金典七步

step1: 需求分析(知道有哪些主题,哪些指标,哪些维度

step2: 建模分析(是否要压缩,行/列存储,索引,分区,分桶

step3: 建模操作,建库建表

step4: 数据采集(MySQL业务数据库 -> Hive(ODS层如何验真: 总量校验,条件校验,抽样校验

step5: 数据清洗转换(空值过滤,空值转换,列值裁剪,切割

step6: 数据分析操作,结合主题,维度指标完成各种需求

step7: 数据导出操作,导出分析后的结果数据到mysql中,之后对接FineBI进行可视化展示

ODS层(建模规则为: edu_ods.表名
-- 表有: 前两张事实表,中间6张维度表,最后1张访问咨询表(这里没写web_chat_ems是访问咨询主题中的表)
	客户关系表 customer_relationship
	客户线索表 customer_clue
	客户表 customer
	客户申诉表 customer_appeal
	学科表 itcast_subject
	校区表 itcast_school
	员工表 employee
	部门表 scrm_department
	访问咨询表web_chat_ems
	

DWD层(建模规则为: edu_dwd.表名
	-- 表有两张事实表(要基于ODS层的表建成拉链表)
	-- 将ODS层的数据加载到DWD层(清洗转换)
	将create_date_time 通过substring函数转换为 yearinfo monthinfo dayinfo hourinfo
	将origin_type 转换为 origin_type_state(NETSERVICE,PRESIGNUP 认为线上,其余线下)
	将clue_state 转换为 clue_state_stat(值为VALID_NEW_CLUES为新客户)
	将校区和学科的id字段进行转换, 如果为 0 或者 null 转换为 -1
	将无效线索过滤,关联投诉表,判断clue_appeal_state有效无效
-- create_date_time 字段类型是,比如2011-08-19 09:24:57.0需要转换为year_info,quarter_info......
	yearinfo,monthinfo,dayinfo都是分区字段(表customer_relationship)

-- origin_type 判断字段是线上还是线下 转为origin_type_state   
	'NETSERVICE', 'PRESIGNUP'都是线上,其余的是线下(表customer_relationship)

-- clue_state 判断字段是新用户还是老用户 转为clue_state_stat   
	'VALID_NEW_CLUES'  		--新客户新线索
	'VALID_PUBLIC_NEW_CLUE'  	--老客户新线索
	(表customer_clue cc)
	
-- 将校区和学科的id进行转换  
	 itcast_school_id 	'校区Id',
 	 itcast_school  		'校区',
 	 itcast_subject_id	'学科Id',
 	 itcast_subject  	'学科',
	(表customer_clue cc)
-- 将无效的线索过滤
	customer_clue 中clue_appeal_state 判断有效无效: 0 有效  1无效(表customer_clue cc)

-- 要再次基础上构建拉链表, 多增加两个字段: start_time 和 end_time
	分区字段为start_time会话开始时间
	end_time为会话结束时间

-- 第七个需求涉及到web_chat_ems访问咨询表(ODS层需要建表)


	
DWB层(建模规则为: edu_dwb.表名
	-- 降维,形成宽表,只有一张宽表itcast_intention_dwb
	
DM层(建模规则为: edu_dm.表名
	-- 主要是进行分析,只有一张表

Sqoop脚本

#!/bin/bash

# 全量导入脚本
#  sqoop 脚本(从MySQL导入数据到ODS层)

if [ $# == 1 ]
then
	DATE_DT = $1  # 如果传入有1个参数,则取第一个参数
else
	DATE_DT = `date -d '1 days ago' +'%Y-%m-%d'`  # 如果传入的参数大于1个,或者没有参数,获取昨天的时间
fi  # if判断结束

export SQOOP_HOME=/usr/bin/sqoop

jdbcUrl='jdbc:mysql://106.75.33.59:3306/scrm?useUnicode=true&characterEncoding=UTF-8&autoReconnect=true'
username='bjclass58'
password='bjclass58_itcast'


# 客户关系表 customer_relationship全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as dt from customer_relationship where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer_relationship \
-m 1

# 客户线索表 customer_clue全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as dt from customer_clue where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer_clue \
-m 1

# 客户表 customer全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from customer where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer \
-m 1

# 学科表 itcast_subject全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from itcast_subject where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table itcast_subject \
-m 1

# 校区表 itcast_school全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from itcast_school where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table itcast_school \
-m 1

# 员工表 employee全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from employee where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table employee \
-m 1

# 部门表 scrm_department全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from scrm_department where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table scrm_department \
-m 1

# 客户申诉表 customer_appeal全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from customer_appeal where 1=1 and (create_date_time  between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer_appeal \
-m 1