金典七步

  • step1: 需求分析(知道有哪些主题,哪些指标,哪些维度
  • step2: 建模分析(是否要压缩,行/列存储,索引,分区,分桶
  • step3: 建模操作,建库建表
  • step4: 数据采集(MySQL业务数据库 -> Hive(ODS层 如何验真: 总量校验,条件校验,抽样校验
  • step5: 数据清洗转换(空值过滤,空值转换,列值裁剪,切割
  • step6: 数据分析操作,结合主题,维度指标完成各种需求
  • step7: 数据导出操作,导出分析后的结果数据到mysql中,之后对接FineBI进行可视化展示
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
ODS层(建模规则为: edu_ods.表名
-- 表有: 前两张事实表,中间6张维度表,最后1张访问咨询表(这里没写web_chat_ems是访问咨询主题中的表)
客户关系表 customer_relationship
客户线索表 customer_clue
客户表 customer
客户申诉表 customer_appeal
学科表 itcast_subject
校区表 itcast_school
员工表 employee
部门表 scrm_department
访问咨询表web_chat_ems


DWD层(建模规则为: edu_dwd.表名
-- 表有两张事实表(要基于ODS层的表建成拉链表)
-- 将ODS层的数据加载到DWD层(清洗转换)
将create_date_time 通过substring函数转换为 yearinfo monthinfo dayinfo hourinfo
将origin_type 转换为 origin_type_state(NETSERVICE,PRESIGNUP 认为线上,其余线下)
将clue_state 转换为 clue_state_stat(值为VALID_NEW_CLUES为新客户)
将校区和学科的id字段进行转换, 如果为 0 或者 null 转换为 -1
将无效线索过滤,关联投诉表,判断clue_appeal_state有效无效
-- create_date_time 字段类型是,比如2011-08-19 09:24:57.0需要转换为year_info,quarter_info......
yearinfo,monthinfo,dayinfo都是分区字段(表customer_relationship)

-- origin_type 判断字段是线上还是线下 转为origin_type_state
'NETSERVICE', 'PRESIGNUP'都是线上,其余的是线下(表customer_relationship)

-- clue_state 判断字段是新用户还是老用户 转为clue_state_stat
'VALID_NEW_CLUES' --新客户新线索
'VALID_PUBLIC_NEW_CLUE' --老客户新线索
(表customer_clue cc)

-- 将校区和学科的id进行转换
itcast_school_id '校区Id',
itcast_school '校区',
itcast_subject_id '学科Id',
itcast_subject '学科',
(表customer_clue cc)
-- 将无效的线索过滤
customer_clue 中clue_appeal_state 判断有效无效: 0 有效 1无效(表customer_clue cc)

-- 要再次基础上构建拉链表, 多增加两个字段: start_time 和 end_time
分区字段为start_time会话开始时间
end_time为会话结束时间

-- 第七个需求涉及到web_chat_ems访问咨询表(ODS层需要建表)



DWB层(建模规则为: edu_dwb.表名
-- 降维,形成宽表,只有一张宽表itcast_intention_dwb

DM层(建模规则为: edu_dm.表名
-- 主要是进行分析,只有一张表

Sqoop脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/bash

# 全量导入脚本
# sqoop 脚本(从MySQL导入数据到ODS层)

if [ $# == 1 ]
then
DATE_DT = $1 # 如果传入有1个参数,则取第一个参数
else
DATE_DT = `date -d '1 days ago' +'%Y-%m-%d'` # 如果传入的参数大于1个,或者没有参数,获取昨天的时间
fi # if判断结束

export SQOOP_HOME=/usr/bin/sqoop

jdbcUrl='jdbc:mysql://106.75.33.59:3306/scrm?useUnicode=true&characterEncoding=UTF-8&autoReconnect=true'
username='bjclass58'
password='bjclass58_itcast'


# 客户关系表 customer_relationship全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as dt from customer_relationship where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer_relationship \
-m 1

# 客户线索表 customer_clue全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as dt from customer_clue where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer_clue \
-m 1

# 客户表 customer全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from customer where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer \
-m 1

# 学科表 itcast_subject全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from itcast_subject where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table itcast_subject \
-m 1

# 校区表 itcast_school全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from itcast_school where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table itcast_school \
-m 1

# 员工表 employee全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from employee where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table employee \
-m 1

# 部门表 scrm_department全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from scrm_department where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table scrm_department \
-m 1

# 客户申诉表 customer_appeal全量导入
/usr/bin/sqoop import "-Dorg.apache.sqoop.splitter.allow_text_splitter=true" \
--connect ${jdbcUrl} \
--username ${username} \
--password ${password} \
--query "select *,'${DATE_DT}' as start_time from customer_appeal where 1=1 and (create_date_time between '1970-01-01 00:00:00' and '${DATE_DT} 23:59:59') and \$CONDITIONS" \
--hcatalog-database edu_ods \
--hcatalog-table customer_appeal \
-m 1