1.1 DWD 层(业务数据)
1.1.1 商品维度表(全量表)
1)建表语句
DROP TABLE IF EXISTS `dwd_dim_sku_info`; CREATE EXTERNAL TABLE `dwd_dim_sku_info` ( `id` string COMMENT \'商品 id\', `spu_id` string COMMENT \'spuid\', `price` double COMMENT \'商品价格\', `sku_name` string COMMENT \'商品名称\', `sku_desc` string COMMENT \'商品描述\', `weight` double COMMENT \'重量\', `tm_id` string COMMENT \'品牌 id\', `tm_name` string COMMENT \'品牌名称\', `category3_id` string COMMENT \'三级分类 id\', `category2_id` string COMMENT \'二级分类 id\', `category1_id` string COMMENT \'一级分类 id\', `category3_name` string COMMENT \'三级分类名称\', `category2_name` string COMMENT \'二级分类名称\', `category1_name` string COMMENT \'一级分类名称\', `spu_name` string COMMENT \'spu 名称\', `create_time` string COMMENT \'创建时间\' ) COMMENT \'商品维度表\' PARTITIONED BY (`dt` string) stored as parquet location \'/warehouse/gmall/dwd/dwd_dim_sku_info/\' tblproperties ("parquet.compression"="lzo");
2)数据装载
insert overwrite table dwd_dim_sku_info partition(dt=\'2020-03-10\') select sku.id, sku.spu_id, sku.price, sku.sku_name, sku.sku_desc, sku.weight, sku.tm_id, ob.tm_name, sku.category3_id, c2.id category2_id, c1.id category1_id, c3.name category3_name, c2.name category2_name, c1.name category1_name, spu.spu_name, sku.create_time from ( select * from ods_sku_info where dt=\'2020-03-10\' )sku join ( select * from ods_base_trademark where dt=\'2020-03-10\' )ob on sku.tm_id=ob.tm_id join ( select * from ods_spu_info where dt=\'2020-03-10\' )spu on spu.id = sku.spu_id join ( select * from ods_base_category3 where dt=\'2020-03-10\' )c3 on sku.category3_id=c3.id join ( select * from ods_base_category2 where dt=\'2020-03-10\' )c2 on c3.category2_id=c2.id join ( select * from ods_base_category1 where dt=\'2020-03-10\' )c1 on c2.category1_id=c1.id;
1.1.2 优惠券信息表(全量)
把 ODS 层 ods_coupon_info 表数据导入到 DWD 层优惠卷信息表,在导入过程中可以做适当的清洗
1)建表语句
drop table if exists dwd_dim_coupon_info; create external table dwd_dim_coupon_info( `id` string COMMENT \'购物券编号\', `coupon_name` string COMMENT \'购物券名称\', `coupon_type` string COMMENT \'购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券\', `condition_amount` string COMMENT \'满额数\', `condition_num` string COMMENT \'满件数\', `activity_id` string COMMENT \'活动编号\', `benefit_amount` string COMMENT \'减金额\', `benefit_discount` string COMMENT \'折扣\', `create_time` string COMMENT \'创建时间\', `range_type` string COMMENT \'范围类型 1、商品 2、品类 3、品牌\', `spu_id` string COMMENT \'商品 id\', `tm_id` string COMMENT \'品牌 id\', `category3_id` string COMMENT \'品类 id\', `limit_num` string COMMENT \'最多领用次数\', `operate_time` string COMMENT \'修改时间\', `expire_time` string COMMENT \'过期时间\' ) COMMENT \'优惠券信息表\' PARTITIONED BY (`dt` string) row format delimited fields terminated by \'\t\' stored as parquet location \'/warehouse/gmall/dwd/dwd_dim_coupon_info/\' tblproperties ("parquet.compression"="lzo");
2)数据装载
insert overwrite table dwd_dim_coupon_info partition(dt=\'2020-03-10\') select id, coupon_name, coupon_type, condition_amount, condition_num, activity_id, benefit_amount, benefit_discount, create_time, range_type, spu_id, tm_id, category3_id, limit_num, operate_time, expire_time from ods_coupon_info where dt=\'2020-03-10\';
3)查询加载结果
select * from dwd_dim_coupon_info where dt=\'2020-03-10\';
1.1.3 活动维度表(全量)
1)建表语句
drop table if exists dwd_dim_activity_info; create external table dwd_dim_activity_info( `id` string COMMENT \'编号\', `activity_name` string COMMENT \'活动名称\', `activity_type` string COMMENT \'活动类型\', `condition_amount` string COMMENT \'满减金额\', `condition_num` string COMMENT \'满减件数\', `benefit_amount` string COMMENT \'优惠金额\', `benefit_discount` string COMMENT \'优惠折扣\', `benefit_level` string COMMENT \'优惠级别\', `start_time` string COMMENT \'开始时间\', `end_time` string COMMENT \'结束时间\', `create_time` string COMMENT \'创建时间\' ) COMMENT \'活动信息表\' PARTITIONED BY (`dt` string) row format delimited fields terminated by \'\t\' stored as parquet location \'/warehouse/gmall/dwd/dwd_dim_activity_info/\' tblproperties ("parquet.compression"="lzo");
2)数据装载
insert overwrite table dwd_dim_activity_info partition(dt=\'2020-03-10\') select info.id, info.activity_name, info.activity_type, rule.condition_amount, rule.condition_num, rule.benefit_amount, rule.benefit_discount, rule.benefit_level, info.start_time, info.end_time, info.create_time from ( select * from ods_activity_info where dt=\'2020-03-10\' )info left join ( select * from ods_activity_rule where dt=\'2020-03-10\' )rule on info.id = rule.activity_id;
3)查询加载结果
select * from dwd_dim_activity_info where dt=\'2020-03-10\';
1.1.4 地区维度表(特殊)
1)建表语句
DROP TABLE IF EXISTS `dwd_dim_base_province`; CREATE EXTERNAL TABLE `dwd_dim_base_province` ( `id` string COMMENT \'id\', `province_name` string COMMENT \'省市名称\', `area_code` string COMMENT \'地区编码\', `iso_code` string COMMENT \'ISO 编码\', `region_id` string COMMENT \'地区 id\', `region_name` string COMMENT \'地区名称\' ) COMMENT \'地区省市表\' stored as parquet location \'/warehouse/gmall/dwd/dwd_dim_base_province/\' tblproperties ("parquet.compression"="lzo");
2)数据装载
insert overwrite table dwd_dim_base_province select bp.id, bp.name, bp.area_code, bp.iso_code, bp.region_id, br.region_name from ods_base_province bp join ods_base_region br on bp.region_id=br.id;
1.1.5 时间维度表(特殊)(预留)
1)建表语句
DROP TABLE IF EXISTS `dwd_dim_date_info`; CREATE EXTERNAL TABLE `dwd_dim_date_info`( `date_id` string COMMENT \'日\', `week_id` int COMMENT \'周\', `week_day` int COMMENT \'周的第几天\', `day` int COMMENT \'每月的第几天\', `month` int COMMENT \'第几月\', `quarter` int COMMENT \'第几季度\', `year` int COMMENT \'年\', `is_workday` int COMMENT \'是否是周末\', `holiday_id` int COMMENT \'是否是节假日\' ) row format delimited fields terminated by \'\t\' stored as parquet location \'/warehouse/gmall/dwd/dwd_dim_date_info/\' tblproperties ("parquet.compression"="lzo");
2)把 date_info.txt 文件上传到 node01 的 /opt/modules/db_log/路径
3)数据装载
load data local inpath \'/opt/modules/db_log/date_info.txt\' into table dwd_dim_date_info;
4)查询加载结果
select * from dwd_dim_date_info;
1.1.6 订单明细事实表(事务型快照事实表)
1)建表语句
drop table if exists dwd_fact_order_detail; create external table dwd_fact_order_detail ( `id` string COMMENT \'订单编号\', `order_id` string COMMENT \'订单号\', `user_id` string COMMENT \'用户 id\', `sku_id` string COMMENT \'sku 商品 id\', `sku_name` string COMMENT \'商品名称\', `order_price` decimal(10,2) COMMENT \'商品价格\', `sku_num` bigint COMMENT \'商品数量\', `create_time` string COMMENT \'创建时间\', `province_id` string COMMENT \'省份 ID\', `total_amount` decimal(20,2) COMMENT \'订单总金额\' ) PARTITIONED BY (`dt` string) stored as parquet location \'/warehouse/gmall/dwd/dwd_fact_order_detail/\' tblproperties ("parquet.compression"="lzo");
2)数据装载
insert overwrite table dwd_fact_order_detail partition(dt=\'2020-03-10\') select od.id, od.order_id, od.user_id, od.sku_id, od.sku_name, od.order_price, od.sku_num, od.create_time, oi.province_id, od.order_price*od.sku_num from ( select * from ods_order_detail where dt=\'2020-03-10\' ) od join ( select * from ods_order_info where dt=\'2020-03-10\' ) oi on od.order_id=oi.id;
3)查询加载结果
select * from dwd_fact_order_detail where dt=\'2020-03-10\';
1.1.7 支付事实表(事务型快照事实表)
1)建表语句
drop table if exists dwd_fact_payment_info; create external table dwd_fact_payment_info ( `id` string COMMENT \'\', `out_trade_no` string COMMENT \'对外业务编号\', `order_id` string COMMENT \'订单编号\', `user_id` string COMMENT \'用户编号\', `alipay_trade_no` string COMMENT \'支付宝交易流水编号\', `payment_amount` decimal(16,2) COMMENT \'支付金额\', `subject` string COMMENT \'交易内容\', `payment_type` string COMMENT \'支付类型\', `payment_time` string COMMENT \'支付时间\', `province_id` string COMMENT \'省份 ID\' ) PARTITIONED BY (`dt` string) stored as parquet location \'/warehouse/gmall/dwd/dwd_fact_payment_info/\' tblproperties ("parquet.compression"="lzo");
2)数据装载
insert overwrite table dwd_fact_payment_info partition(dt=\'2020-03-10\') select pi.id, pi.out_trade_no, pi.order_id, pi.user_id, pi.alipay_trade_no, pi.total_amount, pi.subject, pi.payment_type, pi.payment_time, oi.province_id from ( select * from ods_payment_info where dt=\'2020-03-10\' )pi join ( select id, province_id from ods_order_info where dt=\'2020-03-10\' )oi on pi.order_id = oi.id;
3)查询加载结果
select * from dwd_fact_payment_info where dt=\'2020-03-10\';
1.1.8 退款事实表(事务型快照事实表)
把 ODS 层 ods_order_refund_info 表数据导入到 DWD 层退款事实表,在导入过程中可以做适当的清洗
1)建表语句
drop table if exists dwd_fact_order_refund_info; create external table dwd_fact_order_refund_info( `id` string COMMENT \'编号\', `user_id` string COMMENT \'用户 ID\', `order_id` string COMMENT \'订单 ID\', `sku_id` string COMMENT \'商品 ID\', `refund_type` string COMMENT \'退款类型\', `refund_num` bigint COMMENT \'退款件数\', `refund_amount` decimal(16,2) COMMENT \'退款金额\', `refund_reason_type` string COMMENT \'退款原因类型\', `create_time` string COMMENT \'退款时间\' ) COMMENT \'退款事实表\' PARTITIONED BY (`dt` string) row format delimited fields terminated by \'\t\' location \'/warehouse/gmall/dwd/dwd_fact_order_refund_info/\';
2)数据装载
insert overwrite table dwd_fact_order_refund_info partition(dt=\'2020-03-10\') select id, user_id, order_id, sku_id, refund_type, refund_num, refund_amount, refund_reason_type, create_time from ods_order_refund_info where dt=\'2020-03-10\';
3)查询加载结果
select * from dwd_fact_order_refund_info where dt=\'2020-03-10\';
1.1.9 评价事实表(事务型快照事实表)
把 ODS 层 ods_comment_info 表数据导入到 DWD 层评价事实表,在导入过程中可以做适当的清洗
1)建表语句
drop table if exists dwd_fact_comment_info; create external table dwd_fact_comment_info( `id` string COMMENT \'编号\', `user_id` string COMMENT \'用户 ID\', `sku_id` string COMMENT \'商品 sku\', `spu_id` string COMMENT \'商品 spu\', `order_id` string COMMENT \'订单 ID\', `appraise` string COMMENT \'评价\', `create_time` string COMMENT \'评价时间\' ) COMMENT \'评价事实表\' PARTITIONED BY (`dt` string) row format delimited fields terminated by \'\t\' location \'/warehouse/gmall/dwd/dwd_fact_comment_info/\';
2)数据装载
insert overwrite table dwd_fact_comment_info partition(dt=\'2020-03-10\') select id, user_id, sku_id, spu_id, order_id, appraise, create_time from ods_comment_info where dt=\'2020-03-10\';
1.1.10 加购事实表(周期型快照事实表,每日快照)
由于购物车的数量是会发生变化,所以导增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增
周期型快照事实表劣势:存储的数据量会比较大
解决方案:周期型快照事实表存储的数据比较讲究时效性,时间太久了的意义不大,可以删除以前的数据
1)建表语句
drop table if exists dwd_fact_cart_info; create external table dwd_fact_cart_info( `id` string COMMENT \'编号\', `user_id` string COMMENT \'用户 id\', `sku_id` string COMMENT \'skuid\', `cart_price` string COMMENT \'放入购物车时价格\', `sku_num` string COMMENT \'数量\', `sku_name` string COMMENT \'sku 名称 (冗余)\', `create_time` string COMMENT \'创建时间\', `operate_time` string COMMENT \'修改时间\', `is_ordered` string COMMENT \'是否已经下单。1 为已下单;0 为未下单\', `order_time` string COMMENT \'下单时间\' ) COMMENT \'加购事实表\' PARTITIONED BY (`dt` string) row format delimited fields terminated by \'\t\' location \'/warehouse/gmall/dwd/dwd_fact_cart_info/\';
2)数据装载
insert overwrite table dwd_fact_cart_info partition(dt=\'2020-03-10\') select id, user_id, sku_id, cart_price, sku_num, sku_name, create_time, operate_time, is_ordered, order_time from ods_cart_info where dt=\'2020-03-10\';
3)查询加载结果
select * from dwd_fact_cart_info where dt=\'2020-03-10\';
1.1.11 收藏事实表(周期型快照事实表,每日快照)
收藏的标记,是否取消,会发生变化,做增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增
1)建表语句
drop table if exists dwd_fact_favor_info; create external table dwd_fact_favor_info( `id` string COMMENT \'编号\', `user_id` string COMMENT \'用户 id\', `sku_id` string COMMENT \'skuid\', `spu_id` string COMMENT \'spuid\', `is_cancel` string COMMENT \'是否取消\', `create_time` string COMMENT \'收藏时间\', `cancel_time` string COMMENT \'取消时间\' ) COMMENT \'收藏事实表\' PARTITIONED BY (`dt` string) row format delimited fields terminated by \'\t\' location \'/warehouse/gmall/dwd/dwd_fact_favor_info/\';
2)数据装载
insert overwrite table dwd_fact_favor_info partition(dt=\'2020-03-10\') select id, user_id, sku_id, spu_id, is_cancel, create_time, cancel_time from ods_favor_info where dt=\'2020-03-10\';
3)查询加载结果
select * from dwd_fact_favor_info where dt=\'2020-03-10\';
1.1.12 优惠券领用事实表(累积型快照事实表)
优惠卷的生命周期:领取优惠卷-》用优惠卷下单-》优惠卷参与支付
累积型快照事实表使用:统计优惠卷领取次数、优惠卷下单次数、优惠卷参与支付次数
1)建表语句
drop table if exists dwd_fact_coupon_use; create external table dwd_fact_coupon_use( `id` string COMMENT \'编号\', `coupon_id` string COMMENT \'优惠券 ID\', `user_id` string COMMENT \'userid\', `order_id` string COMMENT \'订单 id\', `coupon_status` string COMMENT \'优惠券状态\', `get_time` string COMMENT \'领取时间\', `using_time` string COMMENT \'使用时间(下单)\', `used_time` string COMMENT \'使用时间(支付)\' ) COMMENT \'优惠券领用事实表\' PARTITIONED BY (`dt` string) row format delimited fields terminated by \'\t\' location \'/warehouse/gmall/dwd/dwd_fact_coupon_use/\';
注意:dt 是按照优惠卷领用时间 get_time 做为分区
2)数据装载
set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dwd_fact_coupon_use partition(dt) select if(new.id is null,old.id,new.id), if(new.coupon_id is null,old.coupon_id,new.coupon_id), if(new.user_id is null,old.user_id,new.user_id), if(new.order_id is null,old.order_id,new.order_id), if(new.coupon_status is null,old.coupon_status,new.coupon_status), if(new.get_time is null,old.get_time,new.get_time), if(new.using_time is null,old.using_time,new.using_time), if(new.used_time is null,old.used_time,new.used_time), date_format(if(new.get_time is null,old.get_time,new.get_time),\'yyyy-MM-dd\') from ( select id, coupon_id, user_id, order_id, coupon_status, get_time, using_time, used_time from dwd_fact_coupon_use where dt in ( select date_format(get_time,\'yyyy-MM-dd\') from ods_coupon_use where dt=\'2020-03-10\' ) )old full outer join ( select id, coupon_id, user_id, order_id, coupon_status, get_time, using_time, used_time from ods_coupon_use where dt=\'2020-03-10\' )new on old.id=new.id;
1.1.13 订单事实表(累积型快照事实表)
1)concat 函数
concat 函数在连接字符串的时候,只要其中一个是 NULL,那么将返回 NULL
hive> select concat(\'a\',\'b\'); ab hive> select concat(\'a\',\'b\',null); NULL
2)concat_ws 函数
concat_ws 函数在连接字符串的时候,只要有一个字符串不是 NULL,就不会返回 NULL。concat_ws 函数需要指定分隔符
hive> select concat_ws(\'-\',\'a\',\'b\'); a-b hive> select concat_ws(\'-\',\'a\',\'b\',null); a-b hive> select concat_ws(\'\',\'a\',\'b\',null); ab
3)STR_TO_MAP 函数
- (1)语法描述
STR_TO_MAP(VARCHAR text, VARCHAR listDelimiter, VARCHAR keyValueDelimiter)
- (2)功能描述
使用 listDelimiter 将 text 分隔成 K-V 对,然后使用 keyValueDelimiter 分隔每个 K-V 对,
组装成 MAP 返回。默认 listDelimiter 为( ,),keyValueDelimiter 为(=)。
- (3)案例
str_to_map(‘1001=2020-03-10,1002=2020-03-10’, ‘,’ , ‘=’)
输出{“1001”:“2020-03-10”,“1002”:“2020-03-10”}
4)建表语句
订单生命周期:创建时间=》支付时间=》取消时间=》完成时间=》退款时间=》退款完成时间
由于 ODS 层订单表只有创建时间和操作时间两个状态,不能表达所有时间含义,所以需要关联订单状态表。订单事实表里面增加了活动 id,所以需要关联活动订单表
drop table if exists dwd_fact_order_info; create external table dwd_fact_order_info ( `id` string COMMENT \'订单编号\', `order_status` string COMMENT \'订单状态\', `user_id` string COMMENT \'用户 id\', `out_trade_no` string COMMENT \'支付流水号\', `create_time` string COMMENT \'创建时间(未支付状态)\', `payment_time` string COMMENT \'支付时间(已支付状态)\', `cancel_time` string COMMENT \'取消时间(已取消状态)\', `finish_time` string COMMENT \'完成时间(已完成状态)\', `refund_time` string COMMENT \'退款时间(退款中状态)\', `refund_finish_time` string COMMENT \'退款完成时间(退款完成状态)\', `province_id` string COMMENT \'省份 ID\', `activity_id` string COMMENT \'活动 ID\', `original_total_amount` string COMMENT \'原价金额\', `benefit_reduce_amount` string COMMENT \'优惠金额\', `feight_fee` string COMMENT \'运费\', `final_total_amount` decimal(10,2) COMMENT \'订单金额\' ) PARTITIONED BY (`dt` string) stored as parquet location \'/warehouse/gmall/dwd/dwd_fact_order_info/\' tblproperties ("parquet.compression"="lzo");
5)数据装载
5)常用函数
更多函数请点击博客【HIve】Hive入门解析(五)
6)数据装载
set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dwd_fact_order_info partition(dt) select if(new.id is null,old.id,new.id), if(new.order_status is null,old.order_status,new.order_status), if(new.user_id is null,old.user_id,new.user_id), if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no), if(new.tms[\'1001\'] is null,old.create_time,new.tms[\'1001\']),--1001 对应未支付状态 if(new.tms[\'1002\'] is null,old.payment_time,new.tms[\'1002\']), if(new.tms[\'1003\'] is null,old.cancel_time,new.tms[\'1003\']), if(new.tms[\'1004\'] is null,old.finish_time,new.tms[\'1004\']), if(new.tms[\'1005\'] is null,old.refund_time,new.tms[\'1005\']), if(new.tms[\'1006\'] is null,old.refund_finish_time,new.tms[\'1006\']), if(new.province_id is null,old.province_id,new.province_id), if(new.activity_id is null,old.activity_id,new.activity_id), if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount), if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount), if(new.feight_fee is null,old.feight_fee,new.feight_fee), if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount), date_format(if(new.tms[\'1001\'] is null,old.create_time,new.tms[\'1001\']),\'yyyy-MM-dd\') from ( select id, order_status, user_id, out_trade_no, create_time, payment_time, cancel_time, finish_time, refund_time, refund_finish_time, province_id, activity_id, original_total_amount, benefit_reduce_amount, feight_fee, final_total_amount from dwd_fact_order_info where dt in ( select date_format(create_time,\'yyyy-MM-dd\') from ods_order_info where dt=\'2020-03-10\' ) )old full outer join ( select info.id, info.order_status, info.user_id, info.out_trade_no, info.province_id, act.activity_id, log.tms, info.original_total_amount, info.benefit_reduce_amount, info.feight_fee, info.final_total_amount from ( select order_id, str_to_map(concat_ws(\',\',collect_set(concat(order_status,\'=\',operate_time))),\',\',\'=\') tms from ods_order_status_log where dt=\'2020-03-10\' group by order_id )log join ( select * from ods_order_info where dt=\'2020-03-10\' )info on log.order_id=info.id left join ( select * from ods_activity_order where dt=\'2020-03-10\' )act on log.order_id=act.order_id )new on old.id=new.id;
1.1.14 用户维度表(拉链表)
用户表中的数据每日既有可能新增,也有可能修改,但修改频率并不高,属于缓慢变化
维度,此处采用拉链表存储用户维度数据
1)什么是拉链表
2)为什么要做拉链表
<ignore_js_op>
3)拉链表形成过程
4)拉链表制作过程图
5)拉链表制作过程
步骤 0:初始化拉链表(首次独立执行)
(1)建立拉链表
drop table if exists dwd_dim_user_info_his; create external table dwd_dim_user_info_his( `id` string COMMENT \'用户 id\', `name` string COMMENT \'姓名\', `birthday` string COMMENT \'生日\', `gender` string COMMENT \'性别\', `email` string COMMENT \'邮箱\', `user_level` string COMMENT \'用户等级\', `create_time` string COMMENT \'创建时间\', `operate_time` string COMMENT \'操作时间\', `start_date` string COMMENT \'有效开始日期\', `end_date` string COMMENT \'有效结束日期\' ) COMMENT \'订单拉链表\' stored as parquet location \'/warehouse/gmall/dwd/dwd_dim_user_info_his/\' tblproperties ("parquet.compression"="lzo");
(2)初始化拉链表
insert overwrite table dwd_dim_user_info_his select id, name, birthday, gender, email, user_level, create_time, operate_time, \'2020-03-10\', \'9999-99-99\' from ods_user_info oi where oi.dt=\'2020-03-10\';
步骤 1:制作当日变动数据(包括新增,修改)每日执行
(1)如何获得每日变动表
- a.最好表内有创建时间和变动时间(Lucky!)
- b.如果没有,可以利用第三方工具监控比如 canal,监控 MySQL 的实时变化进行记录(麻烦)
- c.逐行对比前后两天的数据,检查 md5(concat(全部有可能变化的字段))是否相同(low)
- d.要求业务数据库提供变动流水(人品,颜值)
(2)因为 ods_order_info 本身导入过来就是新增变动明细的表,所以不用处理
- a)数据库中新增 2020-03-11 一天的数据
- b)通过 Sqoop 把 2020-03-11 日所有数据导入mysqlTohdfs.sh all 2020-03-11
- c)ods 层数据导入hdfs_to_ods_db.sh all 2020-03-11
步骤 2:先合并变动信息,再追加新增信息,插入到临时表中
1)建立临时表
drop table if exists dwd_dim_user_info_his_tmp; create external table dwd_dim_user_info_his_tmp( `id` string COMMENT \'用户 id\', `name` string COMMENT \'姓名\', `birthday` string COMMENT \'生日\', `gender` string COMMENT \'性别\', `email` string COMMENT \'邮箱\', `user_level` string COMMENT \'用户等级\', `create_time` string COMMENT \'创建时间\', `operate_time` string COMMENT \'操作时间\', `start_date` string COMMENT \'有效开始日期\', `end_date` string COMMENT \'有效结束日期\' ) COMMENT \'订单拉链临时表\' stored as parquet location \'/warehouse/gmall/dwd/dwd_dim_user_info_his_tmp/\' tblproperties ("parquet.compression"="lzo");
2)导入脚本
insert overwrite table dwd_dim_user_info_his_tmp select * from ( select id, name, birthday, gender, email, user_level, create_time, operate_time, \'2020-03-11\' start_date, \'9999-99-99\' end_date from ods_user_info where dt=\'2020-03-11\' union all select uh.id, uh.name, uh.birthday, uh.gender, uh.email, uh.user_level, uh.create_time, uh.operate_time, uh.start_date, if(ui.id is not null and uh.end_date=\'9999-99-99\', date_add(ui.dt,-1), uh.end_date) end_date from dwd_dim_user_info_his uh left join ( select * from ods_user_info where dt=\'2020-03-11\' ) ui on uh.id=ui.id )his order by his.id, start_date;
1.1.15 DWD 层数据导入脚本
1)vim ods_to_dwd_db.sh
#!/bin/bash APP=gmall hive=/opt/modules/hive/bin/hive # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天 if [ -n "$2" ] ;then do_date=$2 else do_date=`date -d "-1 day" +%F` fi sql1=" set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table ${APP}.dwd_dim_sku_info partition(dt=\'$do_date\') select sku.id, sku.spu_id, sku.price, sku.sku_name, sku.sku_desc, sku.weight, sku.tm_id, ob.tm_name, sku.category3_id, c2.id category2_id, c1.id category1_id, c3.name category3_name, c2.name category2_name, c1.name category1_name, spu.spu_name, sku.create_time from ( select * from ${APP}.ods_sku_info where dt=\'$do_date\' )sku join ( select * from ${APP}.ods_base_trademark where dt=\'$do_date\' )ob on sku.tm_id=ob.tm_id join ( select * from ${APP}.ods_spu_info where dt=\'$do_date\' )spu on spu.id = sku.spu_id join ( select * from ${APP}.ods_base_category3 where dt=\'$do_date\' )c3 on sku.category3_id=c3.id join ( select * from ${APP}.ods_base_category2 where dt=\'$do_date\' )c2 on c3.category2_id=c2.id join ( select * from ${APP}.ods_base_category1 where dt=\'$do_date\' )c1 on c2.category1_id=c1.id; insert overwrite table ${APP}.dwd_dim_coupon_info partition(dt=\'$do_date\') select id, coupon_name, coupon_type, condition_amount, condition_num, activity_id, benefit_amount, benefit_discount, create_time, range_type, spu_id, tm_id, category3_id, limit_num, operate_time, expire_time from ${APP}.ods_coupon_info where dt=\'$do_date\'; insert overwrite table ${APP}.dwd_dim_activity_info partition(dt=\'$do_date\') select info.id, info.activity_name, info.activity_type, rule.condition_amount, rule.condition_num, rule.benefit_amount, rule.benefit_discount, rule.benefit_level, info.start_time, info.end_time, info.create_time from ( select * from ${APP}.ods_activity_info where dt=\'$do_date\' )info left join ( select * from ${APP}.ods_activity_rule where dt=\'$do_date\' )rule on info.id = rule.activity_id; insert overwrite table ${APP}.dwd_fact_order_detail partition(dt=\'$do_date\') select od.id, od.order_id, od.user_id, od.sku_id, od.sku_name, od.order_price, od.sku_num, od.create_time, oi.province_id, od.order_price*od.sku_num from ( select * from ${APP}.ods_order_detail where dt=\'$do_date\' ) od join ( select * from ${APP}.ods_order_info where dt=\'$do_date\' ) oi on od.order_id=oi.id; insert overwrite table ${APP}.dwd_fact_payment_info partition(dt=\'$do_date\') select pi.id, pi.out_trade_no, pi.order_id, pi.user_id, pi.alipay_trade_no, pi.total_amount, pi.subject, pi.payment_type, pi.payment_time, oi.province_id from ( select * from ${APP}.ods_payment_info where dt=\'$do_date\' )pi join ( select id, province_id from ${APP}.ods_order_info where dt=\'$do_date\' )oi on pi.order_id = oi.id; insert overwrite table ${APP}.dwd_fact_order_refund_info partition(dt=\'$do_date\') select id, user_id, order_id, sku_id, refund_type, refund_num, refund_amount, refund_reason_type, create_time from ${APP}.ods_order_refund_info where dt=\'$do_date\'; insert overwrite table ${APP}.dwd_fact_comment_info partition(dt=\'$do_date\') select id, user_id, sku_id, spu_id, order_id, appraise, create_time from ${APP}.ods_comment_info where dt=\'$do_date\'; insert overwrite table ${APP}.dwd_fact_cart_info partition(dt=\'$do_date\') select id, user_id, sku_id, cart_price, sku_num, sku_name, create_time, operate_time, is_ordered, order_time from ${APP}.ods_cart_info where dt=\'$do_date\'; insert overwrite table ${APP}.dwd_fact_favor_info partition(dt=\'$do_date\') select id, user_id, sku_id, spu_id, is_cancel, create_time, cancel_time from ${APP}.ods_favor_info where dt=\'$do_date\'; insert overwrite table ${APP}.dwd_fact_coupon_use partition(dt) select if(new.id is null,old.id,new.id), if(new.coupon_id is null,old.coupon_id,new.coupon_id), if(new.user_id is null,old.user_id,new.user_id), if(new.order_id is null,old.order_id,new.order_id), if(new.coupon_status is null,old.coupon_status,new.coupon_status), if(new.get_time is null,old.get_time,new.get_time), if(new.using_time is null,old.using_time,new.using_time), if(new.used_time is null,old.used_time,new.used_time), date_format(if(new.get_time is null,old.get_time,new.get_time),\'yyyy-MM-dd\') from ( select id, coupon_id, user_id, order_id, coupon_status, get_time, using_time, used_time from ${APP}.dwd_fact_coupon_use where dt in ( select date_format(get_time,\'yyyy-MM-dd\') from ${APP}.ods_coupon_use where dt=\'$do_date\' ) )old full outer join ( select id, coupon_id, user_id, order_id, coupon_status, get_time, using_time, used_time from ${APP}.ods_coupon_use where dt=\'$do_date\' )new on old.id=new.id; insert overwrite table ${APP}.dwd_fact_order_info partition(dt) select if(new.id is null,old.id,new.id), if(new.order_status is null,old.order_status,new.order_status), if(new.user_id is null,old.user_id,new.user_id), if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no), if(new.tms[\'1001\'] is null,old.create_time,new.tms[\'1001\']),--1001 对应未支付状态 if(new.tms[\'1002\'] is null,old.payment_time,new.tms[\'1002\']), if(new.tms[\'1003\'] is null,old.cancel_time,new.tms[\'1003\']), if(new.tms[\'1004\'] is null,old.finish_time,new.tms[\'1004\']), if(new.tms[\'1005\'] is null,old.refund_time,new.tms[\'1005\']), if(new.tms[\'1006\'] is null,old.refund_finish_time,new.tms[\'1006\']), if(new.province_id is null,old.province_id,new.province_id), if(new.activity_id is null,old.activity_id,new.activity_id), if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount), if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount), if(new.feight_fee is null,old.feight_fee,new.feight_fee), if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount), date_format(if(new.tms[\'1001\'] is null,old.create_time,new.tms[\'1001\']),\'yyyy-MM-dd\') from ( select id, order_status, user_id, out_trade_no, create_time, payment_time, cancel_time, finish_time, refund_time, refund_finish_time, province_id, activity_id, original_total_amount, benefit_reduce_amount, feight_fee, final_total_amount from ${APP}.dwd_fact_order_info where dt in ( select date_format(create_time,\'yyyy-MM-dd\') from ${APP}.ods_order_info where dt=\'$do_date\' ) )old full outer join ( select info.id, info.order_status, info.user_id, info.out_trade_no, info.province_id, act.activity_id, log.tms, info.original_total_amount, info.benefit_reduce_amount, info.feight_fee, info.final_total_amount from ( select order_id, str_to_map(concat_ws(\',\',collect_set(concat(order_status,\'=\',operate_time))),\',\',\' =\') tms from ${APP}.ods_order_status_log where dt=\'$do_date\' group by order_id )log join ( select * from ${APP}.ods_order_info where dt=\'$do_date\' )info on log.order_id=info.id left join ( select * from ${APP}.ods_activity_order where dt=\'$do_date\' )act on log.order_id=act.order_id )new on old.id=new.id; insert overwrite table ${APP}.dwd_dim_user_info_his_tmp select * from ( select id, name, birthday, gender, email, user_level, create_time, operate_time, \'$do_date\' start_date, \'9999-99-99\' end_date from ${APP}.ods_user_info where dt=\'$do_date\' union all select uh.id, uh.name, uh.birthday, uh.gender, uh.email, uh.user_level, uh.create_time, uh.operate_time, uh.start_date, if(ui.id is not null and uh.end_date=\'9999-99-99\', date_add(ui.dt,-1), uh.end_date) end_date from ${APP}.dwd_dim_user_info_his uh left join ( select * from ${APP}.ods_user_info where dt=\'$do_date\' ) ui on uh.id=ui.id )his order by his.id, start_date; insert overwrite table ${APP}.dwd_dim_user_info_his select * from ${APP}.dwd_dim_user_info_his_tmp; " sql2=" insert overwrite table ${APP}.dwd_dim_base_province select bp.id, bp.name, bp.area_code, bp.iso_code, bp.region_id, br.region_name from ${APP}.ods_base_province bp join ${APP}.ods_base_region br on bp.region_id=br.id; " case $1 in "first"){ $hive -e "$sql1" $hive -e "$sql2" };; "all"){ $hive -e "$sql1" };; esac
2)增加脚本执行权限
chmod 770 ods_to_dwd_db.sh
3)执行脚本导入数据
ods_to_dwd_db.sh all 2020-03-11
4)查看导入数据
select * from dwd_fact_order_info where dt=\'2020-03-11\'; select * from dwd_fact_order_detail where dt=\'2020-03-11\'; select * from dwd_fact_comment_info where dt=\'2020-03-11\'; select * from dwd_fact_order_refund_info where dt=\'2020-03-11\';