huanghanyu

1.1 DWD 层(业务数据)



1.1.1 商品维度表(全量表)



1)建表语句

DROP TABLE IF EXISTS `dwd_dim_sku_info`;
CREATE EXTERNAL TABLE `dwd_dim_sku_info` (
`id` string COMMENT \'商品 id\',
`spu_id` string COMMENT \'spuid\',
`price` double COMMENT \'商品价格\',
`sku_name` string COMMENT \'商品名称\',
`sku_desc` string COMMENT \'商品描述\',
`weight` double COMMENT \'重量\',
`tm_id` string COMMENT \'品牌 id\',
`tm_name` string COMMENT \'品牌名称\',
`category3_id` string COMMENT \'三级分类 id\',
`category2_id` string COMMENT \'二级分类 id\',
`category1_id` string COMMENT \'一级分类 id\',
`category3_name` string COMMENT \'三级分类名称\',
`category2_name` string COMMENT \'二级分类名称\',
`category1_name` string COMMENT \'一级分类名称\',
`spu_name` string COMMENT \'spu 名称\',
`create_time` string COMMENT \'创建时间\'
)
COMMENT \'商品维度表\'
PARTITIONED BY (`dt` string)
stored as parquet
location \'/warehouse/gmall/dwd/dwd_dim_sku_info/\'
tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_dim_sku_info partition(dt=\'2020-03-10\')
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
ob.tm_name,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.name category3_name,
c2.name category2_name,
c1.name category1_name,
spu.spu_name,
sku.create_time
from
(
select * from ods_sku_info where dt=\'2020-03-10\'
)sku
join
(
select * from ods_base_trademark where dt=\'2020-03-10\'
)ob on sku.tm_id=ob.tm_id
join
(
select * from ods_spu_info where dt=\'2020-03-10\'
)spu on spu.id = sku.spu_id
join
(
select * from ods_base_category3 where dt=\'2020-03-10\'
)c3 on sku.category3_id=c3.id
join
(
select * from ods_base_category2 where dt=\'2020-03-10\'
)c2 on c3.category2_id=c2.id
join
(
select * from ods_base_category1 where dt=\'2020-03-10\'
)c1 on c2.category1_id=c1.id;

1.1.2 优惠券信息表(全量)

把 ODS 层 ods_coupon_info 表数据导入到 DWD 层优惠卷信息表,在导入过程中可以做适当的清洗

1)建表语句

drop table if exists dwd_dim_coupon_info;
create external table dwd_dim_coupon_info(
`id` string COMMENT \'购物券编号\',
`coupon_name` string COMMENT \'购物券名称\',
`coupon_type` string COMMENT \'购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券\',
`condition_amount` string COMMENT \'满额数\',
`condition_num` string COMMENT \'满件数\',
`activity_id` string COMMENT \'活动编号\',
`benefit_amount` string COMMENT \'减金额\',
`benefit_discount` string COMMENT \'折扣\',
`create_time` string COMMENT \'创建时间\',
`range_type` string COMMENT \'范围类型 1、商品 2、品类 3、品牌\',
`spu_id` string COMMENT \'商品 id\',
`tm_id` string COMMENT \'品牌 id\',
`category3_id` string COMMENT \'品类 id\',
`limit_num` string COMMENT \'最多领用次数\',
`operate_time` string COMMENT \'修改时间\',
`expire_time` string COMMENT \'过期时间\'
) COMMENT \'优惠券信息表\'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by \'\t\'
stored as parquet
location \'/warehouse/gmall/dwd/dwd_dim_coupon_info/\'
tblproperties ("parquet.compression"="lzo"); 

2)数据装载

insert overwrite table dwd_dim_coupon_info partition(dt=\'2020-03-10\')
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id,
tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from ods_coupon_info
where dt=\'2020-03-10\';

3)查询加载结果

select * from dwd_dim_coupon_info where dt=\'2020-03-10\';

1.1.3 活动维度表(全量)



1)建表语句

drop table if exists dwd_dim_activity_info;
create external table dwd_dim_activity_info(
`id` string COMMENT \'编号\',
`activity_name` string COMMENT \'活动名称\',
`activity_type` string COMMENT \'活动类型\',
`condition_amount` string COMMENT \'满减金额\',
`condition_num` string COMMENT \'满减件数\',
`benefit_amount` string COMMENT \'优惠金额\',
`benefit_discount` string COMMENT \'优惠折扣\',
`benefit_level` string COMMENT \'优惠级别\',
`start_time` string COMMENT \'开始时间\',
`end_time` string COMMENT \'结束时间\',
`create_time` string COMMENT \'创建时间\'
) COMMENT \'活动信息表\'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by \'\t\'
stored as parquet
location \'/warehouse/gmall/dwd/dwd_dim_activity_info/\'
tblproperties ("parquet.compression"="lzo"); 

2)数据装载

insert overwrite table dwd_dim_activity_info partition(dt=\'2020-03-10\')
select
info.id,
info.activity_name,
info.activity_type,
rule.condition_amount,
rule.condition_num,
rule.benefit_amount,
rule.benefit_discount,
rule.benefit_level,
info.start_time,
info.end_time,
info.create_time
from
(
select * from ods_activity_info where dt=\'2020-03-10\'
)info
left join
(
select * from ods_activity_rule where dt=\'2020-03-10\'
)rule on info.id = rule.activity_id;

3)查询加载结果

select * from dwd_dim_activity_info where dt=\'2020-03-10\';

 1.1.4 地区维度表(特殊)



1)建表语句

DROP TABLE IF EXISTS `dwd_dim_base_province`;
CREATE EXTERNAL TABLE `dwd_dim_base_province` (
`id` string COMMENT \'id\',
`province_name` string COMMENT \'省市名称\',
`area_code` string COMMENT \'地区编码\',
`iso_code` string COMMENT \'ISO 编码\',
`region_id` string COMMENT \'地区 id\',
`region_name` string COMMENT \'地区名称\'
)
COMMENT \'地区省市表\'
stored as parquet
location \'/warehouse/gmall/dwd/dwd_dim_base_province/\'
tblproperties ("parquet.compression"="lzo"); 

2)数据装载

insert overwrite table dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from ods_base_province bp
join ods_base_region br
on bp.region_id=br.id;

1.1.5 时间维度表(特殊)(预留)

1)建表语句

DROP TABLE IF EXISTS `dwd_dim_date_info`;
CREATE EXTERNAL TABLE `dwd_dim_date_info`(
`date_id` string COMMENT \'\',
`week_id` int COMMENT \'\',
`week_day` int COMMENT \'周的第几天\',
`day` int COMMENT \'每月的第几天\',
`month` int COMMENT \'第几月\',
`quarter` int COMMENT \'第几季度\',
`year` int COMMENT \'\',
`is_workday` int COMMENT \'是否是周末\',
`holiday_id` int COMMENT \'是否是节假日\'
)
row format delimited fields terminated by \'\t\'
stored as parquet
location \'/warehouse/gmall/dwd/dwd_dim_date_info/\'
tblproperties ("parquet.compression"="lzo"); 

2)把 date_info.txt 文件上传到 node01 的 /opt/modules/db_log/路径

3)数据装载

load data local inpath \'/opt/modules/db_log/date_info.txt\' into table dwd_dim_date_info; 

4)查询加载结果

select * from dwd_dim_date_info;

1.1.6 订单明细事实表(事务型快照事实表)





1)建表语句

drop table if exists dwd_fact_order_detail;
create external table dwd_fact_order_detail (
`id` string COMMENT \'订单编号\',
`order_id` string COMMENT \'订单号\',
`user_id` string COMMENT \'用户 id\',
`sku_id` string COMMENT \'sku 商品 id\',
`sku_name` string COMMENT \'商品名称\',
`order_price` decimal(10,2) COMMENT \'商品价格\',
`sku_num` bigint COMMENT \'商品数量\',
`create_time` string COMMENT \'创建时间\',
`province_id` string COMMENT \'省份 ID\',
`total_amount` decimal(20,2) COMMENT \'订单总金额\'
)
PARTITIONED BY (`dt` string)
stored as parquet
location \'/warehouse/gmall/dwd/dwd_fact_order_detail/\'
tblproperties ("parquet.compression"="lzo"); 

2)数据装载

insert overwrite table dwd_fact_order_detail partition(dt=\'2020-03-10\')
select
od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.order_price*od.sku_num
from
(
select * from ods_order_detail where dt=\'2020-03-10\'
) od
join
(
select * from ods_order_info where dt=\'2020-03-10\'
) oi
on od.order_id=oi.id;

3)查询加载结果

select * from dwd_fact_order_detail where dt=\'2020-03-10\';

1.1.7 支付事实表(事务型快照事实表)




1)建表语句

drop table if exists dwd_fact_payment_info;
create external table dwd_fact_payment_info (
`id` string COMMENT \'\',
`out_trade_no` string COMMENT \'对外业务编号\',
`order_id` string COMMENT \'订单编号\',
`user_id` string COMMENT \'用户编号\',
`alipay_trade_no` string COMMENT \'支付宝交易流水编号\',
`payment_amount` decimal(16,2) COMMENT \'支付金额\',
`subject` string COMMENT \'交易内容\',
`payment_type` string COMMENT \'支付类型\',
`payment_time` string COMMENT \'支付时间\',
`province_id` string COMMENT \'省份 ID\'
)
PARTITIONED BY (`dt` string)
stored as parquet
location \'/warehouse/gmall/dwd/dwd_fact_payment_info/\'
tblproperties ("parquet.compression"="lzo");

2)数据装载

insert overwrite table dwd_fact_payment_info partition(dt=\'2020-03-10\')
select
pi.id,
pi.out_trade_no,
pi.order_id,
pi.user_id,
pi.alipay_trade_no,
pi.total_amount,
pi.subject,
pi.payment_type,
pi.payment_time,
oi.province_id
from
(
select * from ods_payment_info where dt=\'2020-03-10\'
)pi
join
(
select id, province_id from ods_order_info where dt=\'2020-03-10\'
)oi
on pi.order_id = oi.id;

3)查询加载结果

select * from dwd_fact_payment_info where dt=\'2020-03-10\';

1.1.8 退款事实表(事务型快照事实表)

把 ODS 层 ods_order_refund_info 表数据导入到 DWD 层退款事实表,在导入过程中可以做适当的清洗



1)建表语句

drop table if exists dwd_fact_order_refund_info;
create external table dwd_fact_order_refund_info(
`id` string COMMENT \'编号\',
`user_id` string COMMENT \'用户 ID\',
`order_id` string COMMENT \'订单 ID\',
`sku_id` string COMMENT \'商品 ID\',
`refund_type` string COMMENT \'退款类型\',
`refund_num` bigint COMMENT \'退款件数\',
`refund_amount` decimal(16,2) COMMENT \'退款金额\',
`refund_reason_type` string COMMENT \'退款原因类型\',
`create_time` string COMMENT \'退款时间\'
) COMMENT \'退款事实表\'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by \'\t\'
location \'/warehouse/gmall/dwd/dwd_fact_order_refund_info/\'; 

2)数据装载

insert overwrite table dwd_fact_order_refund_info partition(dt=\'2020-03-10\')
select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from ods_order_refund_info
where dt=\'2020-03-10\';

3)查询加载结果

select * from dwd_fact_order_refund_info where dt=\'2020-03-10\';

1.1.9 评价事实表(事务型快照事实表)

把 ODS 层 ods_comment_info 表数据导入到 DWD 层评价事实表,在导入过程中可以做适当的清洗



1)建表语句

drop table if exists dwd_fact_comment_info;
create external table dwd_fact_comment_info(
`id` string COMMENT \'编号\',
`user_id` string COMMENT \'用户 ID\',
`sku_id` string COMMENT \'商品 sku\',
`spu_id` string COMMENT \'商品 spu\',
`order_id` string COMMENT \'订单 ID\',
`appraise` string COMMENT \'评价\',
`create_time` string COMMENT \'评价时间\'
) COMMENT \'评价事实表\'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by \'\t\'
location \'/warehouse/gmall/dwd/dwd_fact_comment_info/\'; 

2)数据装载

insert overwrite table dwd_fact_comment_info partition(dt=\'2020-03-10\')
select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from ods_comment_info
where dt=\'2020-03-10\';

1.1.10 加购事实表(周期型快照事实表,每日快照)

由于购物车的数量是会发生变化,所以导增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增
周期型快照事实表劣势:存储的数据量会比较大
解决方案:周期型快照事实表存储的数据比较讲究时效性,时间太久了的意义不大,可以删除以前的数据



1)建表语句

drop table if exists dwd_fact_cart_info;
create external table dwd_fact_cart_info(
`id` string COMMENT \'编号\',
`user_id` string COMMENT \'用户 id\',
`sku_id` string COMMENT \'skuid\',
`cart_price` string COMMENT \'放入购物车时价格\',
`sku_num` string COMMENT \'数量\',
`sku_name` string COMMENT \'sku 名称 (冗余)\',
`create_time` string COMMENT \'创建时间\',
`operate_time` string COMMENT \'修改时间\',
`is_ordered` string COMMENT \'是否已经下单。1 为已下单;0 为未下单\',
`order_time` string COMMENT \'下单时间\'
) COMMENT \'加购事实表\'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by \'\t\'
location \'/warehouse/gmall/dwd/dwd_fact_cart_info/\'; 

2)数据装载

insert overwrite table dwd_fact_cart_info partition(dt=\'2020-03-10\')
select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time
from ods_cart_info
where dt=\'2020-03-10\';

3)查询加载结果

select * from dwd_fact_cart_info where dt=\'2020-03-10\';

1.1.11 收藏事实表(周期型快照事实表,每日快照)

收藏的标记,是否取消,会发生变化,做增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增


1)建表语句

drop table if exists dwd_fact_favor_info;
create external table dwd_fact_favor_info(
`id` string COMMENT \'编号\',
`user_id` string COMMENT \'用户 id\',
`sku_id` string COMMENT \'skuid\',
`spu_id` string COMMENT \'spuid\',
`is_cancel` string COMMENT \'是否取消\',
`create_time` string COMMENT \'收藏时间\',
`cancel_time` string COMMENT \'取消时间\'
) COMMENT \'收藏事实表\'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by \'\t\'
location \'/warehouse/gmall/dwd/dwd_fact_favor_info/\';

2)数据装载

insert overwrite table dwd_fact_favor_info partition(dt=\'2020-03-10\')
select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ods_favor_info
where dt=\'2020-03-10\';

3)查询加载结果

select * from dwd_fact_favor_info where dt=\'2020-03-10\';

1.1.12 优惠券领用事实表(累积型快照事实表)



优惠卷的生命周期:领取优惠卷-》用优惠卷下单-》优惠卷参与支付

累积型快照事实表使用:统计优惠卷领取次数、优惠卷下单次数、优惠卷参与支付次数
1)建表语句

drop table if exists dwd_fact_coupon_use;
create external table dwd_fact_coupon_use(
`id` string COMMENT \'编号\',
`coupon_id` string COMMENT \'优惠券 ID\',
`user_id` string COMMENT \'userid\',
`order_id` string COMMENT \'订单 id\',
`coupon_status` string COMMENT \'优惠券状态\',
`get_time` string COMMENT \'领取时间\',
`using_time` string COMMENT \'使用时间(下单)\',
`used_time` string COMMENT \'使用时间(支付)\'
) COMMENT \'优惠券领用事实表\'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by \'\t\'
location \'/warehouse/gmall/dwd/dwd_fact_coupon_use/\'; 

注意:dt 是按照优惠卷领用时间 get_time 做为分区

2)数据装载

set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_fact_coupon_use partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),\'yyyy-MM-dd\')
from
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from dwd_fact_coupon_use
where dt in
(
select
date_format(get_time,\'yyyy-MM-dd\')
from ods_coupon_use
where dt=\'2020-03-10\'
)
)old
full outer join
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ods_coupon_use
where dt=\'2020-03-10\'
)new
on old.id=new.id;

1.1.13 订单事实表(累积型快照事实表)

1)concat 函数

concat 函数在连接字符串的时候,只要其中一个是 NULL,那么将返回 NULL

hive> select concat(\'a\',\'b\');
ab
hive> select concat(\'a\',\'b\',null);
NULL

2)concat_ws 函数

concat_ws 函数在连接字符串的时候,只要有一个字符串不是 NULL,就不会返回 NULL。concat_ws 函数需要指定分隔符

hive> select concat_ws(\'-\',\'a\',\'b\');
a-b
hive> select concat_ws(\'-\',\'a\',\'b\',null);
a-b
hive> select concat_ws(\'\',\'a\',\'b\',null);
ab

3)STR_TO_MAP 函数

  • (1)语法描述


STR_TO_MAP(VARCHAR text, VARCHAR listDelimiter, VARCHAR keyValueDelimiter)

  • (2)功能描述


使用 listDelimiter 将 text 分隔成 K-V 对,然后使用 keyValueDelimiter 分隔每个 K-V 对,
组装成 MAP 返回。默认 listDelimiter 为( ,),keyValueDelimiter 为(=)。

  • (3)案例


str_to_map(‘1001=2020-03-10,1002=2020-03-10’, ‘,’ , ‘=’)
输出{“1001”:“2020-03-10”,“1002”:“2020-03-10”}

4)建表语句



订单生命周期:创建时间=》支付时间=》取消时间=》完成时间=》退款时间=》退款完成时间

由于 ODS 层订单表只有创建时间和操作时间两个状态,不能表达所有时间含义,所以需要关联订单状态表。订单事实表里面增加了活动 id,所以需要关联活动订单表

drop table if exists dwd_fact_order_info;
create external table dwd_fact_order_info (
`id` string COMMENT \'订单编号\',
`order_status` string COMMENT \'订单状态\',
`user_id` string COMMENT \'用户 id\',
`out_trade_no` string COMMENT \'支付流水号\',
`create_time` string COMMENT \'创建时间(未支付状态)\',
`payment_time` string COMMENT \'支付时间(已支付状态)\',
`cancel_time` string COMMENT \'取消时间(已取消状态)\',
`finish_time` string COMMENT \'完成时间(已完成状态)\',
`refund_time` string COMMENT \'退款时间(退款中状态)\',
`refund_finish_time` string COMMENT \'退款完成时间(退款完成状态)\',
`province_id` string COMMENT \'省份 ID\',
`activity_id` string COMMENT \'活动 ID\',
`original_total_amount` string COMMENT \'原价金额\',
`benefit_reduce_amount` string COMMENT \'优惠金额\',
`feight_fee` string COMMENT \'运费\',
`final_total_amount` decimal(10,2) COMMENT \'订单金额\'
)
PARTITIONED BY (`dt` string)
stored as parquet
location \'/warehouse/gmall/dwd/dwd_fact_order_info/\'
tblproperties ("parquet.compression"="lzo");

5)数据装载



5)常用函数

更多函数请点击博客【HIve】Hive入门解析(五)

6)数据装载

set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_fact_order_info partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms[\'1001\'] is null,old.create_time,new.tms[\'1001\']),--1001 对应未支付状态
if(new.tms[\'1002\'] is null,old.payment_time,new.tms[\'1002\']),
if(new.tms[\'1003\'] is null,old.cancel_time,new.tms[\'1003\']),
if(new.tms[\'1004\'] is null,old.finish_time,new.tms[\'1004\']),
if(new.tms[\'1005\'] is null,old.refund_time,new.tms[\'1005\']),
if(new.tms[\'1006\'] is null,old.refund_finish_time,new.tms[\'1006\']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is
null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is
null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms[\'1001\'] is
null,old.create_time,new.tms[\'1001\']),\'yyyy-MM-dd\')
from
(
select
id,
order_status,
user_id,
out_trade_no,
create_time,
payment_time,
cancel_time,
finish_time,
refund_time,
refund_finish_time,
province_id,
activity_id,
original_total_amount,
benefit_reduce_amount,
feight_fee,
final_total_amount
from dwd_fact_order_info
where dt
in
(
select
date_format(create_time,\'yyyy-MM-dd\')
from ods_order_info
where dt=\'2020-03-10\'
)
)old
full outer join
(
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from
(
select
order_id,
str_to_map(concat_ws(\',\',collect_set(concat(order_status,\'=\',operate_time))),\',\',\'=\')
tms
from ods_order_status_log
where dt=\'2020-03-10\'
group by order_id
)log
join
(
select * from ods_order_info where dt=\'2020-03-10\'
)info
on log.order_id=info.id
left join
(
select * from ods_activity_order where dt=\'2020-03-10\'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;

1.1.14 用户维度表(拉链表)

用户表中的数据每日既有可能新增,也有可能修改,但修改频率并不高,属于缓慢变化
维度,此处采用拉链表存储用户维度数据

1)什么是拉链表



2)为什么要做拉链表

<ignore_js_op>


3)拉链表形成过程



4)拉链表制作过程图



5)拉链表制作过程

步骤 0:初始化拉链表(首次独立执行)

(1)建立拉链表

drop table if exists dwd_dim_user_info_his;
create external table dwd_dim_user_info_his(
`id` string COMMENT \'用户 id\',
`name` string COMMENT \'姓名\',
`birthday` string COMMENT \'生日\',
`gender` string COMMENT \'性别\',
`email` string COMMENT \'邮箱\',
`user_level` string COMMENT \'用户等级\',
`create_time` string COMMENT \'创建时间\',
`operate_time` string COMMENT \'操作时间\',
`start_date` string COMMENT \'有效开始日期\',
`end_date` string COMMENT \'有效结束日期\'
) COMMENT \'订单拉链表\'
stored as parquet
location \'/warehouse/gmall/dwd/dwd_dim_user_info_his/\'
tblproperties ("parquet.compression"="lzo");

(2)初始化拉链表

insert overwrite table dwd_dim_user_info_his
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
\'2020-03-10\',
\'9999-99-99\'
from ods_user_info oi
where oi.dt=\'2020-03-10\';

步骤 1:制作当日变动数据(包括新增,修改)每日执行

(1)如何获得每日变动表

  • a.最好表内有创建时间和变动时间(Lucky!)
  • b.如果没有,可以利用第三方工具监控比如 canal,监控 MySQL 的实时变化进行记录(麻烦)
  • c.逐行对比前后两天的数据,检查 md5(concat(全部有可能变化的字段))是否相同(low)
  • d.要求业务数据库提供变动流水(人品,颜值)


(2)因为 ods_order_info 本身导入过来就是新增变动明细的表,所以不用处理

  • a)数据库中新增 2020-03-11 一天的数据
  • b)通过 Sqoop 把 2020-03-11 日所有数据导入mysqlTohdfs.sh all 2020-03-11
  • c)ods 层数据导入hdfs_to_ods_db.sh all 2020-03-11


步骤 2:先合并变动信息,再追加新增信息,插入到临时表中

1)建立临时表

drop table if exists dwd_dim_user_info_his_tmp;
create external table dwd_dim_user_info_his_tmp(
`id` string COMMENT \'用户 id\',
`name` string COMMENT \'姓名\',
`birthday` string COMMENT \'生日\',
`gender` string COMMENT \'性别\',
`email` string COMMENT \'邮箱\',
`user_level` string COMMENT \'用户等级\',
`create_time` string COMMENT \'创建时间\',
`operate_time` string COMMENT \'操作时间\',
`start_date` string COMMENT \'有效开始日期\',
`end_date` string COMMENT \'有效结束日期\'
) COMMENT \'订单拉链临时表\'
stored as parquet
location \'/warehouse/gmall/dwd/dwd_dim_user_info_his_tmp/\'
tblproperties ("parquet.compression"="lzo");

2)导入脚本

insert overwrite table dwd_dim_user_info_his_tmp
select * from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
\'2020-03-11\' start_date,
\'9999-99-99\' end_date
from ods_user_info where dt=\'2020-03-11\'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date=\'9999-99-99\', date_add(ui.dt,-1),
uh.end_date) end_date
from dwd_dim_user_info_his uh left join
(
select
*
from ods_user_info
where dt=\'2020-03-11\'
) ui on uh.id=ui.id
)his
order by his.id, start_date;

1.1.15 DWD 层数据导入脚本

1)vim ods_to_dwd_db.sh

#!/bin/bash
APP=gmall
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
sql1="
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table ${APP}.dwd_dim_sku_info partition(dt=\'$do_date\')
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
ob.tm_name,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.name category3_name,
c2.name category2_name,
c1.name category1_name,
spu.spu_name,
sku.create_time
from
(
select * from ${APP}.ods_sku_info where dt=\'$do_date\'
)sku
join
(
select * from ${APP}.ods_base_trademark where dt=\'$do_date\'
)ob on sku.tm_id=ob.tm_id
join
(
select * from ${APP}.ods_spu_info where dt=\'$do_date\'
)spu on spu.id = sku.spu_id
join
(
select * from ${APP}.ods_base_category3 where dt=\'$do_date\'
)c3 on sku.category3_id=c3.id
join
(
select * from ${APP}.ods_base_category2 where dt=\'$do_date\'
)c2 on c3.category2_id=c2.id
join
(
select * from ${APP}.ods_base_category1 where dt=\'$do_date\'
)c1 on c2.category1_id=c1.id;


insert overwrite table ${APP}.dwd_dim_coupon_info partition(dt=\'$do_date\')
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id,
tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from ${APP}.ods_coupon_info
where dt=\'$do_date\';


insert overwrite table ${APP}.dwd_dim_activity_info partition(dt=\'$do_date\')
select
info.id,
info.activity_name,
info.activity_type,
rule.condition_amount,
rule.condition_num,
rule.benefit_amount,
rule.benefit_discount,
rule.benefit_level,
info.start_time,
info.end_time,
info.create_time
from
(
select * from ${APP}.ods_activity_info where dt=\'$do_date\'
)info
left join
(
select * from ${APP}.ods_activity_rule where dt=\'$do_date\'
)rule on info.id = rule.activity_id;


insert overwrite table ${APP}.dwd_fact_order_detail partition(dt=\'$do_date\')
select
od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.order_price*od.sku_num
from
(
select * from ${APP}.ods_order_detail where dt=\'$do_date\'
) od
join
(
select * from ${APP}.ods_order_info where dt=\'$do_date\'
) oi
on od.order_id=oi.id;


insert overwrite table ${APP}.dwd_fact_payment_info partition(dt=\'$do_date\')
select
pi.id,
pi.out_trade_no,
pi.order_id,
pi.user_id,
pi.alipay_trade_no,
pi.total_amount,
pi.subject,
pi.payment_type,
pi.payment_time,
oi.province_id
from
(
select * from ${APP}.ods_payment_info where dt=\'$do_date\'
)pi
join
(
select id, province_id from ${APP}.ods_order_info where dt=\'$do_date\'
)oi
on pi.order_id = oi.id;


insert overwrite table ${APP}.dwd_fact_order_refund_info partition(dt=\'$do_date\')
select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from ${APP}.ods_order_refund_info
where dt=\'$do_date\';


insert overwrite table ${APP}.dwd_fact_comment_info partition(dt=\'$do_date\')
select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from ${APP}.ods_comment_info
where dt=\'$do_date\';


insert overwrite table ${APP}.dwd_fact_cart_info partition(dt=\'$do_date\')
select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time
from ${APP}.ods_cart_info
where dt=\'$do_date\';


insert overwrite table ${APP}.dwd_fact_favor_info partition(dt=\'$do_date\')
select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ${APP}.ods_favor_info
where dt=\'$do_date\';


insert overwrite table ${APP}.dwd_fact_coupon_use partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),\'yyyy-MM-dd\')
from
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ${APP}.dwd_fact_coupon_use
where dt in
(
select
date_format(get_time,\'yyyy-MM-dd\')
from ${APP}.ods_coupon_use
where dt=\'$do_date\'
)
)old
full outer join
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ${APP}.ods_coupon_use
where dt=\'$do_date\'
)new
on old.id=new.id;


insert overwrite table ${APP}.dwd_fact_order_info partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms[\'1001\'] is null,old.create_time,new.tms[\'1001\']),--1001 对应未支付状态
if(new.tms[\'1002\'] is null,old.payment_time,new.tms[\'1002\']),
if(new.tms[\'1003\'] is null,old.cancel_time,new.tms[\'1003\']),
if(new.tms[\'1004\'] is null,old.finish_time,new.tms[\'1004\']),
if(new.tms[\'1005\'] is null,old.refund_time,new.tms[\'1005\']),
if(new.tms[\'1006\'] is null,old.refund_finish_time,new.tms[\'1006\']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is
null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is
null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is
null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms[\'1001\'] is
null,old.create_time,new.tms[\'1001\']),\'yyyy-MM-dd\')
from
(
select
id,
order_status,
user_id,
out_trade_no,
create_time,
payment_time,
cancel_time,
finish_time,
refund_time,
refund_finish_time,
province_id,
activity_id,
original_total_amount,
benefit_reduce_amount,
feight_fee,
final_total_amount
from ${APP}.dwd_fact_order_info
where dt
in
(
select
date_format(create_time,\'yyyy-MM-dd\')
from ${APP}.ods_order_info
where dt=\'$do_date\'
)
)old
full outer join
(
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from
(
select
order_id,
str_to_map(concat_ws(\',\',collect_set(concat(order_status,\'=\',operate_time))),\',\',\'
=\') tms
from ${APP}.ods_order_status_log
where dt=\'$do_date\'
group by order_id
)log
join
(
select * from ${APP}.ods_order_info where dt=\'$do_date\'
)info
on log.order_id=info.id
left join
(
select * from ${APP}.ods_activity_order where dt=\'$do_date\'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;


insert overwrite table ${APP}.dwd_dim_user_info_his_tmp
select * from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
\'$do_date\' start_date,
\'9999-99-99\' end_date
from ${APP}.ods_user_info where dt=\'$do_date\'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date=\'9999-99-99\', date_add(ui.dt,-1),
uh.end_date) end_date
from ${APP}.dwd_dim_user_info_his uh left join
(
select
*
from ${APP}.ods_user_info
where dt=\'$do_date\'
) ui on uh.id=ui.id
)his
order by his.id, start_date;


insert overwrite table ${APP}.dwd_dim_user_info_his select * from
${APP}.dwd_dim_user_info_his_tmp;
"

sql2="
insert overwrite table ${APP}.dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from ${APP}.ods_base_province bp
join ${APP}.ods_base_region br
on bp.region_id=br.id;
"

case $1 in
"first"){
$hive -e "$sql1"
$hive -e "$sql2"
};;
"all"){
$hive -e "$sql1"
};;
esac

2)增加脚本执行权限

chmod 770 ods_to_dwd_db.sh

3)执行脚本导入数据

ods_to_dwd_db.sh all 2020-03-11

4)查看导入数据

select * from dwd_fact_order_info where dt=\'2020-03-11\';
select * from dwd_fact_order_detail where dt=\'2020-03-11\';
select * from dwd_fact_comment_info where dt=\'2020-03-11\';
select * from dwd_fact_order_refund_info where dt=\'2020-03-11\';

 

分类:

技术点:

相关文章:

  • 2021-09-25
  • 2021-11-12
  • 2021-10-05
  • 2021-11-01
  • 2021-10-01
  • 2021-09-25
  • 2021-11-24
  • 2021-12-10
猜你喜欢
  • 2021-09-25
  • 2021-09-25
  • 2021-10-26
  • 2021-10-26
  • 2021-09-25
  • 2021-09-25
  • 2021-09-25
相关资源
相似解决方案