doudou1

 数据仓库中经常会遇到事实表以及维度表需求变更,最常见的就是增加一列数据。本篇先讨论如何需要增加一列数据,模式会发生怎样的改变。然后就销售事实表以及客户维度表分别添加一列,并在该列上运用SCD2的缓慢变化维度。

 

 修改数据库模式

        图(五)- 1-1 显示了修改后的模式,在它的customer_dim表和sales_order_fact表上增加了新列。customer_dim表增加的新列是shipping_address、shipping_zip_code、shipping_city和shipping_state。sales_order_fact表增加的新列是order_quantity。使用清单(五)-1-1里的SQL脚本修改数据库模式。

(此图片引用自blog)

图(五)- 1-1
use dw;
alter table dbo.Customer_dim add  shipping_address varchar(50);
alter table dbo.Customer_dim add  shipping_zip_code int;
alter table dbo.Customer_dim add  shipping_city varchar(30);
alter table dbo.Customer_dim add  shipping_state varchar(2);
alter table dbo.customer_stg add  shipping_address varchar(50);
alter table dbo.customer_stg add  shipping_zip_code int;
alter table dbo.customer_stg add  shipping_city varchar(30);
alter table dbo.customer_stg add  shipping_state varchar(2);
alter table dbo.Sales_order_fact add  order_quantity int;
go
use source;
alter table dbo.Customer add  shipping_address varchar(50);
alter table dbo.Customer add  shipping_zip_code int;
alter table  dbo.Customer add  shipping_city varchar(30);
alter table dbo.Customer add  shipping_state varchar(2);
alter table dbo.Sales_Order add  order_quantity int;
go

 

 修改表模式之后,需修改初始化加载脚本以及定期加载脚本。但由于初始化数据已经加载进数据库,因此此处只修改定期加载脚本。
set ANSI_NULLS ON
set QUOTED_IDENTIFIER ON
go

ALTER  procedure [dbo].[test] as 
--定期加载数据
UPDATE cdc_time SET current_load = getdate() ; 
declare @eff_date datetime;
select @eff_date=current_load from dbo.cdc_time;

begin

truncate table  dbo.customer_stg;
truncate table dbo.product_stg;
--客户以及产品表整体拉取(客户表address实现scd2,name实现scd1
insert into dbo.customer_stg
(customer_number
,customer_name
,customer_street_address
,customer_zip_code
,customer_city
,customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state)
select Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
from source.dbo.Customer;


--新增数据
insert into dbo.Customer_dim
(Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
,effective_date
,expiry_date
,version)
select Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
,@eff_date
,convert(datetime,\'2099-12-31\',120)
,1 from dbo.customer_stg where customer_number not in (  
SELECT customer_number  
FROM customer_dim x);
--更新数据处理
update customer_dim 
set expiry_date=@eff_date
where exists (select 1 from customer_dim a,customer_stg b where a.customer_number=b.customer_number
and (a.Customer_street_address<>b.Customer_street_address
or a.shipping_address<>b.shipping_address
or a.shipping_zip_code<>b.shipping_zip_code
or a.shipping_city<>b.shipping_city
or a.shipping_state<>b.shipping_state)
AND expiry_date = convert(datetime,\'2099-12-31\',120)) ;

insert into customer_dim
(Customer_number
,Customer_name
,Customer_street_address
,Customer_zip_code
,Customer_city
,Customer_state
,shipping_address
,shipping_zip_code
,shipping_city
,shipping_state
,effective_date
,expiry_date
,version)
select b.Customer_number
,b.Customer_name
,b.Customer_street_address
,b.Customer_zip_code
,b.Customer_city
,b.Customer_state
,b.shipping_address
,b.shipping_zip_code
,b.shipping_city
,b.shipping_state
,@eff_date
,convert(datetime,\'2099-12-31\',120)
,a.version+1
from customer_dim a ,customer_stg b
where a.customer_number=b.customer_number
and (a.Customer_street_address<>b.Customer_street_address
or a.shipping_address<>b.shipping_address
or a.shipping_zip_code<>b.shipping_zip_code
or a.shipping_city<>b.shipping_city
or a.shipping_state<>b.shipping_state)
AND EXISTS(  
SELECT *  
FROM customer_dim x
WHERE  
    b.customer_number=x.customer_number  
AND a.expiry_date = @eff_date)  
AND NOT EXISTS (  
SELECT *  
FROM customer_dim y  
WHERE  
    b.customer_number = y.customer_number  
AND y.expiry_date = convert(datetime,\'2099-12-31\',120)) ;  


insert into dbo.product_stg
(Product_code
,product_name
,product_category)
select Product_code,Product_name,Product_category from source.dbo.Product;

--设置scd的截止时间以及生效时间

--新产品
insert into dbo.Product_dim
(Product_code
,product_name
,product_category
,effective_date
,expire_date
,version) 
select Product_code,Product_name,Product_category
,@eff_date
,convert(datetime,\'2099-12-31\',120)
,1 from dbo.product_stg where Product_code not in (select 
Product_code from dbo.Product_dim);

--产品表更新
update dbo.Product_dim  
set expire_date=@eff_date
where exists(
select 1 from dbo.Product_dim a,dbo.product_stg b where
a.Product_code=b.Product_code
and (a.product_name<>b.product_name
or a.product_category<>b.product_category)
and a.expire_date=convert(datetime,\'2099-12-31\',120));

insert into dbo.Product_dim 
(Product_code
,product_name
,product_category
,effective_date
,expire_date
,version) 
select  b.product_code  
, b.product_name  
, b.product_category  
, @eff_date
,\'2099-12-31\'
, a.version + 1 
FROM  
  product_dim a  
, product_stg b
WHERE  
    a.product_code = b.product_code  
AND (   a.product_name <> b.product_name  
     OR a.product_category <> b.product_category)  
AND EXISTS(  
SELECT *  
FROM product_dim x  
WHERE     b.product_code = x.product_code  
      AND a.expire_date = @eff_date)  
AND NOT EXISTS (  
SELECT *  
FROM product_dim y  
WHERE     b.product_code = y.product_code  
      AND y.expire_date = \'2099-12-31\') ; 
--新数据


--新增的订单
-- 装载订单维度,新增前一天的订单号  
INSERT INTO order_dim (  
  order_number  
, effective_date  
, expiry_date)  
SELECT  
  order_number  
, order_date  
, \'2200-01-01\'  
FROM source.sales_order, cdc_time  
WHERE entry_date >= last_load AND entry_date < current_load ;  
  
-- 装载事实表,新增前一天的订单  
INSERT INTO sales_order_fact  
SELECT  
  order_sk  
, customer_sk  
, product_sk  
, date_sk  
, order_amount 
, order_quantity
FROM  
  source.sales_order a  
, order_dim b  
, customer_dim c  
, product_dim d  
, date_dim e  
, cdc_time f  
WHERE  
    a.order_number = b.order_number  
AND a.customer_number = c.customer_number  
AND a.order_date >= c.effective_date  
AND a.order_date < c.expiry_date  
AND a.product_code = d.product_code  
AND a.order_date >= d.effective_date  
AND a.order_date < d.expiry_date  
AND a.order_date = e.date  
AND a.entry_date >= f.last_load AND a.entry_date < f.current_load ;  
  
-- 更新时间戳表的last_load字段  
UPDATE cdc_time SET last_load = current_load ;  

end;

Kettle解决方案:

 

Kettle 需要更改的步骤:

   装载中间表

  装载维度表(客户维度修改)

 装载事实表

 

修改后的配置如下:

 
 
 
 
修改后数据:
 order 维度表加载数据:

37 NULL NULL NULL 1
38 22 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
39 23 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
40 24 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
41 25 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
42 26 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
43 27 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
44 28 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
45 30 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
46 31 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
47 32 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
48 33 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
49 34 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
50 35 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
51 36 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1

产品维度表:

62 NULL NULL NULL NULL NULL 1
63 1 Hard Disk Drive Storage 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
64 2 Floppy Drive Storage 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
65 3 LCD Panel Monitor 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
66 4 Keyboard Peripheral 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1

 

客户维度表:

249 NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL NULL
250 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL 10001 NULL NULL
251 2 Small Stores 2500 Woodland St. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
252 3 Medium Retailers 1111 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
253 4 Good Companies 9500 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
254 5 Wonderful Shops 3333 Rossmoyne Rd. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
255 6 Loyal Clients 7070 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
256 7 Distinguished Partners 9999 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL

 

事实表数据:

250 63 5564 38 1000 1000
251 64 5564 39 2000 2000
252 65 5564 40 3000 3000
253 66 5564 41 4000 4000
254 64 5564 42 1000 1000
255 64 5564 43 3000 3000
256 65 5564 44 5000 5000
250 63 5564 45 1000 1000
251 64 5564 46 2000 2000
252 65 5564 47 4000 4000
253 66 5564 48 6000 6000
254 63 5564 49 2500 2500
255 64 5564 50 5000 5000
256 65 5564 51 7500 7500

 

 执行如下脚本测试:
use source;
update dbo.Customer set shipping_city=\'shanghai\' where Customer_number=1;

客户维度表数据变为:

249 NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL NULL
250 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 1900-01-01 00:00:00.000 2016-03-27 00:00:00.000 1 NULL 10001 NULL NULL
251 2 Small Stores 2500 Woodland St. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
252 3 Medium Retailers 1111 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
253 4 Good Companies 9500 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
254 5 Wonderful Shops 3333 Rossmoyne Rd. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
255 6 Loyal Clients 7070 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
256 7 Distinguished Partners 9999 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
257 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 2016-03-27 00:00:00.000 2200-01-01 00:00:00.000 2 NULL 10001 shanghai NULL

分类:

技术点:

相关文章: