数据仓库中经常会遇到事实表以及维度表需求变更,最常见的就是增加一列数据。本篇先讨论如何需要增加一列数据,模式会发生怎样的改变。然后就销售事实表以及客户维度表分别添加一列,并在该列上运用SCD2的缓慢变化维度。
修改数据库模式
图(五)- 1-1 显示了修改后的模式,在它的customer_dim表和sales_order_fact表上增加了新列。customer_dim表增加的新列是shipping_address、shipping_zip_code、shipping_city和shipping_state。sales_order_fact表增加的新列是order_quantity。使用清单(五)-1-1里的SQL脚本修改数据库模式。
(此图片引用自blog)
use dw; alter table dbo.Customer_dim add shipping_address varchar(50); alter table dbo.Customer_dim add shipping_zip_code int; alter table dbo.Customer_dim add shipping_city varchar(30); alter table dbo.Customer_dim add shipping_state varchar(2);
alter table dbo.customer_stg add shipping_address varchar(50); alter table dbo.customer_stg add shipping_zip_code int; alter table dbo.customer_stg add shipping_city varchar(30); alter table dbo.customer_stg add shipping_state varchar(2);
alter table dbo.Sales_order_fact add order_quantity int; go use source; alter table dbo.Customer add shipping_address varchar(50); alter table dbo.Customer add shipping_zip_code int; alter table dbo.Customer add shipping_city varchar(30); alter table dbo.Customer add shipping_state varchar(2); alter table dbo.Sales_Order add order_quantity int; go
set ANSI_NULLS ON set QUOTED_IDENTIFIER ON go ALTER procedure [dbo].[test] as --定期加载数据 UPDATE cdc_time SET current_load = getdate() ; declare @eff_date datetime; select @eff_date=current_load from dbo.cdc_time; begin truncate table dbo.customer_stg; truncate table dbo.product_stg; --客户以及产品表整体拉取(客户表address实现scd2,name实现scd1 insert into dbo.customer_stg (customer_number ,customer_name ,customer_street_address ,customer_zip_code ,customer_city ,customer_state ,shipping_address ,shipping_zip_code ,shipping_city ,shipping_state) select Customer_number ,Customer_name ,Customer_street_address ,Customer_zip_code ,Customer_city ,Customer_state ,shipping_address ,shipping_zip_code ,shipping_city ,shipping_state from source.dbo.Customer; --新增数据 insert into dbo.Customer_dim (Customer_number ,Customer_name ,Customer_street_address ,Customer_zip_code ,Customer_city ,Customer_state ,shipping_address ,shipping_zip_code ,shipping_city ,shipping_state ,effective_date ,expiry_date ,version) select Customer_number ,Customer_name ,Customer_street_address ,Customer_zip_code ,Customer_city ,Customer_state ,shipping_address ,shipping_zip_code ,shipping_city ,shipping_state ,@eff_date ,convert(datetime,\'2099-12-31\',120) ,1 from dbo.customer_stg where customer_number not in ( SELECT customer_number FROM customer_dim x); --更新数据处理 update customer_dim set expiry_date=@eff_date where exists (select 1 from customer_dim a,customer_stg b where a.customer_number=b.customer_number and (a.Customer_street_address<>b.Customer_street_address or a.shipping_address<>b.shipping_address or a.shipping_zip_code<>b.shipping_zip_code or a.shipping_city<>b.shipping_city or a.shipping_state<>b.shipping_state) AND expiry_date = convert(datetime,\'2099-12-31\',120)) ; insert into customer_dim (Customer_number ,Customer_name ,Customer_street_address ,Customer_zip_code ,Customer_city ,Customer_state ,shipping_address ,shipping_zip_code ,shipping_city ,shipping_state ,effective_date ,expiry_date ,version) select b.Customer_number ,b.Customer_name ,b.Customer_street_address ,b.Customer_zip_code ,b.Customer_city ,b.Customer_state ,b.shipping_address ,b.shipping_zip_code ,b.shipping_city ,b.shipping_state ,@eff_date ,convert(datetime,\'2099-12-31\',120) ,a.version+1 from customer_dim a ,customer_stg b where a.customer_number=b.customer_number and (a.Customer_street_address<>b.Customer_street_address or a.shipping_address<>b.shipping_address or a.shipping_zip_code<>b.shipping_zip_code or a.shipping_city<>b.shipping_city or a.shipping_state<>b.shipping_state) AND EXISTS( SELECT * FROM customer_dim x WHERE b.customer_number=x.customer_number AND a.expiry_date = @eff_date) AND NOT EXISTS ( SELECT * FROM customer_dim y WHERE b.customer_number = y.customer_number AND y.expiry_date = convert(datetime,\'2099-12-31\',120)) ; insert into dbo.product_stg (Product_code ,product_name ,product_category) select Product_code,Product_name,Product_category from source.dbo.Product; --设置scd的截止时间以及生效时间 --新产品 insert into dbo.Product_dim (Product_code ,product_name ,product_category ,effective_date ,expire_date ,version) select Product_code,Product_name,Product_category ,@eff_date ,convert(datetime,\'2099-12-31\',120) ,1 from dbo.product_stg where Product_code not in (select Product_code from dbo.Product_dim); --产品表更新 update dbo.Product_dim set expire_date=@eff_date where exists( select 1 from dbo.Product_dim a,dbo.product_stg b where a.Product_code=b.Product_code and (a.product_name<>b.product_name or a.product_category<>b.product_category) and a.expire_date=convert(datetime,\'2099-12-31\',120)); insert into dbo.Product_dim (Product_code ,product_name ,product_category ,effective_date ,expire_date ,version) select b.product_code , b.product_name , b.product_category , @eff_date ,\'2099-12-31\' , a.version + 1 FROM product_dim a , product_stg b WHERE a.product_code = b.product_code AND ( a.product_name <> b.product_name OR a.product_category <> b.product_category) AND EXISTS( SELECT * FROM product_dim x WHERE b.product_code = x.product_code AND a.expire_date = @eff_date) AND NOT EXISTS ( SELECT * FROM product_dim y WHERE b.product_code = y.product_code AND y.expire_date = \'2099-12-31\') ; --新数据 --新增的订单 -- 装载订单维度,新增前一天的订单号 INSERT INTO order_dim ( order_number , effective_date , expiry_date) SELECT order_number , order_date , \'2200-01-01\' FROM source.sales_order, cdc_time WHERE entry_date >= last_load AND entry_date < current_load ; -- 装载事实表,新增前一天的订单 INSERT INTO sales_order_fact SELECT order_sk , customer_sk , product_sk , date_sk , order_amount , order_quantity FROM source.sales_order a , order_dim b , customer_dim c , product_dim d , date_dim e , cdc_time f WHERE a.order_number = b.order_number AND a.customer_number = c.customer_number AND a.order_date >= c.effective_date AND a.order_date < c.expiry_date AND a.product_code = d.product_code AND a.order_date >= d.effective_date AND a.order_date < d.expiry_date AND a.order_date = e.date AND a.entry_date >= f.last_load AND a.entry_date < f.current_load ; -- 更新时间戳表的last_load字段 UPDATE cdc_time SET last_load = current_load ; end;
Kettle解决方案:
Kettle 需要更改的步骤:
装载中间表
装载维度表(客户维度修改)
装载事实表
修改后的配置如下:
37 NULL NULL NULL 1
38 22 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
39 23 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
40 24 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
41 25 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
42 26 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
43 27 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
44 28 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
45 30 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
46 31 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
47 32 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
48 33 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
49 34 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
50 35 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
51 36 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
62 NULL NULL NULL NULL NULL 1
63 1 Hard Disk Drive Storage 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
64 2 Floppy Drive Storage 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
65 3 LCD Panel Monitor 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
66 4 Keyboard Peripheral 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1
客户维度表:
249 NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL NULL
250 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL 10001 NULL NULL
251 2 Small Stores 2500 Woodland St. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
252 3 Medium Retailers 1111 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
253 4 Good Companies 9500 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
254 5 Wonderful Shops 3333 Rossmoyne Rd. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
255 6 Loyal Clients 7070 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
256 7 Distinguished Partners 9999 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
事实表数据:
250 63 5564 38 1000 1000
251 64 5564 39 2000 2000
252 65 5564 40 3000 3000
253 66 5564 41 4000 4000
254 64 5564 42 1000 1000
255 64 5564 43 3000 3000
256 65 5564 44 5000 5000
250 63 5564 45 1000 1000
251 64 5564 46 2000 2000
252 65 5564 47 4000 4000
253 66 5564 48 6000 6000
254 63 5564 49 2500 2500
255 64 5564 50 5000 5000
256 65 5564 51 7500 7500
use source; update dbo.Customer set shipping_city=\'shanghai\' where Customer_number=1;
客户维度表数据变为:
249 NULL NULL NULL NULL NULL NULL NULL NULL 1 NULL NULL NULL NULL
250 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 1900-01-01 00:00:00.000 2016-03-27 00:00:00.000 1 NULL 10001 NULL NULL
251 2 Small Stores 2500 Woodland St. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
252 3 Medium Retailers 1111 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
253 4 Good Companies 9500 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
254 5 Wonderful Shops 3333 Rossmoyne Rd. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
255 6 Loyal Clients 7070 Ritter Rd. 17055 Pittsburgh PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
256 7 Distinguished Partners 9999 Scott St. 17050 Mechanicsburg PA 1900-01-01 00:00:00.000 2200-01-01 00:00:00.000 1 NULL NULL NULL NULL
257 1 Really Large 7500 Louise Dr. 1 Mechanicsburg PA 2016-03-27 00:00:00.000 2200-01-01 00:00:00.000 2 NULL 10001 shanghai NULL