xiongmozhou

设计一张日期维度表

表名称:DIM_PUB_DATE

 

具体属性值:

字段名称
字段类型
字段描述
示例
ds
string
Id(主键)
20210624
d_date
string
日期
2021-06-24
d_datetime
string
日期(包含时分秒)
2021-06-24 00:00:00
y_ds
string
去年id
20200624(减12月)
y_date
string
去年日期
2020-06-24
y_datetime
string
去年日期(包含时分秒)
2020-06-24 00:00:00
d_year
string
年份
2021
d_yearstartday
string
年初日
2021-01-01
y_year
string
去年份
2020
y_yearstartday
string
去年初日
2020-01-01
d_month
string
月份
2021-06
d_monthstartday
string
月初日
2021-06-01
d_monthendday
string
月尾日
2021-06-30
y_month
string
去年月份
2020-06
y_monthstartday
string
去年月初日
2020-06-01
y_monthendday
string
去年月尾日
2020-06-30
d_quarter
string
季度
Q2
d_yearquarter
string
季度(带年份)
2021-2
week
string
星期(数字表示)
1,2,3,4,5,6,7:1代表星期一
d_week
string
星期几
星期一
d_dayofyear
string
当年第几天
D100
d_dayofmonth
string
当月第几天
D24
d_dayofweek
string
当周第几天
D4
d_weekofmonth
string
当月第几周
W4
d_weekofyear
string
当年第几周
W24
is_weekend
string
是否周末
1:是,2:否
day_type
string
日历天类型
0:节假日,1:工作日,2:休息日
(调休也属于工作日)
hol_name
string
节假日名称
元旦,调休,春节,清明节,劳动节,端午节,中秋节,国庆节

 

生成方法: 

使用python生成一张维度表,考虑到通用性,可以先写到Excel或者CSV里面,然后自定义写入到hive或者odps里面去。

Py3语法:

# -*- coding:utf-8 -*-
import csv
import json
from datetime import *
import requests
from dateutil.relativedelta import *
import calendar
import pandas as pd
from odps import ODPS


def getDate(ddate, holiday_list):
    # 日期
    d_date = datetime.strptime(ddate, "%Y-%m-%d").date()
    # id(主键)
    ds = str(d_date).replace(\'-\', \'\')
    # 日期时间(包含时分秒)
    d_datetime = datetime(d_date.year,d_date.month,d_date.day)
    # 去年今日
    y_date = d_date + relativedelta(months=-12)
    # 去年ds
    y_ds = str(y_date).replace(\'-\', \'\')
    # 去年日期
    y_datetime = datetime(y_date.year, y_date.month, y_date.day)
    # 年份
    d_year = str(d_date.year)
    # 年初日
    d_yearstartday = d_year+\'-01-01\'
    # 年份
    y_year = str(y_date.year)
    # 去年初日
    y_yearstartday = y_year + \'-01-01\'
    # 月份
    d_month = d_year+\'-\'+str(d_date).split(\'-\')[1]
    # 月初日
    d_monthstartday = d_month + \'-01\'
    # 月尾日
    d_monthendday = d_month + \'-\' + str(calendar.monthrange(int(d_year), int(d_date.month))[1])
    # 去年月份
    y_month = y_year+\'-\'+str(y_date).split(\'-\')[1]
    # 去年月初日
    y_monthstartday = y_month + \'-01\'
    # 去年月尾日
    y_monthendday = y_month + \'-\' + str(calendar.monthrange(int(y_year), int(y_date.month))[1])
    # 季度
    d_quarter = "Q"+str((int(d_date.month)-1) // 3 + 1)
    # 星期几
    week = d_date.strftime(\'%u\')
    # 星期
    d_week = \'\'
    if week == \'1\':
        d_week = "星期一"
    elif week == \'2\':
        d_week = "星期二"
    elif week == \'3\':
        d_week = "星期三"
    elif week == \'4\':
        d_week = "星期四"
    elif week == \'5\':
        d_week = "星期五"
    elif week == \'6\':
        d_week = "星期六"
    else:
        d_week = "星期日"
    # 季度带年份
    d_yearquarter = y_year+\'-\'+str((int(d_date.month)-1) // 3 + 1)
    # 当年第几天
    d_dayofyear = d_date.strftime(\'%j\')
    # 当月第几天
    d_dayofmonth = d_date.strftime(\'%d\')
    # 当周第几天:星期一为第一天 (值从1到7,星期一为1)
    d_dayofweek = d_date.strftime(\'%u\')
    # 当月第几周:第一天只要不是星期一,都算一周
    d_weekofmonth = (int(d_date.strftime(\'%W\'))-int(date(d_date.year,d_date.month,1).strftime(\'%W\')))+1
    # 当年第几周:把星期一做为第一天(值从0到53)
    d_weekofyear = int(d_date.strftime(\'%W\'))+1
    # 是否周末
    is_weekend = \'\'
    # 日期类型:0:节假日,1:工作日,2:休息日
    day_type = \'\'
    # 具体节假日名称(包含调休)
    hol_name = \'\'
    if d_dayofweek in (\'6\', \'7\'):
        is_weekend = \'1\'
        day_type = \'2\'
    else:
        is_weekend = \'0\'
        day_type = \'1\'

    # 法定节假日:0,工作日:1,休息日:2
    for hol_json in holiday_list:
        if str(d_date) == json.loads(hol_json)[\'date\']:
            # 是否节假日
            if json.loads(hol_json)[\'flag\'] != "调休":
                day_type = \'0\'
            else:
                day_type = \'1\'
            # 具体节假日(包含调休)
            hol_name = json.loads(hol_json)[\'flag\']

    # 把数据添加到列表中
    date_list = [ds, str(d_date), str(d_datetime), y_ds, str(y_date), str(y_datetime),
                 d_year, d_yearstartday, y_year, y_yearstartday,
                 d_month, d_monthstartday, d_monthendday, y_month,
                 y_monthstartday, y_monthendday, d_quarter, d_yearquarter, week,
                 d_week, \'D\'+str(d_dayofyear), \'D\'+str(d_dayofmonth), \'D\'+str(d_dayofweek),
                 \'W\'+str(d_weekofmonth), \'W\'+str(d_weekofyear), is_weekend, day_type, hol_name]

    return date_list

# 获取节假日,转为需要的格式
def get_holiday_dict(start_year, y_cnt):
    y_list = [start_year]
    for i in range(1, y_cnt+1):
        y_list.append(start_year+i)

    hol_list = []
    dict_res = {}
    for year in y_list:
        headers = {\'User-Agent\':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
        response = requests.get(\'https://timor.tech/api/holiday/year/\'+str(year)+\'/\', headers=headers)
        res = response.json()[\'holiday\'].items()
        for i in res:
            # 日期
            dict_res[\'date\'] = i[1][\'date\']
            # 标识节假日(详细)
            dict_res[\'name\'] = i[1][\'name\']
            # 薪资倍数,3表示是3倍工资
            dict_res[\'wage\'] = i[1][\'wage\']
            dict_res[\'holiday\'] = i[1][\'holiday\']
            # 标识节假日
            if ("" in str(i[1][\'name\'])) or ("除夕" in str(i[1][\'name\'])):
                dict_res[\'flag\'] = \'春节\'
            elif "调休" in str(i[1][\'name\']):
                dict_res[\'flag\'] = \'调休\'
            else:
                dict_res[\'flag\'] = i[1][\'name\']
            hol_list.append(str(json.dumps(dict_res, ensure_ascii=False)))
    print("节假日数据已经获取完毕!!!")
    return hol_list

#获取生成指定日期范围的数据
def get_ste_day(time_start, time_end, holiday_list):
    date_l = [datetime.strftime(x, \'%Y-%m-%d\') for x in list(pd.date_range(start=time_start, end=time_end))]

    res_list = []
    # 遍历日期,生成时间数据
    for ddate in date_l:
        date_list = getDate(ddate, holiday_list)
        res_list.append(date_list)

    # 返回结果数据
    return res_list

# 把数据写入csv里面
def write_csv(res_list):
    # 1. 创建文件对象
    f = open(\'E:\\dim_pub_date.txt\', \'w\', encoding=\'utf-8\', newline="")

    # 2. 基于文件对象构建 csv写入对象
    csv_writer = csv.writer(f)

    # 3. 构建列表头
    csv_writer.writerow(["ds", "d_date", "d_datetime", "y_ds", "y_date", "y_datetime",
                         "d_year", "d_yearstartday", "y_year", "y_yearstartday",
                         "d_month", "d_monthstartday", "d_monthendday", "y_month",
                         "y_monthstartday", "y_monthendday", "d_quarter", "d_yearquarter", "week",
                         "d_week", "d_dayofyear", "d_dayofmonth", "d_dayofweek",
                         "d_weekofmonth", "d_weekofyear", "is_weekend", "day_type", "hol_name"])

    # 4. 写入csv文件内容
    for ls in res_list:
        if len(ls) != 0:
            csv_writer.writerow(ls)

    # 5. 关闭文件
    print("数据写入csv完成!!!")
    f.close()

# 把数据写入到odps里面
def write_odps(res_list):
    # 获取odps环境
    o = ODPS(\'xxx\', \'xxx\', \'DILI_FRESH\',
             endpoint=\'http://service.cn-beijing.maxcompute.aliyun.com/api\')


    # 写表
    t = o.get_table("dim_pub_date")
    with t.open_writer(partition="year=2018", create_partition=True) as writer:
        writer.write(res_list)

    print("数据写入odps完成!!!")


# 把数据写入到hive里面
def write_hive(res_list):
    pass

if __name__ == \'__main__\':
    # 初始时间和结束时间
    time_start = date(2018, 1, 1)
    time_end = date(2018, 12, 31)
    # 节假日年份
    y_cnt = time_end.year-time_start.year
    # 获取节假日信息
    holiday_list = get_holiday_dict(time_start.year, y_cnt)
    # 生成指定的时间范围数据
    res_list = get_ste_day(time_start, time_end, holiday_list)
    # 把数据写入到csv文件中
    write_csv(res_list)
    # 把数据写入到odps中
    write_odps(res_list)
    # 把数据写入到hive中
    write_hive(res_list)

 

 

节假日获取参考:http://timor.tech/api/holiday

 

分类:

技术点:

相关文章: