RandomForest 模型融合

# RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

### 多少颗树，树有多深（一般不超过10），建树的时候不用全部属性（具体看多少属性）， 采样
param_grid = {
              \'n_estimators\': [5, 8, 10, 15, 20, 50, 100, 200],
              \'max_depth\': [3, 5, 7, 9],
              \'max_features\': [0.6, 0.7, 0.8, 0.9],
             }

rf = RandomForestRegressor()

grid = GridSearchCV(rf, param_grid=param_grid, cv=3, n_jobs=8, refit=True)
grid.fit(train_X, train_y)

breg = grid.best_estimator_
print(breg)
print(breg.score(train_X, train_y))

用模型进行预测

from datetime import datetime 

def dataprocess(t):
    t = str(t)[0:10]
    time = datetime.strptime(t, \'%Y-%m-%d\')
    res = time.strftime(\'%Y%m%d\')
    return res


#用模型进行预测
test_X[\'user_id\']=test_X[\'day_of_month\'].apply(lambda x:x)

commit_df = pd.date_range(\'2016/10/1\', periods=31, freq=\'D\')
commit_df = pd.DataFrame(commit_df)
commit_df.columns = [\'predict_date\']
prediction = breg.predict(test_X)
commit_df[\'predict_power_consumption\'] = pd.DataFrame(prediction).astype(\'int\')
commit_df[\'predict_date\'] = commit_df[\'predict_date\'].apply(dataprocess)

commit_df.head()

总结：

通过上面这个用电量分析预测未来用电量例子，我们可以发现，在建摸前对业务数据的分析，特征提取很重要，它直接决定了你预测的准确度的高低，所以好的特征提取很重要。只有尽可能全面准确的对业务场景的了解，才能比较好的做特征提取，在加上合适的算法模型，才能作出好的效果.

完整版代码¶

import numpy as np
import pandas as pd

_df = pd.read_csv("tianchi_powerdata/zhenjiang_power.csv")
train_df = _df 
_df.head()
#_df.shape
#df_201609
#train_df.head(5)
_df[\'record_date\']=pd.to_datetime(_df[\'record_date\'])
_df.head()
train_df=_df[[\'record_date\',\'power_consumption\']].groupby(by=\'record_date\').agg(\'sum\')

train_df=train_df.reset_index()
train_df.head()

test_df=pd.date_range(\'2016-10-1\',periods=31,freq=\'D\')#create very data for 10.1--10.31
 
test_df=pd.DataFrame(test_df,columns=[\'record_date\'])

test_df[\'power_consumption\']=0.0
test_df
total_df=pd.concat([_df,test_df])
#total_df.fillna(np.random.randint(100,10000))
total_df.dropna()
#total_df.head()
total_df.tail()

#时间相关的特征
total_df[\'day_of_week\']=total_df[\'record_date\'].apply(lambda x:x.dayofweek)
total_df[\'day_of_month\']=total_df[\'record_date\'].apply(lambda x:x.day)
total_df[\'day_of_year\']=total_df[\'record_date\'].apply(lambda x:x.dayofyear)
total_df[\'month_of_year\']=total_df[\'record_date\'].apply(lambda x:x.month)
total_df[\'year\']=total_df[\'record_date\'].apply(lambda x:x.year)

#添加工作日还是周末的信息，周六周日和工作日的用电量显然是不一样
total_df[\'holiday\']=0
total_df[\'holiday_sat\']=0
total_df[\'holiday_sun\']=0

#周末特征信息
total_df.loc[total_df.day_of_week ==5,\'holiday\']=1
total_df.loc[total_df.day_of_week ==5,\'holiday_sat\']=1

total_df.loc[total_df.day_of_week ==6,\'holiday\']=1
total_df.loc[total_df.day_of_week ==6,\'holiday_sun\']=1

#一个月4周的周信息,属于第几周
def week_of_month(day):
    if day in range(1,8):return 1
    if day in range(8,15):return 2
    if day in range(15,22):return 3
    if day in range(22,32):return 4

total_df[\'week_of_month\']=total_df[\'day_of_month\'].apply(lambda x:week_of_month(x))
total_df.head()

#属于第上中下旬信息
def period_of_month(day):
    if day in range(1,11):return 1
    if day in range(11,21):return 2
    if day in range(21,32):return 3
    
total_df[\'period_of_month\'] =total_df[\'day_of_month\'].apply(lambda x:period_of_month(x))
total_df.head()

#上半月下半月信息
def period2_of_month(day):
    if day in range(1,16):return 1
    if day in range(16,32):return 2
total_df[\'period2_of_month\'] =total_df[\'day_of_month\'].apply(lambda x:period2_of_month(x))
total_df.head()

# 手动填充节日信息 另外一个对用电量非常大的影响是节假日，法定节假日大部分企业会放假，
# 电量会有大程度的下滑。我们通过查日历的方式去手动填充一个特征/字段，表明这一天是否是节日。
def day_of_festival(day):
    l_festival=[\'2016-10-01\',\'2016-10-02\',\'2016-10-03\',\'2016-10-04\',\'2016-10-05\',\'2016-10-06\',\'2016-10-07\']
    if day in l_festival:return 1
    else:return 0
    
total_df[\'festival_pc\']=0
total_df[\'festival\']=0


total_df[\'festival\']=total_df[\'festival\'].apply(lambda x:day_of_festival(x))

total_df.head(20)

#已经有的数据特征字段
    # 可以看到有
    # 日期
    # 用电量
    # 星期几
    # 一个月第几天
    # 一年第几天
    # 一年第几个月
    # 年
    # 是否节假日
    # 月中第几周
    # 一个月上中下旬哪个旬
    # 上半月还是下半月
    # 是否节日
col_names=total_df.columns.values
col_names

#确认一下训练数据没有缺省值
counts={}
for name in col_names:
    count=total_df[name].isnull().sum()
    counts[name]=[count]

is_null_filds = pd.DataFrame(counts)
is_null_filds

#添加独热向量编码/one-hot encoding  ;针对星期几这个特征，初始化一个长度为7的向量[0,0,0,0,0,0,0]
    #对于类别型特征，我们经常在特征工程的时候会对他们做一些特殊的处理
    # 星期一会被填充成[1,0,0,0,0,0,0]
    # 星期二会被填充成[0,1,0,0,0,0,0]
    # 星期三会被填充成[0,0,1,0,0,0,0]
    # 星期四会被填充成[0,0,0,1,0,0,0]
    # 以此类推...


# 树状模型建模 树状模型是工业界最常用的机器学习算法之一，我们在训练集上去学习出来一个最好的决策路径，而每条决策路径的根节点是我们预测的结果;
# 1.分离训练集和测试集
## 非十月份的是训练集
train_X = total_df[~((total_df.year==2016)&(total_df.month_of_year==10))]
test_X = total_df[((total_df.year==2016)&(total_df.month_of_year==10))]
#print(train_X.shape)
#print(test_X.shape)

train_y = train_X.power_consumption
train_X = train_X.drop([\'power_consumption\',\'record_date\',\'year\'],axis=1)
test_X = test_X.drop([\'power_consumption\',\'record_date\',\'year\'],axis=1)

train_X.head()

#建模与调参;我们利用网格搜索交叉验证去查找最好的参数
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {\'max_features\': [0.7, 0.8, 0.9, 1],
              \'max_depth\':  [3, 5, 7, 9, 12]
             }

dt = DecisionTreeRegressor()

grid = GridSearchCV(dt, param_grid=param_grid, cv=5, n_jobs=8, refit=True)
grid.fit(train_X, train_y)
best_dt_reg = grid.best_estimator_
print(best_dt_reg)
print(best_dt_reg.score(train_X,train_y))

from datetime import datetime 

#完成提交日期格式的转换
def dataprocess(t):
    t = str(t)[0:10]
    time = datetime.strptime(t, \'%Y-%m-%d\')
    res = time.strftime(\'%Y%m%d\')
    return res

#生成10月份31天的时间段
commit_df = pd.date_range(\'2016/10/1\', periods=31, freq=\'D\')
commit_df = pd.DataFrame(commit_df)
commit_df.columns = [\'predict_date\']

#用模型进行预测
prediction = best_dt_reg.predict(test_X.values)
commit_df[\'predict_power_consumption\'] = pd.DataFrame(prediction).astype(\'int\')
commit_df[\'predict_date\'] = commit_df[\'predict_date\'].apply(dataprocess)
commit_df.head()

	record_date	user_id	power_consumption
0	2015/1/1	1	1135
1	2015/1/2	1	570
2	2015/1/3	1	3418
3	2015/1/4	1	3968
4	2015/1/5	1	3986

	record_date	user_id	power_consumption
0	2015-01-01	1	1135
1	2015-01-02	1	570
2	2015-01-03	1	3418
3	2015-01-04	1	3968
4	2015-01-05	1	3986

	power_consumption	record_date	user_id	day_of_week	day_of_month	day_of_year	month_of_year	year	holiday	holiday_sat	holiday_sun	week_of_month	period_of_month	period2_of_month
0	1135.0	2015-01-01	1.0	3	1	1	1	2015	0	0	0	1	1	1
1	570.0	2015-01-02	1.0	4	2	2	1	2015	0	0	0	1	1	1
2	3418.0	2015-01-03	1.0	5	3	3	1	2015	1	1	0	1	1	1
3	3968.0	2015-01-04	1.0	6	4	4	1	2015	1	0	1	1	1	1
4	3986.0	2015-01-05	1.0	0	5	5	1	2015	0	0	0	1	1	1
5	4082.0	2015-01-06	1.0	1	6	6	1	2015	0	0	0	1	1	1
6	4172.0	2015-01-07	1.0	2	7	7	1	2015	0	0	0	1	1	1
7	4022.0	2015-01-08	1.0	3	8	8	1	2015	0	0	0	2	1	1
8	4025.0	2015-01-09	1.0	4	9	9	1	2015	0	0	0	2	1	1
9	4047.0	2015-01-10	1.0	5	10	10	1	2015	1	1	0	2	1	1
10	4135.0	2015-01-11	1.0	6	11	11	1	2015	1	0	1	2	2	1
11	4111.0	2015-01-12	1.0	0	12	12	1	2015	0	0	0	2	2	1
12	3926.0	2015-01-13	1.0	1	13	13	1	2015	0	0	0	2	2	1
13	4244.0	2015-01-14	1.0	2	14	14	1	2015	0	0	0	2	2	1
14	4144.0	2015-01-15	1.0	3	15	15	1	2015	0	0	0	3	2	1
15	4269.0	2015-01-16	1.0	4	16	16	1	2015	0	0	0	3	2	2
16	4262.0	2015-01-17	1.0	5	17	17	1	2015	1	1	0	3	2	2
17	2782.0	2015-01-18	1.0	6	18	18	1	2015	1	0	1	3	2	2
18	3327.0	2015-01-19	1.0	0	19	19	1	2015	0	0	0	3	2	2
19	4002.0	2015-01-20	1.0	1	20	20	1	2015	0	0	0	3	2	2

	predict_date	predict_power_consumption
0	20161001	3820886
1	20161002	3845830
2	20161003	3845830
3	20161004	3845830
4	20161005	3845830

机器学习处理流程、特征工程，模型设计实例

构造和时间相关的强特征¶

完整版代码¶

	user_id	day_of_week	day_of_month	day_of_year	month_of_year	holiday	holiday_sat	holiday_sun	week_of_month	period_of_month	period2_of_month
0	1.0	3	1	1	1	0	0	0	1	1	1
1	1.0	4	2	2	1	0	0	0	1	1	1
2	1.0	5	3	3	1	1	1	0	1	1	1
3	1.0	6	4	4	1	1	0	1	1	1	1
4	1.0	0	5	5	1	0	0	0	1	1	1

	user_id	day_of_week	day_of_month	day_of_year	month_of_year	holiday	holiday_sat	holiday_sun	week_of_month	period_of_month	period2_of_month
0	1.0	3	1	1	1	0	0	0	1	1	1
1	1.0	4	2	2	1	0	0	0	1	1	1
2	1.0	5	3	3	1	1	1	0	1	1	1
3	1.0	6	4	4	1	1	0	1	1	1	1
4	1.0	0	5	5	1	0	0	0	1	1	1

	user_id	day_of_week	day_of_month	day_of_year	month_of_year	holiday	holiday_sat	holiday_sun	week_of_month	period_of_month	period2_of_month
0	1.0	3	1	1	1	0	0	0	1	1	1
1	1.0	4	2	2	1	0	0	0	1	1	1
2	1.0	5	3	3	1	1	1	0	1	1	1
3	1.0	6	4	4	1	1	0	1	1	1	1
4	1.0	0	5	5	1	0	0	0	1	1	1