1. 载入数据
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
##载入示例数据 10维度
X, y = make_hastie_10_2(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)##test_size测试集合所占比例
默认GBDT参数
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pre= clf.predict(X_test)
y_pro= clf.predict_proba(X_test)[:,1]
print ("AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro))
print ("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))
AUC Score : 0.974248
Accuracy : 0.8995
默认XGBoost参数
auc_Score=[]
accuracy=[]
clf = XGBClassifier()
clf.fit(X_train, y_train)
y_pre= clf.predict(X_test)
y_pro= clf.predict_proba(X_test)[:,1]
print ("AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro))
print ("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))
auc_Score.append(metrics.roc_auc_score(y_test, y_pro))
accuracy.append(metrics.accuracy_score(y_test, y_pre))
AUC Score : 0.972424
Accuracy : 0.8993
分步调整XGBoost参数
clf = XGBClassifier(
learning_rate =0.1, #默认0.3
n_estimators=100, #树的个数
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic', #逻辑回归损失函数
nthread=4, #cpu线程数
scale_pos_weight=1,
seed=27) #随机种子
clf.fit(X_train, y_train)
y_pre= clf.predict(X_test)
y_pro= clf.predict_proba(X_test)[:,1]
print ("AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro))
print ("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))
auc_Score.append(metrics.roc_auc_score(y_test, y_pro))
accuracy.append(metrics.accuracy_score(y_test, y_pre))
AUC Score : 0.979435
Accuracy : 0.9177
第一步
第一步:初始学习速率0.1和tree_based参数调优的估计器数目100 给其他参数一个初始值。
- max_depth = 5 :默认6树的最大深度,这个参数的取值最好在3-10之间。
- min_child_weight = 1:默认是1决定最小叶子节点样本权重和。如果是一个极不平衡的分类问题,某些叶子节点下的值会比较小,这个值取小点。
- gamma = 0: 默认0,在0.1到0.2之间就可以。树的叶子节点上作进一步分裂所需的最小损失减少。这个参数后继也是要调整的。
- subsample, colsample_bytree = 0.8: 样本采样、列采样。典型值的范围在0.5-0.9之间。
- scale_pos_weight = 1:默认1,如果类别十分不平衡取较大正值。
from sklearn.model_selection import GridSearchCV
tuned_parameters= [{'n_estimators':[100,200,500,1000]
}]
clf = GridSearchCV(XGBClassifier(
learning_rate =0.1, #默认0.3
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic', #逻辑回归损失函数
nthread=-1, #cpu线程数,-1表示使用所有线程
scale_pos_weight=1,
seed=27), #随机种子
param_grid=tuned_parameters,scoring='roc_auc',n_jobs=-1,iid=False,cv=5)
clf.fit(X_train, y_train)
#clf.cv_results_, clf.best_params_, clf.best_score_
y_true, y_pred = y_test, clf.predict(X_test)
print ("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
y_proba=clf.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba))
clf.cv_results_, clf.best_params_, clf.best_score_
Accuracy : 0.9418
AUC Score (Train): 0.989438
({'mean_fit_time': array([0.66427655, 1.32495356, 3.27263536, 6.16040602]),
'mean_score_time': array([0.01157823, 0.01986165, 0.04536343, 0.0873847 ]),
'mean_test_score': array([0.97632507, 0.98298858, 0.98678414, 0.98780433]),
'mean_train_score': array([0.99961959, 0.99999927, 1. , 1. ]),
'param_n_estimators': masked_array(data=[100, 200, 500, 1000],
mask=[False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'n_estimators': 100},
{'n_estimators': 200},
{'n_estimators': 500},
{'n_estimators': 1000}],
'rank_test_score': array([4, 3, 2, 1], dtype=int32),
'split0_test_score': array([0.97672398, 0.9852551 , 0.9880359 , 0.98898781]),
'split0_train_score': array([0.99961777, 0.99999791, 1. , 1. ]),
'split1_test_score': array([0.97553036, 0.98398993, 0.98780132, 0.98902452]),
'split1_train_score': array([0.99957971, 0.99999896, 1. , 1. ]),
'split2_test_score': array([0.97800178, 0.98263884, 0.98601099, 0.98623339]),
'split2_train_score': array([0.99957119, 0.99999948, 1. , 1. ]),
'split3_test_score': array([0.97199974, 0.97861895, 0.98379532, 0.98571909]),
'split3_train_score': array([0.9996572, 1. , 1. , 1. ]),
'split4_test_score': array([0.97936947, 0.98444009, 0.98827716, 0.98905683]),
'split4_train_score': array([0.9996721, 1. , 1. , 1. ]),
'std_fit_time': array([0.00671019, 0.00677931, 0.01163194, 0.61743938]),
'std_score_time': array([0.00057199, 0.00014971, 0.00039251, 0.00617809]),
'std_test_score': array([0.00251312, 0.00234362, 0.00169478, 0.00150162]),
'std_train_score': array([4.02685744e-05, 7.80430588e-07, 0.00000000e+00, 0.00000000e+00])},
{'n_estimators': 1000},
0.9878043288364371)
得到结论:
‘n_estimators’:[100,200,500,1000,1500]
取1000最好
第二步
第二步: max_depth 和 min_child_weight 它们对最终结果有很大的影响
max_depth range(3,10,2)=[3, 5, 7, 9]
min_child_weight range(1,6,2)=[1, 3, 5]
max_depth=3 min_weight=1 最好
参数注释:
min_child_weight [default=1]
Defines the minimum sum of weights of all observations required in a child.
This is similar to min_child_leaf in GBM but not exactly. This refers to min “sum of weights” of observations while GBM has min “number of observations”.
Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
Too high values can lead to under-fitting hence, it should be tuned using CV.
max_depth [default=6]
The maximum depth of a tree, same as GBM.
Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
Should be tuned using CV.
Typical values: 3-10
关于max_depth
XGBoost 的split方式确定要分割到指定max_depth层,而GBDT则是采用一种贪婪策略,但分割增益小于0即停止。
Tree Pruning:
- A GBM would stop splitting a node when it encounters a negative loss in the split. Thus it is more of a greedy algorithm.
- XGBoost on the other hand make splits upto the ‘max_depth’ specified and then start pruning the tree backwards and remove splits beyond which there is no positive gain.
- Another advantage is that sometimes a split of negative loss say -2 may be followed by a split of positive loss +10. GBM would stop as it encounters -2. But XGBoost will go deeper and it will see a combined effect of +8 of the split and keep both.
from sklearn.model_selection import GridSearchCV
tuned_parameters= [{'max_depth': range(3,10,2),
'min_child_weight': range(1,6,2)
}]
clf = GridSearchCV(XGBClassifier(
learning_rate =0.1, #默认0.3
# max_depth=5,
# min_child_weight=1,
n_estimators = 1000,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic', #逻辑回归损失函数
nthread=-1, #cpu线程数,-1表示使用所有线程
scale_pos_weight=1,
seed=27), #随机种子
param_grid=tuned_parameters,scoring='roc_auc',n_jobs=-1,iid=False,cv=5)
clf.fit(X_train, y_train)
#clf.cv_results_, clf.best_params_, clf.best_score_
print (clf.best_score_)
y_true, y_pred = y_test, clf.predict(X_test)
print ("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
y_proba=clf.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba))
clf.cv_results_, clf.best_params_, clf.best_score_
0.9902421329970684
Accuracy : 0.947
AUC Score (Train): 0.991516
({'mean_fit_time': array([4.10024233, 4.06720438, 4.04949479, 6.56306758, 6.49959416,
6.23295298, 8.4661417 , 7.95352764, 7.71053686, 9.55492063,
8.61783104, 8.03259201]),
'mean_score_time': array([0.05688643, 0.05727029, 0.05646458, 0.09328356, 0.08852201,
0.08528223, 0.12413554, 0.11121621, 0.10461249, 0.14139628,
0.11906195, 0.1062572 ]),
'mean_test_score': array([0.99024213, 0.98919559, 0.98771132, 0.98780433, 0.98647996,
0.98359491, 0.98496889, 0.98328829, 0.98064576, 0.98405483,
0.98209103, 0.98019748]),
'mean_train_score': array([1. , 0.99999496, 0.99993488, 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. ]),
'param_max_depth': masked_array(data=[3, 3, 3, 5, 5, 5, 7, 7, 7, 9, 9, 9],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object),
'param_min_child_weight': masked_array(data=[1, 3, 5, 1, 3, 5, 1, 3, 5, 1, 3, 5],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'max_depth': 3, 'min_child_weight': 1},
{'max_depth': 3, 'min_child_weight': 3},
{'max_depth': 3, 'min_child_weight': 5},
{'max_depth': 5, 'min_child_weight': 1},
{'max_depth': 5, 'min_child_weight': 3},
{'max_depth': 5, 'min_child_weight': 5},
{'max_depth': 7, 'min_child_weight': 1},
{'max_depth': 7, 'min_child_weight': 3},
{'max_depth': 7, 'min_child_weight': 5},
{'max_depth': 9, 'min_child_weight': 1},
{'max_depth': 9, 'min_child_weight': 3},
{'max_depth': 9, 'min_child_weight': 5}],
'rank_test_score': array([ 1, 2, 4, 3, 5, 8, 6, 9, 11, 7, 10, 12], dtype=int32),
'split0_test_score': array([0.99074454, 0.98885182, 0.98746697, 0.98898781, 0.98708399,
0.98411447, 0.98669823, 0.98532171, 0.98176106, 0.9858268 ,
0.98367598, 0.98142248]),
'split0_train_score': array([1. , 0.99999444, 0.99994994, 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. ]),
'split1_test_score': array([0.99130135, 0.99077871, 0.98932754, 0.98902452, 0.98822944,
0.98553561, 0.98566905, 0.98522147, 0.9816992 , 0.98577191,
0.98380088, 0.98100142]),
'split1_train_score': array([1. , 0.99999548, 0.99993224, 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. ]),
'split2_test_score': array([0.98984741, 0.98898838, 0.98705906, 0.98623339, 0.98556619,
0.9820745 , 0.98445419, 0.98278062, 0.97925835, 0.9838815 ,
0.98177704, 0.97901927]),
'split2_train_score': array([1. , 0.99999218, 0.99992616, 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. ]),
'split3_test_score': array([0.98750108, 0.98627231, 0.98603323, 0.98571909, 0.98437913,
0.9821162 , 0.98257212, 0.98025081, 0.97922777, 0.98195496,
0.97892753, 0.97906097]),
'split3_train_score': array([1. , 1. , 0.99998245, 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. ]),
'split4_test_score': array([0.99181629, 0.99108674, 0.98866978, 0.98905683, 0.98714108,
0.98413379, 0.98545087, 0.98286683, 0.98128244, 0.98283899,
0.98227373, 0.98048328]),
'split4_train_score': array([1. , 0.99999271, 0.99988364, 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. ]),
'std_fit_time': array([0.01405666, 0.01141309, 0.01793645, 0.11919028, 0.15385108,
0.017553 , 0.00900605, 0.01608282, 0.05402459, 0.05556089,
0.04126622, 0.42052395]),
'std_score_time': array([0.00072866, 0.00059386, 0.00050668, 0.00325519, 0.00256326,
0.00031589, 0.00093075, 0.00050676, 0.00083621, 0.00106848,
0.00128343, 0.00827042]),
'std_test_score': array([0.00151799, 0.00171985, 0.00116893, 0.00150162, 0.00134989,
0.00132853, 0.00139446, 0.00187262, 0.00115712, 0.00154958,
0.00176496, 0.0009908 ]),
'std_train_score': array([0.00000000e+00, 2.78389460e-06, 3.22451414e-05, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])},
{'max_depth': 3, 'min_child_weight': 1},
0.9902421329970684)
结果表明:
max_depth=3 min_child_weight=1 最好
第三步
第三步:gamma参数调优
‘gamma’:[i/10.0 for i in range(0,7)]=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
gamma [default=0, alias: min_split_loss]
Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
range: [0,∞]
from sklearn.model_selection import GridSearchCV
tuned_parameters= [{'gamma':[i/10.0 for i in range(0,7)]
}]
clf = GridSearchCV(XGBClassifier(
learning_rate =0.1, #默认0.3
n_estimators=1000, #树的个数
max_depth=3,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic', #逻辑回归损失函数
nthread=-1, #cpu线程数,-1表示使用所有线程
scale_pos_weight=1,
seed=27), #随机种子
param_grid=tuned_parameters,scoring='roc_auc',n_jobs=-1,iid=False,cv=5)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)
print ("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
y_proba=clf.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba))
clf.cv_results_, clf.best_params_, clf.best_score_
Accuracy : 0.9523
AUC Score (Train): 0.992072
({'mean_fit_time': array([4.08784242, 4.13580532, 4.15678062, 4.104004 , 4.10653367,
4.11219459, 3.84211726]),
'mean_score_time': array([0.05971794, 0.05829625, 0.05696807, 0.05853791, 0.05707612,
0.05631042, 0.05412021]),
'mean_test_score': array([0.99024213, 0.99031935, 0.99040982, 0.99055717, 0.99063991,
0.9907346 , 0.99117328]),
'mean_train_score': array([1. , 0.99999997, 1. , 1. , 0.99999986,
0.99999983, 0.99999986]),
'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
mask=[False, False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'gamma': 0.0},
{'gamma': 0.1},
{'gamma': 0.2},
{'gamma': 0.3},
{'gamma': 0.4},
{'gamma': 0.5},
{'gamma': 0.6}],
'rank_test_score': array([7, 6, 5, 4, 3, 2, 1], dtype=int32),
'split0_test_score': array([0.99074454, 0.99088053, 0.99110255, 0.99115806, 0.99075564,
0.99153827, 0.99167425]),
'split0_train_score': array([1. , 1. , 1. , 1. , 0.99999965,
0.99999965, 0.99999983]),
'split1_test_score': array([0.99130135, 0.99144591, 0.99225489, 0.99169611, 0.99125965,
0.99162661, 0.99260795]),
'split1_train_score': array([1., 1., 1., 1., 1., 1., 1.]),
'split2_test_score': array([0.98984741, 0.99023383, 0.98963056, 0.99024773, 0.99030889,
0.99002255, 0.99027553]),
'split2_train_score': array([1. , 0.99999983, 1. , 1. , 0.99999965,
0.99999948, 0.99999948]),
'split3_test_score': array([0.98750108, 0.98727034, 0.98756502, 0.98808488, 0.98832118,
0.98830728, 0.98900506]),
'split3_train_score': array([1., 1., 1., 1., 1., 1., 1.]),
'split4_test_score': array([0.99181629, 0.99176617, 0.99149607, 0.9915991 , 0.99255419,
0.99217828, 0.99230358]),
'split4_train_score': array([1., 1., 1., 1., 1., 1., 1.]),
'std_fit_time': array([0.01378078, 0.04815563, 0.04390025, 0.00805336, 0.01261062,
0.01242839, 0.53625721]),
'std_score_time': array([0.00285409, 0.00224898, 0.00153852, 0.00176098, 0.00165367,
0.0002984 , 0.00455792]),
'std_test_score': array([0.00151799, 0.00161128, 0.00165917, 0.00133805, 0.00138188,
0.00140888, 0.00134863]),
'std_train_score': array([0.00000000e+00, 6.94986075e-08, 0.00000000e+00, 0.00000000e+00,
1.70272633e-07, 2.19797461e-07, 2.02624076e-07])},
{'gamma': 0.6},
0.991173275197764)
结果显示:
gamma=0.6 最好
第四步
第四步:调整subsample 和 colsample_bytree 参数
‘subsample’:[i/10.0 for i in range(6,10)]=[0.6, 0.7, 0.8, 0.9]
‘colsample_bytree’:[i/10.0 for i in range(6,10)]=[0.6, 0.7, 0.8, 0.9]
‘subsample’: 0.6, ‘colsample_bytree’: 0.6 最好
from sklearn.model_selection import GridSearchCV
tuned_parameters= [{'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}]
clf = GridSearchCV(XGBClassifier(
learning_rate =0.1, #默认0.3
n_estimators=1000, #树的个数
max_depth=3,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic', #逻辑回归损失函数
nthread=-1, #cpu线程数,-1表示使用所有线程
scale_pos_weight=1,
seed=27), #随机种子
param_grid=tuned_parameters,scoring='roc_auc',n_jobs=-1,iid=False,cv=5)
clf.fit(X_train, y_train)
#clf.cv_results_, clf.best_params_, clf.best_score_
y_true, y_pred = y_test, clf.predict(X_test)
print ("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
y_proba=clf.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba))
clf.cv_results_, clf.best_params_, clf.best_score_
Accuracy : 0.9522
AUC Score (Train): 0.992397
({'mean_fit_time': array([3.34788318, 3.38439374, 3.43055534, 3.47929316, 3.63930459,
3.88183041, 4.04542418, 3.82146721, 3.95433326, 4.02437754,
4.08732462, 4.19291725, 4.27289486, 4.355165 , 4.43081102,
4.49650884]),
'mean_score_time': array([0.05811257, 0.05756421, 0.05767741, 0.05814404, 0.05732098,
0.06595297, 0.05695639, 0.0562304 , 0.05646 , 0.05626922,
0.05605416, 0.05591116, 0.05628901, 0.05641556, 0.05595517,
0.05382738]),
'mean_test_score': array([0.99136929, 0.99111036, 0.9908039 , 0.98996486, 0.99109902,
0.99112067, 0.99051957, 0.9901156 , 0.99119806, 0.99071939,
0.99024213, 0.98978984, 0.9909985 , 0.99081676, 0.99009991,
0.98948751]),
'mean_train_score': array([0.99999969, 0.99999993, 0.99999997, 1. , 0.99999927,
0.99999993, 0.99999997, 1. , 0.9999999 , 0.99999972,
1. , 1. , 0.99999976, 0.9999999 , 0.99999997,
1. ]),
'param_colsample_bytree': masked_array(data=[0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8,
0.8, 0.9, 0.9, 0.9, 0.9],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object),
'param_subsample': masked_array(data=[0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8,
0.9, 0.6, 0.7, 0.8, 0.9],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'colsample_bytree': 0.6, 'subsample': 0.6},
{'colsample_bytree': 0.6, 'subsample': 0.7},
{'colsample_bytree': 0.6, 'subsample': 0.8},
{'colsample_bytree': 0.6, 'subsample': 0.9},
{'colsample_bytree': 0.7, 'subsample': 0.6},
{'colsample_bytree': 0.7, 'subsample': 0.7},
{'colsample_bytree': 0.7, 'subsample': 0.8},
{'colsample_bytree': 0.7, 'subsample': 0.9},
{'colsample_bytree': 0.8, 'subsample': 0.6},
{'colsample_bytree': 0.8, 'subsample': 0.7},
{'colsample_bytree': 0.8, 'subsample': 0.8},
{'colsample_bytree': 0.8, 'subsample': 0.9},
{'colsample_bytree': 0.9, 'subsample': 0.6},
{'colsample_bytree': 0.9, 'subsample': 0.7},
{'colsample_bytree': 0.9, 'subsample': 0.8},
{'colsample_bytree': 0.9, 'subsample': 0.9}],
'rank_test_score': array([ 1, 4, 8, 14, 5, 3, 10, 12, 2, 9, 11, 15, 6, 7, 13, 16],
dtype=int32),
'split0_test_score': array([0.99129959, 0.99074732, 0.99070847, 0.99041151, 0.99070569,
0.99139395, 0.99065019, 0.99108035, 0.99209331, 0.99106925,
0.99074454, 0.99054195, 0.99181857, 0.99077507, 0.99045869,
0.99015897]),
'split0_train_score': array([0.99999965, 0.99999983, 1. , 1. , 0.99999791,
0.99999983, 0.99999983, 1. , 0.99999983, 0.99999965,
1. , 1. , 0.99999965, 1. , 1. ,
1. ]),
'split1_test_score': array([0.9934086 , 0.99278866, 0.99211589, 0.99141255, 0.99296936,
0.99212979, 0.99183233, 0.99145703, 0.9932001 , 0.99155433,
0.99130135, 0.99156545, 0.99230493, 0.99214369, 0.99101223,
0.99114289]),
'split1_train_score': array([1. , 1. , 1. , 1. , 0.99999965,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. ]),
'split2_test_score': array([0.99075647, 0.99052017, 0.98993359, 0.98851022, 0.99112621,
0.99057021, 0.98995305, 0.98937758, 0.99011151, 0.99037283,
0.98984741, 0.9892775 , 0.99077871, 0.99054519, 0.98934422,
0.98898282]),
'split2_train_score': array([0.99999913, 0.99999983, 0.99999983, 1. , 0.99999983,
0.99999983, 1. , 1. , 0.99999983, 0.99999948,
1. , 1. , 0.99999913, 0.99999965, 0.99999983,
1. ]),
'split3_test_score': array([0.98852968, 0.98872984, 0.9888605 , 0.98749552, 0.98865756,
0.98883548, 0.98847964, 0.98747328, 0.9885547 , 0.98779576,
0.98750108, 0.98678939, 0.98822944, 0.9879153 , 0.98782078,
0.98633903]),
'split3_train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
'split4_test_score': array([0.99285213, 0.99276581, 0.99240104, 0.9919945 , 0.99203627,
0.99267392, 0.99168263, 0.99118977, 0.9920307 , 0.99280479,
0.99181629, 0.99077488, 0.99186084, 0.99270455, 0.99186363,
0.99081386]),
'split4_train_score': array([0.99999965, 1. , 1. , 1. , 0.99999896,
1. , 1. , 1. , 0.99999983, 0.99999948,
1. , 1. , 1. , 0.99999983, 1. ,
1. ]),
'std_fit_time': array([0.01738095, 0.00967988, 0.0106058 , 0.00466171, 0.00650665,
0.12905311, 0.35069167, 0.00833264, 0.01511322, 0.01169423,
0.01006915, 0.055133 , 0.0247075 , 0.00591088, 0.00529854,
0.05425587]),
'std_score_time': array([0.00089941, 0.00099629, 0.00192596, 0.00216851, 0.00125375,
0.00643969, 0.00091569, 0.00029168, 0.00032408, 0.00030256,
0.00036829, 0.00025315, 0.00021594, 0.00022433, 0.00010953,
0.00453534]),
'std_test_score': array([0.00172055, 0.00152998, 0.00132782, 0.00171046, 0.00144864,
0.00134393, 0.00123088, 0.00151067, 0.00165353, 0.0016635 ,
0.00151799, 0.00167056, 0.00147234, 0.00166244, 0.00140266,
0.00173885]),
'std_train_score': array([3.18482822e-07, 8.51363164e-08, 6.94986075e-08, 0.00000000e+00,
7.64767179e-07, 8.51363164e-08, 6.95284097e-08, 0.00000000e+00,
8.51187459e-08, 2.35646463e-07, 0.00000000e+00, 0.00000000e+00,
3.40481383e-07, 1.38990178e-07, 6.94986075e-08, 0.00000000e+00])},
{'colsample_bytree': 0.6, 'subsample': 0.6},
0.9913692942072346)
第五步
第五步:正则化参数调优
‘reg_alpha’:[1e-5, 1e-2, 0.1, 1, 100]=[1e-05, 0.01, 0.1, 1, 100] 默认0 L1正则项参数,参数值越大,模型越不容易过拟合
‘reg_lambda’:[1,5,10,50] 默认1L2正则项参数,参数值越大,模型越不容易过拟合
{‘reg_alpha’: 1e-05, ‘reg_lambda’: 1} 正则变化不大
lambda [default=1, alias: reg_lambda]
L2 regularization term on weights. Increasing this value will make model more conservative.
alpha [default=0, alias: reg_alpha]
L1 regularization term on weights. Increasing this value will make model more conservative.
from sklearn.model_selection import GridSearchCV
tuned_parameters= [{'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
'reg_lambda':[1,5,10,50]
}]
clf = GridSearchCV(XGBClassifier(
learning_rate =0.1, #默认0.3
n_estimators=1000, #树的个数
max_depth=3,
min_child_weight=1,
gamma=0.6,
subsample=0.6,
colsample_bytree=0.6,
objective= 'binary:logistic', #逻辑回归损失函数
nthread=-1, #cpu线程数,-1表示使用所有线程
scale_pos_weight=1,
seed=27), #随机种子
param_grid=tuned_parameters,scoring='roc_auc',n_jobs=-1,iid=False,cv=5)
clf.fit(X_train, y_train)
#clf.cv_results_, clf.best_params_, clf.best_score_
y_true, y_pred = y_test, clf.predict(X_test)
print ("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
y_proba=clf.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba))
clf.cv_results_, clf.best_params_, clf.best_score_
Accuracy : 0.9555
AUC Score (Train): 0.992868
({'mean_fit_time': array([3.57523336, 3.59076009, 3.60481343, 3.62930899, 3.58253379,
3.60277076, 3.62260051, 3.64259949, 3.60844054, 3.62169762,
3.63268323, 3.64931197, 3.65102282, 3.65047541, 3.66717477,
3.66272306, 1.77348022, 1.75979638, 1.75771828, 1.75320315]),
'mean_score_time': array([0.05806079, 0.05884867, 0.05991797, 0.05784712, 0.05824041,
0.05812812, 0.05788779, 0.05805917, 0.05803089, 0.05853901,
0.0578908 , 0.05916243, 0.0581624 , 0.06054859, 0.06007977,
0.06048918, 0.00946679, 0.0094245 , 0.00934477, 0.00884004]),
'mean_test_score': array([0.9922044 , 0.9903245 , 0.98835779, 0.97808582, 0.99241992,
0.99042686, 0.98836973, 0.97806353, 0.99218883, 0.99016637,
0.98819365, 0.97793884, 0.9912895 , 0.98909203, 0.98694434,
0.97694493, 0.77610178, 0.77795469, 0.77577765, 0.771155 ]),
'mean_train_score': array([0.9999984 , 0.99984488, 0.99936388, 0.99399755, 0.99999805,
0.99984829, 0.99934877, 0.99398011, 0.9999968 , 0.99981886,
0.99931413, 0.99392284, 0.9999297 , 0.99954614, 0.99884219,
0.99307182, 0.8136743 , 0.81547369, 0.81336072, 0.80796742]),
'param_reg_alpha': masked_array(data=[1e-05, 1e-05, 1e-05, 1e-05, 0.01, 0.01, 0.01, 0.01,
0.1, 0.1, 0.1, 0.1, 1, 1, 1, 1, 100, 100, 100, 100],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object),
'param_reg_lambda': masked_array(data=[1, 5, 10, 50, 1, 5, 10, 50, 1, 5, 10, 50, 1, 5, 10, 50,
1, 5, 10, 50],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'reg_alpha': 1e-05, 'reg_lambda': 1},
{'reg_alpha': 1e-05, 'reg_lambda': 5},
{'reg_alpha': 1e-05, 'reg_lambda': 10},
{'reg_alpha': 1e-05, 'reg_lambda': 50},
{'reg_alpha': 0.01, 'reg_lambda': 1},
{'reg_alpha': 0.01, 'reg_lambda': 5},
{'reg_alpha': 0.01, 'reg_lambda': 10},
{'reg_alpha': 0.01, 'reg_lambda': 50},
{'reg_alpha': 0.1, 'reg_lambda': 1},
{'reg_alpha': 0.1, 'reg_lambda': 5},
{'reg_alpha': 0.1, 'reg_lambda': 10},
{'reg_alpha': 0.1, 'reg_lambda': 50},
{'reg_alpha': 1, 'reg_lambda': 1},
{'reg_alpha': 1, 'reg_lambda': 5},
{'reg_alpha': 1, 'reg_lambda': 10},
{'reg_alpha': 1, 'reg_lambda': 50},
{'reg_alpha': 100, 'reg_lambda': 1},
{'reg_alpha': 100, 'reg_lambda': 5},
{'reg_alpha': 100, 'reg_lambda': 10},
{'reg_alpha': 100, 'reg_lambda': 50}],
'rank_test_score': array([ 2, 6, 10, 13, 1, 5, 9, 14, 3, 7, 11, 15, 4, 8, 12, 16, 18,
17, 19, 20], dtype=int32),
'split0_test_score': array([0.99215715, 0.98956784, 0.98751693, 0.9765991 , 0.9927122 ,
0.98962889, 0.98758354, 0.97685442, 0.99207111, 0.98978708,
0.98710897, 0.97685442, 0.9910526 , 0.9880248 , 0.98550487,
0.97539464, 0.77952865, 0.77970072, 0.77240181, 0.77170384]),
'split0_train_score': array([0.9999993 , 0.99983748, 0.99931393, 0.99406453, 0.99999652,
0.99982044, 0.99930367, 0.99414953, 0.99999774, 0.99982183,
0.9992743 , 0.99398405, 0.99991848, 0.99953381, 0.99881385,
0.99310522, 0.80560326, 0.80571511, 0.8003616 , 0.80160216]),
'split1_test_score': array([0.9937144 , 0.99240223, 0.99045067, 0.97955303, 0.99358096,
0.99241335, 0.98976957, 0.97915271, 0.99382004, 0.99209921,
0.99020881, 0.97911657, 0.99311948, 0.99100945, 0.98882992,
0.97847717, 0.7595959 , 0.75975436, 0.75550789, 0.75297948]),
'split1_train_score': array([0.99999913, 0.99984398, 0.99925324, 0.99367511, 0.99999826,
0.99985127, 0.99930571, 0.993607 , 0.99999792, 0.9998332 ,
0.99927652, 0.99360005, 0.9999477 , 0.99955417, 0.9986896 ,
0.9927327 , 0.80173568, 0.80186468, 0.79715016, 0.79497146]),
'split2_test_score': array([0.99179063, 0.9891246 , 0.98675325, 0.97687866, 0.99204917,
0.98942762, 0.98674491, 0.97722616, 0.99178229, 0.98881602,
0.98702848, 0.97757922, 0.99019769, 0.9876651 , 0.98522703,
0.97614752, 0.79911373, 0.79913597, 0.79869534, 0.79015376]),
'split2_train_score': array([0.99999809, 0.99979724, 0.99940683, 0.99422901, 0.99999844,
0.99982556, 0.99935679, 0.99423266, 0.99999583, 0.99978108,
0.99931457, 0.99425994, 0.9998961 , 0.99944088, 0.99893563,
0.99339207, 0.82645989, 0.82642097, 0.82371174, 0.81447825]),
'split3_test_score': array([0.98964168, 0.98971118, 0.98882158, 0.98019243, 0.99006703,
0.98960832, 0.98863532, 0.97977543, 0.98964446, 0.98978903,
0.98834064, 0.97928337, 0.99037561, 0.9890134 , 0.98762896,
0.97871625, 0.79577494, 0.7958361 , 0.79782659, 0.79339247]),
'split3_train_score': array([1. , 0.99994457, 0.99959222, 0.99433847, 1. ,
0.99994979, 0.99958353, 0.99424882, 1. , 0.99990461,
0.99956233, 0.99419235, 0.99997203, 0.99970255, 0.99910329,
0.9933495 , 0.83876636, 0.83876314, 0.84353457, 0.83296383]),
'split4_test_score': array([0.99371812, 0.99081664, 0.98824653, 0.9772059 , 0.99369027,
0.99105611, 0.9891153 , 0.97730893, 0.99362623, 0.99034049,
0.98828134, 0.97686062, 0.99170212, 0.98974739, 0.98753091,
0.97598906, 0.74649568, 0.75534628, 0.75445663, 0.74754544]),
'split4_train_score': array([0.99999548, 0.99980114, 0.99925319, 0.99368062, 0.99999705,
0.99979437, 0.99919414, 0.99366256, 0.99999253, 0.99975355,
0.99914291, 0.9935778 , 0.9999142 , 0.99949929, 0.9986686 ,
0.99277959, 0.79580629, 0.80460455, 0.80204552, 0.7958214 ]),
'std_fit_time': array([0.01191243, 0.01325173, 0.00659292, 0.00438262, 0.01042653,
0.01166532, 0.02686689, 0.0092751 , 0.00576707, 0.00575164,
0.00630358, 0.00739558, 0.00739137, 0.00888037, 0.01325656,
0.01174046, 0.01253897, 0.01083901, 0.01408579, 0.03089126]),
'std_score_time': array([0.00063335, 0.00045964, 0.00207908, 0.00077044, 0.00090627,
0.00138421, 0.00118615, 0.000901 , 0.00116144, 0.00164232,
0.00092595, 0.000837 , 0.00206178, 0.00349466, 0.0042768 ,
0.00541115, 0.00036815, 0.0002026 , 0.00013465, 0.00142456]),
'std_test_score': array([0.00150415, 0.00117878, 0.00125612, 0.00148542, 0.00132067,
0.00115303, 0.00108112, 0.00117042, 0.00150863, 0.00108406,
0.00115103, 0.0010642 , 0.00105853, 0.00120725, 0.00137036,
0.00137392, 0.0203828 , 0.0179657 , 0.01943307, 0.01867362]),
'std_train_score': array([1.58181542e-06, 5.32495526e-05, 1.27256443e-04, 2.75209409e-04,
1.21093013e-06, 5.38709812e-05, 1.28820776e-04, 2.84513597e-04,
2.51096229e-06, 5.14985044e-05, 1.37056730e-04, 2.87492944e-04,
2.68726271e-05, 8.71239048e-05, 1.61942019e-04, 2.76110081e-04,
1.62482048e-02, 1.45658140e-02, 1.77617460e-02, 1.43118733e-02])},
{'reg_alpha': 0.01, 'reg_lambda': 1},
0.9924199248863191)
第六步
第6步:进一步 降低学习速率 增加更多的树
‘learning_rate’:[0.01,0.1,0.3]
‘learning_rate’: 0.1 不变
‘n_estimators’:[1000,1200,1500,2000,2500]
‘n_estimators’: 1500 较好
from sklearn.model_selection import GridSearchCV
tuned_parameters= [{'learning_rate':[0.01,0.1,0.3],
'n_estimators':[1000,1200,1500,2000,2500]
}]
clf = GridSearchCV(XGBClassifier(
learning_rate =0.1, #默认0.3
n_estimators=1000, #树的个数
max_depth=3,
min_child_weight=1,
gamma=0.6,
subsample=0.6,
colsample_bytree=0.6,
objective= 'binary:logistic', #逻辑回归损失函数
nthread=-1, #cpu线程数,-1表示使用所有线程
scale_pos_weight=1,
reg_alpha = 1e-05,
reg_lambda = 1,
seed=27), #随机种子
param_grid=tuned_parameters,scoring='roc_auc',n_jobs=-1,iid=False,cv=5)
clf.fit(X_train, y_train)
#clf.cv_results_, clf.best_params_, clf.best_score_
y_true, y_pred = y_test, clf.predict(X_test)
print ("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
y_proba=clf.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba))
clf.cv_results_, clf.best_params_, clf.best_score_
Accuracy : 0.9538
AUC Score (Train): 0.993026
({'mean_fit_time': array([3.57008743, 4.77174163, 5.36605616, 7.1552011 , 8.94790907,
3.59686456, 4.31058722, 5.36648741, 7.13045301, 8.86648717,
3.56040711, 4.24546318, 5.31230602, 7.04101019, 8.50523062]),
'mean_score_time': array([0.05733538, 0.07500162, 0.08352704, 0.11490374, 0.15462112,
0.05688457, 0.0685442 , 0.08289218, 0.10480108, 0.12060022,
0.04361629, 0.04652081, 0.0500608 , 0.05485482, 0.056601 ]),
'mean_test_score': array([0.97739301, 0.98025168, 0.98320005, 0.98611754, 0.987942 ,
0.9922044 , 0.9925075 , 0.99244789, 0.99224707, 0.99213971,
0.99118297, 0.99093487, 0.99091936, 0.99061469, 0.99058739]),
'mean_train_score': array([0.99421428, 0.99584249, 0.99740946, 0.99872319, 0.9993335 ,
0.9999984 , 0.99999948, 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ]),
'param_learning_rate': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1,
0.3, 0.3, 0.3, 0.3, 0.3],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False],
fill_value='?',
dtype=object),
'param_n_estimators': masked_array(data=[1000, 1200, 1500, 2000, 2500, 1000, 1200, 1500, 2000,
2500, 1000, 1200, 1500, 2000, 2500],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'learning_rate': 0.01, 'n_estimators': 1000},
{'learning_rate': 0.01, 'n_estimators': 1200},
{'learning_rate': 0.01, 'n_estimators': 1500},
{'learning_rate': 0.01, 'n_estimators': 2000},
{'learning_rate': 0.01, 'n_estimators': 2500},
{'learning_rate': 0.1, 'n_estimators': 1000},
{'learning_rate': 0.1, 'n_estimators': 1200},
{'learning_rate': 0.1, 'n_estimators': 1500},
{'learning_rate': 0.1, 'n_estimators': 2000},
{'learning_rate': 0.1, 'n_estimators': 2500},
{'learning_rate': 0.3, 'n_estimators': 1000},
{'learning_rate': 0.3, 'n_estimators': 1200},
{'learning_rate': 0.3, 'n_estimators': 1500},
{'learning_rate': 0.3, 'n_estimators': 2000},
{'learning_rate': 0.3, 'n_estimators': 2500}],
'rank_test_score': array([15, 14, 13, 12, 11, 4, 1, 2, 3, 5, 6, 7, 8, 9, 10],
dtype=int32),
'split0_test_score': array([0.9771264 , 0.97916065, 0.98176661, 0.9849415 , 0.98700351,
0.99215715, 0.99230146, 0.99243745, 0.99235141, 0.99238194,
0.99104705, 0.99111643, 0.99103039, 0.99066684, 0.99062521]),
'split0_train_score': array([0.99420811, 0.99570905, 0.99736487, 0.99865723, 0.9993254 ,
0.9999993 , 0.99999965, 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ]),
'split1_test_score': array([0.97502161, 0.97882745, 0.98241088, 0.98607771, 0.98859084,
0.9937144 , 0.9939229 , 0.993909 , 0.99381726, 0.99369772,
0.99299994, 0.99264409, 0.99277754, 0.99248285, 0.99235219]),
'split1_train_score': array([0.99364731, 0.99556529, 0.99717784, 0.9986585 , 0.99931266,
0.99999913, 0.99999948, 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ]),
'split2_test_score': array([0.98010069, 0.9825749 , 0.98531321, 0.9875539 , 0.98890498,
0.99179063, 0.99205195, 0.99213257, 0.99159325, 0.99136807,
0.9897501 , 0.98914406, 0.98908012, 0.98874096, 0.98881324]),
'split2_train_score': array([0.99436401, 0.99585667, 0.99737764, 0.9987485 , 0.99934324,
0.99999809, 0.99999965, 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ]),
'split3_test_score': array([0.97363717, 0.97665626, 0.98002007, 0.98343948, 0.98548557,
0.98964168, 0.99011985, 0.98986687, 0.98997807, 0.98988355,
0.98973898, 0.98967226, 0.98953882, 0.98934422, 0.98953048]),
'split3_train_score': array([0.9945772 , 0.99614804, 0.99769074, 0.99880306, 0.99934741,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ]),
'split4_test_score': array([0.98107917, 0.98403912, 0.9864895 , 0.9885751 , 0.98972511,
0.99371812, 0.99414136, 0.99389354, 0.99349536, 0.99336727,
0.99237876, 0.99209753, 0.99216992, 0.99183856, 0.9916158 ]),
'split4_train_score': array([0.99427477, 0.99593337, 0.99743619, 0.99874866, 0.99933882,
0.99999548, 0.99999861, 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ]),
'std_fit_time': array([0.0058678 , 0.29934599, 0.00770195, 0.00409295, 0.02017516,
0.01213609, 0.00927474, 0.0122758 , 0.01989517, 0.02055084,
0.01329442, 0.00936171, 0.0452048 , 0.01752624, 0.51945848]),
'std_score_time': array([0.00072668, 0.00700603, 0.00047122, 0.00046071, 0.00244143,
0.00050282, 0.00184463, 0.00043078, 0.00072381, 0.00114743,
0.00072074, 0.00056753, 0.00059179, 0.00088418, 0.00557539]),
'std_test_score': array([0.00285378, 0.0026791 , 0.00236963, 0.00182534, 0.00151279,
0.00150415, 0.00145765, 0.00148229, 0.0013864 , 0.00139131,
0.00133326, 0.00134962, 0.0014365 , 0.00142236, 0.0012991 ]),
'std_train_score': array([3.09607635e-04, 1.98224239e-04, 1.65215343e-04, 5.69275286e-05,
1.27786744e-05, 1.58181542e-06, 4.65989847e-07, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00])},
{'learning_rate': 0.1, 'n_estimators': 1200},
0.9925075043107296)
绘图查看auc与准确率的变化情况
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (15, 5))
p1 = fig.add_subplot(1,2,1)
p1.plot(auc_Score)
p1.set_ylabel("AUC Score")
p1.set_title("AUC Score")
p2 = fig.add_subplot(1,2,2)
p2.plot(accuracy)
p2.set_ylabel("Accuracy")
p2.set_title("Accuracy")
plt.show()