【一周算法实践】--4.模型调优

任务4：模型调优

使用网格搜索法对7个模型进行调优（调参时采用五折交叉验证的方式），并进行模型评估。

网格搜索（Grid Search）用简答的话来说就是手动的给出一个模型中你想要改动的所用的参数，程序自动的帮你使用穷举法来将所用的参数都运行一遍。决策树中我们常常将最大树深作为需要调节的参数；

K次验证：
【一周算法实践】--4.模型调优

#1. 导入所需包
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

%matplotlib inline

#2.划分数据集并归一化
data_original=pd.read_csv('data_all.csv')
data_original.head(5)
data_original.describe()
data=data_original.copy()
#将数据切分成标签y和特征X
y=data_original['status'].copy()
X=data_original.drop(['status'],axis=1).copy()
print("the X shape is:", X.shape)
print("the X shape is:" ,y.shape)
print("the nums of label 1 in y are",len(y[y==1]))
print("the nums of label 0 in y are",len(y[y==0]))

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2018)
print('the proportition of label 1 in y_test: %.2f%%'%(len(y_test[y_test==1])/len(y_test)*100))
#数据标准化
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)

the X shape is: (4754, 84)
the X shape is: (4754,)
the nums of label 1 in y are 1193
the nums of label 0 in y are 3561
the proportition of label 1 in y_test: 25.16%

# 3.构建模型进行网格搜索，由于对模型参数不熟练，只好边看文档边选
parameters_lr={'solver':['newton-cg','lbfgs','liblinear','sag'],'C':[0.1,1,10]}
lr_model=GridSearchCV(LogisticRegression(class_weight='balanced',max_iter=10000),parameters_lr,cv=5,scoring='roc_auc')

parameters_svm={'kernel':['linear','rbf','poly'],'C':[0.1,1,10]}
svm_model=GridSearchCV(SVC(class_weight='balanced',gamma='auto',probability=True),parameters_svm,cv=5,scoring='roc_auc')

parameters_dt={'criterion':['gini','entropy'],'max_features':['sqrt','log2',None]}
dt_model=GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),parameters_dt,cv=5,scoring='roc_auc')

parameters_en={'n_estimators':range(10,100,10)}
rf_model=GridSearchCV(RandomForestClassifier(class_weight='balanced'),parameters_en,cv=5,scoring='roc_auc')
gbdt_model=GridSearchCV(GradientBoostingClassifier(),parameters_en,cv=5,scoring='roc_auc')
xgb_model=GridSearchCV(XGBClassifier(),parameters_en,cv=5,scoring='roc_auc')
lgbm_model=GridSearchCV(LGBMClassifier(),parameters_en,cv=5,scoring='roc_auc')

models={'LR':lr_model,
       'SVM':svm_model,
       'DT':dt_model,
       'RF':rf_model,
       'GBDT':gbdt_model,
       'XGBoost':xgb_model,
       'LGBM':lgbm_model}

#4.定义评估模型函数
df_result=pd.DataFrame(columns=('model','dataset','accuracy','precision','recall','f1_score','auc'))
row=0
def evaluate(y_pre,y,y_proba):
    acc=accuracy_score(y,y_pre)
    p=precision_score(y,y_pre)
    r=recall_score(y,y_pre)
    f1=f1_score(y,y_pre)
    fpr,tpr,thresholds=roc_curve(y,y_proba[:,1])
    model_auc=auc(fpr,tpr)
    return acc,p,r,f1,fpr,tpr,model_auc

def plot_roc_curve(fpr,tpr,label=None):
    #plt.figure(figsize=(8,6))
    plt.plot(fpr,tpr,label=label)
    plt.plot([0,1],[0,1],'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Poisitive Rate')
    plt.legend()

# 5.训练模型并做评估
#plt.figure(figsize=(8,6))
for name,model in models.items():
    print(name,'start training...')
    model.fit(X_train,y_train)
    print(model.best_params_)
    y_pred_test=model.predict(X_test)
    y_proba_test=model.predict_proba(X_test)
    acc,p,r,f1,fpr_test,tpr_test,auc_test=evaluate(y_pred_test,y_test,y_proba_test)
    df_result.loc[row]=[name,'test',acc,p,r,f1,auc_test]
    row+=1
    
    y_pred_train=model.predict(X_train)
    y_proba_train=model.predict_proba(X_train)
    acc,p,r,f1,fpr_train,tpr_train,auc_train=evaluate(y_pred_train,y_train,y_proba_train)
    df_result.loc[row]=[name,'train',acc,p,r,f1,auc_train]
    row+=1
    plot_roc_curve(fpr_test,tpr_test,label=name)
    #plot_roc_curve(fpr_train,tpr_train,label=name)
print(df_result)
plt.show()

LR start training...
{'C': 1, 'solver': 'newton-cg'}
SVM start training...
{'C': 1, 'kernel': 'linear'}
DT start training...
{'criterion': 'entropy', 'max_features': 'sqrt'}
RF start training...
{'n_estimators': 80}
GBDT start training...
{'n_estimators': 50}
XGBoost start training...
{'n_estimators': 50}
LGBM start training...



{'n_estimators': 30}
      model dataset  accuracy  precision    recall  f1_score       auc
0        LR    test  0.701472   0.437616  0.654596  0.524554  0.751703
1        LR   train  0.764653   0.522467  0.711031  0.602336  0.823539
2       SVM    test  0.689559   0.423913  0.651811  0.513721  0.743370
3       SVM   train  0.763150   0.519896  0.720624  0.604020  0.821762
4        DT    test  0.700771   0.395706  0.359331  0.376642  0.587437
5        DT   train  1.000000   1.000000  1.000000  1.000000  1.000000
6        RF    test  0.766643   0.654762  0.153203  0.248307  0.762133
7        RF   train  1.000000   1.000000  1.000000  1.000000  1.000000
8      GBDT    test  0.780659   0.638554  0.295265  0.403810  0.763197
9      GBDT   train  0.835287   0.837264  0.425659  0.564388  0.880471
10  XGBoost    test  0.789068   0.662921  0.328691  0.439479  0.768588
11  XGBoost   train  0.831981   0.832930  0.412470  0.551724  0.881231
12     LGBM    test  0.780659   0.622340  0.325905  0.427788  0.758161
13     LGBM   train  0.908626   0.964912  0.659472  0.783476  0.968270

【一周算法实践】--4.模型调优

将每个模型的最好参数打印出来，可以发现：每个模型的最优参数各不相同，这也就要求我们在训练模型的时候耐心的找准参数。即使相同的数据集在不同的模型下，也会有不同的最优参数。