数据挖掘(五):参数调优
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,ParameterGrid
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import warnings
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
warnings.filterwarnings('ignore')
org_data = pd.read_csv("org_data.csv", encoding = 'gbk')
print(org_data.shape)
(4754, 58)
var_total = org_data.columns
var_y = ['status']
var_x = list(set(var_total) - set(var_y))
y = org_data[var_y]
x = org_data[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
print(x.shape)
print(y.shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(4754, 57)
(4754, 1)
(3327, 57)
(3327, 1)
(1427, 57)
(1427, 1)
tf = RandomForestClassifier(criterion='gini')
tf_model = tf.fit(x_train, y_train)
tf_model
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
importance_dict = pd.DataFrame(tf_model.feature_importances_, list(x_train.columns))
importance_dict = pd.DataFrame()
importance_dict["features"] = list(x_train.columns)
importance_dict["importance"] = list(tf_model.feature_importances_)
importance_dict=importance_dict.set_index("features",drop=True)
var_sort = importance_dict.sort_values(by="importance",ascending=False)
print(var_sort)
importance
features
trans_fail_top_count_enum_last_1_month_woe 0.059223
history_fail_fee 0.056541
loans_score 0.032077
latest_one_month_fail 0.031654
apply_score 0.031323
loans_overdue_count 0.026386
first_transaction_day 0.024519
trans_amount_3_month 0.023320
trans_day_last_12_month 0.022150
historical_trans_amount 0.021684
loans_latest_day 0.020472
consfin_avg_limit 0.019320
max_cumulative_consume_later_1_month 0.019216
avg_price_last_12_month 0.019063
number_of_trans_from_2011 0.018446
historical_trans_day 0.018180
pawns_auctions_trusts_consume_last_6_month 0.017757
consume_top_time_last_6_month 0.017473
consfin_credit_limit 0.017428
trans_activity_day 0.017384
trans_fail_top_count_enum_last_6_month 0.017367
query_sum_count 0.017095
loans_latest_time_days 0.016930
rank_trad_1_month_woe 0.016894
trans_top_time_last_6_month 0.016527
latest_query_day 0.016235
latest_query_time_days 0.016090
apply_credibility 0.016050
consfin_max_limit 0.015665
consfin_credibility 0.015618
consume_top_time_last_1_month 0.015464
pawns_auctions_trusts_consume_last_1_month 0.015397
loans_max_limit 0.015029
loans_credit_limit 0.014474
history_suc_fee 0.014274
latest_three_month_loan 0.013838
loans_settle_count 0.013797
avg_price_top_last_12_valid_month_woe 0.013660
query_org_count 0.012944
latest_six_month_loan 0.012813
middle_volume_percent 0.012598
consume_mini_time_last_1_month 0.012405
latest_one_month_suc_woe 0.012299
loans_org_count_behavior 0.012265
loans_cash_count 0.012263
loans_count 0.012134
latest_one_month_apply 0.011842
trans_fail_top_count_enum_last_12_month 0.011343
trans_top_time_last_1_month 0.010996
loans_org_count_current 0.010605
consfin_product_count_woe 0.010191
consfin_org_count_current_woe 0.009683
loans_product_count 0.009406
query_cash_count_woe 0.008782
top_trans_count_last_1_month_woe 0.008133
consfin_org_count_behavior_woe 0.007659
low_volume_percent 0.007617
var_x = list(var_sort.importance[var_sort.importance > 0.02].index)
var_x
['trans_fail_top_count_enum_last_1_month_woe',
'history_fail_fee',
'loans_score',
'latest_one_month_fail',
'apply_score',
'loans_overdue_count',
'first_transaction_day',
'trans_amount_3_month',
'trans_day_last_12_month',
'historical_trans_amount',
'loans_latest_day']
y = org_data[var_y]
x = org_data[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
Lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=200)
xgb = XGBClassifier(n_jobs=2)
C = np.logspace(-3,0,20,base=10)
lr_param_grid = {'C': C, 'penalty':['l1', 'l2']}
Lr_cv = GridSearchCV(estimator=Lr,
param_grid=lr_param_grid,
cv=5,
scoring='f1')
svc_param_grid = {'C': C}
svc_cv = GridSearchCV(estimator=svc,
param_grid=svc_param_grid,
cv=5,
scoring='f1')
dt_param_grid = {'min_samples_leaf':range(5, 10), 'criterion': ['gini', 'entropy'], 'max_depth':range(2, 5)}
dt_cv = GridSearchCV(estimator=dt,
param_grid=dt_param_grid,
cv=5,
scoring='f1')
rf_param_grid = {'min_samples_leaf':range(5, 10), 'criterion': ['gini', 'entropy'], 'max_depth':range(2, 5)}
rf_cv = GridSearchCV(estimator=rf,
param_grid=rf_param_grid,
cv=5,
scoring='f1')
xgb_param_grid = {'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.5], 'n_estimators':[100, 200, 300, 500]}
xgb_cv = GridSearchCV(estimator=xgb,
param_grid=xgb_param_grid,
cv=5,
scoring='f1')
model_dict = {"逻辑回归":Lr_cv, "SVM":svc_cv, "决策树":dt_cv, "随机森林":rf_cv, "XGBoost":xgb_cv}
results = pd.DataFrame()
def model_est(model_dict, x_train, x_test, y_train, y_test):
for name, model in model_dict.items():
model_trian = model.fit(x_train, y_train)
y_pred_train = model_trian.predict(x_train)
y_pred_test = model_trian.predict(x_test)
acc_score_train = metrics.accuracy_score(y_pred_train, y_train)
precision_score_train = metrics.precision_score(y_pred_train, y_train)
recall_score_train = metrics.recall_score(y_pred_train, y_train)
f1_score_train = metrics.f1_score(y_pred_train, y_train)
roc_auc_score_train = metrics.roc_auc_score(y_pred_train, y_train)
acc_score_test = metrics.accuracy_score(y_pred_test, y_test)
precision_score_test = metrics.precision_score(y_pred_test, y_test)
recall_score_test = metrics.recall_score(y_pred_test, y_test)
f1_score_test = metrics.f1_score(y_pred_test, y_test)
roc_auc_score_test = metrics.roc_auc_score(y_pred_test, y_test)
print('{} 训练集准确率:{}'.format(name,acc_score_train))
print('{} 测试集准确率:{}\n'.format(name,acc_score_test))
print('{} 训练集精确率:{}'.format(name,precision_score_train))
print('{} 测试集精确率:{}\n'.format(name,precision_score_test))
print('{} 训练集召回率:{}'.format(name,recall_score_train))
print('{} 测试集召回率:{}\n'.format(name,recall_score_test))
print('{} 训练集f1评分:{}'.format(name,f1_score_train))
print('{} 测试集f1评分:{}\n'.format(name,f1_score_test))
print('{} 训练集AUC值:{}'.format(name,roc_auc_score_train))
print('{} 测试集AUC值:{}\n'.format(name,roc_auc_score_test))
fpr, tpr, th = metrics.roc_curve(y_train, y_pred_train)
fpr_t, tpr_t, th_t = metrics.roc_curve(y_test, y_pred_test)
plt.figure(figsize=[10, 8])
plt.plot(fpr, tpr, 'b--')
plt.plot(fpr_t, tpr_t, 'r--')
plt.title(label='{} ROC curve'.format(name))
plt.xlabel("fpr", fontsize=13)
plt.ylabel("tpr", fontsize=13)
plt.show()
model_est(model_dict, x_train, x_test, y_train, y_test)
逻辑回归 训练集准确率:0.7938082356477307
逻辑回归 测试集准确率:0.7778556412053259
逻辑回归 训练集精确率:0.31894484412470026
逻辑回归 测试集精确率:0.27019498607242337
逻辑回归 训练集召回率:0.6927083333333334
逻辑回归 测试集召回率:0.6381578947368421
逻辑回归 训练集f1评分:0.4367816091954024
逻辑回归 测试集f1评分:0.37964774951076313
逻辑回归 训练集AUC值:0.7498539967720014
逻辑回归 测试集AUC值:0.7163338493292053

SVM 训练集准确率:0.7682596934174932
SVM 测试集准确率:0.7659425367904695
SVM 训练集精确率:0.08752997601918465
SVM 测试集精确率:0.08356545961002786
SVM 训练集召回率:0.8795180722891566
SVM 测试集召回率:0.8571428571428571
SVM 训练集f1评分:0.15921483097055614
SVM 测试集f1评分:0.15228426395939088
SVM 训练集AUC值:0.8224655712863784
SVM 测试集AUC值:0.8103961412151068

决策树 训练集准确率:0.7992185151788398
决策树 测试集准确率:0.7631394533987386
决策树 训练集精确率:0.2973621103117506
决策树 测试集精确率:0.23676880222841226
决策树 训练集召回率:0.7515151515151515
决策树 测试集召回率:0.5704697986577181
决策树 训练集f1评分:0.42611683848797255
决策树 测试集f1评分:0.3346456692913386
决策树 训练集AUC值:0.7779931446598113
决策树 测试集AUC值:0.6780361512850406

随机森林 训练集准确率:0.8028253681995792
随机森林 测试集准确率:0.7778556412053259
随机森林 训练集精确率:0.27817745803357313
随机森林 测试集精确率:0.22284122562674094
随机森林 训练集召回率:0.8111888111888111
随机森林 测试集召回率:0.6779661016949152
随机森林 训练集f1评分:0.4142857142857143
随机森林 测试集f1评分:0.33542976939203356
随机森林 训练集AUC值:0.8066138071070659
随机森林 测试集AUC值:0.7324131501599099

XGBoost 训练集准确率:0.8557258791704238
XGBoost 测试集准确率:0.7876664330763841
XGBoost 训练集精确率:0.511990407673861
XGBoost 测试集精确率:0.3593314763231198
XGBoost 训练集召回率:0.854
XGBoost 测试集召回率:0.6386138613861386
XGBoost 训练集f1评分:0.6401799100449775
XGBoost 测试集f1评分:0.45989304812834225
XGBoost 训练集AUC值:0.8550155642023348
XGBoost 测试集AUC值:0.7254293796726612
