算法实践第三天
数据
和day01中的数据一样data_all.csv
任务:模型评估
记录7个模型(在Task1的基础上)关于accuracy、precision,recall和F1-score、auc值的评分表格,并画出Roc曲线。
代码实现
导入包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
加载数据
file_path = 'G:\DatawhaleWeek01\Data\data_all.csv'
row_data = pd.read_csv(file_path)
划分数据集
X = row_data.drop(columns=['status']).values
y = row_data['status'].values
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3,random_state=2018)
定义得分和ROC曲线函数
def get_scores(y_true, y_predict, y_predict_pro):
accuracy_score = metrics.accuracy_score(y_true, y_predict)
precision_score = metrics.precision_score(y_true, y_predict)
recall_score = metrics.recall_score(y_true, y_predict)
f1_score = metrics.f1_score(y_true, y_predict)
auc_score = metrics.roc_auc_score(y_true, y_predict_pro)
test_fprs, test_tprs, test_thresholds = metrics.roc_curve(y_test, y_predict_pro)
plt.plot(test_fprs, test_tprs)
plt.title("ROCCurve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()
print('准确率:', accuracy_score)
print('精准率:', precision_score)
print('召回率:', recall_score)
print('F1-score:', f1_score)
print('AUC:', auc_score)
1.xgboost
from xgboost.sklearn import XGBClassifier
xgbs = XGBClassifier(random_state=2018)
xgbs.fit(X_train, y_train)
xgbs_predict = xgbs.predict(X_test)
xgbs_predict_proba = xgbs.predict_proba(X_test)[:, 1]
get_scores(y_test, xgbs_predict, xgbs_predict_proba)
结果
准确率: 0.7855641205325858
精准率: 0.6305418719211823
召回率: 0.3565459610027855
F1-score: 0.4555160142348754
AUC: 0.7713634419371329
ROC曲线
2.Lightgbm
lgb_clf = lgb.LGBMClassifier()
lgb_clf.fit(X_train,y_train)
lgb_predict = lgb_clf.predict(X_test)
lgb_predict_proba = lgb_clf.predict_proba(X_test)[:, 1]
get_scores(y_test, lgb_predict, lgb_predict_proba)
结果
准确率: 0.7701471618780659
精准率: 0.5701357466063348
召回率: 0.35097493036211697
F1-score: 0.43448275862068964
AUC: 0.7574019592501017
ROC曲线
3.GBDT
gbdt = GradientBoostingClassifier(random_state=2018)
gbdt.fit(X_train,y_train)
gbdt_y_predict = gbdt.predict(X_test)
gbdt_y_predict_proba = gbdt.predict_proba(X_test)[:, 1]
get_scores(y_test, gbdt_y_predict, gbdt_y_predict)
结果
准确率: 0.7806587245970568
精准率: 0.6116504854368932
召回率: 0.35097493036211697
F1-score: 0.44601769911504424
AUC: 0.6380342816604593
ROC曲线
4.随机森林
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_y_predict = rfc.predict(X_test)
rfc_y_predict_proba= rfc.predict_proba(X_test)[:, 1]
get_scores(y_test, rfc_y_predict, rfc_y_predict_proba)
结果
准确率: 0.7708479327259986
精准率: 0.6066666666666667
召回率: 0.25348189415041783
F1-score: 0.35756385068762286
AUC: 0.7134466318216436
ROC曲线
5.决策树
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
clf_predict= clf.predict(X_test)
clf_predict_proba = clf.predict_proba(X_test)[:, 1]
get_scores(y_test, clf_predict, clf_predict_proba)
结果
准确率: 0.6664330763840224
精准率: 0.3548387096774194
召回率: 0.3983286908077994
F1-score: 0.3753280839895013
AUC: 0.5774414989619521
ROC曲线
6.线性SVM
from sklearn.svm import LinearSVC
# 数据标准化
scaler = StandardScaler()
scaler.fit(X_train)
X_train_standard = scaler.transform(X_train)
X_test_standard = scaler.transform(X_test)
svc = LinearSVC(C=5)
svc.fit(X_train_standard,y_train)
svc_predict = svc.predict(X_test_standard)
svc_predict_proba = svc.decision_function(X_test_standard)
get_scores(y_test, svc_predict, svc_predict_proba)
结果
准确率: 0.25227750525578135
精准率: 0.2517531556802244
召回率: 1.0
F1-score: 0.4022408963585434
AUC: 0.5655065569152765
ROC曲线
7.逻辑回归
from sklearn.linear_model import LogisticRegression
scaler = StandardScaler()
scaler.fit(X_train)
X_train_standard = scaler.transform(X_train)
X_test_standard = scaler.transform(X_test)
lr = LogisticRegression()
lr.fit(X_train_standard,y_train)
# 得到预测标签值
lr_predict = lr.predict(X_test_standard)
# 得到预测标签的概率(标签为0的概率,标签为1的概率)
lr_predict_proba = lr.predict_proba(X_test_standard)[:, 1]
get_scores(y_test, lr_predict, lr_predict_proba)
结果
准确率: 0.7876664330763841
精准率: 0.6609195402298851
召回率: 0.3203342618384401
F1-score: 0.4315196998123827
AUC: 0.7657428562486307
ROC曲线
遇到的问题:
1.使用逻辑回归和线性SVM的时候发现y_predict都是0,导致Presion等等分数都是0,函数是没有问题的,最后发现将数据归一化之后就没有这个问题了。
2.matplotlib之前没有使用过,需要多多学习