【发布时间】:2021-06-29 11:53:58
【问题描述】:
我一直在尝试使用 5 折交叉验证对高度不平衡的数据进行分类。我的样本量是:
样本总数:12237899
阳性样本:1064 个(占总数的 0.01%)
我也想避免数据泄露。但是,我的平均精度分数和 F-1 分数相当低。我使用加权逻辑回归来帮助我处理不平衡的数据,因为 SMOTE 在存在极度不平衡的数据时效果不佳。另外,我在 sklearn 库中看到了 F-1 分数的几个选项。例如:f1 score 有一个参数,如:average{‘micro’, ‘macro’, ‘samples’, ‘weighted’, ‘binary’}。不确定我应该使用哪一个?还有,和cross_val_score(clf, X, y, cv=5,scoring='f1')的scoring='f1'参数有什么区别?
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc
Balanced_Acc = []
F1 = []
G=[]
AP=[]
aucs = []
tprs = []
#fi = []
#rf_pi_train = []
#rf_pi_test = []
mean_fpr = np.linspace(0, 1, 100)
acc = []
cm = []
i=0
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for trainIndex, textIndex in tqdm(skf.split(X, y)):
xTrain, xTest = X.iloc[trainIndex], X.iloc[textIndex]
yTrain, yTest = y[trainIndex], y[textIndex]
clf = LogisticRegression(class_weight='balanced',max_iter=100000)
clf.fit(xTrain, yTrain)
yPred = clf.predict(xTest)
Balanced_Acc.append(balanced_accuracy_score(yTest, yPred))
AP.append(average_precision_score(yTest, yPred))
F1.append(f1_score(yTest,yPred))
G.append(geometric_mean_score(yTest,yPred))
#fi.append(clf.feature_importances_)
#result_train = permutation_importance(clf, xTrain, yTrain, n_repeats=1)
#result_test = permutation_importance(clf, xTest, yTest, n_repeats=1)
#rf_pi_train.append(result_train.importances)
#rf_pi_test.append(result_test.importances)
acc.append(accuracy_score(yTest, yPred))
cm.append(confusion_matrix(yTest,yPred))
# ROC Curve
fpr, tpr, thresholds = roc_curve(yTest, yPred)
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i = i+1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
# print(cm[0])
tp = fp = fn = tn = 0
for m in cm:
tp += m[0][0]
fp += m[0][1]
fn += m[1][0]
tn += m[1][1]
# print(tp, fp, fn, tn)
finalCM = [[tp, fp], [fn, tn]]
print(finalCM)
ax = sns.heatmap(finalCM, annot=True, cbar=False, fmt='g')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
print("Balanced Accuracy: ", np.mean(Balanced_Acc))
print("AP score: ", np.mean(AP))
print("G-mean: ", np.mean(G))
print("F1: ", np.mean(F1))
print('AUC: ', np.mean(aucs))
#AUC_rf = aucs
我不确定为什么我看到平衡的准确性和 AUC 分数相同!感谢您的想法!谢谢!
【问题讨论】:
标签: python machine-learning scikit-learn classification