【问题标题】:How to compute false positive rate of an imbalanced dataset for Stratified K fold cross validation?如何计算分层 K 折交叉验证的不平衡数据集的误报率?
【发布时间】:2021-12-26 00:59:39
【问题描述】:

以下几行是我能够计算准确率、精度、召回率和 f1 分数的示例代码。如何计算分层 K 折交叉验证的误报率 (FPR)?

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, 
f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

scoring = {'accuracy' : make_scorer(accuracy_score), 
       'precision' : make_scorer(precision_score),
       'recall' : make_scorer(recall_score), 
       'f1_score' : make_scorer(f1_score)}
skfold = StratifiedKFold(n_splits=10)
dt_clf = DecisionTreeClassifier()

results = cross_validate(estimator=dt_clf,
                      X=data_train_X,
                      y=target_train_Y,
                      cv=skfold,
                      scoring=scoring)
print("Results", results)

【问题讨论】:

    标签: python scikit-learn cross-validation k-fold false-positive


    【解决方案1】:

    我使用逻辑回归编写了这段代码。您可以将其替换为您喜欢的任何其他二进制分类算法。

    #Importing required libraries
    from sklearn.model_selection import KFold
    from sklearn.datasets import load_breast_cancer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    data = load_breast_cancer(as_frame = True)
    df = result.frame
    X = result.iloc[:,:-1]
    y = result.iloc[:,-1]
    
    
    #shffling
    X = X.sample(frac = 1)
    y= y.sample(frac = 1)
    #Implementing cross validation
    kf = KFold(n_splits=10)#, random_state=None
    model = LogisticRegression(max_iter=1000000)#(solver= 'liblinear')
    
    acc_score = list()
    res_tpr = list()
    res_fpr = list()
    for train_index , test_index in kf.split(X):
        #X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        #y_train , y_test = y[train_index] , y[test_index]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, pred_values, labels=[0, 1]).ravel()
        print(f'True Positives: {tp}')
        print(f'False Positives: {fp}')
        print(f'True Negatives: {tn}')
        print(f'False Negatives: {fn}')
        tpr=(np.divide(tp,(tp+fn)))
        fpr=(np.divide(fp,(fp+tn)))
        if tp==0:
          tpr=0
        if fp==0:
          fpr=0
    
        print('tpr=%.4f fpr=%.3f' % ( mean(tpr), mean(fpr)))
        res_tpr.append(mean(tpr))
        res_fpr.append(mean(fpr))
        print('---------------------')
    
        acc = accuracy_score(pred_values , y_test)
        acc_score.append(acc)
     
    avg_acc_score = np.sum(acc_score)/10
    total_tpr=np.sum(res_tpr)/10
    total_fpr=np.sum(res_fpr)/10
    print('\n\n',' total_tpr=%.4f total_fpr=%.3f' % (total_tpr,total_fpr))
    #print('\n\n','accuracy of each fold - {}'.format(acc_score))
    print('\n\n','Avg accuracy : {}'.format(avg_acc_score))
    

    【讨论】:

      【解决方案2】:

      您可以按如下方式定义自定义记分器:

      from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
      from sklearn.model_selection import StratifiedKFold, cross_validate
      from sklearn.tree import DecisionTreeClassifier
      from sklearn.datasets import make_classification
      
      def false_positive_rate(y_true, y_pred):
      
          # false positive
          fp = ((y_pred == 1) & (y_true == 0)).sum()
      
          # true negative
          tn = ((y_pred == 0) & (y_true == 0)).sum()
      
          # false positive rate
          return fp / (fp + tn)
      
      scoring = {
          'accuracy': make_scorer(accuracy_score),
          'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score),
          'f1_score': make_scorer(f1_score),
          'false_positive_rate': make_scorer(false_positive_rate),
      }
      
      skf = StratifiedKFold(n_splits=3)
      
      clf = DecisionTreeClassifier(random_state=42)
      
      X, y = make_classification(random_state=42)
      
      results = cross_validate(estimator=clf, X=X, y=y, cv=skf, scoring=scoring)
      
      print(results['test_false_positive_rate'])
      # [0.11764706 0.11764706 0.0625]
      

      【讨论】:

        猜你喜欢
        • 2018-03-17
        • 2015-12-13
        • 2020-08-03
        • 1970-01-01
        • 2018-05-03
        • 2017-02-07
        • 1970-01-01
        • 2013-05-03
        • 2017-01-11
        相关资源
        最近更新 更多