错误：`data` 和 `reference` 应该是相同级别的因素。 Logistic 回归的混淆矩阵答案

【问题标题】：Error: `data` and `reference` should be factors with the same levels. Confusion matrix for Logistic Regression错误：`data` 和 `reference` 应该是相同级别的因素。 Logistic 回归的混淆矩阵
【发布时间】：2020-09-17 03:57:44
【问题描述】：

我已经看到很多关于这个特定错误的答案。对于我的特定问题，我还没有找到任何答案。因此，我的问题

这就是我的工作：

    shortness_breath_data <- data_categ_nosev %>%
dplyr::select(shortness_breath, obesity, asthma, diabetes_type_one, diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease, Covid_tested, Gender)

这是put(head(shortness_breath_data))：

structure(list(shortness_breath = structure(c(1L, 2L, 1L, 1L, 
1L, 2L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L, 
1L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diabetes_type_one = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), hypertension = structure(c(1L, 
    1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), lung_condition = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), kidney_disease = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative", 
    "positive"), class = "factor"), Gender = structure(c(2L, 
    1L, 2L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"
    ), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"), problems = structure(list(row = c(2910L, 
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double", 
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'", 
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))

我将其分为训练和测试数据集。

shortness_breath_data$shortness_breath <- as.factor(shortness_breath_data$shortness_breath)

n <- nrow(shortness_breath_data)
set.seed(22)
trainingdx <- sample(1:n, 0.7 * n)

train <- shortness_breath_data[trainingdx,]
validate <- shortness_breath_data[-trainingdx,]

train %>% distinct(shortness_breath)
validate %>% distinct(shortness_breath)

为了方便您查找问题，我提供了dput(head(train)) 和dput(head(validate))

训练数据集：

structure(list(shortness_breath = structure(c(1L, 1L, 1L, 1L, 
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(2L, 
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    asthma = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diabetes_type_one = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    diabetes_type_two = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), hypertension = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), lung_condition = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), kidney_disease = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative", 
    "positive"), class = "factor"), Gender = structure(c(1L, 
    1L, 1L, 2L, 1L, 2L), .Label = c("Female", "Male", "Other"
    ), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"), problems = structure(list(row = c(2910L, 
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double", 
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'", 
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))

验证数据集：

structure(list(shortness_breath = structure(c(1L, 2L, 2L, 1L, 
1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L, 
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diabetes_type_one = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), hypertension = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), lung_condition = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), kidney_disease = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative", 
    "positive"), class = "factor"), Gender = structure(c(2L, 
    1L, 2L, 2L, 1L, 1L), .Label = c("Female", "Male", "Other"
    ), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"), problems = structure(list(row = c(2910L, 
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double", 
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'", 
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))

然后，我使用逐步向前的方法构建我的逻辑回归模型。

null_model <- glm(shortness_breath ~ 1, data = train, family = "binomial")

fm_shortness_breath <- glm(shortness_breath ~., data = train, family = "binomial")

stepmodel <- step(null_model, scope = list(lower = null_model, upper = fm_shortness_breath), direction = "forward")

然后我得到我的摘要模型并将预测存储在源数据框中。

summary(stepmodel)

validate$pred <- predict(stepmodel, validate, type = "response")

validate$real <- validate$shortness_breath

train$pred <- predict(stepmodel, train, type = "response")
train$real <- train$shortness_breath

然后我可以毫无问题地绘制我的 ROC 曲线：

plot.roc(validate$real, validate$pred, col = "red", main = "ROC Validation Set", percent = TRUE, print.auc = TRUE)

然而，当我试图得到我的混淆矩阵时，这就是我得到错误的地方。但这是我的代码：

cm_stepmodel <- confusionMatrix(stepmodel, validate)

然后，错误来了：

Error: `data` and `reference` should be factors with the same levels.

显示回溯：

3.
stop("`data` and `reference` should be factors with the same levels.", call. = FALSE)
2.
confusionMatrix.default(stepmodel, validate)
1.
confusionMatrix(stepmodel, validate)

我根本看不到问题所在。并尝试了其他几个选项，但没有奏效。我已经逐步复制了我正在采用的确切方法。我没有得到我的答案。另外，我也用 RMarkdown 标记了这个问题，以及插入符号和 R，以防万一。

另外，使用的库是：

library(tidyverse)
library(conflicted)
library(tidymodels)
library(ggrepel)
library(corrplot)
library(dplyr)
library(corrr) 
library(themis)
library(rsample)
library(caret)
library(forcats)
library(rcompanion)
library(MASS)
library(pROC)
library(ROCR)
library(data.table)

【问题讨论】：

为什么要将逐步模型stepmodel 和整个验证集validate 放在confusionMatrix 中？你不应该只放预测和真实标签的两个向量吗？例如confusionMatrix(validate$pred, validate$real)
我认为您应该将向量提供给confusionMatrix()，而不是数据集或模型。你试过confusionMatrix(validate$real, validate$pred)
你能通过库语句在顶部列出你正在使用的所有包吗？
@bzki 做到了。谢谢。
@RicS 我得到了同样的错误。

标签： r logistic-regression r-caret

【解决方案1】：

尝试将您的预测概率转换为标签，然后在此运行您的confusionMatrix：

validate$pred <- predict(stepmodel, validate, type = "response")
validate$pred_label <- as.factor(ifelse(validate$pred >= 0.5, "Yes", "No"))
confusionMatrix(validate$real, validate$pred) # Error
confusionMatrix(validate$real, validate$pred_label) # This will work

检查您在validate$pred_label 语句中的原始数据集中是否正确分配了标签。

我对@987654323@ 不是特别熟悉，但总体思路是您对标签进行预测并与数据的实际标签进行比较。它抛出了一个错误，因为您正在将标签与概率进行比较——您需要分配标签。如果我在上面犯了概念错误或编码错误，请纠正我。

【讨论】：

谢谢。这是直截了当的答案。有趣的是，对于我前几年的模特，我所做的完全一样。然而，在这种情况下，这里的错误是显而易见的。谢谢！