predict() R 函数插入符号包错误：“newdata”行不同，“type”不被接受答案

【问题标题】：predict() R function caret package errors: "newdata" rows different, "type" not acceptedpredict() R 函数插入符号包错误：“newdata”行不同，“type”不被接受
【发布时间】：2016-09-25 09:52:24
【问题描述】：

我正在使用 caret 包运行逻辑回归分析。
数据以 18x6 矩阵形式输入
到目前为止一切都很好，除了predict() 函数。
R 告诉我 type 参数应该是 raw 或 prob 但 raw 只是吐出最后一列的精确副本（二项式变量的值）。 prob 给我以下错误：

“dimnames 中的错误（输出）[[2]]

install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)

A=matrix(
  c(
    64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
  ),

  nrow = 18,
  ncol = 6,
  byrow = FALSE)  #"bycol" does NOT exist
################### data set as vectors
a<-c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946)
b<-c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627)
c<-c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755)
d<-c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500)
e<-c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500)
f<-c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1)
######################

n<-nrow(A);
K<-ncol(A)-1;

Train <- createDataPartition(f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)

#this is the logistic formula:
#estimates from logistic regression characterize the relationship between the predictor and response variable on a log-odds scale
mod_fit <- train(f ~ a + b + c + d +e,  data=training, method="glm", family="binomial")
mod_fit

#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))

predict(mod_fit, newdata=training)
predict(mod_fit, newdata=testing, type="prob")

【问题讨论】：

只是好奇，为什么需要同时定义A数据框和a~f向量？
@zyurnaidi 我这样做是因为我将二项式变量与其他列（f vs a-e）进行比较。我不知道任何其他方式来运行逻辑公式。有没有更好的办法？
当然，我们只需要将数据设置为数据框，然后通过名称访问每一列。查看答案。

标签： r machine-learning logistic-regression r-caret training-data

【解决方案1】：

我不太清楚，但 A 是 (a,b,c,d,e,f) 的矩阵。所以你不需要创建两个对象。

install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)

A=matrix(
  c(
        64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
  ),
  nrow = 18,
  ncol = 6,
  byrow = FALSE)  #"bycol" does NOT exist

A <- data.frame(A)
colnames(A) <- c('a','b','c','d','e','f')
A$f <- as.factor(A$f)

Train <- createDataPartition(A$f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)

要预测变量，您必须输入解释变量而不是要预测的变量

mod_fit <- train(f ~ a + b + c + d +e,  data=training, method="glm", family="binomial")
mod_fit

#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))

predict(mod_fit, newdata=training[,-which(colnames(training)=="f")])
predict(mod_fit, newdata=testing[,-which(colnames(testing)=="f")])

【讨论】：

f 需要成为一个因素，您正在尝试进行逻辑回归。查看正在生成的warnings()。 train(as.factor(f) ~ ., data = training , method = "glm", family = "binomial"
是的，这是真的，但这只是一个警告，预测是一样的。我纠正了。

【解决方案2】：

简短的回答，你不应该在你的predict 方程中包含解释变量，即f。所以你应该这样做：

predict(mod_fit, newdata=training[, -ncol(training])
predict(mod_fit, newdata=testing[, -ncol(testing])

警告消息'newdata' had 11 rows but variables found have 18 rows 的问题是因为您使用整个数据集（18 个观察值）运行回归，但只使用其中的一部分（11 或 7 个）进行预测。

编辑：为了简化数据创建和glm 流程，我们可以这样做：

library(caret)
A <- data.frame(a = c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946),
                b = c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627),
                c = c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755),
                d = c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500),
                e = c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500),
                f = c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1))

Train <- createDataPartition(f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]

mod_fit <- train(f ~ a + b + c + d + e,  data=training, method="glm", family="binomial")

【讨论】：

【解决方案3】：

我尝试运行逻辑回归模型。我写了这段代码：

install.packages('caret')
library(caret)
setwd('C:\\Users\\BAHOZ\\Documents\\')
D<-read.csv(file = "D.csv",header = T)
D<-read.csv(file = 'DataSet.csv',header=T)
names(D)
set.seed(111134)
Train<-createDataPartition(D$X, p=0.7,list = FALSE)
training<-D[Train,]
length(training$age)
testing<-D[-Train,]
length(testing$age)
mod_fit<-train(X~age + gender  + total.Bilirubin + direct.Bilirubin + total.proteins + albumin + A.G.ratio+SGPT + SGOT + Alkphos,data=training,method="glm", family="binomial")
summary(mod_fit)
exp(coef(mod_fit$finalModel))

我收到了最后一条命令的这条消息：

     (Intercept)              age           gender  total.Bilirubin direct.Bilirubin   total.proteins          albumin        A.G.ratio 
  0.01475027       1.01596886       1.03857883       1.00022899       1.78188072       1.00065332       1.01380334       1.00115742 
        SGPT             SGOT          Alkphos 
  3.93498241       0.05616662      38.29760014

通过运行这个命令，我可以预测我的数据，

predict(mod_fit , newdata=testing)

但如果我设置type="prob" 或type="raw"

predict(mod_fit , newdata=testing, type = "prob")

出错了：

dimnames 中的错误（输出）*vtmp* :

'dimnames' [2] 的长度不等于数组范围

【讨论】：