【问题标题】:predict() R function caret package errors: "newdata" rows different, "type" not acceptedpredict() R 函数插入符号包错误:“newdata”行不同,“type”不被接受
【发布时间】:2016-09-25 09:52:24
【问题描述】:
  • 我正在使用 caret 包运行逻辑回归分析。

  • 数据以 18x6 矩阵形式输入

  • 到目前为止一切都很好,除了predict() 函数。

  • R 告诉我 type 参数应该是 rawprobraw 只是吐出最后一列的精确副本(二项式变量的值)。 prob 给我以下错误:

“dimnames 中的错误(输出)[[2]]

install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)

A=matrix(
  c(
    64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
  ),

  nrow = 18,
  ncol = 6,
  byrow = FALSE)  #"bycol" does NOT exist
################### data set as vectors
a<-c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946)
b<-c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627)
c<-c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755)
d<-c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500)
e<-c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500)
f<-c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1)
######################

n<-nrow(A);
K<-ncol(A)-1;

Train <- createDataPartition(f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)

#this is the logistic formula:
#estimates from logistic regression characterize the relationship between the predictor and response variable on a log-odds scale
mod_fit <- train(f ~ a + b + c + d +e,  data=training, method="glm", family="binomial")
mod_fit

#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))

predict(mod_fit, newdata=training)
predict(mod_fit, newdata=testing, type="prob")

【问题讨论】:

  • 只是好奇,为什么需要同时定义A数据框和a~f向量?
  • @zyurnaidi 我这样做是因为我将二项式变量与其他列(f vs a-e)进行比较。我不知道任何其他方式来运行逻辑公式。有没有更好的办法?
  • 当然,我们只需要将数据设置为数据框,然后通过名称访问每一列。查看答案。

标签: r machine-learning logistic-regression r-caret training-data


【解决方案1】:

我不太清楚,但 A 是 (a,b,c,d,e,f) 的矩阵。所以你不需要创建两个对象。

install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)

A=matrix(
  c(
        64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
  ),
  nrow = 18,
  ncol = 6,
  byrow = FALSE)  #"bycol" does NOT exist

A <- data.frame(A)
colnames(A) <- c('a','b','c','d','e','f')
A$f <- as.factor(A$f)

Train <- createDataPartition(A$f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)

要预测变量,您必须输入解释变量而不是要预测的变量

mod_fit <- train(f ~ a + b + c + d +e,  data=training, method="glm", family="binomial")
mod_fit

#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))

predict(mod_fit, newdata=training[,-which(colnames(training)=="f")])
predict(mod_fit, newdata=testing[,-which(colnames(testing)=="f")])

【讨论】:

  • f 需要成为一个因素,您正在尝试进行逻辑回归。查看正在生成的warnings()train(as.factor(f) ~ ., data = training , method = "glm", family = "binomial"
  • 是的,这是真的,但这只是一个警告,预测是一样的。我纠正了。
【解决方案2】:

简短的回答,你不应该在你的predict 方程中包含解释变量,即f。所以你应该这样做:

predict(mod_fit, newdata=training[, -ncol(training])
predict(mod_fit, newdata=testing[, -ncol(testing])

警告消息'newdata' had 11 rows but variables found have 18 rows 的问题是因为您使用整个数据集(18 个观察值)运行回归,但只使用其中的一部分(11 或 7 个)进行预测。

编辑:为了简化数据创建和glm 流程,我们可以这样做:

library(caret)
A <- data.frame(a = c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946),
                b = c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627),
                c = c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755),
                d = c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500),
                e = c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500),
                f = c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1))

Train <- createDataPartition(f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]

mod_fit <- train(f ~ a + b + c + d + e,  data=training, method="glm", family="binomial")

【讨论】:

    【解决方案3】:

    我尝试运行逻辑回归模型。我写了这段代码:

    install.packages('caret')
    library(caret)
    setwd('C:\\Users\\BAHOZ\\Documents\\')
    D<-read.csv(file = "D.csv",header = T)
    D<-read.csv(file = 'DataSet.csv',header=T)
    names(D)
    set.seed(111134)
    Train<-createDataPartition(D$X, p=0.7,list = FALSE)
    training<-D[Train,]
    length(training$age)
    testing<-D[-Train,]
    length(testing$age)
    mod_fit<-train(X~age + gender  + total.Bilirubin + direct.Bilirubin + total.proteins + albumin + A.G.ratio+SGPT + SGOT + Alkphos,data=training,method="glm", family="binomial")
    summary(mod_fit)
    exp(coef(mod_fit$finalModel))
    

    我收到了最后一条命令的这条消息:

         (Intercept)              age           gender  total.Bilirubin direct.Bilirubin   total.proteins          albumin        A.G.ratio 
      0.01475027       1.01596886       1.03857883       1.00022899       1.78188072       1.00065332       1.01380334       1.00115742 
            SGPT             SGOT          Alkphos 
      3.93498241       0.05616662      38.29760014 
    

    通过运行这个命令,我可以预测我的数据,

    predict(mod_fit , newdata=testing)
    

    但如果我设置type="prob"type="raw"

    predict(mod_fit , newdata=testing, type = "prob")
    

    出错了:

    dimnames 中的错误(输出)*vtmp* :

    'dimnames' [2] 的长度不等于数组范围

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2018-04-09
      • 1970-01-01
      • 2016-03-24
      • 2020-02-24
      • 2020-10-06
      • 1970-01-01
      • 2016-06-18
      • 1970-01-01
      相关资源
      最近更新 更多