【问题标题】:Different results with varying order of nested ifelse conditions具有不同嵌套 ifelse 条件顺序的不同结果
【发布时间】:2016-09-11 10:27:26
【问题描述】:

我正在尝试在数据框中创建一个新列,该列将包含取决于同一数据框中多个其他列中的条件的信息。我的研究涉及量化冠状动脉(心脏动脉)闭塞的严重程度。

示例数据框x 是:

structure(list(Study_number = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 
3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 
9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 
13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 
17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 
21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 
25, 25, 26, 26, 26, 26, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 
29, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 33, 
34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 
38, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 
42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 
46, 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 
50, 51, 51, 51, 51, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 
55, 55, 55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 
59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 63, 63, 63, 
63, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66), Vessel = c(1, 2, 
3, 4, 1, 2, 3, 4, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 
1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 
4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 
1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 
2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 1, 2, 3, 
4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 1, 
2, 3, 4, 2, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 
2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 
3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 
4, 1, 2, 3, 4, 1, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 3, 4, 1, 2, 
3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 
4, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3), Segment = c(3, 
9, 7, 8, 2, 9, 7, 8, 9, 7, 8, 3, 9, 6, 11, 3, 9, 6, 8, 2, 9, 
9, 15, 2, 9, 7, 8, 2, 9, 6, 8, 2, 9, 2, 9, 7, 8, 3, 9, 9, 11, 
1, 9, 7, 8, 2, 9, 6, 8, 2, 9, 7, 11, 1, 9, 6, 12, 2, 9, 7, 11, 
2, 9, 6, 15, 2, 9, 6, 8, 2, 9, 7, 8, 3, 9, 7, 11, 2, 9, 6, 11, 
2, 9, 7, 8, 1, 9, 6, 11, 2, 9, 8, 11, 2, 9, 7, 8, 2, 9, 7, 11, 
9, 7, 11, 2, 9, 6, 11, 3, 9, 7, 11, 2, 9, 6, 11, 2, 9, 7, 8, 
1, 9, 6, 11, 4, 9, 7, 3, 9, 7, 8, 9, 2, 9, 7, 8, 2, 9, 7, 11, 
1, 9, 7, 14, 2, 9, 7, 11, 2, 9, 6, 12, 2, 9, 6, 11, 2, 9, 7, 
8, 2, 9, 9, 8, 2, 9, 7, 12, 2, 9, 7, 11, 1, 9, 7, 8, 2, 9, 7, 
15, 2, 9, 6, 11, 2, 9, 6, 8, 3, 9, 10, 14, 2, 9, 6, 11, 1, 6, 
11, 1, 9, 6, 8, 1, 9, 7, 11, 2, 8, 12, 2, 9, 7, 8, 1, 9, 7, 11, 
0, 9, 6, 12, 1, 9, 7, 8, 0, 9, 6, 11, 0, 9, 7, 8, 9, 7, 3, 9, 
7, 8, 2, 9, 7, 11, 21, 9, 6, 11, 9, 7), Severity = c(0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 
1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 
0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("Study_number", 
"Vessel", "Segment", "Severity"), row.names = c(NA, -250L), class = c("tbl_df", 
"tbl", "data.frame"))

实际的数据框是这样的:

    Study_number Vessel Segment Severity
          <dbl>  <dbl>   <dbl>    <dbl>
1             1      1       3        0
2             1      2       9        0
3             1      3       7        0
4             1      4       8        0
5             2      1       2        0
6             2      2       9        0
7             2      3       7        0
8             2      4       8        0
9             3      2       9        0
10            3      3       7        1
  • Study_number = 参与者 ID
  • 容器 = 容器 ID(1 到 4)
  • Segment = 该特定船只的 Segment ID
  • 严重程度 = 该血管疾病的严重程度(0 = 否,1 = 是)

每个参与者通常有 4 艘船 (1-4),即使有些参与者可能只有 3 艘船。我想要实现的是一个名为“Overall_severe_disease”的新列,它应该满足以下条件。

  1. 当血管 2 患有严重疾病时(即同一行的血管 == 2 和严重程度 == 1);或

  2. 当血管 3 的第 6 节或第 7 节患有严重疾病时(即,血管 == 3 和节段 == 6 或 7 并且对于相应的行,严重性 == 1)并且至少有一条其他血管患有严重疾病(即,严重性列的总和 == 2);或者

  3. 当 3 个或更多血管出现严重疾病时(即,每个参与者的严重性列的总和 >= 3)。

这就是我试图解决这个问题的方式。首先通过将它们粘贴在一起来创建一个 Vessel-Severity 列。

x$Vessel_Severity <- paste(x$Vessel, x$Severity, sep = '-')

新的数据框将如下所示:

  Study_number Vessel Segment Severity Vessel_Severity
         <dbl>  <dbl>   <dbl>    <dbl>           <chr>
1            1      1       3        0             1-0
2            1      2       9        0             2-0
3            1      3       7        0             3-0
4            1      4       8        0             4-0
5            2      1       2        0             1-0
6            2      2       9        0             2-0

然后我使用 plyr 包中的以下 ddply 函数将嵌套的 ifelse 条件应用于每个参与者。

library(plyr)
x <- ddply(x, 'Study_number', transform,
Overall_severe_disease = ifelse(Vessel_Severity == '3-1' &  Segment %in% c(6,7) & sum(Severity) == 2 , 1,
       ifelse(Vessel_Severity == '2-1', 1,
       ifelse(sum(Severity) >= 3, 1, 0))))

之后,我使用以下函数将“是”或“否”分配给“Overall_severe_disease”列(如果任何行至少有一个“1”,则在参与者级别将其分配为“是”)

x <- ddply(x, 'Study_number', transform, Overall_severe_disease = ifelse(sum(Overall_severe_disease) >= 1, 'Yes', 'No'))

此方法有效,它为我提供了 9 名患有“Overall_severe_disease”的独特参与者

length(unique(x$Study_number[x$Overall_severe_disease=='Yes']))

#9

但是,如果我更改 ifelse 的顺序并将最后一个条件放在嵌套 ifelse 语句 (ifelse(sum(Severity) &gt;= 3) 的开头,那么 ddply 将不会应用除此之外的其余语句,我将得到一个完全不足的结果估计结果(5 个独立参与者,而不是 9 个)

x <- ddply(x, 'Study_number', transform,
           Overall_severe_disease = ifelse(sum(Severity) >= 3, 1,
                                    ifelse(Vessel_Severity == '2-1', 1,
                                   ifelse(Vessel_Severity == '3-1' &  Segment %in% c(6,7) & sum(Severity) == 2 , 1 , 0))))

x <- ddply(x, 'Study_number', transform, Overall_severe_disease = ifelse(sum(Overall_severe_disease) >= 1, 'Yes', 'No'))

length(unique(x$Study_number[x$Overall_severe_disease=='Yes']))

#5

我对这种行为感到困惑。我将不胜感激一些建议和澄清。

【问题讨论】:

  • 谢谢,塞德里克。我已经使用正确的赋值运算符进行了编辑。

标签: r if-statement


【解决方案1】:

在你的例子中你应该替换

    x$Vessel_Severity -> paste(x$Vessel, x$Severity, sep = '-')

    x$Vessel_Severity <- paste(x$Vessel, x$Severity, sep = '-')

试图重现你的例子,你没有得到 9 和 5 的 anwser 吗?

    # first example
    x$Overall_severe_disease<-0
    x <- ddply(x, 'Study_number', transform,
            Overall_severe_disease = ifelse(Vessel_Severity == '3-1' &  Segment %in% c(6,7) & sum(Severity) == 2 , 1, 0))
    sum(x$Overall_severe_disease) #4

    x <- ddply(x, 'Study_number', transform,
    Overall_severe_disease = ifelse(Vessel_Severity == '3-1' &  Segment %in% c(6,7) & sum(Severity) == 2 , 1, ifelse(Vessel_Severity == '2-1', 1,0)))
    sum(x$Overall_severe_disease) #4

    x <- ddply(x, 'Study_number', transform,
            Overall_severe_disease = ifelse(Vessel_Severity == '3-1' &  Segment %in% c(6,7) & sum(Severity) == 2 , 1,
                    ifelse(Vessel_Severity == '2-1', 1,
                            ifelse(sum(Severity) >= 3, 1, 0))))
    sum(x$Overall_severe_disease) #24
    res<-tapply(x$Overall_severe_disease,x$Study_number,sum)
    length(res[res>0])#9
    x <- ddply(x, 'Study_number', transform, Overall_severe_disease = ifelse(sum(Overall_severe_disease) >= 1, 'Yes', 'No'))
    length(unique(x$Study_number[x$Overall_severe_disease=='Yes'])) #9

    # second example

    x <- ddply(x, 'Study_number', transform,
        Overall_severe_disease = ifelse(sum(Severity) >= 3, 1, 0))
    sum(x$Overall_severe_disease) #20 

    x <- ddply(x, 'Study_number', transform,
            Overall_severe_disease = ifelse(sum(Severity) >= 3, 1, ifelse(Vessel_Severity == '2-1', 1,0)))
    sum(x$Overall_severe_disease) #20 

    x <- ddply(x, 'Study_number', transform,
            Overall_severe_disease = ifelse(sum(Severity) >= 3, 1,
                    ifelse(Vessel_Severity == '2-1', 1,
                            ifelse(Vessel_Severity == '3-1' &  Segment %in% c(6,7) & sum(Severity) == 2 , 1 , 0))))
    sum(x$Overall_severe_disease) #20 
    res<-tapply(x$Overall_severe_disease,x$Study_number,sum)
    length(res[res>0])#5
    x <- ddply(x, 'Study_number', transform, Overall_severe_disease = ifelse(sum(Overall_severe_disease) >= 1, 'Yes', 'No'))
    length(unique(x$Study_number[x$Overall_severe_disease=='Yes'])) #5

因此,在第二个示例中,与条件 ifelse(Vessel_Severity == '3-1' &amp; Segment %in% c(6,7) &amp; sum(Severity) == 2 , 1, 0)) 对应的 4 被删除了。这是个好问题。

【讨论】:

  • 你是绝对正确的。应该是 9 和 5。我包含的可重现数据集是从更大的数据集中截断的,这就是我最初提到的数字不同的原因。
  • 你的问题很好。您可以使用另一种方式使用 tapply 对严重性求和,然后合并回您的初始数据框。这样您就可以知道您的总和(严重性)是否为 2 或总和是否等于或大于 3。但关键是,这并不能回答您的问题。
猜你喜欢
  • 2016-03-17
  • 2021-12-15
  • 2016-10-13
  • 1970-01-01
  • 1970-01-01
  • 2020-01-06
  • 1970-01-01
  • 2022-01-26
  • 1970-01-01
相关资源
最近更新 更多