【问题标题】:Change the value of a column in R based on values of two other columns根据其他两列的值更改 R 中列的值
【发布时间】:2020-08-07 11:37:19
【问题描述】:

我正在尝试根据另外两列的值更改一列的值。 到目前为止,这让我有点头疼,我不确定这是否可能。

我的数据集如下所示。一列是时间,另外两列反映子代父代关系。在时间点1这样的奇怪场合 我有数据集中第一次弹出的后代“D”,一直没有 在上一个时间点同时作为后代和父亲。

数据

structure(list(time = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), 
    offspring = c("A", "B", "C", "A", "D", "E", "A", "F", "G"
    ), parent = c(NA, NA, NA, "A", "B", "D", "A", "A", "F")), class = "data.frame", row.names = c(NA, 
-9L))

我想要帮助的是

  1. 查找存在于某个时间点但不存在前一个时间点的所有后代 (不考虑时间点 0)并作为后代和父亲一样 D和F

  2. 当我找到它们时,我想将一个确切的时间点减少 0.5

time  offspring  parent
 0       A        NA
 0       B        NA
 0       C        NA
 1       A        A
 0.5     D        B 
 1       E        D
 2       A        A
 1.5     F        A
 2       G        F

非常感谢您在此问题上的任何帮助或指导。

【问题讨论】:

  • 向我们展示您为达到预期结果所做的尝试,但都失败了,以便我们从那里着手?
  • 为什么A有时有父母,有时没有?为什么 A 是它自己的父母——这是一个无性物种吗?这是数据的真实情况,还是个人缺乏唯一标识符?

标签: r dataframe dplyr tidyverse


【解决方案1】:

创建 2 个数据框,找出每只动物作为父母和后代的第一次出现。
找出两个组合列中出现的时间和动物,然后更新原始数据框中的时间。

df <-structure(list(time = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), 
                    offspring = c("A", "B", "C", "A", "D", "E", "A", "F", "G"), 
                    parent = c(NA, NA, NA, "A", "B", "D", "A", "A", "F")), class = "data.frame", 
                    row.names = c(NA, -9L))


library(dplyr)
#find the row where each Letter First appears as both a parent and offspring 
parents <-df %>% filter(complete.cases(.)) %>% group_by(parent) %>% slice(1) %>% select(time, parent)
offsprings <- df  %>% group_by(offspring) %>% slice(1) %>% select(time, offspring)

combined <- full_join(offsprings, parents)
#rows where the names match for both parent and offspring
matchingrows <-which(combined$parent == combined$offspring)

#update the times
for (i in matchingrows){
   row = which(df$time == combined$time[i] & df$offspring == combined$offspring[i])
   df$time[row] <- df$time[row] - 0.5
}
df

【讨论】:

  • 亲爱的,戴夫,看起来棒极了。我不希望在周五轮班之前取得任何成就。谢谢你的帮忙。我非常感谢
【解决方案2】:

如果需要,可以在 data.table 中:

library(data.table)

DT <- data.table(time = c(0,0,0,1,1,1,2,2,2),
                 offspring = c('A', 'B', 'C', 'A', 'D', 'E', 'A', 'F', 'G'),
                 parent = c(NA, NA, NA, 'A', 'B', 'D', 'A', 'A', 'F'))

for (i in seq_len(nrow(DT))) {
  DT[i, time := fifelse(time != 0 & offspring %chin% DT[, parent] & !(offspring %chin% DT[seq_len(i-1), offspring]),
                        time - 0.5,
                        time)]
}

> DT
   time offspring parent
1:  0.0         A   <NA>
2:  0.0         B   <NA>
3:  0.0         C   <NA>
4:  1.0         A      A
5:  0.5         D      B
6:  1.0         E      D
7:  2.0         A      A
8:  1.5         F      A
9:  2.0         G      F

使用 dplyr:

library(dplyr)
library(tibble)

tbl <- tibble(time = c(0,0,0,1,1,1,2,2,2),
              offspring = c('A', 'B', 'C', 'A', 'D', 'E', 'A', 'F', 'G'),
              parent = c(NA, NA, NA, 'A', 'B', 'D', 'A', 'A', 'F'))

for (i in seq_len(nrow(tbl))) {
  tbl[i,][['time']] <- tbl[i, ] %>% mutate(time = if_else(time != 0 &
                                         offspring %in% tbl[['parent']] &
                                         !(offspring %in% tbl[seq_len(i-1),][['offspring']]),
                                       time - 0.5,
                                       time)) %>% pull(time)
}

> tbl
# A tibble: 9 x 3
   time offspring parent
  <dbl> <chr>     <chr> 
1   0   A         NA    
2   0   B         NA    
3   0   C         NA    
4   1   A         A     
5   0.5 D         B     
6   1   E         D     
7   2   A         A     
8   1.5 F         A     
9   2   G         F   

【讨论】:

    【解决方案3】:

    我的解决方案可能不是最简洁的,但我能够使其工作,并且它可以推广到更大的数据集。我确信有办法改进这一点,所以我很想看看其他人想出什么。首先,我遇到了 0 个下标的问题,所以我在最后减去的时间列中添加了 2。

    我的想法是遍历行,然后找到当年(第 0 年之后)但不是前一年的后代的个体。然后我检查了哪些人在当年也是父母。我将这些人在那个时间段内作为后代的行编译成一个向量,因为我们稍后会删除它们。然后,我用 time-.5、那个后代和它的父母创建一个新行。我将它们编译成一个新的数据框,它将替换被删除的行。

    因为每个时间戳都有重复,所以我使要删除的行向量和要添加的行的 df 唯一。然后我对原始数据框进行删除和添加,并让数据类型一致。

        parent_offspring <- data.frame(
                     "time" = c( rep(0,3), rep(1,3), rep(2,3)),
                     "offspring" = c("A","B","C","A","D","E","A","F","G"),
                     "parent" = c(NA, NA, NA, "A","B","D","A","A","F")
              )
    
        po<- parent_offspring
        po$time <- po$time+2
        delete_vec <- vector()
        df_to_add <- data.frame()
    
    
        for (i in seq_along(po$time)) {
        q <- po$time[[i]] # Value of "Time" variable for the row
        a <- which(po$time == q) # Rows sharing that value of "Time"
        offspring_curr <- po$offspring[a]  # Offspring at that time
        b <- which(po$time==(q-1))  # Rows of offspring at Time-1
        offspring_prev <- po$offspring[b] # Identities of offspring at Time-1
        f<- offspring_curr[offspring_curr %in% offspring_prev == FALSE] # Which offspring at Time were not offspring at Time-1
    
        if (length(f) == 0) {
        next
        } else { ##skip ahead if none of the offspring at Time were not offspring at Time - 1
    
        parents_curr <- po$parent[which(po$time == q)] # Parents at current time
        parent_and_offpsring_curr <- intersect(f,parents_curr) # Which individuals are both parents and offspring at the current time
    
        if (length(parent_and_offpsring_curr) == 0) {
        next
        } else { ## skip ahead if no individuals are both parents and offspring
    
        g<- which(po$time==q & po$offspring==parent_and_offpsring_curr) # which offspring row is occupied by an individual who is both a parent and offspring at the current time
        delete_vec <- append(delete_vec,g) #we'll be deleting those rows in the end so we'll keep track of them and save them in a vector
        h<- po$parent[g] # this is the parent for the offspring/parent individual in the current time. 
        add_row<-c((q-.5), parent_and_offpsring_curr, h) # make a new row with the fractional time, parent/offspring individual, and their parent for row when the parent/offspring individual is an offspring
        df_to_add <- rbind(df_to_add,add_row) ## we'll add these rows at the end
      }
    }
     }
     
        delete_vec<-unique(delete_vec) ## iteration gave us duplicates
        df_to_add <- unique(df_to_add) ## same as above
        colnames(df_to_add) <- colnames(po) ## fix column names for new df
        po<- po[-delete_vec,] ## remove the offspring rows for the parent/offspring individuals
        po<-rbind(po,df_to_add) ## add the rows with fractional times
        rownames(po) <- c(1:nrow(po)) ## fix the row numbers
        po$time<- as.numeric(po$time) ## time was converted to character when put into a vector with letters
        po$time <- po$time-2 ## back to the original time values
    
        po
    
            time offspring parent
          1  0.0         A   <NA>
          2  0.0         B   <NA>
          3  0.0         C   <NA>
          4  1.0         A      A
          5  1.0         E      D
          6  2.0         A      A
          7  2.0         G      F
          8  0.5         D      B
          9  1.5         F      A
    

    然后您可以使用 dplyr::arrange 按时间升序排列行

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2022-07-05
      • 2022-07-28
      • 2020-07-03
      • 1970-01-01
      • 2018-10-15
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多