【问题标题】:Splitting a column into two before and after the first appearance of a delimiter "/" [duplicate]在第一次出现分隔符“/”之前和之后将一列拆分为两列[重复]
【发布时间】:2020-04-15 20:27:41
【问题描述】:

我有附加的数据框。

数据

structure(list(associated_gene = c(NA, NA, "A4GALT", NA, NA, 
   "NOT FOUND"), chr_name = c("22", "22", "22", "22", "22", "NOT FOUND"
   ), chrom_start = c(42693910L, 42693843L, 42693321L, 42693665L, 
   42693653L, 0L), allele = c("G/A/T", "T/C", "G/C", "C/T", "G/A/T", 
   "NOT FOUND"), refsnp_id = c("rs778598915", "rs11541159", "rs397514502", 
   "rs762949801", "rs776304817", "NOT FOUND")), row.names = c("s3a", 
   "s3b", "s3c", "s3d", "s3e", "s3f"), class = "data.frame")
    associated_gene  chr_name chrom_start    allele   refsnp_id
s3a            <NA>        22    42693910     G/A/T rs778598915
s3b            <NA>        22    42693843       T/C  rs11541159
s3c          A4GALT        22    42693321       G/C rs397514502
s3d            <NA>        22    42693665       C/T rs762949801
s3e            <NA>        22    42693653     G/A/T rs776304817
s3f       NOT FOUND NOT FOUND           0 NOT FOUND   NOT FOUND

我想将第一个“/”的等位基因列分成两部分(Ref & Var),并将它们插入到 $chrom_start 和 $refsnp_id 之间

理想的输出是:

     associated_gene  chr_name chrom_start   Ref   Var   refsnp_id
s3a            <NA>         22    42693910     G   A/T rs778598915
s3b            <NA>         22    42693843     T     C  rs11541159

我不知道我是否可以加载 awk,但在 bash 中我会这样做:

猫等位基因 | awk -F"/" '{打印 $1 "\t" $2}'

【问题讨论】:

    标签: r string dataframe dplyr stringr


    【解决方案1】:

    我们可以使用 tidyr 中的 extract 从字符串的开头 (^) 后跟 / 捕获不是 / ([^/]+) 的字符,然后捕获其余字符

    library(tidyr)
    library(dplyr)
    df1 %>%
         extract(allele, into = c("Ref", "Var"), "^([^/]+)/(.*)")
    #   associated_gene  chr_name chrom_start  Ref  Var   refsnp_id
    #s3a            <NA>        22    42693910    G  A/T rs778598915
    #s3b            <NA>        22    42693843    T    C  rs11541159
    #s3c          A4GALT        22    42693321    G    C rs397514502
    #s3d            <NA>        22    42693665    C    T rs762949801
    #s3e            <NA>        22    42693653    G  A/T rs776304817
    #s3f       NOT FOUND NOT FOUND           0 <NA> <NA>   NOT FOUND
    

    或者另一个选项是str_split

    library(stringr)
    do.call(rbind, str_split(df$allele, "/", 2))
    

    或者使用sub 创建一个分隔符并在base R 中使用read.table/read.csv 读取

    df1[c("Ref", "Var")] <- read.table(text = sub("/", ";", df1$allele, 
        fixed = TRUE), header = FALSE, sep = ";", 
       stringsAsFactors = FALSE, fill = TRUE, na.strings = c("NOT FOUND", ""))
    

    【讨论】:

      【解决方案2】:

      根据您希望最后一行输出的方式,您可以使用以下选项之一:

      使用tidyr::separate

      library(tidyr)
      separate(df, allele,into = c("Ref", "Var"),sep = "/",extra = "merge",fill = "right")
      
      #   associated_gene  chr_name chrom_start        Ref  Var   refsnp_id
      #s3a            <NA>        22    42693910         G  A/T rs778598915
      #s3b            <NA>        22    42693843         T    C  rs11541159
      #s3c          A4GALT        22    42693321         G    C rs397514502
      #s3d            <NA>        22    42693665         C    T rs762949801
      #s3e            <NA>        22    42693653         G  A/T rs776304817
      #s3f       NOT FOUND NOT FOUND           0 NOT FOUND <NA>   NOT FOUND
      

      或与stringr::str_match

      stringr::str_match(df$allele, "(.*?)/(.*)")[,c(2, 3)]
      
      #     [,1] [,2] 
      #[1,] "G"  "A/T"
      #[2,] "T"  "C"  
      #[3,] "G"  "C"  
      #[4,] "C"  "T"  
      #[5,] "G"  "A/T"
      #[6,] NA   NA   
      

      【讨论】:

        【解决方案3】:

        另一种解决方案是使用“stringr”包:

        install.packages("stringr")
        library(stringr)
        

        数据:

        df <- structure(list(associated_gene = c(NA, NA, "A4GALT", NA, NA, 
                                       "NOT FOUND"), chr_name = c("22", "22", "22", "22", "22", "NOT FOUND"
                                       ), chrom_start = c(42693910L, 42693843L, 42693321L, 42693665L, 
                                                          42693653L, 0L), allele = c("G/A/T", "T/C", "G/C", "C/T", "G/A/T", 
                                                                                     "NOT FOUND"), refsnp_id = c("rs778598915", "rs11541159", "rs397514502", 
                                                                                                                 "rs762949801", "rs776304817", "NOT FOUND")), row.names = c("s3a", "s3b", "s3c", "s3d", "s3e", "s3f"), class = "data.frame")
        

        创建一个包含两个新变量的新 df:

        new_df <- data.frame(
        Ref = str_extract(df$allele, "\\w(?=/)"), 
        Var = str_extract(df$allele, "(?<=/)\\w.*")
          )
        new_df
           Ref  Var
        1    G  A/T
        2    T    C
        3    G    C
        4    C    T
        5    G  A/T
        6 <NA> <NA>
        

        列绑定new_dfdf(减去现在已过时的allele列):

        cbind(df[,-4], new_df)
            associated_gene  chr_name chrom_start   refsnp_id  Ref  Var
        s3a            <NA>        22    42693910 rs778598915    G  A/T
        s3b            <NA>        22    42693843  rs11541159    T    C
        s3c          A4GALT        22    42693321 rs397514502    G    C
        s3d            <NA>        22    42693665 rs762949801    C    T
        s3e            <NA>        22    42693653 rs776304817    G  A/T
        s3f       NOT FOUND NOT FOUND           0   NOT FOUND <NA> <NA>
        

        【讨论】:

          猜你喜欢
          • 2018-06-14
          • 1970-01-01
          • 2015-05-14
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          相关资源
          最近更新 更多