【问题标题】:Arabidopsis Gene ID Conversion (BioMart, CLC Genomics Workbench Output)拟南芥基因 ID 转换(BioMart,CLC Genomics Workbench 输出)
【发布时间】:2019-05-12 13:49:09
【问题描述】:

我有一个来自 CLC 基因组学工作台的 RNA-seq 读数输出,用于拟南芥。基因列表包含基因名称(即“TRY”、“TMM”、“SVP”、“FLC”)和 ID(例如“AT1G01390”、“AT1G01310”、“AT1G01240”)的混合。我想将它们全部转换为基因名称,因此我可以通过 GO 术语 R 包运行它(该包似乎不读取像 AT1G01390 这样的 ID)。

当我使用 biomaRt 的 getBM() 函数时,它返回的基因比我正在读入的基因列表要少得多。 CLC 的原始列表包含所有拟南芥基因 (27,655),而 getBM() 的输出通常包含 12,085 个或更少的基因名称。

以前有人成功完成过这种类型的转换吗?

提前致谢!

我尝试了各种类型的属性,但都没有奏效。

#data load in and conversions, meta matrix/design creation:
    #reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
      #each sample

  reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
  meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")


mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
  listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))  
  ensembl = useDataset("athaliana_eg_gene",mart= mart)

  genes <- row.names(reads)

  test1 <- getBM(attributes='external_gene_name', 
        values = genes, 
        mart = ensembl)

【问题讨论】:

    标签: r biomart


    【解决方案1】:

    好的,我找到了一个解决这个问题的方法,至少在我的场景中是这样。

    我使用的 gmt 和 fgsea 信息只能读取基因符号(例如“TRY”)或 entrez ID。所以我编写了一个函数来将我拥有的所有信息转换为符号或 entrez ID。代码是:

      reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
    
    genes <- row.names(reads)
    
    sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))
    
    #genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")
    
    IDconvert <- function(genes) {
    
      for (i in genes){
    
        if (grepl("AT[0-9]", i) == TRUE) {
    
          if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {
    
            if (is.na(getEG(i, data='org.At.tair')) == TRUE) {
    
              i <- i
    
               } else{
    
                 name <- getEG(i, data='org.At.tair')
    
                 name.l <- as.list(name)
                 newname <- as.character(name.l[[1]])
                 genes <- sub(i, newname, genes)
    
                }
    
          } else{
          name <- getSYMBOL(i, data='org.At.tair')
    
          name.l <- as.list(name)
          newname <- as.character(name.l[[1]])
          genes <- sub(i, newname, genes)
    
          }
    
        } else{
          NULL
        } 
    
      }
      return(genes)
    
    }   
    
    
    genes2 <- IDconvert(genes)
    
    sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))
    
    row.names(reads) <- genes2  
    
    
    gmt <- read.gmt("GSEA_BIO.gmt")
    gmt.ids <- read.gmt("IB_BIO_GMT.gmt")                  
    gmt.combo <- c(gmt, gmt.ids)
    
    #Stage 3 GO terms
    
    names3 <- row.names(sub.break3)
    sub.break3$names=names3
    ranks <- sub.break3$stat
    names(ranks) <- sub.break3$names
    sub.break3.rank <- sort(ranks, decreasing = T)
    
    fgseaRes3 <- fgsea(pathways = gmt.combo, 
                      stats = sub.break3.rank,
                      minSize=5,
                      maxSize=500,
                      nperm=100000)
    fgsea3.sig <- fgseaRes3[pval < 0.05]
    pathways.stg3 <- fgsea3.sig$pathway
    
    
    
    #Stage 1 GO terms
    
    names1 <- row.names(sub.break1)
    sub.break1$names=names1
    ranks <- sub.break1$stat
    names(ranks) <- sub.break1$names
    sub.break1.rank <- sort(ranks, decreasing = T)
    
    fgseaRes1 <- fgsea(pathways = gmt.combo, 
                      stats = sub.break1.rank,
                      minSize=5,
                      maxSize=500,
                      nperm=100000)
    fgsea1.sig <- fgseaRes1[pval < 0.05]
    pathways.stg1 <- fgsea1.sig$pathway
    
    
    #Stage 2 GO terms
    
    names2 <- row.names(sub.break2)
    sub.break2$names=names2
    ranks <- sub.break2$stat
    names(ranks) <- sub.break2$names
    sub.break2.rank <- sort(ranks, decreasing = T)
    
    fgseaRes2 <- fgsea(pathways = gmt.combo, 
                       stats = sub.break2.rank,
                       minSize=5,
                       maxSize=500,
                       nperm=100000)
    fgsea2.sig <- fgseaRes2[pval < 0.05]
    pathways.stg2 <- fgsea2.sig$pathway
    
    
    
    #Stage 4 GO terms
    
    names4 <- row.names(sub.break4)
    sub.break4$names=names4
    ranks <- sub.break4$stat
    names(ranks) <- sub.break4$names
    sub.break4.rank <- sort(ranks, decreasing = T)
    
    fgseaRes4 <- fgsea(pathways = gmt.combo, 
                       stats = sub.break4.rank,
                       minSize=5,
                       maxSize=500,
                       nperm=100000)
    fgsea4.sig <- fgseaRes4[pval < 0.05]
    pathways.stg4 <- fgsea4.sig$pathway
    #openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")
    
    
    #GO Venn-----------------------------------
    
    group.venn(list(One = pathways.stg1, 
                    Two = pathways.stg2, 
                    Three = pathways.stg3, 
                    Four = pathways.stg4), 
               fill = c("orange", "green", "red", "blue"))
    
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2016-11-10
      • 1970-01-01
      • 1970-01-01
      • 2020-09-25
      • 1970-01-01
      相关资源
      最近更新 更多