【问题标题】:Running a loop on multiple .txt files in one folder in R在 R 的一个文件夹中的多个 .txt 文件上运行循环
【发布时间】:2021-05-31 12:25:48
【问题描述】:

我有一个脚本可以计算拷贝数变异并将数据保存到基于第一列信息的名为“genesforcomp1”的现有文件中。输入文件名为 BRCA1.txt、BRCA2.txt、BRCA3.txt.......BRCA4376.txt。另一个输入文件“genes.txt”在每个循环中都是相同的,用于注释,而“genesforcomp1”用于更新输出。 由于文件很多,我想知道是否可以通过R中的循环函数来完成。 这是我的脚本

setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
library(GenomicRanges)
library(dplyr)
library("scales")
require(tidyverse)
#Create annotation or refrence table
genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
df<- read.table("BRCA1.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df$Chromosome <- gsub('X', '23', df$Chromosome)
df$Chromosome <- gsub('Y', '24', df$Chromosome)
colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
cnv <-  makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, cnv, type="within")
df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
df_ann <- unique(df_ann)
df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
df_ann$Seg_value2 <- abs(df_ann$Seg_value)
df_ann$Seg_value2 = 2^df_ann$Seg_value2
df_ann$Seg_value2 = df_ann[, 4] - 1
df_ann$Seg_value2 = df_ann[, 4] * 2
df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
df <- rbind.data.frame(df_ann, df_ann1)
df <- df[!duplicated(df$Ensembl_ID),]
#saving the results into existing file based on first column values
df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
lst <- list(data.frame(df1), data.frame(df))
df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)

任何关于如何循环脚本的建议或想法将不胜感激。提前致谢!

【问题讨论】:

    标签: r loops file


    【解决方案1】:

    由于您的文件名遵循良好的模式,您可以执行从 1 到 4376 的循环,并将代码中的 "BRCA1.txt" 替换为 paste0("BRCA", i, ".txt")。可能有一些方法可以在不对模式进行硬编码的情况下进行循环,但在您的情况下,您似乎不需要它。

    setwd("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/aa")
    library(GenomicRanges)
    library(dplyr)
    library("scales")
    require(tidyverse)
    #Create annotation or refrence table
    genes <- read.table("/home/sumit/Test/1_Lung/AllCNV/0a0bd4cc-3da8-44ff-ba08-a7e86b95f2f6/genes.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
    genes$chromosome_name <- gsub('X', '23', genes$chromosome_name)
    genes$chromosome_name <- gsub('Y', '24', genes$chromosome_name)
    colnames(genes) <- c("GeneSymbol","Chr","Start","End")
    genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
    #File need to be analyzed (3 step: preprocessing, comparison with reference or annotation and post-porcessing)
    for(i in 1:4376){
      df<- read.table(paste0("BRCA", i, ".txt"), sep="\t", stringsAsFactors=FALSE, header=TRUE)
      df$Chromosome <- gsub('X', '23', df$Chromosome)
      df$Chromosome <- gsub('Y', '24', df$Chromosome)
      colnames(df) <- c("Barcode", "Chr", "Start", "End", "extra1", "extra2")
      cnv <-  makeGRangesFromDataFrame(df, keep.extra.columns = TRUE)
      hits <- findOverlaps(genes_GR, cnv, type="within")
      df_ann <- cbind(df[subjectHits(hits),],genes[queryHits(hits),])
      df_ann <- unique(df_ann)
      df_ann <- df_ann[ , c("GeneSymbol", "Chr", "extra2")]
      colnames(df_ann) <- c("Ensembl_ID","Chr","Seg_value")
      df_ann$Seg_value2 <- abs(df_ann$Seg_value)
      df_ann$Seg_value2 = 2^df_ann$Seg_value2
      df_ann$Seg_value2 = df_ann[, 4] - 1
      df_ann$Seg_value2 = df_ann[, 4] * 2
      df_ann$Seg_value2 <- with(df_ann, sign(Seg_value) * Seg_value2)
      df_ann <- df_ann[ , c("Ensembl_ID", "Seg_value")]
      df_ann$Seg_value <- rescale(df_ann$Seg_value, to = c(-1, 1))
      df_ann1 <- read.table("/home/sumit/Academic/DHR/TCGA/Gene List/Final1/genesbase.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE)
      df <- rbind.data.frame(df_ann, df_ann1)
      df <- df[!duplicated(df$Ensembl_ID),]
      #saving the results into existing file based on first column values
      df1 <- read.delim("genesforcomp1", check.names=FALSE, stringsAsFactors=FALSE)
      lst <- list(data.frame(df1), data.frame(df))
      df2 <- reduce(lst, full_join, by = "Ensembl_ID") %>% replace(., is.na(.), 0);
      write.table(df2, file="genesforcomp1", quote = F, sep = "\t", row.names = F)
    }
    

    【讨论】:

    • 感谢您的评论。是的,它运行良好。我浪费了将近 6 个小时,但无济于事。将来,我会遵循同样的模式。非常感谢。祝你有美好的一天。
    猜你喜欢
    • 2020-07-23
    • 2020-08-23
    • 1970-01-01
    • 1970-01-01
    • 2019-06-18
    • 2021-12-09
    • 2020-07-04
    • 2017-04-21
    • 1970-01-01
    相关资源
    最近更新 更多