给定参考字符串中的坐标，从长参考字符串中拼接字符串答案

【问题标题】：Splicing strings out of a long reference string given their coordinates in that reference string给定参考字符串中的坐标，从长参考字符串中拼接字符串
【发布时间】：2019-06-19 20:05:20
【问题描述】：

虽然这是一个genomics 的问题，但由于它处理的是字符串的拼接（获取子集），我认为它与此受众相关，而不是与Bioconductor 单独相关。

很简单，我有一个长字符串列表（基因组的染色体）。例如，我使用 Bioconductor Biostrings 包创建并存储了 10 条染色体：

set.seed(1)
set <- NULL
for (i in 1:10) set <- c(set,paste(sample(Biostrings::DNA_ALPHABET[1:4],10000,replace=T),collapse=""))

genome.set <- Biostrings::DNAStringSet(set)
names(genome.set) <- paste0("chr",1:10)

然后我有一个记录坐标的data.frame（来自GTF 文件），其中每个记录可以有多行：

library(dplyr)
gtf.df <- data.frame(seqnames = sample(names(genome.set),100,replace=T),
                     strand = sample(c("+","-"),100,replace=T),
                     start = sample(1:9000,100,replace=F)) %>%
  dplyr::mutate(end = start+sample(1:1000,100,replace = F))

gtf.df <- gtf.df %>% dplyr::group_by(seqnames) %>%
  dplyr::arrange(start,end) %>%
  dplyr::mutate(transcript_id = paste0(seqnames,"_",sample(1:8,length(seqnames),replace=T))) %>%
  dplyr::ungroup()

我想要做的是通过从genome.set 拼接出每个转录本来加入其序列。

再次使用Biostrings，我可以这样实现：

transcript_ids <- unique(gtf.df$transcript_id)
transcript.seqs <- sapply(1:length(transcript_ids),function(t){
  transcript.gtf.df <- gtf.df %>% dplyr::filter(gtf.df$transcript_id == transcript_ids[t])
  transcript.seq <- paste(sapply(1:nrow(transcript.gtf.df),function(e)
    unname(as.character(Biostrings::subseq(genome.set[which(names(genome.set) == transcript.gtf.df$seqnames[1])],start=transcript.gtf.df$start[e],end=transcript.gtf.df$end[e])))
  ),collapse="")
  if(transcript.gtf.df$strand[1] == "-") transcript.seq <- unname(as.character(Biostrings::reverseComplement(Biostrings::DNAString(transcript.seq))))
  return(transcript.seq)
})

我的问题是我的真实数据中有4520919 成绩单，最后一部分需要很长时间。所以我的问题是，是否以及如何使用Biostrings 或任何其他方式更快地完成此操作。

【问题讨论】：

transcript.gtf.df$seqnames[1] 是否应该在第二个 sapply 中被索引 1 或 e ？
transcript_id 在gtf.df 的所有行中都有相同的seqnames，所以我只是使用transcript.gtf.df$seqnames[1] 抓住第一个

标签： r string subset splice

【解决方案1】：

我已经重写了您的 sapply 方法，并进行了两项重大更改：

首先，我使用了vapply，一般来说比较快
其次，我使用了很多.subset2 来对数据帧进行子集化

编辑

我已经设法摆脱了内部循环 (vapply)
替换函数Biostrings::reverseComplement

这里是代码

names_genome.set <- names(genome.set)
transcript_ids <- unique(gtf.df$transcript_id)
transcript_seqs <- vapply(seq_along(transcript_ids), function (t) {
  ind_id <- which(.subset2(gtf.df, 5L) == transcript_ids[t])
  x <- unname(as.character(genome.set[names_genome.set == .subset2(gtf.df, 1L)[ind_id[1L]]]))
  out <- paste0(substring(text = x, first = .subset2(gtf.df, 3L)[ind_id], last = .subset2(gtf.df, 4L)[ind_id]), collapse = '')
  if (.subset2(gtf.df, 2L)[ind_id[1L]] == '-') {
    out <- unlist(strsplit(out, ''))
    ind_A <- out == 'A'
    ind_T <- out == 'T'
    ind_C <- out == 'C'
    ind_G <- out == 'G'
    out[ind_A] <- 'C'
    out[ind_T] <- 'G'
    out[ind_G] <- 'T'
    out[ind_C] <- 'A'
    out <- paste(out, collapse = '')
  }
  out
}, character(1))

以下是一些基准数据以及提供的示例数据

# Unit: milliseconds
#       expr       min        lq      mean    median        uq      max neval cld
#     sapply 160.94296 169.97698 180.13836 175.20474 182.58224 400.3273   100   c
# vapply_old  66.20113  69.59185  72.96804  71.45861  74.56051 120.0023   100  b 
# vapply_new  47.45224  49.51573  52.87001  50.97023  54.52104 109.3320   100 a  

microbenchmark::microbenchmark(
  'sapply' = {
    transcript.seqs <- sapply(1:length(transcript_ids),function(t){
      transcript.gtf.df <- gtf.df %>% dplyr::filter(gtf.df$transcript_id == transcript_ids[t])
      transcript.seq <- paste(sapply(1:nrow(transcript.gtf.df),function(e)
        unname(as.character(Biostrings::subseq(genome.set[which(names(genome.set) == transcript.gtf.df$seqnames[1])],start=transcript.gtf.df$start[e],end=transcript.gtf.df$end[e])))
      ),collapse="")
      if(transcript.gtf.df$strand[1] == "-") transcript.seq <- unname(as.character(Biostrings::reverseComplement(Biostrings::DNAString(transcript.seq))))
      return(transcript.seq)
    })
  },
  'vapply_old' = {
    transcript_seqs <- vapply(seq_along(transcript_ids), function (t) {
      ind_id <- which(.subset2(gtf.df, 5L) == transcript_ids[t])
      x <- unname(as.character(genome.set[names_genome.set == .subset2(gtf.df, 1L)[ind_id[1L]]]))
      out <- vapply(ind_id, 
                    function (e) substr(x = x, start = .subset2(gtf.df, 3L)[e], stop = .subset2(gtf.df, 4L)[e]),
                    character(1))
      out <- paste(out, collapse = '')
      if (.subset2(gtf.df, 2L)[ind_id[1L]] == '-') {
        out <- unname(as.character(Biostrings::reverseComplement(Biostrings::DNAString(out))))
      }
      out
    }, character(1))
  },
  'vapply_new' = {
    transcript_seqs <- vapply(seq_along(transcript_ids), function (t) {
      ind_id <- which(.subset2(gtf.df, 5L) == transcript_ids[t])
      x <- unname(as.character(genome.set[names_genome.set == .subset2(gtf.df, 1L)[ind_id[1L]]]))
      out <- paste0(substring(text = x, first = .subset2(gtf.df, 3L)[ind_id], last = .subset2(gtf.df, 4L)[ind_id]), collapse = '')
      if (.subset2(gtf.df, 2L)[ind_id[1L]] == '-') {
        out <- unlist(strsplit(out, ''))
        ind_A <- out == 'A'
        ind_T <- out == 'T'
        ind_C <- out == 'C'
        ind_G <- out == 'G'
        out[ind_A] <- 'C'
        out[ind_T] <- 'G'
        out[ind_G] <- 'T'
        out[ind_C] <- 'A'
        out <- paste(out, collapse = '')
      }
      out
    }, character(1))
  }
)

我仍然会尝试找到增强它的方法（可能有矢量化）。 ~~例如，我还没有掌握 reverseComplement 函数的作用 - 也许可以更有效地执行。~~

您可以尝试使用更大的数据集，看看是否有改进。此外，如果效率真的受到威胁，Rcpp 可能是一个想法。

【讨论】：

非常感谢@nate。 reverseComplement 对 DNA 更具体一点。 genome.set 中的每条染色体实际上都有一对我遗漏了，这是它的反向补码，这意味着字符串的顺序是颠倒的，所有 C 的都被 G 替换，反之亦然反之亦然，所有A 被T 替换，反之亦然。正如代码所示，gtf.df 中的 strand 列确定了这一点。因此，我可以在拼接序列上执行此操作，而不是为每个染色体保留反向互补。
@dan 好的，我已经用Biostrings::reverseComplement 的替换来编辑我的代码。它现在的表现要好得多。（无论如何，您都应该检查该替换-我不知道生物学的第一件事哈哈）