【发布时间】:2020-09-14 19:00:15
【问题描述】:
我编写了一个自定义函数来读取 JSON 文件并提取我需要的所有相关信息,目标是在目录中的所有文件上运行它。我已经创建了所有文件的字符向量并使用 sapply/lapply,我已经能够运行如下函数。
setwd("/directory/")
file.list=dir()
sapply(file.list,function)
在执行过程中由于某种原因,它到达了输出以下内容并无缘无故停止的地步,如下所示。我有警告被禁止,我得到的唯一警告是我所期望的。在所有失败的文件上单独运行解析器,给我我想查看的表。
#expected output
#expected output
$'filename'
[1]FALSE
$'filename'
[1]NULL
如果有帮助,我已经在此处附加了我的解析器,我确信它没有经过优化并且有更好的方法可以做到这一点,但速度不是这里的主要问题。提前致谢!
library(jsonlite)
library(data.table)
library(dplyr)
library(plyr)
library(stringr)
library(tidyr)
trialParse=function(filename){
options(warn=-1)
options(max.print=99999)
parsefile=read_json(filename)
title=str_remove(basename(filename),".json")
#cat(sprintf("Schema Version is %s\n", parsefile$metadata$schemaVersion))
if (parsefile$report$workflow$reportType!="DNA"){
#cat(sprintf("%s report is not DNA, moved\n",title))
from=paste("~/JSON_parsing/workingFiles/",title,".json",sep='')
to=paste("~/JSON_parsing/failedFiles/",title,".json",sep='')
file.rename(from=from,to=to)
}else{
#cat(sprintf("%s\n",title))
#Extract report info
if (exists("report",parsefile)&&length(parsefile$report)!=0){
reportData=rbindlist(list(parsefile$report,parsefile$report$workflow),fill=TRUE)
reportData$workflow=NULL
reportData[is.na(reportData)]=""
reportData=reportData%>%
unique()%>%
summarize_all(funs(trimws(paste(.,collapse=''))))
#assign(paste("reportData_",title,sep=''),reportData,envir=.GlobalEnv)
}else{
print("No report info")
}
#Extract patient info
if(exists("patient",parsefile)&&length(parsefile$patient)!=0){
patientData=as.data.frame(t(unlist(parsefile$patient)))
#assign(paste("patientData_",title,sep=''),patientData,envir=.GlobalEnv)
}else{
print("No patient info")
}
#Extract order info
if(exists("report",parsefile)&&length(parsefile$report)!=0){
orderData=rbindlist(list(parsefile$order,parsefile$order$test),fill=TRUE)
orderData$test=NULL
orderData[is.na(orderData)]=""
orderData=orderData%>%
unique()%>%
summarize_all(funs(trimws(paste(.,collapse=''))))
#assign(paste("orderData_",title,sep=''),orderData,envir=.GlobalEnv)
}else{
print("No order info")
}
#Extract specimens info
if(exists("specimens",parsefile)&&length(parsefile$specimens)!=0){
specimens=list()
for(i in 1:length(parsefile$specimens)){
specimens[[i]]=as.data.frame(t(unlist(parsefile$specimens[[i]])))
}
specimensData=do.call(rbind.fill,specimens)%>%
unique()
#assign(paste("specimensData_",title,sep=''),specimensData,envir=.GlobalEnv)
}else{
print("No specimens info")
}
#Extract mutations info
if (exists("results",parsefile)&&length(parsefile$results)!=0){
#Tumor Mutational Burden
if (length(parsefile$results$tumorMutationalBurden!=0)){
tmbdata=as.data.frame(t(unlist(c(parsefile$results[1],parsefile$results[2],parsefile$results[3]))))
#assign(paste("tmbData_",title,sep=''),tmbdata,envir=.GlobalEnv)
}
#Somatic Potentially Actionable Mutations
if (exists("somaticPotentiallyActionableMutations",parsefile$results)&&length(parsefile$results$somaticPotentiallyActionableMutations)!=0){
#Remove Therapies
for (i in 1:length(parsefile$results$somaticPotentiallyActionableMutations)){
for (j in 1:length(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants)){
parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[[j]]$therapies=NULL
}
}
mutations=list()
variants=list()
#Extract Somatic Potentially Actionable Mutations data
#Per Entry
for(i in 1:length(parsefile$results$somaticPotentiallyActionableMutations)){
#Per Variants in Entry
for(j in 1:length(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants)){
variants[[j]]=rbindlist(list(parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[j],parsefile$results$somaticPotentiallyActionableMutations[[i]]$variants[[j]]),fill=TRUE)
}
#Per Variants in Entry
for (j in 1:length(variants)){
variantsData=do.call(rbind,variants[j])
mutationsList=parsefile$results$somaticPotentiallyActionableMutations[[i]]
mutationsList$variants=NULL
mutationTable=rbindlist(list(mutationsList,variantsData),fill=TRUE)
mutations=append(mutations,list(rbindlist(list(mutationsList,variantsData),fill=TRUE)))
}
}
#Build SPAMS table
SPAMsData=do.call(rbind,mutations)
SPAMsData$mutationEffect=NULL
SPAMsData=SPAMsData%>%
unique()%>%
fill(c(1:4),.direction=c("down"))%>%
fill(c(5:ncol(SPAMsData)),.direction=c("up"))%>%
unique()%>%
mutate(mutation_type="Somatic Potentially Actionable Mutation")
}else{
print("No SPAMS")
}
#Somatic Potentially Actionable Copy Number Variants
if (exists("somaticPotentiallyActionableCopyNumberVariants",parsefile$results)&&length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)!=0){
#Remove Therapies
for (i in 1:length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)){
parsefile$results$somaticPotentiallyActionableCopyNumberVariants[[i]]$therapies=NULL
}
#Extract Somatic Potentially Actionable Copy Number Variants Data
variants=list()
for(i in 1:length(parsefile$results$somaticPotentiallyActionableCopyNumberVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticPotentiallyActionableCopyNumberVariants[[i]])))
}
SPACNVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Somatic Potentially Actionable Copy Number Variants")
rm(variants)
}else{
print("No SPACNVs")
}
#Somatic Biologically Relevant Variants
if (exists("somaticBiologicallyRelevantVariants",parsefile$results)&&length(parsefile$results$somaticBiologicallyRelevantVariants)!=0){
#Extract Data
variants=list()
for(i in 1:length(parsefile$results$somaticBiologicallyRelevantVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticBiologicallyRelevantVariants[[i]])))
}
SBRVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Somatic Biologically Relevant Variants")
rm(variants)
}else{
print("No SBRVs")
}
#Somatic Variants of Unknown Significance
if (exists("somaticVariantsOfUnknownSignificance",parsefile$results)&&length(parsefile$results$somaticVariantsOfUnknownSignificance)!=0){
#Extract Data
variants=list()
for(i in 1:length(parsefile$results$somaticVariantsOfUnknownSignificance)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$somaticVariantsOfUnknownSignificance[[i]])))
}
SVUSsData=do.call(rbind,variants)%>%
mutate(mutation_type="Somatic Variants of Unknown Significance")
rm(variants)
}else{
print("No SVUSs")
}
#Fusion Variants
if (exists("fusionVariants",parsefile$results)&&length(parsefile$results$fusionVariants)!=0){
#Remove Therapies
for (i in 1:length(parsefile$results$fusionVariants)){
parsefile$results$fusionVariants[[i]]$therapies=NULL
}
#Extract Data
variants=list()
for(i in 1:length(parsefile$results$fusionVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$fusionVariants[[i]])))
}
FVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Fusion Variants")%>%
dplyr::rename(gene5display=gene5Display)%>%
dplyr::rename(gene3display=gene3Display)
rm(variants)
}else{
#print("No FVs")
}
#Inherited Relevant Variants
if (exists("inheritedRelevantVariants",parsefile$results)&&length(parsefile$results$inheritedRelevantVariants)!=0){
#Extract Data
if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
variants=list()
for(i in 1:length(parsefile$results$inheritedRelevantVariants)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedRelevantVariants[[i]])))
}
IRVsData=do.call(rbind,variants)%>%
mutate(mutation_type="Inherited Relevant Variants")
rm(variants)
}else{
print("No IRVs")
}
}else{
print("No IRVs")
}
#Inherited Incidental Findings
if (exists("inheritedIncidentalFindings",parsefile$results)&&length(parsefile$results$inheritedIncidentalFindings)!=0){
#Extract Data
if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
variants=list()
for(i in 1:length(parsefile$results$inheritedIncidentalFindings)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedIncidentalFindings[[i]])))
}
IIFsData=do.call(rbind,variants)%>%
mutate(mutation_type="Inherited Incidental Findings")
rm(variants)
}else{
print("No IIFs")
}
}else{
print("No IIFs")
}
#Inherited Variants of Unknown Significance
if (exists("inheritedVariantsOfUnknownSignificance",parsefile$results)&&length(parsefile$results$inheritedVariantsOfUnknownSignificance)!=0){
#Extract Data
if (length(parsefile$results$inheritiedIncidentalFindings)!=0){
variants=list()
for(i in 1:length(parsefile$results$inheritedVariantsOfUnknownSignificance)){
variants[[i]]=as.data.frame(t(unlist(parsefile$results$inheritedVariantsOfUnknownSignificance[[i]])))
}
IVUSsData=do.call(rbind,variants)%>%
mutate(mutation_type="Inherited Variants of Unknown Significance")
rm(variants)
}else{
print("No IVUSs")
}
}else{
print("No IVUSs")
}
#Merge and Output data tables
if (exists(c("SPAMsData","SBRVsData","SVUSsData","FVsData","IRVsData","IIFsData","IVUSsData"))){
mergedMutations=rbind.fill(get0("SPAMsData"),get0("SPACNVsData"),get0("SBRVsData"),get0("SVUSsData"),get0("FVsData"),get0("IRVsData"),get0("IIFsData"),get0("IVUSsData"))%>%
select(mutation_type,everything())
outMutations=merge(patientData,mergedMutations)
write.csv(outMutations,"~/JSON/mutations.csv",append=TRUE)
#assign(paste("mergedMutations_",title,sep=''),mergedMutations,envir=.GlobalEnv)
#assign(paste("patientMutations_",title,sep=''),merge(patientData,mergedMutations),envir=.GlobalEnv)
}else{
print("No mutations info")
}
}else{
print("No mutations info")
}
if (exists(c("orderData","reportData","specimensData","tmbData"))){ outPatients=rbind.fill(get0("patientData"),get0("orderData"),get0("reportData"),get0("specimensData"),get0("tmbData"))
write.csv(outPatients,"~/JSON/patients.csv",append=TRUE)
}else{
print("Missing patient info")
}
}
}
【问题讨论】:
-
这段代码太庞大了,任何人都无法编写一个合理的答案。在你的位置上,我会尝试调试代码。仔细阅读哈德利的书中Advanced R 中的rstudios guide to debugging 和chapter 22。使用这些知识来找到您的问题并将其隔离,可能通过跟踪整个函数的值。一旦你发现问题,如果你不能解决它。制作一个较小的示例来复制问题,然后发布一个新问题。
-
调试对于任何想要编程的人、数据科学家、分析师等来说都是一项基本技能。这是在实践中解决绝大多数编程问题的方法。
-
附加说明:即使使用缩减的功能,这也是不可重现的,因为我们不知道发生此故障时它正在操作的文件。您对 "it prints
#expected output" 的总结并不能帮助我们缩小问题的范围,因为该字符串不会出现在您的函数中。我理解(并且普遍赞赏)减少您的问题大小的尝试,但是(1)该功能已经通过该建议; (2) 我建议您提供明确的(不一定是全部)上下文。祝你好运!