这是一个解决方案,它没有得到很好的优化,但我希望它能完成工作:
ff <- function(df=data.frame(A=c(1,NA,1),B=c(NA,1,1),C=c(1,1,1))) {
if(NCOL(df)==1) return(df)
#create all possible combinations of names
combinations <- unlist(lapply(2:NCOL(df), function(k) {
kCombs <- utils::combn(names(df),k)
apply(kCombs,2,paste0,collapse=",")
}))
#go through all combinations and calculate the product of the associated columns
allColumnProducts <- lapply(combinations, function(combi) {
colNamesInCombi <- strsplit(combi,",")[[1]]
dfColList <- as.list(df[,colNamesInCombi])
poductOfCols <- Reduce(f = "*",
x = as.list(df[,colNamesInCombi]),
init = numeric(NROW(df))+1)
setNames(data.frame(poductOfCols),combi)
})
#put everything in one dataframe including the initial dataframe
allColumnProducts <- do.call("cbind",allColumnProducts)
allColumnProducts <- cbind(df,allColumnProducts)
#clear all the subsets and make them NA
for(columnIndex in NCOL(allColumnProducts):(NCOL(df)+1)) {
rowsWithOne <- allColumnProducts[,columnIndex]==1
rowsWithOne[is.na(rowsWithOne)] <- FALSE
if(any(rowsWithOne)) {
#we found a 1 in the column, so we have to make all columns whose
#names are part of the name of the current column NA at these rows with 1
nameParts <- strsplit(names(allColumnProducts)[columnIndex],",")[[1]]
namesToFillWithNa <- unlist(lapply(1:(length(nameParts)-1), function(k) {
kCombs <- utils::combn(nameParts,k)
apply(kCombs,2,paste0,collapse=",")
}))
allColumnProducts[rowsWithOne,namesToFillWithNa] <- NA
}
}
allColumnProducts
}
您可以使用数据框调用它。如果您的数据框中的名称还包含“,”,那么对于粘贴和拆分,您将需要另一个唯一字符。
我希望 cmets 能够很好地描述它的作用以及所选择的变量名称。
更新:
我只是对这项任务进行了更多思考,并在使用 NA 清理列之前在创建大数据框的部分进行了一些优化。
此外,我添加了参数 uniqueString 和 removeUniqueStringInResult。
uniqueString 是一个不应出现在数据框名称中的字符串。否则一些 strsplits 将失败。
removeUniqueStringInResult 清除生成的数据帧名称中的 uniqueString。我认为这很好,因为现在你得到了你想要的结果。如果列的名称是单个字符,如果只是将名称放在一起而不用分隔字符,则不会造成混淆。
ff2 <- function(df,
uniqueString = ",",
removeUniqueStringInResult = TRUE) {
if(NCOL(df)==1) return(df)
#go through all combinations of size k, k=2,...,NCOL(df), and calculate the product of the associated columns
allColumnProducts <- lapply(2:NCOL(df), function(k) {
kCombs <- utils::combn(names(df),k)
#the columns are all possible combinations of size k of the names
kComdDataframe <- lapply(1:NCOL(kCombs), function(i) {
colNamesInCombi <- kCombs[,i] #columns in the origianl dataframe df with these names have to be multiplied together
Reduce(f = "*",
x = as.list(df[,colNamesInCombi]),
init = numeric(NROW(df))+1)
})
kComdDataframe <- data.frame(do.call("cbind",kComdDataframe))
kCombNames <- apply(kCombs,2,paste0,collapse=uniqueString)
names(kComdDataframe) <- kCombNames
kComdDataframe
})
#put everything in one dataframe including the initial dataframe
allColumnProducts <- do.call("cbind",allColumnProducts)
allColumnProducts <- cbind(df,allColumnProducts)
#clear all the subsets and make them NA
for(columnIndex in NCOL(allColumnProducts):(NCOL(df)+1)) {
rowsWithOne <- allColumnProducts[,columnIndex]==1
rowsWithOne[is.na(rowsWithOne)] <- FALSE
if(any(rowsWithOne)) {
#we found a 1 in the column, so we have to make all columns whose
#names are part of the name of the current column NA at these rows with 1
nameParts <- strsplit(names(allColumnProducts)[columnIndex],",")[[1]]
namesToFillWithNa <- unlist(lapply(1:(length(nameParts)-1), function(k) {
kCombs <- utils::combn(nameParts,k)
apply(kCombs,2,paste0,collapse=uniqueString)
}))
allColumnProducts[rowsWithOne,namesToFillWithNa] <- NA
}
}
if(removeUniqueStringInResult) {
names(allColumnProducts) <- gsub(uniqueString,"",names(allColumnProducts))
}
allColumnProducts
}
执行时间得到了很好的改善,请参阅以下基准:
testdf <- data.frame(A=c(1,NA,1),B=c(NA,1,1),C=c(1,1,1),D=c(1,1,1),E=c(1,NA,1))
microbenchmark::microbenchmark(ff(testdf),ff2(testdf))
#Unit: milliseconds
# expr min lq mean median uq max neval
#ff(testdf) 8.6415 8.87095 10.238998 9.00815 11.38315 23.0477 100
#ff2(testdf) 3.7638 3.86935 4.905192 4.00970 5.36295 14.2669 100