在 R 中批量处理和导出 CSV 文件列表答案

【问题标题】：Batch processing and export of a list of CSV files in R在 R 中批量处理和导出 CSV 文件列表
【发布时间】：2020-10-18 03:21:14
【问题描述】：

我在一个基于不同村庄名称的文件夹中有 300 个结构相同的 CSV 文件。我需要单独读取每个文件，处理这些文件，并将输出文件导出到另一个文件夹中，并使用各自的村庄名称（例如，'村庄名称'_score）。

这是一个示例村庄文件的数据...

structure(list(ID_GC = structure(1:51, .Label = c("492K", "494K", 
"497K", "498K", "499K", "500K", "501K", "502K", "503K", "504K", 
"506K", "507K", "508K", "509K", "510K", "511K", "512K", "513K", 
"514K", "516K", "517K", "518K", "519K", "522K", "523K", "524K", 
"526K", "527K", "528K", "530K", "531K", "532K", "533K", "534K", 
"535K", "536K", "537K", "538K", "539K", "540K", "541K", "542K", 
"543K", "544K", "545K", "546K", "547K", "548K", "550K", "551K", 
"552K"), class = "factor"), Lat = c(23.78107, 23.78115, 23.78122, 
23.78123, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068, 23.78071, 
23.78075, 23.78043, 23.78021, 23.77937, 23.77985, 23.77981, 23.77995, 
23.77987, 23.7799, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906, 
23.77899, 23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855, 
23.77852, 23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797, 
23.77788, 23.77786, 23.77809, 23.77815, 23.77771, 23.77757, 23.77772, 
23.77752, 23.7774, 23.7772, 23.77869, 23.78084, 23.78178), Long = c(90.65016, 
90.64968, 90.6497, 90.64969, 90.64972, 90.64996, 90.64987, 90.64989, 
90.64924, 90.64921, 90.65, 90.64998, 90.6494, 90.64989, 90.64978, 
90.64973, 90.64952, 90.64958, 90.64925, 90.64935, 90.6492, 90.64922, 
90.64919, 90.64928, 90.64937, 90.64887, 90.64919, 90.64891, 90.64914, 
90.64903, 90.64907, 90.6491, 90.64868, 90.6491, 90.64853, 90.64862, 
90.64851, 90.64852, 90.64865, 90.64865, 90.64878, 90.64878, 90.64866, 
90.64859, 90.64844, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994, 
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"), 
    Depth_m = c(18, 18, 18, 210, 18, 31.5, 13.5, 15, 13.5, 21, 
    13.5, 18, 15, 240, 24, 13.5, 19.5, 33, 156, 14.4, 18, 21, 
    13.5, 18, 18, 51, 48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66, 
    54, 31.5, 21, 210, 66, 12, 54, 27, 219, 18, 18, 18, 18, 18, 
    21), As_ug_L = c(68L, 68L, 68L, 2L, 68L, 306L, 129L, 129L, 
    20L, 68L, 188L, 129L, 68L, 2L, 68L, 68L, 129L, 188L, 2L, 
    2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L, 20L, 2L, 2L, 2L, 
    2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L, 2L, 306L, 2L, 306L, 
    20L, 306L, 68L, 68L, 306L, 68L, 20L)), class = "data.frame", row.names = c(NA, 
-51L))

以及计算所有村庄所需的另一个数据集（“dtw_BG”）......

structure(list(ID_GC = structure(c(10L, 11L, 12L, 13L, 14L, 8L, 
9L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L), .Label = c("1002F", "1008F", "1016F", "1029F", "1051F", 
"1053F", "1058F", "1548D", "1561D", "498K", "509K", "514K", "540K", 
"545K", "559K", "560K", "569K", "571K", "597K", "601K", "614K", 
"819F", "829F", "933F", "934F", "951F", "957F", "958F", "959F", 
"960F", "964F", "973F", "982F", "998F"), class = "factor"), Lat = c(23.78123, 
23.77937, 23.7799, 23.77786, 23.77772, 23.77439336, 23.77204886, 
23.77484, 23.775, 23.77528, 23.77492, 23.77521, 23.77593, 23.7757, 
23.78494, 23.78473, 23.78385611, 23.78395451, 23.78426992, 23.78374538, 
23.78377154, 23.78360725, 23.78340944, 23.78362259, 23.78272036, 
23.78307399, 23.78269739, 23.78252464, 23.78279102, 23.78131262, 
23.78149057, 23.77867098, 23.77828323, 23.78592929), Long = c(90.64969, 
90.64989, 90.64925, 90.64865, 90.64844, 90.65543457, 90.65292302, 
90.65158, 90.65192, 90.65219, 90.65232, 90.65363, 90.65356, 90.65483, 
90.65025, 90.65238, 90.64900976, 90.64933908, 90.65082989, 90.64891814, 
90.64902199, 90.64910447, 90.64933699, 90.6488857, 90.64921562, 
90.64848103, 90.64799873, 90.64826494, 90.64738669, 90.64781684, 
90.64612672, 90.64499055, 90.64476985, 90.6499865), Village = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L), .Label = c("Abdullapur", "Chauthar Kanda", "Nagra Para Faitadi", 
"Nowa Para"), class = "factor"), Depth_m = c(210, 240, 156, 210, 
219, 225, 195, 299.7, 299.7, 240, 240, 234, 240, 105, 165, 180, 
180, 225, 180, 210, 195, 201, 180, 195, 210, 210, 195, 180, 225, 
180, 108, 210, 225, 240), As_ug_L = c(2L, 2L, 2L, 2L, 20L, 2L, 
2L, 2L, 20L, 2L, 2L, 7L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA, 
-34L))

我需要处理所有村庄，但我不确定如何循环它们。到目前为止，我能够使用“readr”包读取所有单独的村庄文件。

library(readr)
a <- list.files(path = "/Users/......",
                pattern = "*.csv", full.names = T)

这是我用于单个村庄的代码：

dtw_BG<- read.csv('/Users/...../dtw_BG.csv',header=TRUE)
gw<-read.csv('/Users/....../Abdullapur.csv',header=TRUE)
stw = gw[gw$Depth_m <= 90,]
stw_R = gw[gw$Depth_m <= 90 & gw$As_ug_L > 50,]
itw = gw[gw$Depth_m >= 45 & gw$Depth_m <= 90,]
itw_10 = gw[gw$Depth_m >= 45 & gw$Depth_m <= 90 & gw$As_ug_L <= 10,]

p<-stw [,c(3,2)]
R<-stw_R[,c(3,2)]
ITW<-itw[,c(3,2)]
ITW_10<- itw_10[,c(3,2)]
BG<-dtw_BG[,c(3,2)]

dist_R<- lapply(1:length(p[[1]]), function (i) distGeo (R, p[i,]))
dist_R<-lapply(1:length(p[[1]]), function (i) data.frame(R, dist_R[[i]]))
dist_R100<-lapply(1:length(p[[1]]),function (i) dist_R[[i]][dist_R[[i]][,3] <= 100,])
maxscore<- lapply(1:length(p[[1]]), function(i) nrow (dist_R100[[i]]))
maxscore<-unlist(maxscore)

dist_ITW<- lapply(1:length(p[[1]]), function (i) distGeo (ITW, p[i,]))
dist_ITW<-lapply(1:length(p[[1]]), function (i) data.frame(ITW, dist_ITW[[i]]))
dist_ITW100<-lapply(1:length(p[[1]]),function (i) dist_ITW[[i]][dist_ITW[[i]][,3] <= 100,])
count_itw<- lapply(1:length(p[[1]]), function(i) nrow (dist_ITW100[[i]]))
count_itw<-unlist(count_itw)

if (nrow(ITW_10)==0) {
  count_itw10<- rep(0, length(maxscore))
} else {
  dist_ITW10<- lapply(1:length(p[[1]]), function (i) distGeo (ITW_10, p[i,]))
  dist_ITW10<-lapply(1:length(p[[1]]), function (i) data.frame(ITW_10, dist_ITW10[[i]]))
  dist_ITW10_100<-lapply(1:length(p[[1]]),function (i) dist_ITW10[[i]][dist_ITW10[[i]][,3] <= 100,])
  count_itw10<- lapply(1:length(p[[1]]), function(i) nrow (dist_ITW10_100[[i]]))
  count_itw10<-unlist(count_itw10)
}

dist_BG<- lapply(1:length(p[[1]]), function (i) distGeo (BG, p[i,]))
dist_BG<-lapply(1:length(p[[1]]), function (i) data.frame(BG, dist_BG[[i]]))
dtw<-lapply(1:length(p[[1]]), function(i) {
  lapply(1: length(maxscore), function(j) {
    min(distGeo( c(dist_R100[[i]][j,1], dist_R100[[i]][j,2]), dist_BG[[i]]))
  }
  )
}
)

dtw<-unlist(dtw)
dtw<-split(dtw, (0:length(dtw) %/% length(p[[1]])))
dtw <- dtw[-length (dtw)]
count<-lapply(1:length(dtw), function(i) length(subset(dtw[[i]], dtw[[i]]<=100)))
count<-unlist(count)
score<-maxscore-count

abc<-cbind (stw, maxscore, count, score, count_itw, count_itw10)
abc<- data.frame (abc)
write.csv (abc, "/Users/..../Output/Abdullapur_score.csv", row.names = F)

所提供村庄的输出应如下所示

structure(list(ID_GC = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L, 21L, 22L, 23L, 
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 
37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L, 48L, 49L, 50L, 51L
), .Label = c("492K", "494K", "497K", "498K", "499K", "500K", 
"501K", "502K", "503K", "504K", "506K", "507K", "508K", "509K", 
"510K", "511K", "512K", "513K", "514K", "516K", "517K", "518K", 
"519K", "522K", "523K", "524K", "526K", "527K", "528K", "530K", 
"531K", "532K", "533K", "534K", "535K", "536K", "537K", "538K", 
"539K", "540K", "541K", "542K", "543K", "544K", "545K", "546K", 
"547K", "548K", "550K", "551K", "552K"), class = "factor"), Lat = c(23.78107, 
23.78115, 23.78122, 23.78125, 23.78081, 23.78096, 23.78062, 23.78068, 
23.78071, 23.78075, 23.78043, 23.78021, 23.77985, 23.77981, 23.77995, 
23.77987, 23.7796, 23.77944, 23.77934, 23.77937, 23.77906, 23.77899, 
23.77907, 23.77889, 23.77898, 23.77863, 23.77865, 23.77855, 23.77852, 
23.77843, 23.77806, 23.77824, 23.77809, 23.7781, 23.77797, 23.77788, 
23.77809, 23.77815, 23.77771, 23.77757, 23.77752, 23.7774, 23.7772, 
23.77869, 23.78084, 23.78178), Long = c(90.65016, 90.64968, 90.6497, 
90.64972, 90.64996, 90.64987, 90.64989, 90.64924, 90.64921, 90.65, 
90.64998, 90.6494, 90.64978, 90.64973, 90.64952, 90.64958, 90.64935, 
90.6492, 90.64922, 90.64919, 90.64928, 90.64937, 90.64887, 90.64919, 
90.64891, 90.64914, 90.64903, 90.64907, 90.6491, 90.64868, 90.6491, 
90.64853, 90.64862, 90.64851, 90.64852, 90.64865, 90.64878, 90.64878, 
90.64866, 90.64859, 90.64839, 90.64858, 90.64861, 90.64922, 90.64994, 
90.64925), Village = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Abdullapur", class = "factor"), 
    Depth_m = c(18, 18, 18, 18, 31.5, 13.5, 15, 13.5, 21, 13.5, 
    18, 15, 24, 13.5, 19.5, 33, 14.4, 18, 21, 13.5, 18, 18, 51, 
    48, 54, 67.5, 69, 69, 66, 66, 21, 60, 66, 54, 31.5, 21, 66, 
    12, 54, 27, 18, 18, 18, 18, 18, 21), As_ug_L = c(68L, 68L, 
    68L, 68L, 306L, 129L, 129L, 20L, 68L, 188L, 129L, 68L, 68L, 
    68L, 129L, 188L, 2L, 68L, 37L, 20L, 306L, 306L, 20L, 306L, 
    20L, 2L, 2L, 2L, 2L, 2L, 306L, 2L, 2L, 2L, 306L, 306L, 2L, 
    306L, 2L, 306L, 306L, 68L, 68L, 306L, 68L, 20L), maxscore = c(10L, 
    11L, 11L, 11L, 12L, 12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L, 
    10L, 9L, 10L, 9L, 10L, 10L, 10L, 7L, 7L, 5L, 7L, 6L, 9L, 
    9L, 9L, 8L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 9L, 9L, 8L, 8L, 8L, 
    8L, 6L, 7L, 12L, 3L), count = c(10L, 11L, 11L, 11L, 12L, 
    12L, 16L, 13L, 12L, 12L, 16L, 13L, 8L, 10L, 9L, 10L, 9L, 
    9L, 9L, 9L, 6L, 6L, 4L, 6L, 5L, 8L, 8L, 8L, 7L, 8L, 8L, 8L, 
    8L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 6L, 6L, 12L, 3L), 
    score = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 
    0L, 1L, 0L, 0L), count_itw = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 6L, 7L, 7L, 8L, 8L, 
    9L, 10L, 10L, 12L, 12L, 12L, 12L, 13L, 11L, 13L, 10L, 10L, 
    10L, 10L, 12L, 12L, 6L, 6L, 5L, 5L, 2L, 12L, 0L, 0L), count_itw10 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 3L, 4L, 4L, 5L, 5L, 6L, 7L, 7L, 9L, 9L, 9L, 9L, 10L, 
    10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 6L, 6L, 5L, 5L, 2L, 
    9L, 0L, 0L)), class = "data.frame", row.names = c(1L, 2L, 
3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 
33L, 34L, 35L, 36L, 37L, 38L, 39L, 41L, 42L, 43L, 44L, 46L, 47L, 
48L, 49L, 50L, 51L))

如何导出每个村庄文件的名称？

提前谢谢你:)

【问题讨论】：

您可以列出目录中的 .csv 文件并使用您为单个村庄制作的代码循环浏览它们

标签： r loops batch-processing

【解决方案1】：

简单地将您的流程概括为一个已定义的方法，该方法接收村庄文件名作为参数。然后通过遍历文件名并调用您的方法来构建数据框列表：

# COMMON VARIABLES
output_path <- "/Users/..../Output/"
dtw_BG <- read.csv('/Users/...../dtw_BG.csv', header=TRUE)
BG <- dtw_BG[,c(3,2)] 

# OUTPUT CSV AND RETURN DATA FRAME
calc_score <- function(village_file) {
  gw <- read.csv(village_file, header=TRUE)

  #... REST OF CODE

  write.csv(abc, paste0(output_Path, stw$Village[[1]], ".csv"), row.names = FALSE)
  return(abc)
}

# PASS FILE NAMES ITERATIVELY TO BULLD LIST OF DFs (WITH EACH CSV)
v_files <- list.files(path = "/path/to/inputs", pattern = "*.csv", 
                      full.names = TRUE)        
df_list <- lapply(v_files, calc_score)

顺便说一句，您的大部分代码可能会被收紧，因为distGeo 可以接收Lon 和Lat 坐标的矩阵。此外，考虑在每对数据帧和 p 之间进行交叉连接合并（即所有成对匹配），以减少重复的 lapply 调用。为了代码的可维护性，尽量使用列名而不是数字。

注意：以下需要对完整数据进行测试，并作为示例显示。

calc_score <- function(village_file) {
      gw <- read.csv(village_file, header=TRUE)

      ### DATA FRAME SUBSETS
      stw <- gw[gw$Depth_m <= 90,]
      p <- stw[, c("Long", "Lat")]
      R <- gw[gw$Depth_m <= 90 & gw$As_ug_L > 50, c("Long", "Lat")]
      ITW <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90, c("Long", "Lat")]
      ITW_10 <- gw[gw$Depth_m >= 45 & gw$Depth_m <= 90 & gw$As_ug_L <= 10, c("Long", "Lat")]

      ### MAX SCORE CALCULATION
      cj <- merge(R, p, by=NULL, suffixes=c("", "_"))  # CROSS JOIN OF ALL ROWS BETWEEN DFs
      dist_R <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
      dist_R100 <- subset(dist_R, Distance <= 100)

      maxscore <- aggregate(cbind(Score=Distance) ~ Long_ + Lat_, dist_R100, FUN=length)$Score

      ### COUNT ITW100 CALCULATION
      cj <- merge(ITW, p, by=NULL, suffixes=c("", "_"))  # CROSS JOIN OF ALL ROWS BETWEEN DFs
      dist_ITW <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
      dist_ITW100 <- subset(dist_ITW, Distance <= 100)

      count_itw <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW100, FUN=length)$Count

      ### COUNT ITW10 CALCULATION
      if (nrow(ITW_10)==0) {
        count_itw10 <- rep(0, length(maxscore))
      } else {
        cj <- merge(IT_10, p, by=NULL, suffixes=c("", "_"))  # CROSS JOIN OF ALL ROWS BETWEEN DFs
        dist_ITW10 <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))
        dist_ITW10_100 <- subset(dist_ITW10, Distance <= 100)

        count_itw10 <- aggregate(cbind(Count=Distance) ~ Long_ + Lat_, dist_ITW10_100, FUN=length)$Count
      }
    
      ### MINIMUM DISTANCE
      cj <- merge(BG, p, by=NULL, suffixes=c("", "_"))  # CROSS JOIN OF ALL ROWS BETWEEN DFs
      dist_BG <- transform(cj, Distance = distGeo(cj[c("Long", "Lat")], cj[c("Long_", "Lat_")]))    
      mdf <- merge(dist_R100, dist_BG, by=c("Long_", "Lat_"),   
                   suffixes=c("", "_"))    # MERGE AT p LEVEL
      dtw <- transform(mdf, Distance = distGeo(mdf[c("Long", "Lat")], mdf[c("Long_", "Lat_")]))

      dtw <- aggregate(Distance ~ Long + Lat, dtw, FUN=min)$Distance

      ### SCORE CALCULATION
      dtw <- unlist(dtw)
      dtw <- split(dtw, (0:length(dtw) %/% length(p[[1]])))
      dtw <- dtw[-length (dtw)]
      count <- sapply(dtw, function(d) length(d[d<=100]))
      score <- maxscore - count

      ### FINAL DATA FRAME
      village_df <- cbind.data.frame(stw, maxscore, count, score, count_itw, count_itw10)

      write.csv(village_df, paste0(output_Path, village_df$Village[[1]], ".csv"), row.names = FALSE)

      return(village_df)
}

【讨论】：

非常感谢@Parfait！我可以根据需要使用公共变量导出村庄文件，并按照您提供的方式传递文件名。但是，直到我使用列名而不是数字，分数才正确。我还尝试了使用交叉连接合并的代码，但它没有给我正确的分数（合并数据长度不相等）。我收到一个错误......mdf
是的，再一次，底部重构的代码示例需要更多的测试和评估，仅作为演示显示。这个想法是避免在每个数据帧中逐行迭代，而是将整个集合合并在一起以进行成对距离计算。我修复了一些语法问题。错误应该出现在transform 行而不是merge。但是，如果第一个代码块对您有用（使用您现有的代码），很高兴我能提供帮助！