根据表中最近的列分组的平均值替换 NA 的值答案

【问题标题】：Replace value of NA based on the mean of nearest groupings of columns in a table根据表中最近的列分组的平均值替换 NA 的值
【发布时间】：2017-01-21 08:00:15
【问题描述】：

我的目标是用最近的变量分组替换 NA。例如，假设有四个变量A、B、C 和Num。 Num 是数值变量，而 A、B 和 C 是分类变量。现在，如果Num 的值缺失，比如A = Alpha、B = Beta 和C = Theta，那么我想寻找这个组合的其他观察值，计算它们的平均值并替换 NA .

如果不存在这样的组合，我会寻找 A = Alpha 和 B = Beta 组合的观察结果（因此，术语“最近分组”），计算它们的平均值并替换它。

如果不存在这样的组合，我会查找所有归类为A = Alpha 的观察值，计算它们的平均值并替换它。

如果这是唯一的观察结果，那么我们将其替换为 0。我已在随此发布的测试文件中创建了此类场景。

虽然我的代码运行良好，但它非常程序化。我已经从 C/C++ 编程过渡，但我仍然不习惯 R 的向量化方法。因此，我正在寻找一种方法：

a) 更干净（请不要使用for 循环；内存更少，速度更快）。在编写代码时，我意识到我没有充分利用 R 编程的力量。

b) 易于理解。

我在下面的示例输出中添加了 cmets 仅供参考。

输入数据：

dput(DFile)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2", 
"G3", "G3", "G3", "G3", "G5", "G5", "G5", "G5", "G5", "G6"), 
    Country_SV = c("United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    NA, NA, NA, "Europe", "UK", "France", "Europe", "Europe", 
    "Australia"), Product_BU = c("Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", NA, NA, NA, "Pencil", 
    "Power Cord", "Laptop", "Keyboard", "Mouse", "Motherboard"
    ), Prob_model3 = c(0, 79647405.9878251, 282615405.328728, 
    NA, NA, 363419594.065383, 0, 72870592.8458704, 260045174.088548, 
    369512727.253779, NA, 234, NA, 5, 10, NA)), .Names = c("Region_SL", 
"Country_SV", "Product_BU", "Prob_model3"), row.names = c(NA, 
16L), class = "data.frame")

预期输出： 请注意，cmets 仅供参考。该列不是必需的。

dput(Output)
structure(list(Region_SL = c("G1", "G1", "G1", "G1", "G2", "G2", 
"G3", "G3", "G3", "G3", "G5", "G5", "G5", "G5", "G5", "G6"), 
    Country_SV = c("United States", "United States", "United States", 
    "United States", "United States", "United States", "United States", 
    "United States", "United States", "United States", "Europe", 
    "UK", "France", "Europe", "Europe", "Australia"), Product_BU = c("Laptop", 
    "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", "Laptop", 
    "Laptop", "Laptop", "Laptop", "Pencil", "Power Cord", "Laptop", 
    "Keyboard", "Mouse", "Motherboard"), Prob_model3 = c(0, 79647405.9878251, 
    282615405.328728, 120754270.438851, 363419594.065383, 363419594.065383, 
    0, 72870592.8458704, 260045174.088548, 369512727.253779, 
    7.5, 234, 83, 5, 10, 0), Comment = c(NA, NA, NA, "Grouped on G1, Laptop, US; Average of rows 1 to 3", 
    "Grouped on G2, US, Laptop; Average is the only value in row 6", 
    NA, NA, NA, NA, NA, "Group of G5, Europe and Pencil are unique; G5 and Europe exist. Average of row 14 and 15", 
    NA, "Group of G5, France and Laptop is unique; Group of G5 and France is unique as well; Use group of G5 and take average of row 12, 14, 15", 
    NA, NA, "Unique. Substitute 0")), .Names = c("Region_SL", 
"Country_SV", "Product_BU", "Prob_model3", "Comment"), row.names = c(NA, 
16L), class = "data.frame")

这是我的代码：（代码运行良好，预期的输出是b。b 只不过是上面发布的Output，没有 cmets。

DFile_New <-DFile
DFile_New<-DFile_New %>% 
  arrange(Region_SL, Country_SV,Product_BU) 

#replace categorical variable with the combination above or below the row to complete cases.
DFile_New[,1:3]<-  zoo::na.locf(DFile_New[,1:3])

#Create look-up table for means, for each type of combination.
Lookup1<- DFile_New %>%
  dplyr::group_by(Region_SL, Country_SV, Product_BU) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup2<-DFile_New %>%
  dplyr::group_by(Region_SL, Country_SV) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup3<-DFile_New %>%
  dplyr::group_by(Region_SL) %>%
  dplyr::summarise(count=n(),Mean_prob = mean(Prob_model3,na.rm = TRUE)) 

Lookup_Table<-dplyr::bind_rows(Lookup1,Lookup2,Lookup3)

#Get rid of those rows with count = 1
Lookup_Table<-Lookup_Table[!Lookup_Table$count==1,]
colnames(Lookup_Table)[5]<-"Prob_model3"

#Look for combinations based on Region, Country and Product
b<-DFile_New %>%
  dplyr::left_join(Lookup_Table,by=c("Region_SL", "Country_SV", "Product_BU"))
b$Prob_model3 <- coalesce(b$Prob_model3.x,b$Prob_model3.y)
#Drop the two columns
b$Prob_model3.x<-NULL
b$Prob_model3.y<-NULL
b$count<-NULL
b<-b[!(is.na(b$Country_SV)&is.na(b$Product_BU)),]


c<-b[is.na(b$Prob_model3),] %>%
  dplyr::left_join(Lookup_Table[is.na(Lookup_Table$Product_BU) & !is.na(Lookup_Table$Country_SV),],by=c("Region_SL", "Country_SV")) %>%
    dplyr::mutate(Prob_model3 = coalesce(Prob_model3.x,Prob_model3.y)) %>%
    dplyr::select(Region_SL:Product_BU.x, Prob_model3)
colnames(c)[3]<-"Product_BU"
colnames(c)[4]<-"Prob_model3"

b<-rbind(b,c)
b%>% unite(Col,Region_SL:Product_BU,sep=".")
c<-b
b<-b[complete.cases(b[4]),]

#Look for combinations based on Region, and Country     
c<-c[is.na(c$Prob_model3),] %>%
  dplyr::left_join(Lookup_Table[is.na(Lookup_Table$Product_BU) & is.na(Lookup_Table$Country_SV),],by=c("Region_SL")) %>%
  dplyr::filter(!is.na(Prob_model3.y)) %>%
  dplyr::mutate(Prob_model3.1 = coalesce(Prob_model3.x,Prob_model3.y)) %>%
  dplyr::select(Region_SL:Product_BU.x, Prob_model3.1) %>%
  unique(.)
colnames(c)[3]<-"Product_BU"
colnames(c)[2]<-"Country_SV"

#Look for combinations based on Region     
b<-b%>% 
  full_join(c) %>% 
  dplyr::mutate(Prob_model3.2 = coalesce(Prob_model3,Prob_model3.1)) %>%
  dplyr::select(Region_SL:Product_BU,Prob_model3.2)
colnames(b)[4]<-"Prob_model3"

#Are there any unique observations left?
b<-rbind(b,anti_join(DFile_New,b,by=c("Region_SL", "Country_SV", "Product_BU")))
b[is.na(b$Prob_model3),"Prob_model3"]<-0

我对 R 编程世界比较陌生。我真诚地感谢任何帮助。我最好寻找高级解决方案——lapply/dplyr/tidyr，只要它不像我的那么复杂，什么都可以。

我的会话信息：

R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] grDevices datasets  stats     graphics  grid      tcltk     utils     methods   base    

$otherPkgs
 [1] "bit"               "bit64"             "boot"              "car"               "compare"          
 [6] "corrgram"          "corrplot"          "cowplot"           "debug"             "directlabels"     
[11] "dplyr"             "foreign"           "Formula"           "ggplot2"           "ggthemes"         
[16] "gmodels"           "hexbin"            "Hmisc"             "installr"          "knitr"            
[21] "lattice"           "lubridate"         "magrittr"          "maps"              "openxlsx"         
[26] "pastecs"           "plotly"            "plyr"              "psych"             "purrr"            
[31] "R2HTML"            "readr"             "readstata13"       "reshape2"          "ResourceSelection"
[36] "rJava"             "rmarkdown"         "sm"                "stringr"           "survival"         
[41] "tables"            "tibble"            "tidyr"             "tidyverse"         "tufte"            
[46] "tufterhandout"     "vcd"               "xlsxjars"          "xts"               "zoo"              

$loadedOnly
 [1] "acepack"      "assertthat"   "backports"    "base64enc"    "bitops"       "broom"        "caTools"     
 [8] "checkmate"    "class"        "cluster"      "codetools"    "colorspace"   "data.table"   "DBI"         
[15] "dendextend"   "DEoptimR"     "digest"       "diptest"      "evaluate"     "flexmix"      "foreach"     
[22] "fpc"          "gclus"        "gdata"        "gplots"       "gridExtra"    "gtable"       "gtools"      
[29] "haven"        "hms"          "htmlTable"    "htmltools"    "htmlwidgets"  "httr"         "iterators"   
[36] "jsonlite"     "kernlab"      "KernSmooth"   "latticeExtra" "lazyeval"     "lme4"         "lmtest"      
[43] "MASS"         "Matrix"       "MatrixModels" "mclust"       "mgcv"         "minqa"        "mnormt"      
[50] "modelr"       "modeltools"   "munsell"      "mvbutils"     "mvtnorm"      "nlme"         "nloptr"      
[57] "nnet"         "parallel"     "pbkrtest"     "prabclus"     "quadprog"     "quantreg"     "R6"          
[64] "RColorBrewer" "Rcpp"         "readxl"       "registry"     "robustbase"   "rpart"        "rprojroot"   
[71] "rvest"        "scales"       "seriation"    "SparseM"      "splines"      "stats4"       "stringi"     
[78] "tools"        "trimcluster"  "TSP"          "viridisLite"  "whisker"      "xml2"

【问题讨论】：

标签： r dplyr tidyr

【解决方案1】：

有一个使用dplyr工具的两步解决方案：

为特定类型的平均创建代表“查找”的列；
连续替换NAs。

代码如下：

library(dplyr)

df_1 <- df %>%
  group_by(Region_SL) %>%
  summarise(lookup_1 = mean(Prob_model3, na.rm=TRUE))
df_2 <- df %>%
  group_by(Region_SL, Country_SV) %>%
  summarise(lookup_2 = mean(Prob_model3, na.rm=TRUE))
df_3 <- df %>%
  group_by(Region_SL, Country_SV, Product_BU) %>%
  summarise(lookup_3 = mean(Prob_model3, na.rm=TRUE))

df_new <- df %>%
  left_join(df_3, by = c("Region_SL", "Country_SV", "Product_BU")) %>%
  left_join(df_2, by = c("Region_SL", "Country_SV")) %>%
  left_join(df_1, by = c("Region_SL")) %>%
  mutate(modProb_model3 = coalesce(x=Prob_model3,
                                   lookup_3, lookup_2, lookup_1,
                                   0)) %>%
  select(Region_SL, Country_SV, Product_BU, Prob_model3=modProb_model3)

这里df是输入数据框。 df_1、df_2和df_3是具有一定平均信息的数据框（索引代表分组的分类变量的数量）。

在连续的左连接之后，使用coalesce函数创建新变量modProb_model3：它在每个位置找到第一个非缺失值。

编辑

我认为上述解决方案对特定问题最有效。例如，如果在 NA 替换中至少有 10 个可能的分组需要考虑，那么一些自动化会更好。这种自动化可以如下（使用包tidyverse和lazyeval）：

library(tidyverse)

value_name <- "Prob_model3"
max_group_vars <- c("Region_SL", "Country_SV", "Product_BU")
n_group_vars <- length(max_group_vars)
lookup_vars_list <- c(x = value_name, paste0("lookup_", n_group_vars:1)) %>%
  as.list()

get_lookup_table <- function(.data,
                             group_vars,
                             value_name = "Prob_model3",
                             lookup_index = 1) {
  summarise_data <- (~ mean(val, na.rm = TRUE)) %>%
    lazyeval::interp(val = as.name(value_name)) %>%
    list() %>%
    setNames(paste0("lookup_", lookup_index))
  .data %>%
    group_by_(.dots = as.list(group_vars)) %>%
    summarise_(.dots = summarise_data)
}

df_new_1 <- c(
  list(df),
  map(n_group_vars:1, function(lookup_index) {
    get_lookup_table(.data = df,
                     group_vars = max_group_vars[1:lookup_index],
                     value_name = value_name,
                     lookup_index = lookup_index)
  })
) %>%
  reduce(left_join) %>%
  mutate(modValue = select_(., .dots = lookup_vars_list) %>%
           as.list() %>%
           c(0) %>%
           do.call(what = coalesce)) %>%
  select(-matches(match = paste0("^lookup_[0-9]+$|", value_name))) %>%
  rename_(.dots = setNames(list("modValue"), value_name))

基本上算法是相同的，但代码更通用：它将列中的 NA 值替换为名称 value_name，基于由减少的列名称集定义的组中的平均值（从存储在 max_group_vars 中的集合开始）。代码大量使用dplyr 的标准评估（请参阅https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html）。以下是一些解释：

get_lookup_table 为指定的分组变量创建查找表。查找列具有唯一名称lookup_i 其中i - 查找表的指定索引；
使用reduce 函数完成连续左连接，该函数连续将具有两个参数的函数（此处为left_join 与x 和y）应用于其先前的调用和一个新值。例如：f1=f(x1, x2) --> f2=f(f1, x3) 等等。注意无需指定by 参数即可正确完成左连接，因为查找表是以“自然连接”正确的方式创建的。还会有dplyr关于它猜测加入列的警告；
modValue 是使用函数 do.call 创建的，并为 coalesce 创建了初步参数列表；
管道中的最后两个元素：使用正则表达式选择适当的列，然后将modValue 重命名为所需的value_name。

【讨论】：

很好的反应。您认为我们可以缩短查找表的生成时间吗？我相信我们的代码有一些冗余，因为我们正在编写很多重复的东西。你怎么看？
编辑了答案。但我认为这种复杂方法有效的情况并不多（除了所涉及的学习技术:)）。
非常感谢。我收到以下错误：Error in UseMethod("group_by_") : no applicable method for 'group_by_' applied to an object of class "function"。我进行了回溯，发现11.dplyr::group_by_ 存在问题。回溯：11. dplyr::group_by_... 10. function_list[[i]](value) 9. freduce(value, _function_list) 8. _fseq(_lhs) 7. eval(expr, envir, ...) 6. eval(quote(_fseq(_lhs)), env, env) 5. withVisible(eval(quote... 4. .data %>% dplyr::group_by_....3. get_lookup_table 2..f(.x[[i]], ...) 1. purrr::map 非常感谢您的帮助。
很遗憾，我无法重现您的错误。这种行为的可能问题是： 1. 未安装包lazyeval； 2. 没有将代码正确复制到您的控制台中。
再次感谢您的帮助。我正在发布 sessionInfo()。你认为它可能有帮助吗？我也尝试过多次复制粘贴您的代码，但问题仍然存在。