【问题标题】:Nested Tabular Output in RR中的嵌套表格输出
【发布时间】:2018-02-02 00:20:45
【问题描述】:

我正在努力确定如何生成表格输出,如下所示。 (我希望能够利用条件逻辑对单元格进行着色,如 Excel 中生成的附加输出所示,但我很高兴能够简单地了解如何在没有着色的情况下生成输出。)


所需的输出(在 Excel 中生成)


采用的总体流程:

  1. 使用“gains”包提供的 MineThatData 数据集。
  2. 完整的数据集包括四个模型分数。只需保留“logistic.score”作为说明。
  3. 根据模型分数(即logistic.score)将训练样本(train = 1)中的每条记录分配给一个十分位数。
  4. 使用训练样本中的分数范围将测试样本中的记录分配给十分位数
  5. 按十分位数报告每个抽样组的各种统计数据(包括转化率和每条记录的支出)。

加载所需的包。

library(gains)
library(plyr)
library(StatMeasures)
library(sqldf)
library(tables)

full_dataset <- MineThatData

reduced_dataset <- full_dataset[ , 
                           c("conversion","spend","train","logistic.score")]

reduced_dataset <- rename(reduced_dataset,c("logistic.score"="score"))

reduced_dataset$score <- round(reduced_dataset$score, 8)

summary(reduced_dataset)

trainDF <- reduced_dataset[reduced_dataset$train == 1, ]
testDF  <- reduced_dataset[reduced_dataset$train == 0, ]

trainDF$Decile <- decile(trainDF$score, decreasing = TRUE)

summarize_results_by_decile <- function(Input_DF, Output_DF) {
  Output_DF <- sqldf("
     select
      case when train = 1 then 'Train' else 'Test' end as Sample
     ,Decile
     ,count(*) as Num_Records
     ,sum(conversion) as Num_Converters
     ,sum(spend) as Sum_Spend
     ,min(score) as Min_Score
     ,max(score) as Max_Score
     ,round(avg(conversion),4) as Pct_Response
     ,round(avg(spend),2) as Spend_per_Record
     from Input_DF
     group by Decile
     order by Decile
     ")

  temp_df <- sqldf("
     select
      case when train = 1 then 'Train' else 'Test' end as Sample
     ,11 as Decile
     ,count(*) as Num_Records
     ,sum(conversion) as Num_Converters
     ,sum(Spend) as Sum_Spend
     ,min(score) as Min_Score
     ,max(score) as Max_Score
     ,round(avg(conversion),4) as Pct_Response
     ,round(avg(spend),2) as Spend_per_Record
     from Input_DF
     ")

  Output_DF <- rbind(Output_DF , temp_df)

  Output_DF$Decile <- factor(Output_DF$Decile, 
                labels =c("1","2","3","4","5","6","7","8","9","10","Total"))

  Output_DF$Pct_of_Records <- paste(format(round(Output_DF$Num_Records    
    / temp_df$Num_Records * 100, 1), nsmall=1), "%", sep="")

  Output_DF$Pct_of_Converters <- paste(format(round(Output_DF$Num_Converters 
    / temp_df$Num_Converters * 100, 1), nsmall=1), "%", sep="")

  Output_DF$Pct_of_Spend <- paste(format(round(Output_DF$Sum_Spend      
    / temp_df$Sum_Spend * 100, 1), nsmall=1), '%', sep="")

  Output_DF$Num_Records <- format(Output_DF$Num_Records, big.mark = ",")

  Output_DF$Num_Converters <- format(Output_DF$Num_Converters, 
                                     big.mark = ",")

  Output_DF$Sum_Spend <- paste("$" , sep="", format(Output_DF$Sum_Spend,      
                                                    big.mark = ","))

  Output_DF$Pct_Response <- paste(format(round(Output_DF$Pct_Response * 100, 
                                        2), nsmall=2), "%", sep="")

  Output_DF$Spend_per_Record <- paste("$", sep="", 
                               format(Output_DF$Spend_per_Record, nsmall=2))

return(Output_DF)
}

summary_results_train <- summarize_results_by_decile(trainDF, 
                                                     summary_results_train)

Min_Decile_Scores <- t(subset(summary_results_train, select = Min_Score))

assign_decile <- function(score_var, decile_var) {
  decile_var <- ifelse(score_var >= Min_Decile_Scores[1], 1,
                ifelse(score_var >= Min_Decile_Scores[2], 2,
                ifelse(score_var >= Min_Decile_Scores[3], 3,
                ifelse(score_var >= Min_Decile_Scores[4], 4,
                ifelse(score_var >= Min_Decile_Scores[5], 5,
                ifelse(score_var >= Min_Decile_Scores[6], 6,
                ifelse(score_var >= Min_Decile_Scores[7], 7,
                ifelse(score_var >= Min_Decile_Scores[8], 8,
                ifelse(score_var >= Min_Decile_Scores[9], 9, 10)))))))))

  return(decile_var)
}

验证用于在训练数据上分配十分位分配的逻辑:

trainDF$Replicate_Decile <- assign_decile(trainDF$score, 
                                                   trainDF$Replicate_Decile)
table(trainDF$Decile, trainDF$Replicate_Decile)
trainDF$Replicate_Decile <- NULL

testDF$Decile <- assign_decile(testDF$score, testDF$Decile)

summary_results_test <- summarize_results_by_decile(testDF, 
                                                    summary_results_test)

summary_results <- rbind(summary_results_train, summary_results_test)

summary_results <- subset(summary_results, select = -c(Min_Score,Max_Score))

这样做是为了重新排列要显示的列:

summary_results <- summary_results[ ,c("Sample", "Decile", "Num_Records", 
                         "Num_Converters", "Sum_Spend", "Pct_of_Records", 
                         "Pct_of_Converters","Pct_of_Spend","Pct_Response", 
                         "Spend_per_Record")]

这样做是为了影响列名的显示方式:

summary_results <- rename(summary_results,
   c("Num_Records"       = "# Records",
     "Num_Converters"    = "# Converters",
     "Sum_Spend"         = "Total Spend",
     "Pct_of_Records"    = "% of Records",
     "Pct_of_Converters" = "% of Converters",
     "Pct_of_Spend"      = "% of Spend",
     "Pct_Response"      = "% Conversion",
     "Spend_per_Record"  = "$ per Record"))

print(summary_results[summary_results$Sample == 'Train', -1], row.names = FALSE)
print(summary_results[summary_results$Sample == 'Test' , -1], row.names = FALSE)

我能够在 R 中生成的输出


除了这是我第一次在 Stack Overflow 上发帖外,我还是一个相对较新的 R 用户。我希望我的代码是可以理解的!提前感谢您的任何帮助。

【问题讨论】:

标签: r


【解决方案1】:

不太好和简洁的解决方案:

library(gains)
library(expss)

full_dataset = MineThatData

reduced_dataset = full_dataset[ , 
                                 c("conversion","spend","train","logistic.score")]
reduced_dataset$score = round(reduced_dataset$logistic.score, 8)

summary_fun = function(data){
    calc(data, 
         list(
             "# Records" = NROW(data),
             "# Converters" = sum(conversion), 
             "$ Spend" = sum(spend), 
             "% Conversion" = round(mean(conversion),4)*100, 
             "$ per Record" = round(mean(spend),2)    
         )
    )
}

reduced_dataset %>% 
    compute({
        decile_points =  quantile(score[train==1],
                                  probs = seq(0,1,by = 0.1)
        )
        decile_points[length(decile_points)] = Inf
        # '11 - ' is needed make reverse order
        decile = 11 - as.integer(cut(score, decile_points, include.lowest = TRUE)) 
        rm(decile_points) # we don't need this in our dataset

    }) %>% 
    # "|" to suppress variable label
    tab_rows("|" = decile, total(label = "Total")) %>% 
    tab_cols(total(label = "|")) %>% 
    tab_cells(sheet(conversion, 
                    spend)) %>% 
    tab_subgroup(train==1) %>% 
    tab_stat_fun_df(summary_fun, label = "Train") %>% 
    tab_subgroup(train==0) %>% 
    tab_stat_fun_df(summary_fun, label = "Test") %>% 
    tab_pivot(stat_position = "inside_columns") %>% 
    # calculate  percent
    do_repeat(i = perl("Records|Converters|Spend"), {
        ..[gsub("#|\\$", "% of", .item_value, perl = TRUE)] = i/i[.N]*100
    }) %>%
    # move some columns to the end
    keep(!fixed("% Conversion") & !fixed("$ per Record"), other()) %>% 
    # formating
    do_repeat(i = fixed("#"), {
        i = format(i, big.mark = ",")
    }) %>% 
    do_repeat(i = fixed("$"), {
        i = paste0("$", format(i, big.mark = ","))
    }) %>%
    do_repeat(i = fixed("%"), {
        i = paste0(format(round(i, 1), nsmall=1), "%")
    }) %>% 
    htmlTable()

这给出了以下结果:

免责声明:我是“expss”包的作者。

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2022-01-24
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2013-03-06
    相关资源
    最近更新 更多