【发布时间】:2018-02-02 00:20:45
【问题描述】:
我正在努力确定如何生成表格输出,如下所示。 (我希望能够利用条件逻辑对单元格进行着色,如 Excel 中生成的附加输出所示,但我很高兴能够简单地了解如何在没有着色的情况下生成输出。)
采用的总体流程:
- 使用“gains”包提供的 MineThatData 数据集。
- 完整的数据集包括四个模型分数。只需保留“logistic.score”作为说明。
- 根据模型分数(即logistic.score)将训练样本(train = 1)中的每条记录分配给一个十分位数。
- 使用训练样本中的分数范围将测试样本中的记录分配给十分位数
- 按十分位数报告每个抽样组的各种统计数据(包括转化率和每条记录的支出)。
加载所需的包。
library(gains)
library(plyr)
library(StatMeasures)
library(sqldf)
library(tables)
full_dataset <- MineThatData
reduced_dataset <- full_dataset[ ,
c("conversion","spend","train","logistic.score")]
reduced_dataset <- rename(reduced_dataset,c("logistic.score"="score"))
reduced_dataset$score <- round(reduced_dataset$score, 8)
summary(reduced_dataset)
trainDF <- reduced_dataset[reduced_dataset$train == 1, ]
testDF <- reduced_dataset[reduced_dataset$train == 0, ]
trainDF$Decile <- decile(trainDF$score, decreasing = TRUE)
summarize_results_by_decile <- function(Input_DF, Output_DF) {
Output_DF <- sqldf("
select
case when train = 1 then 'Train' else 'Test' end as Sample
,Decile
,count(*) as Num_Records
,sum(conversion) as Num_Converters
,sum(spend) as Sum_Spend
,min(score) as Min_Score
,max(score) as Max_Score
,round(avg(conversion),4) as Pct_Response
,round(avg(spend),2) as Spend_per_Record
from Input_DF
group by Decile
order by Decile
")
temp_df <- sqldf("
select
case when train = 1 then 'Train' else 'Test' end as Sample
,11 as Decile
,count(*) as Num_Records
,sum(conversion) as Num_Converters
,sum(Spend) as Sum_Spend
,min(score) as Min_Score
,max(score) as Max_Score
,round(avg(conversion),4) as Pct_Response
,round(avg(spend),2) as Spend_per_Record
from Input_DF
")
Output_DF <- rbind(Output_DF , temp_df)
Output_DF$Decile <- factor(Output_DF$Decile,
labels =c("1","2","3","4","5","6","7","8","9","10","Total"))
Output_DF$Pct_of_Records <- paste(format(round(Output_DF$Num_Records
/ temp_df$Num_Records * 100, 1), nsmall=1), "%", sep="")
Output_DF$Pct_of_Converters <- paste(format(round(Output_DF$Num_Converters
/ temp_df$Num_Converters * 100, 1), nsmall=1), "%", sep="")
Output_DF$Pct_of_Spend <- paste(format(round(Output_DF$Sum_Spend
/ temp_df$Sum_Spend * 100, 1), nsmall=1), '%', sep="")
Output_DF$Num_Records <- format(Output_DF$Num_Records, big.mark = ",")
Output_DF$Num_Converters <- format(Output_DF$Num_Converters,
big.mark = ",")
Output_DF$Sum_Spend <- paste("$" , sep="", format(Output_DF$Sum_Spend,
big.mark = ","))
Output_DF$Pct_Response <- paste(format(round(Output_DF$Pct_Response * 100,
2), nsmall=2), "%", sep="")
Output_DF$Spend_per_Record <- paste("$", sep="",
format(Output_DF$Spend_per_Record, nsmall=2))
return(Output_DF)
}
summary_results_train <- summarize_results_by_decile(trainDF,
summary_results_train)
Min_Decile_Scores <- t(subset(summary_results_train, select = Min_Score))
assign_decile <- function(score_var, decile_var) {
decile_var <- ifelse(score_var >= Min_Decile_Scores[1], 1,
ifelse(score_var >= Min_Decile_Scores[2], 2,
ifelse(score_var >= Min_Decile_Scores[3], 3,
ifelse(score_var >= Min_Decile_Scores[4], 4,
ifelse(score_var >= Min_Decile_Scores[5], 5,
ifelse(score_var >= Min_Decile_Scores[6], 6,
ifelse(score_var >= Min_Decile_Scores[7], 7,
ifelse(score_var >= Min_Decile_Scores[8], 8,
ifelse(score_var >= Min_Decile_Scores[9], 9, 10)))))))))
return(decile_var)
}
验证用于在训练数据上分配十分位分配的逻辑:
trainDF$Replicate_Decile <- assign_decile(trainDF$score,
trainDF$Replicate_Decile)
table(trainDF$Decile, trainDF$Replicate_Decile)
trainDF$Replicate_Decile <- NULL
testDF$Decile <- assign_decile(testDF$score, testDF$Decile)
summary_results_test <- summarize_results_by_decile(testDF,
summary_results_test)
summary_results <- rbind(summary_results_train, summary_results_test)
summary_results <- subset(summary_results, select = -c(Min_Score,Max_Score))
这样做是为了重新排列要显示的列:
summary_results <- summary_results[ ,c("Sample", "Decile", "Num_Records",
"Num_Converters", "Sum_Spend", "Pct_of_Records",
"Pct_of_Converters","Pct_of_Spend","Pct_Response",
"Spend_per_Record")]
这样做是为了影响列名的显示方式:
summary_results <- rename(summary_results,
c("Num_Records" = "# Records",
"Num_Converters" = "# Converters",
"Sum_Spend" = "Total Spend",
"Pct_of_Records" = "% of Records",
"Pct_of_Converters" = "% of Converters",
"Pct_of_Spend" = "% of Spend",
"Pct_Response" = "% Conversion",
"Spend_per_Record" = "$ per Record"))
print(summary_results[summary_results$Sample == 'Train', -1], row.names = FALSE)
print(summary_results[summary_results$Sample == 'Test' , -1], row.names = FALSE)
除了这是我第一次在 Stack Overflow 上发帖外,我还是一个相对较新的 R 用户。我希望我的代码是可以理解的!提前感谢您的任何帮助。
【问题讨论】:
-
我认为
htmlTable包可以用来生成这样的嵌套表格,包括阴影。见cran.r-project.org/web/packages/htmlTable/vignettes/tables.html
标签: r