【问题标题】:Wanting to summarize multiple columns in R but not summarizing correctly想要总结 R 中的多个列但没有正确总结
【发布时间】:2018-06-29 18:42:03
【问题描述】:

我想最终按季度汇总我的数据。我可以用下面的代码来做,但由于下面代码中的 INV_ITEM_ID 不是数字类型,它不允许我将它添加到列表中。

temptable[ ,lapply(.SD, sum), by = quarter(INVOICE_DT), 
    .SDcols = c("INV_ITEM_ID","Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")]

运行脚本时出现以下错误

Error in gsum(INV_ITEM_ID) : 
  Type 'character' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)

下面我提供了我的数据子集(前 30 行)。

library(data.table)
temptable = setDT(structure(list(INVOICE_DT = c("2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29"), BRANCH_CODE = c("AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001"
), INVOICE_NO = c("AK0015522", "AK0015529", "AK0015515", "AK0015529", 
"AK0015514", "AK0015513", "AK0015519", "AK0015519", "AK0015520", 
"AK0015520", "AK0015520", "AK0015522", "AK0015520", "AK0015520", 
"AK0015520", "AK0015518", "AK0015525", "AK0015520", "AK0015519", 
"AK0015520", "AK0015525", "AK0015518", "AK0015529", "AK0015519", 
"AK0015525", "AK0015519", "AK0015519", "AK0015522", "AK0015522", 
"AK0015529"), INV_SEQ_NO = c(6L, 6L, 3L, 3L, 3L, 3L, 148L, 149L, 
91L, 45L, 162L, 34L, 125L, 39L, 159L, 56L, 64L, 127L, 160L, 93L, 
45L, 34L, 5L, 138L, 46L, 38L, 161L, 35L, 28L, 4L), INV_ITEM_ID = c("11239190", 
"11239190", "55692", "400050-131280", "0700535", "0800758", "69970", 
"0445050", "0819092", "0115160", "0115160", "0703316", "55692", 
"0115157", "0115157", "0115157", "440704-131280", "55618", "0703318", 
"0205428", "0702119", "0115157", "465278", "465278", "0702121", 
"15166", "0703315", "0703315", "0715718", "0243945"), ITEM_DESCR = c("Au-Ve-Co# 20898", 
"Au-Ve-Co# 20898", "MM HSS T TAP 16X1.5", "3/16\"\"\"\"H05703-100R Hs", 
"18/2 BLK SOOW", "3x1x1/4 180G FW", "BF SPEC SER HV1-6C", "1/4MP External Seat", 
"1-1/2xNHConditonDisc", "HCS 7/16-14x1 3/4 YZ8", "HCS 7/16-14x1 3/4 YZ8", 
"25AMP MINI AUTO FUSE", "MM HSS T TAP 16X1.5", "HCS 7/16-14x1.25 YZ8", 
"HCS 7/16-14x1.25 YZ8", "HCS 7/16-14x1.25 YZ8", "BPOLY MELBW 1/4TX1/8", 
"MM HSS P TAP 5 X 0.8", "MAXI-BLADE 20 AMP", "1-1/2\"\"\"\" T3 VF SURF", 
"16ga Red Wire/PPF", "HCS 7/16-14x1.25 YZ8", "80I 45 ELBOW 1/4 BLK", 
"80I 45 ELBOW 1/4 BLK", "16ga Blk Wire/PPF", "HCS 7/16-14x3.25 YZ8", 
"20AMP MINI AUTO FUSE", "20AMP MINI AUTO FUSE", "3/0-3/8\"\"\"\"HvyWalCprLug", 
"97053 Pink Marker"), STD_ITEM = c("Y", "Y", "Y", "Y", "Y", "Y", 
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", 
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), PRIVATE_LABEL = c("N", 
"N", "Y", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N", 
"N", "N", "N", "Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N", 
"N", "N", "N"), CATEGORY_PATH1 = c("08.5-Fleet & Automotive", 
"08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking", 
"08-Hydraulics & Pneumatics", "09-Electrical", "05-Abrasives", 
"08-Hydraulics & Pneumatics", "07.3-Plumbing", "05-Abrasives", 
"01-Fasteners", "01-Fasteners", "08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking", 
"01-Fasteners", "01-Fasteners", "01-Fasteners", "08-Hydraulics & Pneumatics", 
"07-Cutting Tools & Metalworking", "08.5-Fleet & Automotive", 
"05-Abrasives", "08.5-Fleet & Automotive", "01-Fasteners", "07.3-Plumbing", 
"07.3-Plumbing", "08.5-Fleet & Automotive", "01-Fasteners", "08.5-Fleet & Automotive", 
"08.5-Fleet & Automotive", "08.5-Fleet & Automotive", "11-Chemicals & Paints"
), CATEGORY_PATH2 = c("03-Batteries & Accessories", "03-Batteries & Accessories", 
"04-Threading and Tapping", "02-Hose and Hose Reels", "05-Electrical Wire and Accessories", 
"03-Coated and Non-Woven Abrasives", "07-Valves & Accessories", 
"04-Valves & Valve Accessories", "03-Coated and Non-Woven Abrasives", 
"01-Bolts", "01-Bolts", "02-Fleet Electrical Products", "04-Threading and Tapping", 
"01-Bolts", "01-Bolts", "01-Bolts", "01-Fittings", "04-Threading and Tapping", 
"02-Fleet Electrical Products", "03-Coated and Non-Woven Abrasives", 
"02-Fleet Electrical Products", "01-Bolts", "01-Pipe Fittings", 
"01-Pipe Fittings", "02-Fleet Electrical Products", "01-Bolts", 
"02-Fleet Electrical Products", "02-Fleet Electrical Products", 
"02-Fleet Electrical Products", "11-Paints, Marking, and Accessories"
), CUST_ID = c("AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001"), CUSTOMER_TIER = c("Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3"), 
    IS_VENDING = c("No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No"), SALE_PRICE = c(10.53, 10.53, 23.0175, 
    2.0251, 0.4755, 6.4725, 15.8235, 1.468, 1.0275, 0.3425, 0.3425, 
    0.3726, 23.0175, 0.2513, 0.2513, 0.2513, 2.7412, 5.7675, 
    2.226, 1.5579, 0.1418, 0.2513, 12.0835, 12.0835, 0.1297, 
    0.625, 0.3726, 0.3726, 3.514, 4.1112), TOTAL_COST = c(6.877449, 
    6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787, 
    0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562, 
    0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503, 
    0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628, 
    0.164998, 0.125893, 0.125893, 2.183983, 1.469989), POS_COST = c(6.877449, 
    6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787, 
    0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562, 
    0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503, 
    0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628, 
    0.164998, 0.125893, 0.125893, 2.183983, 1.469989), CE100 = c(6.55, 
    6.55, 9.25, 0.95, 0.23785, 2.81, 5.93127, 1.17, 0.508, 0.074946, 
    0.074946, 0.12, 9.25, 0.058626, 0.058626, 0.058626, 1.31, 
    2.35, 0.61, 0.5, 0.05172, 0.058626, 4.2844, 4.2844, 0.05172, 
    0.12255, 0.12, 0.12, 2.08, 1.43), CE110 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), CE120 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CE200 = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CORP_PRICE = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), QTY_SOLD = c(15L, 
    10L, 6L, 50L, 250L, 10L, 10L, 20L, 50L, 25L, 25L, 50L, 6L, 
    25L, 25L, 25L, 10L, 6L, 10L, 30L, 100L, 25L, 15L, 10L, 100L, 
    20L, 50L, 50L, 4L, 36L), PACKSLIP_WHSL = c(NA, NA, 30.69, 
    2.63, 0.7315, 8.63, 20.55, 1.96, 1.37, 0.685, 0.685, 0.5323, 
    30.69, 0.5025, 0.5025, 0.5025, 3.56, 7.69, 3.18, 1.82, 0.2145, 
    0.5025, 18.59, 18.59, 0.1961, 1.25, 0.5323, 0.5323, 5.02, 
    5.71), PRICING_GROUP = c("PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGG", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB"), PGG_MIN_PRICE = c(257.9043375, 
    171.936225, 184.14, 131.5, 175.56, 86.3, 154.125, 39.2, 68.5, 
    15.07, 15.07, 26.615, 184.14, 11.055, 11.055, 11.055, 35.6, 
    46.14, 31.8, 54.6, 21.45, 11.055, 253.7535, 185.9, 19.61, 
    22, 26.615, 26.615, 20.08, 205.56), PGY_MIN_PRICE = c(206.32347, 
    137.54898, 171.2502, 128.87, 170.07375, 80.259, 147.96, 36.456, 
    63.705, 13.3575, 13.3575, 24.75195, 171.2502, 9.79875, 9.79875, 
    9.79875, 33.108, 45.2172, 29.574, 50.778, 19.9485, 9.79875, 
    245.388, 163.592, 18.2373, 19.5, 24.75195, 24.75195, 18.6744, 
    191.1708), PGR_MIN_PRICE = c(171.936225, 114.62415, 162.0432, 
    124.925, 166.41625, 75.944, 141.795, 35.672, 60.28, 9.9325, 
    9.9325, 24.21965, 162.0432, 7.28625, 7.28625, 7.28625, 32.396, 
    43.833, 28.938, 48.048, 19.5195, 7.28625, 239.811, 159.874, 
    17.8451, 14.5, 24.21965, 24.21965, 18.2728, 180.8928), `Ext Sale` = c(157.95, 
    105.3, 138.105, 101.255, 118.875, 64.725, 158.235, 29.36, 
    51.375, 8.5625, 8.5625, 18.63, 138.105, 6.2825, 6.2825, 6.2825, 
    27.412, 34.605, 22.26, 46.737, 14.18, 6.2825, 181.2525, 120.835, 
    12.97, 12.5, 18.63, 18.63, 14.056, 148.0032), `Ext Total Cost` = c(103.161735, 
    68.77449, 56.397372, 46.24365, 61.282, 29.53287, 62.27787, 
    14.06988, 26.0175, 2.08635, 2.08635, 6.71425, 56.397372, 
    1.56715, 1.56715, 1.56715, 13.493, 14.317644, 7.7503, 15.74988, 
    5.1595, 1.56715, 67.1673, 44.7782, 5.1628, 3.29996, 6.29465, 
    6.29465, 8.735932, 52.919604), cust_part = c("AK001000111239190", 
    "AK001000111239190", "AK001000155692", "AK0010001400050-131280", 
    "AK00100010700535", "AK00100010800758", "AK001000169970", 
    "AK00100010445050", "AK00100010819092", "AK00100010115160", 
    "AK00100010115160", "AK00100010703316", "AK001000155692", 
    "AK00100010115157", "AK00100010115157", "AK00100010115157", 
    "AK0010001440704-131280", "AK001000155618", "AK00100010703318", 
    "AK00100010205428", "AK00100010702119", "AK00100010115157", 
    "AK0010001465278", "AK0010001465278", "AK00100010702121", 
    "AK001000115166", "AK00100010703315", "AK00100010703315", 
    "AK00100010715718", "AK00100010243945")), .Names = c("INVOICE_DT", 
"BRANCH_CODE", "INVOICE_NO", "INV_SEQ_NO", "INV_ITEM_ID", "ITEM_DESCR", 
"STD_ITEM", "PRIVATE_LABEL", "CATEGORY_PATH1", "CATEGORY_PATH2", 
"CUST_ID", "CUSTOMER_TIER", "IS_VENDING", "SALE_PRICE", "TOTAL_COST", 
"POS_COST", "CE100", "CE110", "CE120", "CE200", "CORP_PRICE", 
"QTY_SOLD", "PACKSLIP_WHSL", "PRICING_GROUP", "PGG_MIN_PRICE", 
"PGY_MIN_PRICE", "PGR_MIN_PRICE", "Ext Sale", "Ext Total Cost", 
"cust_part"), row.names = c(NA, 
30L), class = c("data.table", "data.frame")))

我也尝试过通过 INV_ITEM_ID 运行脚本,但脚本不会只返回不同的值。

我无法通过 INV_ITEM_ID 进行总结。

【问题讨论】:

  • 下一次,请写一个最小的示例——我们可能不需要查看您的真实脚本或数据,如果您制作一个新的可重现示例,我们可能会更好地了解问题。无论如何,错误说它不能求和 INV_ITEM_ID,这是有道理的。要通过多个变量进行总结,您可以像 temptable[ , lapply(.SD, sum), by = .(INV_ITEM_ID, quarter(INVOICE_DT)), .SDcols = c("Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")] 那样做 by=.(var1,var2) 不确定是否能回答它。
  • Frank,这实际上解决了我的第一个问题。但我遇到的最大问题是,当它返回 INV_ITEM_ID 值时,它实际上并没有被汇总。它给了我 5,135,153 行和 933,049 个不同的值。我要做的是将其汇总到 933,049 行,然后按季度汇总
  • 好的,如果你想在一行上每个不同的值和在列上的 Quarters + vars (?),你可以把最后一个命令的结果传递给 dcast(res, INV_ITEM_ID ~ quarter, value.var=c("Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")) 虽然我猜这可能非常难以阅读。更短的版本:dcast(temptable, INV_ITEM_ID ~ quarter(INVOICE_DT), fun = sum, value.var = val_cols)
  • 弗兰克,我想我不确定我是否完全理解这一点。我想知道为什么我会通过 INV_ITEM_ID 总结它没有将销售信息组合到每个分组中。我不知道为什么它显示多行而不是将它们加在一起
  • 好的,您可能需要澄清您期望的输出,对应于提供的 30 行示例。一些指导:stackoverflow.com/questions/5963269/…

标签: r data.table lapply lubridate


【解决方案1】:

我建议您首先在数据中创建另一列来表示INVOICE_DT 的季度:

temptable[,INVOICE_Q := quarter(INVOICE_DT)]

接下来通过INVOICE_QINV_ITEM_ID运行求和运算:

temptable[,lapply(X = .SD,FUN = sum), by = c("INV_ITEM_ID","INVOICE_Q"), 
           .SDcols = c("Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")]

然后,如果需要,您可以删除创建的 INVOICE_Q 列:

temptable[,INVOICE_Q := NULL]

希望有帮助!

【讨论】:

  • rbindlist(list(mastertable, temptable), use.names = TRUE) 中的错误:第 2 项有 36 列,与有 30 列的第 1 项不一致。如果您需要填充缺失的列,请使用将参数“填充”设置为 TRUE。
  • 好的,让我重新检查一下。
  • 我为您的数据运行了相同的代码并且没有错误。您是否使用您在问题中发布的相同数据集?我认为您正在尝试将另一个数据集绑定到导致此问题的 temptable。我没有看到您的帖子中提到任何 mastertable。
  • 是的,临时表是从另一个创建的表中提取数据
  • 有没有办法解决这个问题?如果有帮助,我可以向您展示我的完整代码
猜你喜欢
  • 2013-12-31
  • 1970-01-01
  • 2020-08-02
  • 1970-01-01
  • 2016-06-29
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
相关资源
最近更新 更多