以下是 BigQuery 标准 SQL
来自 OP 的评论 - 我将使用这部分作为窗口功能。每个窗口将是 30-200 行...
示例数据仅代表一个分区/窗口的示例 - 所以我添加了 id 列来扮演分区的角色,因此代码可以应用于实际用例(只需将 id 替换为实际的相应列名)
#standardSQL
CREATE TEMP FUNCTION partitionBySum(arr ARRAY<STRUCT<val FLOAT64, pos FLOAT64>>, size FLOAT64)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
count = parseInt(arr.reduce((a, b) => a + (b.val > size?size:b.val), 0) / size);
count = (count==0?1:count);
repeat = true;
while (repeat) {
output = []; sum = []; repeat = false;
for (i = 0; i < count; i++) {
output.push(arr[i].pos + ',' + arr[i].val);
sum.push(arr[i].val);
};
for (i = count; i < arr.length; i++) {
min_sum = sum[0]; min_index = 0;
for (j = 0; j < count; j++){
if(sum[j] < min_sum){
min_index = j;
min_sum = sum[j];
};
};
output[min_index] = output[min_index] + ';' + arr[i].pos + ',' + arr[i].val;
sum[min_index] = sum[min_index] + arr[i].val;
if(output[min_index].includes(';') && sum[min_index] > size){
++count;
repeat = true;
break;
};
};
} ;
return output;
""";
--
WITH `project.dataset.table` AS (
SELECT 1 id, 6854 value1, 10 value2, 83 value3 UNION ALL
SELECT 1, 6723, 9, 82 UNION ALL
SELECT 1, 2234, 203, 49 UNION ALL
SELECT 1, 456, 1888, 48 UNION ALL
SELECT 1, 434, 679, 33 UNION ALL
SELECT 1, 789, 234, 32 UNION ALL
SELECT 1, 678, 11, 26 UNION ALL
SELECT 1, 345, 33, 19 UNION ALL
SELECT 1, 22, 345, 19 UNION ALL
SELECT 1, 232, 45, 17 UNION ALL
SELECT 1, 234, 4, 15 UNION ALL
SELECT 1, 45, 123, 13 UNION ALL
SELECT 1, 4, 123, 11 UNION ALL
SELECT 1, 123, 2, 11 UNION ALL
SELECT 1, 23, 76, 10 UNION ALL
SELECT 1, 34, 23, 8 UNION ALL
SELECT 1, 12, 45, 8 UNION ALL
SELECT 1, 23, 30, 7 UNION ALL
SELECT 1, 23, 2, 5 UNION ALL
SELECT 1, 12, 4, 4
),
--
data_with_positions as (
-- adding position number to distinguish same values in different rows
-- for example two 19s and two 11s in sample data
SELECT *, ROW_NUMBER() OVER(PARTITION BY id) pos
FROM `project.dataset.table`
), grouped_by_value3 AS (
-- grouping value3 (along with their respective id, pos) based on summation
SELECT id,
CAST(SPLIT(line)[OFFSET(0)] AS INT64) pos,
CAST(SPLIT(line)[OFFSET(1)] AS INT64) value3,
group_id
FROM (
SELECT id, ROW_NUMBER() OVER(PARTITION BY id) group_id, grp
FROM (
SELECT id,
partitionBySum(ARRAY_AGG(STRUCT(CAST(value3 AS FLOAT64), CAST(pos AS FLOAT64) ) ORDER BY value3 DESC), 60) arr
FROM data_with_positions GROUP BY id
), UNNEST(arr) grp
), UNNEST(SPLIT(grp, ';')) line
), all_values_with_groups AS (
-- join grouping info back to data
SELECT id, pos, value1, value2, value3, group_id
FROM data_with_positions
JOIN grouped_by_value3 USING(id, pos, value3)
)
SELECT id, group_id,
STRING_AGG(CAST(value1 AS STRING) ORDER BY value3 DESC) list_values1,
STRING_AGG(CAST(value2 AS STRING) ORDER BY value3 DESC) list_values2,
STRING_AGG(CAST(value3 AS STRING) ORDER BY value3 DESC) list_values3,
SUM(value1) sum_values1,
SUM(value2) sum_values2,
SUM(value3) sum_values3,
FROM all_values_with_groups
GROUP BY id, group_id ORDER BY id, group_id
结果
简要说明
实际上,在上述解决方案中,几乎没有不同的逻辑部分
第 1 部分:[最复杂的部分] 按值的总和分组 - 使其小于或等于某个值(本例中为 60)
主要逻辑在JS UDF中实现,逻辑如下(只是主要步骤):
- 计算分区/组的初始数量 -
N
- 使用前 N 行 value3 中的 N 个元素初始化数组
- 循环遍历 value3 的其余部分,并在每次迭代中将其添加到总和最小的组中,直到总和超过限制(只有组中的 value3 时除外)
- 如果超过上述 #3 中的总和限制 - 递增计数 -
N + 1 并重复上述 ##2,3 并使用新的组数
- 在处理 ##1-4 时,所有相应的位置都使用 value3 保持不变,因此可以将其连接回第 2 部分中的初始数据
结果在grouped_by_value3 CTE 中捕获,如下所示
第 2 部分:将分组信息(来自第 1 部分)加入主数据(注意添加到主数据的位置,因此此处使用 data_with_positions CTE)
结果在all_values_with_groups CTE 中捕获,如下所示
第 3 部分:最终聚合 - 结果已显示在答案顶部:o)
测试
作为轻量级测试的一部分 - 我在运行这个解决方案时只使用了极少数的虚拟数据 - 下面就是其中之一。
WITH `project.dataset.table` AS (
SELECT 1 id, 6854 value1, 10 value2, 83 value3 UNION ALL
SELECT 1, 6723, 9, 82 UNION ALL
SELECT 1, 2234, 203, 49 UNION ALL
SELECT 1, 456, 1888, 48 UNION ALL
SELECT 1, 434, 679, 33 UNION ALL
SELECT 1, 789, 234, 32 UNION ALL
SELECT 1, 678, 11, 26 UNION ALL
SELECT 1, 345, 33, 19 UNION ALL
SELECT 1, 22, 345, 19 UNION ALL
SELECT 1, 232, 45, 17 UNION ALL
SELECT 1, 234, 4, 15 UNION ALL
SELECT 1, 45, 123, 13 UNION ALL
SELECT 1, 4, 123, 11 UNION ALL
SELECT 1, 123, 2, 11 UNION ALL
SELECT 1, 23, 76, 10 UNION ALL
SELECT 1, 34, 23, 8 UNION ALL
SELECT 1, 12, 45, 8 UNION ALL
SELECT 1, 23, 30, 7 UNION ALL
SELECT 1, 23, 2, 5 UNION ALL
SELECT 1, 12, 4, 4 UNION ALL
SELECT 2, 2, 50, 45 UNION ALL
SELECT 2, 2, 50, 45 UNION ALL
SELECT 2, 3, 60, 44 UNION ALL
SELECT 2, 3, 60, 44 UNION ALL
SELECT 2, 3, 60, 44 UNION ALL
SELECT 2, 3, 60, 44 UNION ALL
SELECT 2, 3, 60, 44 UNION ALL
SELECT 3, 2, 50, 5 UNION ALL
SELECT 3, 2, 50, 5 UNION ALL
SELECT 3, 3, 60, 5 UNION ALL
SELECT 3, 3, 60, 85 UNION ALL
SELECT 3, 3, 60, 45 UNION ALL
SELECT 4, 2, 50, 25 UNION ALL
SELECT 4, 2, 50, 25 UNION ALL
SELECT 4, 3, 60, 24 UNION ALL
SELECT 4, 3, 60, 24 UNION ALL
SELECT 4, 3, 60, 24 UNION ALL
SELECT 4, 3, 60, 24 UNION ALL
SELECT 4, 3, 60, 24
)
下面是输出