【发布时间】:2021-06-30 14:26:37
【问题描述】:
我正在为一个 uni 项目尝试 CB。我已经加载了一个名为 tweets 的存储桶,其中包含 5000000 个文档,一个名为 users 的存储桶包含 2000000 个文档。
每个 Tweet-doc 具有相同的架构,具有一些属性,例如“is_retweet”(“true”或“false”)、“text”(推文的文本)、“timestamp”和“user_id”,即 id发推文的用户。
每个用户都有 id 并且可能有一个关注者列表(其他用户的 id)。
我想创建一个查询来检索 20 条最常见的推文。这意味着我会寻找那些具有“is_retweet”=“true”字段的推文以及拥有大量关注者的用户。
我已经在关注者的 ARRAY_LENGTH、属性“is_retweet”和“user_id”字段上创建了索引:
CREATE INDEX `idx_followers_length` ON `users`(array_length(`followers`)) WHERE (100 < array_length(`followers`))
CREATE INDEX `idx_retweet` ON `tweets`(`is_retweet`) WHERE (`is_retweet` = "true")
CREATE INDEX `idx_users_on_tweets` ON `tweets`(`user_id`)
多亏了这些索引,部分查询的执行速度非常快。 “部分查询”是:
-
根据“followers”数组的长度,获取 20 个最受关注的用户
SELECT id, ARRAY_LENGTH(followers) AS followers_num FROM users WHERE ARRAY_LENGTH(followers) > 100 ORDER BY ARRAY_LENGTH(followers) DESC LIMIT 20 -
选择已转发推文的文本和时间戳(月/日)
SELECT DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'month') AS month, DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'day') AS day, text FROM tweets WHERE is_retweet = "true"
问题是当我尝试加入时,查询会永远运行(超过 30 分钟)。这是查询(可能是错误的)和解释(查询使用了上面提到的所有索引):
SELECT u.id,
u.followers_num,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'day') AS day,
t.text
FROM tweets AS t
JOIN (SELECT id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100)
AS u ON t.user_id = META(u).id
WHERE t.is_retweet = "true"
ORDER BY u.followers_num DESC
LIMIT 20
{
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IntersectScan",
"scans": [
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_retweet",
"index_id": "437d590a2e220ed4",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "\"true\"",
"inclusion": 3,
"low": "\"true\""
}
]
}
],
"using": "gsi"
},
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_users_on_tweets",
"index_id": "c93f6f0be887553",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "null"
}
]
}
],
"using": "gsi"
}
]
},
{
"#operator": "Fetch",
"as": "t",
"keyspace": "tweets",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "NestedLoopJoin",
"alias": "u",
"on_clause": "((`t`.`user_id`) = (meta(`u`).`id`))",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan3",
"index": "idx_followers_length_2",
"index_id": "b5cc45b51847b40f",
"index_projection": {
"primary_key": true
},
"keyspace": "users",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "100"
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"keyspace": "users",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "(100 < array_length((`users`.`followers`)))"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`users`.`id`)"
},
{
"as": "followers_num",
"expr": "array_length((`users`.`followers`))"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
{
"#operator": "Alias",
"as": "u"
}
]
}
},
{
"#operator": "Filter",
"condition": "((`t`.`is_retweet`) = \"true\")"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`u`.`id`)"
},
{
"expr": "(`u`.`followers_num`)"
},
{
"as": "month",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"month\")"
},
{
"as": "day",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"day\")"
},
{
"expr": "(`t`.`text`)"
}
]
}
]
}
}
]
},
{
"#operator": "Order",
"limit": "20",
"sort_terms": [
{
"desc": true,
"expr": "(`u`.`followers_num`)"
}
]
},
{
"#operator": "Limit",
"expr": "20"
},
{
"#operator": "FinalProject"
}
]
}
【问题讨论】: