所以一位同事曾经告诉我 regex_extract 比解析 JSON 更快,我一直相信...直到今天我决定进行一些计时实验,比较使用 @987654322 发布的其他两个解决方案@ 和from_json。
简短的回答是,即使我们通过添加数千个额外的 K:V 对来使 JSON 复杂化,所有这些都具有可比性。在这些测试中,regex_extract 方法实际上一直慢一些。
设置:证明每种方法都有效
import pyspark.sql.functions as fun
import pyspark.sql.types as t
case_ids = range(1,6)
data = [
'{"dummyAcc":"12346","accountRequest":{"schemeCode":"ZEROQ1", "CCZ":"SGD"}}',
'{"dummyAcc":"12347","accountRequest":{"schemeCode":"ZEROQ2", "CCZ":"SGD"}}',
'{"dummyAcc":"12348","accountRequest":{"schemeCode":"ZEROQ5", "CCZ":"SGD"}}',
'{"dummyAcc":"12349","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}',
'{"dummyAcc":"12350","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}'
]
df = spark.createDataFrame(pd.DataFrame({"caseid": case_ids, "object_value": data}))
##
# fun.from_json
##
schm = t.StructType(
[
t.StructField("dummyAcc", t.StringType()),
t.StructField(
"accountRequest",
t.StructType(
[
t.StructField("schemeCode", t.StringType()),
t.StructField("CCZ", t.StringType()),
]
),
),
]
)
def run_from_json(df):
return df.withColumn("object_value", fun.from_json("object_value", schm, options={"allowSingleQuotes": "true"}))\
.select(
"caseid",
"object_value.accountRequest.schemeCode",
"object_value.accountRequest.CCZ",
)
##
# get_json
##
def run_get_json(df):
return df.select('caseid',
fun.get_json_object('object_value', '$.accountRequest.schemeCode').alias('schemeCode'),
fun.get_json_object('object_value', '$.accountRequest.CCZ').alias('CCZ'))
##
# regexp_extract
##
def run_regexp_extract(df):
return df.withColumn("schemeCode", fun.regexp_extract(fun.col("object_value"), '(.)("schemeCode":")(\w+)', 3))\
.withColumn("CCZ", fun.regexp_extract(fun.col("object_value"), '(.)("CCZ":")(\w+)', 3))\
.select("caseid", "schemeCode", "CCZ")
##
# Test them out
##
print("from_json")
run_from_json(df).show(truncate=False)
print("get_json")
run_get_json(df).show(truncate=False)
print("regexp_extract")
run_regexp_extract(df).show(truncate=False)
from_json
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
get_json
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
regexp_extract
+------+----------+---+
|caseid|schemeCode|CCZ|
+------+----------+---+
|1 |ZEROQ1 |SGD|
|2 |ZEROQ2 |SGD|
|3 |ZEROQ5 |SGD|
|4 |ZEROQ |SGD|
|5 |ZEROQ |SGD|
+------+----------+---+
计时第 1 部分 -- 使用短 JSON
我使用上面定义的默认紧凑 JSON 检查了运行多次迭代的挂钟时间。
def time_run_method(df, n_it, meth, meth_name):
t0 = time.time()
for i in range(n_it):
meth(df).count()
td = time.time() - t0
print(n)
print("Time to count %d iterations: %s [sec]" % (n_it, "{:,}".format(td)))
for m, n in zip([run_from_json, run_get_json, run_regexp_extract], ["from_json", "get_json", "regexp_extract"]):
time_run_method(df, 200, m, n)
from_json
Time to count 200 iterations: 15.918861389160156 [sec]
get_json
Time to count 200 iterations: 15.668830871582031 [sec]
regexp_extract
Time to count 200 iterations: 17.539576292037964 [sec]
计时第 2 部分 -- 使用长 JSON
我向 JSON 添加了 2000 个键值对,以查看反序列化它们的额外开销是否会改变事情。它没。也许这个结构太简单了,内部解析器能够简单地避免额外的键,或者考虑到结构的平坦度,它们不会产生很多开销。我不知道。
cruft = json.dumps({k:v for k,v in enumerate(range(2000))})
data = [
'{ "cruft": %s, "dummyAcc":"12346","accountRequest":{"schemeCode":"ZEROQ1", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12347","accountRequest":{"schemeCode":"ZEROQ2", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12348","accountRequest":{"schemeCode":"ZEROQ5", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12349","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}' % cruft,
'{ "cruft": %s, "dummyAcc":"12350","accountRequest":{"schemeCode":"ZEROQ", "CCZ":"SGD"}}' % cruft
]
df2 = spark.createDataFrame(pd.DataFrame({"caseid": case_ids, "object_value": data}))
for m, n in zip([run_from_json, run_get_json, run_regexp_extract], ["from_json", "get_json", "regexp_extract"]):
time_run_method(df2, 200, m, n)
from_json
Time to count 200 iterations: 16.005220413208008 [sec]
get_json
Time to count 200 iterations: 15.788024187088013 [sec]
regexp_extract
Time to count 200 iterations: 16.81353187561035 [sec]