如果您只打算过滤来自whoswho 的True 结果
您可以交叉连接数据框的行并使用who.match 创建一个UDF,最后仅过滤True 结果
数据准备
df1 = sql.createDataFrame([
["Luc Krier"],
["Jeanny Thorn"],
[ "Teddy E Beecher"],
["Philippe Schauss"],
["Meindert I Tholen"],
["John I Muller"]
]).toDF("name")
df2 = sql.createDataFrame([
["J. Thorn"],
["Ben Weller"],
[ "L. Krier"],
["J.M. Thorn"],
["Liam Muller"],
["Meindert Tholen"]
]).toDF("name")
df1 = df1.withColumn('id',F.lit(1))
df2 = df2.withColumn('id',F.lit(1))
combined_df = df1.join(df2
,df1['id'] == df2['id']
).select(df1['name'],df2['name'].alias('name_proxy'))
#### 20 records
combined_df.show()
+----------------+---------------+
| name| name_proxy|
+----------------+---------------+
| Luc Krier| J. Thorn|
| Luc Krier| Ben Weller|
| Luc Krier| L. Krier|
| Jeanny Thorn| J. Thorn|
| Jeanny Thorn| Ben Weller|
| Jeanny Thorn| L. Krier|
| Teddy E Beecher| J. Thorn|
| Teddy E Beecher| Ben Weller|
| Teddy E Beecher| L. Krier|
| Luc Krier| J.M. Thorn|
| Luc Krier| Liam Muller|
| Luc Krier|Meindert Tholen|
| Jeanny Thorn| J.M. Thorn|
| Jeanny Thorn| Liam Muller|
| Jeanny Thorn|Meindert Tholen|
| Teddy E Beecher| J.M. Thorn|
| Teddy E Beecher| Liam Muller|
| Teddy E Beecher|Meindert Tholen|
|Philippe Schauss| J. Thorn|
|Philippe Schauss| Ben Weller|
+----------------+---------------+
熊猫 UDF
from whoswho import who
from functools import partial,reduce
schema = StructType([
StructField('name', StringType(), True),
StructField('name_proxy', StringType(), True),
StructField('match', BooleanType(), True)
])
def whos_who_match(inp_df,match_columns):
inp_df['match'] = inp_df[match_columns].apply(lambda x : who.match(x[0],x[1]),axis=1)
return inp_df
partial_func = partial(whos_who_match,match_columns=['name','name_proxy'])
combined_df = combined_df.groupby('name').applyInPandas(partial_func,schema)
combined_df.filter(F.col('match') == True).show()
+-----------------+---------------+-----+
| name| name_proxy|match|
+-----------------+---------------+-----+
|Meindert I Tholen|Meindert Tholen| true|
| Jeanny Thorn| J. Thorn| true|
| Luc Krier| L. Krier| true|
+-----------------+---------------+-----+
UDF
from whoswho import who
@F.udf(BooleanType())
def whos_who_match(x,y):
return who.match(x,y)
combined_df = combined_df.withColumn('match',whos_who_match(F.col('name'),F.col('name_proxy')))
#### 20 records
combined_df.show()
+----------------+---------------+-----+
| name| name_proxy|match|
+----------------+---------------+-----+
| Luc Krier| J. Thorn|false|
| Luc Krier| Ben Weller|false|
| Luc Krier| L. Krier| true|
| Jeanny Thorn| J. Thorn| true|
| Jeanny Thorn| Ben Weller|false|
| Jeanny Thorn| L. Krier|false|
| Teddy E Beecher| J. Thorn|false|
| Teddy E Beecher| Ben Weller|false|
| Teddy E Beecher| L. Krier|false|
| Luc Krier| J.M. Thorn|false|
| Luc Krier| Liam Muller|false|
| Luc Krier|Meindert Tholen|false|
| Jeanny Thorn| J.M. Thorn|false|
| Jeanny Thorn| Liam Muller|false|
| Jeanny Thorn|Meindert Tholen|false|
| Teddy E Beecher| J.M. Thorn|false|
| Teddy E Beecher| Liam Muller|false|
| Teddy E Beecher|Meindert Tholen|false|
|Philippe Schauss| J. Thorn|false|
|Philippe Schauss| Ben Weller|false|
+----------------+---------------+-----+
过滤器
combined_df.filter(F.col('match') == True).show()
+-----------------+---------------+-----+
| name| name_proxy|match|
+-----------------+---------------+-----+
| Luc Krier| L. Krier| true|
| Jeanny Thorn| J. Thorn| true|
|Meindert I Tholen|Meindert Tholen| true|
+-----------------+---------------+-----+