在这里尝试使用window:
- 构建数据框
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import mean, pandas_udf, PandasUDFType
from pyspark.sql.types import *
df = spark.createDataFrame(pd.DataFrame([['A','B','A','C','A'],[1,1,2,1,3],['Eggs','Salad','Peaches','Bread','Water']],index=['User','Order','Food']).T)
df.show()
+----+-----+-------+
|User|Order| Food|
+----+-----+-------+
| A| 1| Eggs|
| B| 1| Salad|
| A| 2|Peaches|
| C| 1| Bread|
| A| 3| Water|
+----+-----+-------+
- 创建窗口并应用
udf 来加入字符串:
w = Window.partitionBy('User').orderBy('Order').rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
@pandas_udf(StringType(), PandasUDFType.GROUPED_AGG)
def _udf(v):
return ' $ '.join(v)
df = df.withColumn('Food List', _udf(df['Food']).over(w)).dropDuplicates(['User', 'Food List']).drop(*['Order', 'Food'])
df.show(truncate=False)
+----+----------------------+
|User|Food List |
+----+----------------------+
|B |Salad |
|C |Bread |
|A |Eggs $ Peaches $ Water|
+----+----------------------+