在 pyspark 中试试这个:一种方法是使用窗口函数
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
spark = SparkSession.builder \
.appName('SO')\
.getOrCreate()
sc= spark.sparkContext
df = sc.parallelize([
("new south wales", "aus", 4, 4, 4),("victoria", "aus", 4, 4, 4), ("queensland", "aus", 3, 5, 5), ("south australia","aus", 1, 2, 2)
]).toDF(["province_state", "country_region", "2/1/2020", "2/10/2020", "2/11/2020"])
df.show()
#
# +---------------+--------------+--------+---------+---------+
# | province_state|country_region|2/1/2020|2/10/2020|2/11/2020|
# +---------------+--------------+--------+---------+---------+
# |new south wales| aus| 4| 4| 4|
# | victoria| aus| 4| 4| 4|
# | queensland| aus| 3| 5| 5|
# |south australia| aus| 1| 2| 2|
# +---------------+--------------+--------+---------+---------+
w = Window().partitionBy('country_region')
w1 = Window().partitionBy('country_region').orderBy('country_region')
for column in df.columns:
if column not in ['country_region','province_state']:
df = df.withColumn(column, F.sum(column).over(w) )
df1 = df.withColumn("r_no", F.row_number().over(w1)).where(F.col('r_no')==1)
df1.select(F.lit('_').alias('province_state'), *[ column for column in df1.columns if column not in ['province_state']]).drop(F.col('r_no')).show()
# +--------------+--------------+--------+---------+---------+
# |province_state|country_region|2/1/2020|2/10/2020|2/11/2020|
# +--------------+--------------+--------+---------+---------+
# | _| aus| 12| 15| 15|
# +--------------+--------------+--------+---------+---------+