无论有多少列或混合的数据类型如何,代码都可以正常工作。
注意: OP 在她的 cmets 中建议,对于字符串列,在分组时取第一个 non-Null 值。
# Import relevant functions
from pyspark.sql.functions import max, first, col
# Take an example DataFrame
values = [('Alice',10,5,None,50),('Bob',15,15,'Simon',10),('Jack',5,1,'Timo',3)]
df = sqlContext.createDataFrame(values,['col1','col2','col3','col4','col5'])
df.show()
+-----+----+----+-----+----+
| col1|col2|col3| col4|col5|
+-----+----+----+-----+----+
|Alice| 10| 5| null| 50|
| Bob| 15| 15|Simon| 10|
| Jack| 5| 1| Timo| 3|
+-----+----+----+-----+----+
# Lists all columns in the DataFrame
seq_of_columns = df.columns
print(seq_of_columns)
['col1', 'col2', 'col3', 'col4', 'col5']
# Using List comprehensions to create a list of columns of String DataType
string_columns = [i[0] for i in df.dtypes if i[1]=='string']
print(string_columns)
['col1', 'col4']
# Using Set function to get non-string columns by subtracting one list from another.
non_string_columns = list(set(seq_of_columns) - set(string_columns))
print(non_string_columns)
['col2', 'col3', 'col5']
了解first() 和ignorenulls here
# Aggregating both string and non-string columns
df = df.select(*[max(col(c)).alias(c) for c in non_string_columns],*[first(col(c),ignorenulls = True).alias(c) for c in string_columns])
df = df[[seq_of_columns]]
df.show()
+-----+----+----+-----+----+
| col1|col2|col3| col4|col5|
+-----+----+----+-----+----+
|Alice| 15| 15|Simon| 50|
+-----+----+----+-----+----+