您需要记住聚合函数会减少行数,因此您需要使用减少函数指定您想要哪些行年龄。如果您想保留组的所有行(警告!这可能导致爆炸或分区倾斜),您可以将它们收集为一个列表。然后,您可以使用 UDF(用户定义的函数)按照您的标准减少它们,在这个例子中是 funniness_of_requisite。然后使用另一个 UDF 从单个缩减行扩展属于缩减行的列。
出于此答案的目的,我假设您希望保留具有最大 funniness_of_requisite 的人的年龄。
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType}
import scala.collection.mutable
object TestJob4 {
def main (args: Array[String]): Unit = {
val sparkSession = SparkSession
.builder()
.appName(this.getClass.getName.replace("$", ""))
.master("local")
.getOrCreate()
val sc = sparkSession.sparkContext
import sparkSession.sqlContext.implicits._
val rawDf = Seq(
(1, "Moe", "Slap", 7.9, 118),
(2, "Larry", "Spank", 8.0, 115),
(3, "Curly", "Twist", 6.0, 113),
(4, "Laurel", "Whimper", 7.53, 119),
(5, "Hardy", "Laugh", 6.0, 18),
(6, "Charley", "Ignore", 9.7, 115),
(2, "Moe", "Spank", 6.8, 118),
(3, "Larry", "Twist", 6.0, 115),
(3, "Charley", "fall", 9.0, 115)
).toDF("id", "name", "requisite", "funniness_of_requisite", "age")
rawDf.show(false)
rawDf.printSchema
val rawSchema = rawDf.schema
val fUdf = udf(reduceByFunniness, rawSchema)
val nameUdf = udf(extractAge, IntegerType)
val aggDf = rawDf
.groupBy("name")
.agg(
count(struct("*")).as("count"),
max(col("funniness_of_requisite")),
collect_list(struct("*")).as("horizontal")
)
.withColumn("short", fUdf($"horizontal"))
.withColumn("age", nameUdf($"short"))
.drop("horizontal")
aggDf.printSchema
aggDf.show(false)
}
def reduceByFunniness= (x: Any) => {
val d = x.asInstanceOf[mutable.WrappedArray[GenericRowWithSchema]]
val red = d.reduce((r1, r2) => {
val funniness1 = r1.getAs[Double]("funniness_of_requisite")
val funniness2 = r2.getAs[Double]("funniness_of_requisite")
val r3 = funniness1 match {
case a if a >= funniness2 =>
r1
case _ =>
r2
}
r3
})
red
}
def extractAge = (x: Any) => {
val d = x.asInstanceOf[GenericRowWithSchema]
d.getAs[Int]("age")
}
}
d.getAs[String]("name")
}
}
这是输出
+-------+-----+---------------------------+-------------------------------+---+
|name |count|max(funniness_of_requisite)|short
|age|
+-------+-----+---------------------------+-------------------------------+---+
|Hardy |1 |6.0 |[5, Hardy, Laugh, 6.0, 18]
|18 |
|Moe |2 |7.9 |[1, Moe, Slap, 7.9, 118]
|118|
|Curly |1 |6.0 |[3, Curly, Twist, 6.0, 113]
|113|
|Larry |2 |8.0 |[2, Larry, Spank, 8.0, 115]
|115|
|Laurel |1 |7.53 |[4, Laurel, Whimper, 7.53, 119]|119|
|Charley|2 |9.7 |[6, Charley, Ignore, 9.7, 115] |115|
+-------+-----+---------------------------+-------------------------------+---+