groupByKey

  • 没有 mergeValue 操作,即没有map端combine操作,增加网络传输次数
  • 生成CompactBuffer对象,占用资源
  • 可重新指定分区

groupBy

  • 增加传输次数
  • 保存生成CompactBuffer对象并保存key占用资源

reduceByKey

  • 有 mergeValue操作,即map端有预聚合,减少网络传输次数
  • 不能改变v的返回值类型
  • 可重新指定分区

combineByKeyWithClassTag  //最根本方法

def combineByKeyWithClassTag[C](
      createCombiner: V => C,  //map端,改变 v 的返回值类型
      mergeValue: (C, V) => C,  //map端,预聚合
      mergeCombiners: (C, C) => C,  //reduce端,聚合
      partitioner: Partitioner,  //分区对象
      mapSideCombine: Boolean = true,  //是否开启map端聚合,默认开启
      serializer: Serializer = null)
      
def combineByKeyWithClassTag[C](
      createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C,
      numPartitions: Int)

def combineByKeyWithClassTag[C](
      createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C)
aggregateByKey
def aggregateByKey[U: ClassTag](zeroValue: U, partitioner: Partitioner)(seqOp: (U, V) => U,
      combOp: (U, U) => U)
      
def aggregateByKey[U: ClassTag](zeroValue: U, numPartitions: Int)(seqOp: (U, V) => U,
      combOp: (U, U) => U)

def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V) => U,
      combOp: (U, U) => U)

 

相关文章: