var rdd1=sc.makeRDD(Array((1,"A"),(2,"B"),(3,"C"),(4,"D")),2)

rdd1.partitions.size

res20:int=2

rdd1.mapPartitionsWithIndex{

(partIdx,iter)=>{

 var part_map=scala.collection.mutable.Map[string,List[(Int,String)]]()

 while(iter.hasNext)

{

  var part_name="part_"+partIdx;

  var elem=iter.next();

 if(part_map.contains(part_name)){

 var elems=part_map(part_name)

elems::=elem

part_map(part_name)=elems

} else{

  part_map(part_name)=List[(Int,String)]{elem}

}

}

part_map.iterator

}}.collect

 -----------------------------------------------------------

val three=sc.textFile("/tmp/spark/three",3)
var idx=0
import org.apache.spark.HashPartitioner

val res=three.filter(_.trim().length>0).map(num=>(num.trim.toInt,"")).partitionBy(new HashPartitioner(1)).sortBykey().map
(t=>{
idx+=1
(idx,t._1)
}).collect.foreach(x=>println(x._1+"\t"+x._2))

------------------------------------------------------------------

spark算子:partitionBy对数据进行分区
https://www.cnblogs.com/yy3b2007com/p/7800793.html

Hadoop经典案例Spark实现(三)——数据排序

https://blog.csdn.net/kwu_ganymede/article/details/50475788

相关文章:

  • 2022-02-02
  • 2022-12-23
  • 2021-09-18
  • 2021-08-18
  • 2021-11-22
  • 2021-04-01
猜你喜欢
  • 2021-08-12
  • 2021-08-22
  • 2022-01-22
  • 2022-01-13
  • 2022-12-23
  • 2021-10-28
  • 2021-06-20
相关资源
相似解决方案