Spark Streaming整合Spark SQL操作
代码如下:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
def getSparkSessionInstance(sparkConf):
if (‘sparkSessionSingletonInstance’ not in globals()):
globals()[‘sparkSessionSingletonInstance’] = SparkSession
.builder
.config(conf=sparkConf)
.getOrCreate()
return globals()[‘sparkSessionSingletonInstance’]
ssc = StreamingContext(sc, 5)
lines = ssc.socketTextStream(“localhost”, 6789)
words = lines.flatMap(lambda line: line.split(" "))
#将rdds转换为dataframe
def process(time, rdd):
print("========= %s =========" % str(time))
try:
spark = getSparkSessionInstance(rdd.context.getConf())
rowRdd = rdd.map(lambda w: Row(word=w))
wordsDataFrame = spark.createDataFrame(rowRdd)
wordsDataFrame.createOrReplaceTempView(“words”)
wordCountsDataFrame = spark.sql(“select word, count(*) as total from words group by word”)
wordCountsDataFrame.show()
except:
pass
words.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
启动pyspark
启动nc,将代码输入到命令行
结果如下