我给出了从文本文件创建 DataFrame 的不同方法
val conf = new SparkConf().setAppName(appName).setMaster("local")
val sc = SparkContext(conf)
原始文本文件
val file = sc.textFile("C:\\vikas\\spark\\Interview\\text.txt")
val fileToDf = file.map(_.split(",")).map{case Array(a,b,c) =>
(a,b.toInt,c)}.toDF("name","age","city")
fileToDf.foreach(println(_))
没有架构的 Spark 会话
import org.apache.spark.sql.SparkSession
val sparkSess =
SparkSession.builder().appName("SparkSessionZipsExample")
.config(conf).getOrCreate()
val df = sparkSess.read.option("header",
"false").csv("C:\\vikas\\spark\\Interview\\text.txt")
df.show()
使用架构激发会话
import org.apache.spark.sql.types._
val schemaString = "name age city"
val fields = schemaString.split(" ").map(fieldName => StructField(fieldName,
StringType, nullable=true))
val schema = StructType(fields)
val dfWithSchema = sparkSess.read.option("header",
"false").schema(schema).csv("C:\\vikas\\spark\\Interview\\text.txt")
dfWithSchema.show()
使用 sql 上下文
import org.apache.spark.sql.SQLContext
val fileRdd =
sc.textFile("C:\\vikas\\spark\\Interview\\text.txt").map(_.split(",")).map{x
=> org.apache.spark.sql.Row(x:_*)}
val sqlDf = sqlCtx.createDataFrame(fileRdd,schema)
sqlDf.show()