使用org.json将xml转换为json。
示例 XML 数据
val xmlData = """<?xml version="1.0" encoding="utf-8"?> <visitors> <visitor id="9615" age="68" sex="F" /> <visitor id="1882" age="34" sex="M" /> <visitor id="5987" age="23" sex="M" /> </visitors>"""
UDF 函数
val parse = udf((value: String) => {
import org.json._
XML.toJSONObject(value).toString
}
)
转换后的 json 数据的架构。
import org.apache.spark.sql.types._
val schema = DataType.fromJson("""{"type":"struct","fields":[{"name":"visitors","type":{"type":"struct","fields":[{"name":"visitor","type":{"type":"array","elementType":{"type":"struct","fields":[{"name":"age","type":"long","nullable":true,"metadata":{}},{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"sex","type":"string","nullable":true,"metadata":{}}]},"containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}""").asInstanceOf[StructType]
scala> schema.printTreeString
root
|-- visitors: struct (nullable = true)
| |-- visitor: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- age: long (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- sex: string (nullable = true)
df
.withColumn(
"parsed_xml",
from_json(parse($"xml"),schema)
)
.select(
$"id",
$"xml",
explode_outer($"parsed_xml.visitors.visitor").as("visitors")
)
.select(
$"id",
$"xml",
$"visitors.*"
)
.show(false)
最终输出
+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---+----+---+
|id |xml |age|id |sex|
+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---+----+---+
|1 |<?xml version="1.0" encoding="utf-8"?> <visitors> <visitor id="9615" age="68" sex="F" /> <visitor id="1882" age="34" sex="M" /> <visitor id="5987" age="23" sex="M" /> </visitors>|68 |9615|F |
|1 |<?xml version="1.0" encoding="utf-8"?> <visitors> <visitor id="9615" age="68" sex="F" /> <visitor id="1882" age="34" sex="M" /> <visitor id="5987" age="23" sex="M" /> </visitors>|34 |1882|M |
|1 |<?xml version="1.0" encoding="utf-8"?> <visitors> <visitor id="9615" age="68" sex="F" /> <visitor id="1882" age="34" sex="M" /> <visitor id="5987" age="23" sex="M" /> </visitors>|23 |5987|M |
+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---+----+---+