{
"顏色標簽": {
"顏色": ["黑色", "白色", "紅色", "藍色", "綠色", "紫色"],
"尺寸": ["超大", "大"]
},
"顏色標簽2": {
"顏色2": ["黑色", "白色", "紅色", "藍色", "綠色", "紫色", "黃色"],
"尺寸2": ["超大", "大"]
}
}
問題
使用SparkSQL將上述的數(shù)據(jù)根據(jù)嵌套提取成3列
解答
with base_tb as (
select '{
"顏色標簽": {
"顏色": ["黑色", "白色", "紅色", "藍色", "綠色", "紫色"],
"尺寸": ["超大", "大"]
},
"顏色標簽2": {
"顏色2": ["黑色", "白色", "紅色", "藍色", "綠色", "紫色", "黃色"],
"尺寸2": ["超大", "大"]
}
}' as col
)
select key, inner_key, json_tuple(inner_json, inner_key) js
from (
select key, json_object_keys(inner_json) inner_keys, inner_json
from (
select key, json_tuple(col, key) inner_json
from (
select json_object_keys(col) keys, col
from base_tb
) as a
LATERAL view explode(keys) as key
) as a
) as a
LATERAL view explode(inner_keys) as inner_key
主要采用Spark 3.1.0新增的json_object_keys
函數(shù)拿到key值锡凝,然后依次提取,最后輸出如下:
key | inner_key | js |
---|---|---|
顏色標簽 | 顏色 | ["黑色", "白色", "紅色", "藍色", "綠色", "紫色"] |
顏色標簽 | 尺寸 | ["超大", "大"] |
顏色標簽2 | 顏色2 | ["黑色", "白色", "紅色", "藍色", "綠色", "紫色"] |
顏色標簽2 | 尺寸2 | ["超大", "大"] |