需求:解析XML文件驰后,寫入到hbase(xml文件格式為GBK蜂绎,spark讀進(jìn)來會亂碼)
痛點:普通的寫入太慢太耗費時間
1.spark解決讀取GBK亂碼問題
object ParseXml {
def main(args: Array[String]): Unit = {
//構(gòu)建sparksession
val spark = SparkSession.builder.master("local[*]").appName("Parse_xml").getOrCreate()
// 格式轉(zhuǎn)換
val data_DS: RDD[String] = spark.sparkContext.hadoopFile("/Users/Desktop/2017003_2010-2019/2018-2019", classOf[TextInputFormat],
classOf[LongWritable], classOf[Text]).map(
pair => new String(pair._2.getBytes, 0, pair.2.getLength, "GBK"))
import spark.implicits.
data_DS.toDF().createOrReplaceTempView("categ_entry")
//SHENQINGH,FEIYONGZLMC,JIAOFEIJE,JIAOFEISJ,JIAOFEIRXM,PIAOJUDM,SHOUJUH
spark.sql("select * from categ_entry").write.csv("data/data_csv_2")
spark.close()
}
}
2.解析XML輸出為csv文件
object ParseXml2 {
def main(args: Array[String]): Unit = {
//構(gòu)建sparksession
// val spark: SparkSession = SparkSession.builder().appName("ConfigFictoryDemo").master("local[2]").getOrCreate()
val spark = SparkSession.builder.master("local[*]")
//.config("spark.debug.maxToStringFields", "100")
.appName("Parse_xml").getOrCreate()
val sc = spark.sparkContext
val df = spark.read
.format("com.databricks.spark.xml")
.option("SHENQINGH", "FEIYONGZLMC")
.load("data/data_csv_2")
//注冊表
df.toDF().createOrReplaceTempView("categ_entry")
//SHENQINGH,FEIYONGZLMC,JIAOFEIJE,JIAOFEISJ,JIAOFEIRXM,PIAOJUDM,SHOUJUH
spark.sql("select SHENQINGH,FEIYONGZLMC,JIAOFEIJE,JIAOFEISJ,JIAOFEIRXM,PIAOJUDM,SHOUJUH from categ_entry").write.csv("data/result_2")
spark.close()
}
}
數(shù)據(jù)格式
2014208081375,實用,180.0,20150630,蕪湖,,47526269
2014208081375,新型,150.0,20141231,蕪湖,,41375489
2014208081375,實用,180.0,20151224,蕪湖,,49007979
3.load到hive表(hive支持load整個文件夾下的數(shù)據(jù))
load data local inpath "/na/20200513/hive/result" into table hive_info_paid_20200513;
4.Hbase反映射為Hbase表
CREATE TABLE ods_hive_patent_info_paid_20200513(
key string comment "hbase rowkey",
SHENQINGH string comment "申請?zhí)?,
JIAOFEISJ string comment "繳費時間",
JIAOFEIJE string comment "繳費金額",
FEIYONGZLMC string comment "費用ZLMC",
JIAOFEIRXM string comment "繳費RXM",
unit string ,
recNum string ,
currency string,
num string
)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:SHENQINGH,cf:JIAOFEISJ,cf:JIAOFEIJE,cf:FEIYONGZLMC,cf:JIAOFEIRXM,cf:unit,cf:recNum,cf:currency,cf:num")
TBLPROPERTIES("hbase.table.name" = "process_fee_20200513");
(此處我創(chuàng)建的是hive內(nèi)部表沧卢,在hive drop掉此表,hbase中的表也會被刪除蜓耻,根據(jù)個人情況可創(chuàng)建外部表存哲,此處不再贅述)
至此完美將一億條數(shù)存到Hbase表