背景:公司有需求將hive數(shù)據(jù)導(dǎo)入es,之前是通過datax進行導(dǎo)數(shù)的尤勋;但是datax多線程很耗內(nèi)存喘落,并且經(jīng)常性會出現(xiàn)一些程序上的問題。故想要使用hive外部表映射es的方式將數(shù)據(jù)導(dǎo)入es
添加 jar 包最冰,創(chuàng)建環(huán)境
- 將相關(guān) jar 包添加至 hive 環(huán)境瘦棋,進去 es 官網(wǎng),下載對應(yīng)版本jar包暖哨,網(wǎng)址:https://artifacts.elastic.co
- 提取壓縮包中的 elasticsearch-hadoop-hive-6.hive添加jar包參考:http://note.youdao.com/noteshare?id=061ad30a8eee86362bb154cf6f923c25&sub=F68ADFE343044A8193EA06026FEAC0B3赌朋。
- 個人推薦,最有效且長期的方式鹿蜀,將elasticsearch-hadoop-hive-6.3.2.jar包放到/opt/cloudera/parcels/CDH/lib/hive/auxlib路徑下,在CDH管理界面重啟hive
構(gòu)建 es 索引
- 注意:es 索引不要加上
"dynamic": "strict"
服球,否則即有可能會報錯 - 創(chuàng)建 hive 外部表映射 es
CREATE EXTERNAL TABLE demo(
`es_id` STRING COMMENT 'ES唯一鍵',
`oid` STRING COMMENT '新聞id',
`enterprises` STRUCT<eid:STRING, name:STRING> COMMENT '相關(guān)企業(yè)列表',
`title` STRING COMMENT '新聞標(biāo)題',
`names` STRING COMMENT '相關(guān)人員名列表',
`companies` STRING COMMENT '相關(guān)公司名列表',
`url` STRING COMMENT '新聞鏈接',
`neg_index` STRING COMMENT '情感正負面打分',
`sentiment` STRING COMMENT '情感正負面',
`brief` STRING COMMENT '摘要',
`keywords` STRING COMMENT '關(guān)鍵詞',
`keywords_desc` STRING COMMENT '關(guān)鍵字前后文字',
`class_type` STRING COMMENT '大類',
`pub_time` STRING COMMENT '發(fā)布時間',
`pub_time_date_long` STRING COMMENT '發(fā)布時間',
`pub_time_date` STRING COMMENT '發(fā)布時間',
`pub_time_year` STRING COMMENT '發(fā)布年份',
`pub_time_month` STRING COMMENT '發(fā)布年月',
`tag_list` STRING COMMENT '標(biāo)簽列表',
`source` STRING COMMENT '來源',
`theme_keywords` STRUCT<count:int, name:STRING> COMMENT '主題詞',
`deprecated` STRING COMMENT 'I 新增 U更新 D 刪除')
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES(
'es.nodes' = 'ip:9200', --es集群節(jié)點ip
'es.index.auto.create' = 'true', --寫數(shù)據(jù)時茴恰,若es中無該索引,會自動創(chuàng)建一個索引
'es.resource' = 'demo/content', --寫入es的index/type
'es.mapping.id'='es_id', --將該字段作為es的_id斩熊,所謂唯一鍵
'es.mapping.names' = 'oid:oid,enterprises:enterprises,title:title,names:names,companies:companies,url:url,neg_index:neg_index,sentiment:sentiment,brief:brief,keywords:keywords,keywords_desc:keywords_desc,class_type:class_type,pub_time:pub_time,pub_time_date_long:pub_time_date_long,pub_time_date:pub_time_date,pub_time_year:pub_time_year,pub_time_month:pub_time_month,tag_list:tag_list,source:source,theme_keywords:theme_keywords,deprecated:deprecated' --hive字段:es字段
);
- 插入數(shù)據(jù):
- es 索引:
put demo
{
"mappings":{
"content":{
"properties":{
"brief":{
"type":"text",
"index":false
},
"class_type":{
"type":"keyword"
},
"companies":{
"type":"keyword",
"ignore_above":256
},
"keywords":{
"type":"keyword",
"ignore_above":256
},
"keywords_desc":{
"type":"text",
"index":false
},
"names":{
"type":"keyword",
"ignore_above":256
},
"neg_index":{
"type":"half_float"
},
"oid":{
"type":"keyword"
},
"pub_time":{
"type":"long"
},
"sentiment":{
"type":"keyword"
},
"pub_time_date_long":{
"type":"keyword"
},
"pub_time_date":{
"type":"keyword"
},
"pub_time_year":{
"type":"keyword"
},
"pub_time_month":{
"type":"keyword"
},
"source":{
"type":"keyword",
"ignore_above":50
},
"tag_list":{
"type":"keyword"
},
"enterprises":{
"type":"nested",
"properties":{
"eid":{
"type":"keyword"
},
"name":{
"type":"keyword",
"ignore_above":256
}
}
},
"theme_keywords":{
"type":"nested",
"properties":{
"count":{
"type":"short"
},
"name":{
"type":"keyword",
"ignore_above":50
}
}
},
"title":{
"type":"text",
"analyzer":"ik_smart"
},
"url":{
"type":"keyword",
"ignore_above":256
},
"deprecated":{
"type":"keyword",
"doc_values":true
}
}
}
},
"settings":{
"index":{
"max_result_window":30000,
"indexing.slowlog.level":"info",
"indexing.slowlog.source":"1000",
"indexing.slowlog.threshold.index.info":"5s",
"indexing.slowlog.threshold.index.warn":"10s",
"search.slowlog.level":"info",
"search.slowlog.threshold.query.info":"1s",
"search.slowlog.threshold.query.warn":"4s",
"routing.rebalance.enable":"replicas",
"refresh_interval":"120s",
"store.type":"niofs",
"number_of_shards":"3",
"number_of_replicas":"0"
}
}
}