elasticsearch分析器三功能自定義
- 字符過濾器(char_filter)
首先叔收,字符串按順序通過每個(gè)字符過濾器,他們的任務(wù)是在分詞前整理字符串
一個(gè)字符過濾器可以用來去掉HTML傲隶,或者將&轉(zhuǎn)化成and
- 分詞器(tokenizer)
其次,字符串被分詞器分爲(wèi)單個(gè)的詞條窃页,一個(gè)簡(jiǎn)單的分詞器遇到空格和標(biāo)點(diǎn)的時(shí)候跺株,可能會(huì)將文本拆分成詞條
Hello how are you?會(huì)被ES預(yù)設(shè)的分詞器standard分成hello、how脖卖、are乒省、you
- Token 過濾器 (filter)
最后,詞條按順序通過每個(gè) token 過濾器畦木,這個(gè)過程可能會(huì)改變?cè)~條(Quick -> quick)袖扛、
刪除詞條(a、an、and蛆封、the...)唇礁、增加詞條(jump和leap這種同義詞)
- 過濾器(filter)解析
edge_ngram_filter:將每個(gè)詞都進(jìn)行進(jìn)一步的切分,用于即時(shí)搜索(instant search)惨篱。
`min_gram`表示只要用戶搜索了一個(gè)字符我們就去進(jìn)行匹配盏筐。
`max_gram`表示匹配的最大長(zhǎng)度,最大長(zhǎng)度越長(zhǎng)越占用空間
pinyin_simple_filter:拼音首字母的過濾器
pinyin_full_filter:拼音全拼的過濾器
自定義分析器
- 自定義setting格式
PUT 127.0.0.1:9200/mytest
{
"setting": {
"analysis": {
"char_filter": { 自定義的字符過濾器 },
"tokenizer": { 自定義的分詞器 },
"filter": { 自定義的token過濾器 },
"analyzer": { 自定義的分析器砸讳,可以將上面的char_filter琢融、tokenizer、filter用不同的組合拼起來簿寂,形成不同的分析器 }
}
}
}
實(shí)例
#設(shè)置setting
PUT /enterpriseextend
{
"settings": {
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"pinyin_simple_filter": {
"type": "pinyin",
"keep_first_letter": true,
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_original": false,
"limit_first_letter_length": 50,
"lowercase": true
},
"pinyin_full_filter": {
"type": "pinyin",
"keep_first_letter": false,
"keep_separate_first_letter": false,
"keep_full_pinyin": true,
"none_chinese_pinyin_tokenize": true,
"keep_original": false,
"limit_first_letter_length": 50,
"lowercase": true
}
},
"tokenizer": {
"ik_max_word": {
"type": "ik_max_word",
"use_smart": true
}
},
"analyzer": {
"ngramIndexAnalyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"edge_ngram_filter",
"lowercase"
]
},
"ikIndexAnalyzer": {
"type": "custom",
"tokenizer": "ik_max_word"
},
"pinyiSimpleIndexAnalyzer": {
"tokenizer": "keyword",
"filter": [
"pinyin_simple_filter",
"edge_ngram_filter",
"lowercase"
]
},
"pinyiFullIndexAnalyzer": {
"tokenizer": "keyword",
"filter": [
"pinyin_full_filter",
"lowercase"
]
}
}
}
}
}
#設(shè)置mapping
PUT enterpriseextend/_mapping/enterpriseextend
{
"properties": {
"id": {
"type": "long"
},
"entName": {
"type": "text",
"analyzer": "ikIndexAnalyzer",
"fields": {
"ngram": {
"type": "text",
"analyzer": "ngramIndexAnalyzer"
},
"SPY": {
"type": "text",
"analyzer": "pinyiSimpleIndexAnalyzer"
},
"FPY": {
"type": "text",
"analyzer": "pinyiFullIndexAnalyzer"
}
}
}
}
}
#插入語句
PUT enterpriseextend/_doc/1
{
"entName":"確實(shí)不是啥好東西"
}
#三種查詢
GET enterpriseextend/_search
{
"query": {
"match": {
"entName.ngram": "確實(shí)不是啥好東西"
}
}
}
GET enterpriseextend/_search
{
"query": {
"match": {
"entName.SPY": "qsbsshdx"
}
}
}
GET enterpriseextend/_search
{
"query": {
"match": {
"entName.FPY": "queshibushishahaodongxi"
}
}
}
綜合查詢
GET enterpriseextend/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"entName.ngram": {
"query": "確實(shí)不是啥好東西",
"boost": 5
}
}
},
{
"match": {
"entName.SPY": {
"query": "qsbsshdx",
"boost": 1
}
}
},
{
"match": {
"entName.FPY": {
"query": "queshibushishahaodongxi",
"boost": 0.8
}
}
}
]
}
}
}