整體效果
背景描述
在電商領(lǐng)域垫释,如果僅僅依賴傳統(tǒng)詞匹配方式進(jìn)行商品檢索,會(huì)導(dǎo)致一些商品召回錯(cuò)誤 影響用戶體驗(yàn)才漆;如 搜索【蛋糕奶油】結(jié)果查詢出“奶油蛋糕“相關(guān)商品封豪,搜索【車手】相關(guān)的宣傳海報(bào),結(jié)果找到了“汽車手機(jī)版”宣傳海報(bào)
架構(gòu)設(shè)計(jì)
- 特征提取部分采用傳統(tǒng)的DSSM模型進(jìn)行相似度訓(xùn)練朽寞,正樣本數(shù)據(jù)集來自七日內(nèi)用戶搜索詞和點(diǎn)擊商品的標(biāo)題及標(biāo)簽识窿,負(fù)樣本來自正樣本隨機(jī)組合生成的數(shù)據(jù)
- 提取后的特征采用ES的dense_vector來存儲(chǔ),目前最新版本支持knn檢索脑融,較傳統(tǒng)script查詢性能提升不少喻频;索引結(jié)構(gòu)如下:
PUT my-index
{
"mappings": {
"properties": {
"doc_vector": {
"type": "dense_vector",
"dims": 128,
"index": true,
"similarity": "cosine"
},
"title": {
"type": "text",
"analyzer": "ik_max_word"
},
"big_tags": {
"type": "text",
"analyzer": "ik_max_word"
},
"id": {
"type": "keyword"
}
}
}
}
工程實(shí)現(xiàn)
準(zhǔn)備數(shù)據(jù)
- 從日志里獲取搜索詞和商品id
import requests
url = "http://****:8123/?user=&password=&database=app"
rep = requests.post(url, data="SELECT decodeURLComponent(extractURLParameter(ref, 'keywords')) kw, splitByChar('/', path(url))[-1] id from app.scene_tracker where e_t='page_view' and p_l='PC' and ref like '%keywords%' and (url like '%detail%' or url like '%create%' ) and length(id)>4 ").text
kw_array = []
id_array = []
for line in rep.split('\n'):
array = line.split('\t')
try:
id_array.append(array[1])
kw_array.append(array[0])
except:
print(line)
- 補(bǔ)全標(biāo)題、標(biāo)簽等信息
import json
headers={
"Content-Type": "application/json"
}
temp='''
{
"_source": ["big_tags", "id", "title"],
"size":10000,
"query": {
"ids": {
"values": {}
}
}
}'''
id_set = set(id_array)
id_map_tit = {}
for i in range(0, len(id_set), 10000):
query = temp.replace('{}', str(list(id_set)[i:i+10000]).replace("'","\""))
response = requests.post("http://****:9200/index/_search", data=query.encode(), auth=('guoyanchao',''), headers=headers)
json_data = json.loads(response.text)
# print(response.text)
for item in json_data.get('hits').get('hits'):
doc = item.get('_source')
id_map_tit[item.get('_id')]=doc.get('title')+' '+doc.get('big_tags')
- 基于正樣本構(gòu)建負(fù)樣本
import random
import numpy as np
from collections import Counter
query_total = []
id_total = []
for query, id in zip(kw_array, id_array):
if id not in id_map_tit or query=='':
continue
id_total.append(id)
query_total.append(query)
lab_total = np.ones(len(query_total), dtype=int)
cn = len(lab_total)
ids = list(id_map_tit.keys())
for i in range(cn):
k = query_total[i]
query_total.append(k)
id_total.append(ids[random.randint(0,len(id_map_tit)-1)])
lab_total=np.append(lab_total, np.zeros(cn, dtype=int))
特征轉(zhuǎn)換
- 中文分詞吨掌,這里采用的是百度的lac半抱,大家也可以使用其它分詞器脓恕,或者單字分詞(考慮到中文詞組合無窮盡膜宋,大家線上可采用單字分詞 約1.5w維窿侈,個(gè)人也推薦使用這種方式 雖然詞義表達(dá)上有所降低,但更便于線上維護(hù))
from tensorflow.keras.preprocessing.text import Tokenizer
from LAC import LAC
lac = LAC(mode='seg')
query_terms = []
title_terms = []
for query, id in zip(query_total, id_total):
words = []
title = id_map_tit[id]
query_terms.append(lac.run(query))
title_terms.append(lac.run(title))
# MAX_WORDS=15
tokenizer = Tokenizer()
tokenizer.fit_on_texts(query_terms+title_terms)
vocab = tokenizer.word_index
vocab
- 生成詞序列:通過詞袋將中文映射為數(shù)字秋茫,并拓展為固定序列史简,便于模型輸入
from tensorflow.keras.preprocessing.sequence import pad_sequences
query_max_len = 16
doc_max_len = 128
# query_vec = pad_sequences(query_terms, dtype='object', maxlen=query_max_len).tolist()
# doc_vec = pad_sequences(title_terms, dtype='object', maxlen=doc_max_len).tolist()
query_seq = tokenizer.texts_to_sequences(query_terms)
query_vec = pad_sequences(query_seq, maxlen=query_max_len)
doc_seq = tokenizer.texts_to_sequences(title_terms)
doc_vec = pad_sequences(doc_seq, maxlen=doc_max_len)
構(gòu)建雙塔模型并訓(xùn)練
考慮到輸入都是短文本,故表示層才用了經(jīng)典的長短記憶模型LSTM來構(gòu)建雙塔用于特征提取肛著,輸出層采用的是余弦相似圆兵,大家也可以求解向量點(diǎn)積,模型每日更新
import tensorflow as tf
from sklearn.model_selection import train_test_split
embed_dim = 64
NEG, batch_size = 20, 128
class CosineLayer():
""" 自定義batch內(nèi)負(fù)采樣并做cosine相似度的層 """
def __call__(self, inputs):
def _cosine(x):
query_encoder, doc_encoder = x
doc_encoder_fd = doc_encoder
for i in range(NEG):
ss = tf.gather(doc_encoder, tf.random.shuffle(tf.range(tf.shape(doc_encoder)[0])))
doc_encoder_fd = tf.concat([doc_encoder_fd, ss], axis=0)
query_norm = tf.tile(tf.sqrt(tf.reduce_sum(tf.square(query_encoder), axis=1, keepdims=True)),[NEG + 1, 1])
doc_norm = tf.sqrt(tf.reduce_sum(tf.square(doc_encoder_fd), axis=1, keepdims=True))
query_encoder_fd = tf.tile(query_encoder, [NEG + 1, 1])
prod = tf.reduce_sum(tf.multiply(query_encoder_fd, doc_encoder_fd, name="sim-multiply"), axis=1, keepdims=True)
norm_prod = tf.multiply(query_norm, doc_norm)
cos_sim_raw = tf.truediv(prod, norm_prod)
cos_sim = tf.transpose(tf.reshape(tf.transpose(cos_sim_raw), [NEG + 1, -1])) * 20
prob = tf.nn.softmax(cos_sim, name="sim-softmax")
hit_prob = tf.slice(prob, [0, 0], [-1, 1], name="sim-slice")
loss = -tf.reduce_mean(tf.math.log(hit_prob), name="sim-mean")
return loss
output_shape = (1,)
value = tf.keras.layers.Lambda(_cosine, output_shape=output_shape)([inputs[0], inputs[1]])
return value
query_input = tf.keras.layers.Input(shape=(query_max_len, ), name="query_input")
doc_input = tf.keras.layers.Input(shape=(doc_max_len, ), name="doc_input")
embedding = tf.keras.layers.Embedding(len(vocab)+1, embed_dim)
query_embed = embedding(query_input)
doc_embed = embedding(doc_input)
query_encoder = tf.keras.layers.LSTM(128, name="query_tower")(query_embed)
doc_encoder = tf.keras.layers.LSTM(128, name="doc_tower")(doc_embed)
cos_sim = CosineLayer()([query_encoder, doc_encoder])
# output = tf.keras.layers.Dense(1, activation='sigmoid')(output)
model = tf.keras.models.Model(inputs=[query_input, doc_input], outputs=cos_sim)
# model.compile(loss="binary_crossentropy", optimizer='adam')
model.compile(optimizer="adam", loss=lambda y_true, y_pred: y_pred)
# query tower
query_model = tf.keras.models.Model(inputs=query_input, outputs=query_encoder)
# doc tower
doc_model = tf.keras.models.Model(inputs=doc_input, outputs=doc_encoder)
train_X, test_X, train_y, test_y = train_test_split(list(zip(query_vec, doc_vec)), lab_total, test_size=0.3)
# 將每一列轉(zhuǎn)化為nparray進(jìn)行輸入
train_q, train_p = zip(*train_X)
train_inputs = [ np.array(train_q), np.array(train_p) ]
test_q, test_p = zip(*test_X)
test_inputs = [ np.array(test_q), np.array(test_p) ]
print(train_q[:1], train_p[:1])
# train the model
print("[INFO] training model...")
model.fit(
train_inputs, train_y,
validation_data=(test_inputs, test_y),
epochs=40, batch_size=32, verbose=1)
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True)
導(dǎo)出特征
- 通過獲取訓(xùn)練好的doc_tower將商品特征導(dǎo)入到ES枢贿,另外也可以保存 doc_model 用于實(shí)時(shí)索引商品特征
import requests
for id, vec in dict(zip(id_total,doc_vec)).items():
doc_embedding = doc_model(np.reshape(vec, [1, doc_max_len]) )
data='''{{
"id":"{}",
"title":"{}",
"doc_vector":{}
}}'''.format(id, id_map_tit[id], list(doc_embedding.numpy().flatten()))
headers={
"Content-Type": "application/json"
}
response = requests.post("http://****:9200/my-index/_doc/"+str(id), data=data.encode(), auth=('guoyanchao',''), headers=headers)
線上服務(wù)
- 獲取query特征
import requests
from tensorflow.keras.preprocessing.sequence import pad_sequences
from LAC import LAC
lac = LAC(mode='seg')
query_layer_model = tf.keras.models.Model(
inputs=[query_input],
outputs=model.get_layer(name='query_tower').output
)
query = "婚禮邀請(qǐng)函"
query_seq = tokenizer.texts_to_sequences([lac.run(query)])
qvec = pad_sequences(query_seq, maxlen=query_max_len).tolist()
query_embedding = query_layer_model(np.reshape(qvec, [1,query_max_len]))
query_embedding.numpy().flatten()
- ES查詢語句殉农,這里使用knn檢索 其中filter部分可搭配其它DSL語句做商品的近一步篩選以提升匹配精度
GET my-index/_knn_search
{
"knn": {
"field": "doc_vector",
"query_vector":[],
"k": 20,
"num_candidates": 1000
} ,
"filter": {
"match": {
"title": ""
}
},
"_source": ["title" ]
}
傳統(tǒng)的script檢索
GET my-index/_search
{
"_source": ["title"],
"query": {
"script_score": {
"query" : {
"match_all": {}
},
"script": {
"source": """
double value = dotProduct(params.query_vector, 'doc_vector');
return sigmoid(1, Math.E, -value);
""",
"params": {
"query_vector": []
}
}
}
}
}
后期優(yōu)化
目前模型數(shù)據(jù)只是考慮了query和商品文本描述,后期會(huì)增加更多維度的訓(xùn)練特征局荚,如用戶短期興趣偏好特征超凳,商品屬性、價(jià)格以及封面圖特征等
匹配層采用歐式距離要優(yōu)于點(diǎn)積和余弦
# -- 歐式距離
output = tf.sqrt(tf.reduce_sum(tf.square(query_encoder - doc_encoder), axis = 1))
output = tf.expand_dims(output, 1)
output = tf.keras.layers.Dense(1, activation='sigmoid')(output)
- 表示層優(yōu)化模型結(jié)構(gòu)耀态,將LSTM換成雙向結(jié)構(gòu)
forward_layer = tf.keras.layers.LSTM(128, return_sequences=True)
backward_layer = tf.keras.layers.LSTM(128, activation='tanh', return_sequences=True, go_backwards=True)
query_encoder = tf.keras.layers.Bidirectional(forward_layer, backward_layer=backward_layer)(query_embed)
query_encoder = tf.keras.layers.Flatten()(query_encoder)
merged = tf.keras.layers.Dropout(0.1)(query_encoder)
merged = tf.keras.layers.BatchNormalization()(merged)
merged = tf.keras.layers.Dense(128, activation='tanh')(merged)
query_encoder = tf.keras.layers.Dense(128, activation='tanh', name="query_tower")(merged)
doc_encoder = tf.keras.layers.Bidirectional(forward_layer, backward_layer=backward_layer)(doc_embed)
doc_encoder = tf.keras.layers.Flatten()(doc_encoder)
merged = tf.keras.layers.Dropout(0.1)(doc_encoder)
merged = tf.keras.layers.BatchNormalization()(merged)
merged = tf.keras.layers.Dense(128, activation='tanh')(merged)
doc_encoder = tf.keras.layers.Dense(128, activation='tanh', name="doc_tower")(merged)
output = tf.reduce_sum(query_encoder*doc_encoder, axis = 1)
output = tf.expand_dims(output, 1)
output = tf.keras.layers.Dense(1, activation='sigmoid', use_bias=False)(output)