小孔同學推薦了兩個基于循環(huán)神經(jīng)網(wǎng)絡以及CRF處理nlp問題的算法锦爵,分別是中分分詞以及命名實體識別哟忍,都是github上開源的項目梅屉,之前也用過一些開源的分詞工具以及產(chǎn)品級分詞工具筐带,比如波森nlp鹅经,發(fā)現(xiàn)github上這兩個【BiLSTM+CRF】 【NeuroNER】寂呛,效果都非常不錯,感謝koth 以及 Franck Dernoncourt瘾晃。在這不分析模型贷痪,只說下把兩個模型frozen一下,然后一塊封裝到nlp相關(guān)的接口中蹦误,通過flask可以非常方便的發(fā)布供項目其他模塊使用~
分詞模型
按照README中介紹的過程訓練模型劫拢,訓練完成后,想把分詞作為工程的其他模塊通過接口的形式發(fā)布出來强胰,在這直接使用了作者固化出來的模型在Python中加載舱沧,最終腳本如下所示,指示單句子分詞哪廓,如果是長文本狗唉,參照著作者github中 kcws/cc/seg_backend_api.cc 調(diào)用的分割函數(shù),分割成短文本即可涡真。
"""
分詞主類分俯,加載預先訓練好并固化好的模型文件肾筐,加載訓練生成的詞-id映射表并進行分詞
"""
import numpy as np
import tensorflow as tf
import math
class Node:
"""
trie樹中的節(jié)點, weight=0 表示該節(jié)點不是葉子節(jié)點
"""
def __init__(self):
self.weight = 0.0
self.next = {}
class Trie:
"""
trie 樹實現(xiàn)缸剪,用來添加用戶自定義詞典
每個節(jié)點包括一個權(quán)重以及指向下一個節(jié)點的連接
權(quán)重初始化為0吗铐,當該節(jié)點是詞的最終節(jié)點時,權(quán)重賦值為用戶自定義詞典的權(quán)重
"""
def __init__(self):
self.root = Node()
def push_node(self, word, weight):
"""
將詞與權(quán)重壓入構(gòu)建trie數(shù)的節(jié)點以及路徑
:param word: 詞
:param weight: 權(quán)重
:return: None
"""
word_len = len(word)
if word_len == 0: return
temp = self.root
for index, char in enumerate(word):
if char in temp.next:
temp = temp.next[char]
if index + 1 == word_len: temp.weight = weight
continue
else:
temp.next[char] = Node()
temp = temp.next[char]
if index + 1 == word_len: temp.weight = weight
def search(self, sentence):
"""
對輸入的文本通過trie樹做檢索杏节,查找文本中出現(xiàn)的用戶自定義詞唬渗,并返回該詞所在位置以及該詞權(quán)重
:param sentence: 文本
:return: eg: [([0, 1], 4.0), ([9, 10, 11], 4.0)]
tuple 列表,其中[0, 1]表示該詞出現(xiàn)在句子的第0,1索引出奋渔, 4.0表示用戶對該詞定義的權(quán)重
"""
fake_list = []
word_len = len(sentence)
if word_len == 0: return fake_list
point = self.root
index = 0
pre_match = -1
word_range = None
while index < word_len:
word = sentence[index]
if word in point.next:
if pre_match < 0: pre_match = index
point = point.next[word]
index += 1
if point.weight > 0: word_range = ([i for i in range(pre_match, index)], point.weight)
if index == word_len: fake_list.append(word_range)
continue
else:
if word_range: fake_list.append(word_range)
word_range = None
point = self.root
index = pre_match + 1 if pre_match >= 0 else index + 1
pre_match = -1
return fake_list
class Segment:
def __init__(self, frozen_model_path, vocab_path, user_dict_path=None):
self.seq_max_len = 80
self.graph = None
self.frozen_model_path = frozen_model_path
self.base_vocab_path = vocab_path
self.user_dict_path = user_dict_path
self.user_dict = None
self.sess = None
self.graph = None
self.input = None
self.transitions = None
self.unary_score = None
self.word_2_index = dict()
self.load_model()
def load_model(self):
"""
加載深度神經(jīng)網(wǎng)絡模型镊逝,提取輸入節(jié)點以及輸出節(jié)點
得到狀態(tài)轉(zhuǎn)移概率矩陣
Returns: None
"""
self.load_basic_vocab()
self.load_user_dict()
with tf.gfile.GFile(self.frozen_model_path, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, input_map=None, return_elements=None, name="prefix", op_dict=None,
producer_op_list=None)
self.graph = graph
self.sess = tf.Session(graph=self.graph)
transitions_ = self.graph.get_tensor_by_name("prefix/transitions:0")
self.unary_score = self.graph.get_tensor_by_name("prefix/Reshape_7:0")
self.input = self.graph.get_tensor_by_name("prefix/input_placeholder:0")
self.transitions = self.sess.run([transitions_])
self.transitions = np.squeeze(np.array(self.transitions), axis=0)
def load_basic_vocab(self):
"""
加載詞映射表 漢字:id
Returns: None
"""
with open(self.base_vocab_path, "r") as file_rd:
for line in file_rd.readlines():
line = line.strip()
[word, index] = line.split(" ")
try:
self.word_2_index[word] = int(index)
except Exception: pass
def load_user_dict(self):
"""
加載用戶自定義詞典,生成trie樹嫉鲸,viterbi解碼前對unary code 進行修改
:return: None
"""
if not self.user_dict_path:
self.user_dict = None
return
else:
self.user_dict = Trie()
with open(self.user_dict_path, "r") as file_rd:
for line in file_rd.readlines():
line = line.strip()
if line and line == "": continue
try:
word, weight = line.split(" ")
self.user_dict.push_node(word, float(weight))
except Exception:
pass
def fake_predication(self, unary_scores, sentence):
"""
通過用戶自定義詞典檢索句子撑蒜,修改自定義詞處的概率向量,用于后續(xù)的viterbi解碼
:param unary_scores: 句子的狀態(tài)概率矩陣玄渗,shape=(sentence_len, state_len)
:param sentence: 原句子
:return: None
"""
fake_list = self.user_dict.search(sentence)
sentence_len = unary_scores.shape[0]
for item in fake_list:
if max(item[0]) > sentence_len:
return
word_range = item[0]
word_weight = item[1]
word_len = len(word_range)
for index in range(word_len):
weight_total = 4.0 + word_weight
if index == 0:
weights = [1.0, 1.0 + word_weight, 1.0, 1.0]
elif index + 1 == word_len:
weights = [1.0, 1.0, 1.0, 1.0 + word_weight]
else:
weights = [1.0, 1.0, 1.0 + word_weight, 1.0]
weights = [math.log(i / weight_total) for i in weights]
unary_scores[word_range[index]] = weights
def predict(self, content):
"""
分割主函數(shù)
Args:
content: 待分割的文本
eg: 趙雅淇灑淚道歉和林丹沒有任何經(jīng)濟關(guān)系
Returns: 分割后的字符串
eg: [{"tok": "趙雅琪"},{"tok": "灑淚"},{"tok": "道歉"},{"tok": "和"},
{"tok": "林丹"},{"tok": "沒有"},{"tok": "任何"},{"tok": "經(jīng)濟"},{"tok": "關(guān)系"}]
"""
result = []
word_index_seq = []
for word in content:
if word in self.word_2_index:
word_index_seq.append(self.word_2_index[word])
else:
word_index_seq.append(1)
word_index_seq.extend([0] * (self.seq_max_len - len(word_index_seq)))
feed_input = np.expand_dims(np.array(word_index_seq), axis=0)
unary_score_val = self.sess.run([self.unary_score], {self.input: feed_input})
seq_len = sum([1 for item in word_index_seq if item > 0])
tf_unary_scores_ = np.squeeze(unary_score_val[0], axis=0)
tf_unary_scores_ = tf_unary_scores_[:seq_len]
if self.user_dict:
# 是否啟用用戶自定義詞典進行優(yōu)化分詞結(jié)果
self.fake_predication(tf_unary_scores_, content)
tag_sequence, _ = tf.contrib.crf.viterbi_decode(tf_unary_scores_, self.transitions)
pre_word = ""
for tag, word in zip(tag_sequence, content):
if tag == 0:
pre_word = ""
result.append({"tok": word})
elif tag == 1 or tag == 2:
pre_word += word
elif tag == 3:
pre_word += word
result.append({"tok": pre_word})
pre_word = ""
return result
其中frozen_model_path是frozen之后生成的pb文件座菠,vocab_path則是詞-ID對照文件,user_dict_path是用戶自定義詞典藤树,如圖所示:
參照著koth的方法浴滴,網(wǎng)絡輸出的句子狀態(tài)概率矩陣,在進行viterbi解碼之前通過用戶自定義詞典以及權(quán)重進行調(diào)整岁钓,之后再解碼升略,如果讓我自己去想怎么實現(xiàn),我可能會直接在viterbi解碼后的狀態(tài)向量甜紫,通過自定義詞強行進行拆分合并降宅。
命名實體識別模型
命名實體識別的項目也用了相似的,感興趣的不妨也參考下囚霸,最終大致也是通過相同的方式對模型進行固化腰根,然后封裝到接口中。
# -*- coding:utf-8 -*-
"""
命名實體識別主類
加載預先訓練好并固化好的模型文件拓型,加載詞向量模型文件
"""
import numpy as np
import tensorflow as tf
import dataset_predict as ds
class NERPredict:
def __init__(self, frozen_model_path, word_2_vec_path, trained_model_path):
self.graph = None
self.frozen_model_path = frozen_model_path
self.word_vec_path = word_2_vec_path
self.pre_trained_model_path = trained_model_path
with tf.gfile.GFile(self.frozen_model_path, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, input_map=None, return_elements=None, name="prefix", op_dict=None,
producer_op_list=None)
self.graph = graph
self.transition_params_trained = self.graph.get_tensor_by_name('prefix/crf/transitions:0')
self.input_token_indices = self.graph.get_tensor_by_name('prefix/input_token_indices:0')
self.dropout_keep_prob = self.graph.get_tensor_by_name('prefix/dropout_keep_prob:0')
self.unary_scores = self.graph.get_tensor_by_name('prefix/feedforward_before_crf/scores:0')
self.predictions = self.graph.get_tensor_by_name('prefix/feedforward_before_crf/predictions:0')
self.sess = tf.Session(graph=self.graph)
self.dataset = ds.Dataset(verbose=False, debug=False)
self.parameters = {
'token_pretrained_embedding_filepath': self.word_vec_path,
'pretrained_model_folder': self.pre_trained_model_path,
'remap_unknown_tokens_to_unk': True
}
self.dataset.load_dataset(self.parameters)
transition_temp = self.sess.run([self.transition_params_trained])
transition_temp = np.array(transition_temp)
self.transition = np.squeeze(transition_temp, axis=0)
def predict(self, tokens_of_sentence):
"""
實體識別主函數(shù)
Args:
content: 詞序列
Returns: 實體識別結(jié)果
"""
small_score = -1000.0
large_score = 0.0
if isinstance(tokens_of_sentence, str):
tokens_of_sentence = tokens_of_sentence.split(" ")
token_indices, label_indices, character_indices_padded, character_indices, token_lengths, \
characters, label_vector_indices = self.dataset.convert_to_indices_for_sentence(
tokens_of_sentence)
feed_dict = {
self.input_token_indices: token_indices["deploy"][0],
self.dropout_keep_prob: 1.
}
unary_scores, predictions = self.sess.run([self.unary_scores, self.predictions], feed_dict)
unary_scores = np.squeeze(np.array(unary_scores))
concate_array = np.ones(shape=(unary_scores.shape[0], 2), dtype=np.float32) * small_score
unary_scores = np.concatenate((unary_scores, concate_array), axis=1)
start_unary_scores = np.array([[small_score] * self.dataset.number_of_classes + [large_score, small_score]])
end_unary_scores = np.array([[small_score] * self.dataset.number_of_classes + [small_score, large_score]])
unary_scores = np.concatenate((start_unary_scores, unary_scores, end_unary_scores), axis=0)
predictions, _ = tf.contrib.crf.viterbi_decode(unary_scores, self.transition)
predictions = predictions[1:-1]
assert (len(predictions) == len(tokens_of_sentence))
predictions = [self.dataset.index_to_label[index] for index in predictions]
return predictions
dataset_predict 是工程自帶的一個類额嘿,偷懶直接糾過來用了,至于怎樣frozen原模型生成pb文件劣挫,在這不在介紹册养。從代碼中可以看到循環(huán)神經(jīng)網(wǎng)絡結(jié)合CRF 做分詞以及命名實體識別的模型結(jié)構(gòu)幾乎是一致的,最終也都用到了viterbi解碼压固,有興趣的可以參考下球拦。
模型固化部分(針對ner項目)
"""
frozen the model test
"""
graph = tf.get_default_graph()
input_graph_def = graph.as_graph_def()
output_nodes = ['crf/transitions', 'feedforward_before_crf/scores', 'feedforward_before_crf/predictions']
output_graph_def = graph_util.convert_variables_to_constants(sess, input_graph_def, output_nodes)
with tf.gfile.GFile("frozen_model.pb", "wb") as f:
f.write(output_graph_def.SerializeToString())
print("frozen the ner model")
"""
frozen model test finish
"""
歡迎一塊交流討論,文中有錯誤的地方,還請指正坎炼,謝謝~
email: bidai541@foxmail.com