1. HMM
1.1 模型原理
HMM
中丝格,有5個(gè)基本元素:{N,M,A,B,π},結(jié)合序列標(biāo)志任務(wù)(NER)對(duì)其的概念定義為:
- N:狀態(tài)的有限集合加匈。在這里攘轩,是指每一個(gè)詞語(yǔ)背后的標(biāo)注洞辣。
- M:觀察值的有限集合咐刨。在這里,是指每一個(gè)詞語(yǔ)本身扬霜。
- A:狀態(tài)轉(zhuǎn)移概率矩陣定鸟。在這里,是指某一個(gè)標(biāo)注轉(zhuǎn)移到下一個(gè)標(biāo)注的概率著瓶。
- B:觀測(cè)概率矩陣联予,也就是發(fā)射概率矩陣。在這里,是指在某個(gè)標(biāo)注下沸久,生成某個(gè)詞的概率季眷。
- π:初始概率矩陣。在這里卷胯,是指每一個(gè)標(biāo)注的初始化概率子刮。
而以上的這些元素,都是可以從訓(xùn)練語(yǔ)料集中統(tǒng)計(jì)出來(lái)的窑睁。最后根據(jù)這些統(tǒng)計(jì)值挺峡,應(yīng)用維特比(viterbi)
算法,算出詞語(yǔ)序列背后的標(biāo)注序列了卵慰,命名實(shí)體識(shí)別本質(zhì)上就是序列標(biāo)注沙郭,只需要定義好對(duì)應(yīng)的標(biāo)簽以及模式串佛呻,就可以從標(biāo)注序列中提取出實(shí)體
假設(shè)對(duì)于t時(shí)刻的一個(gè)詞公式就可寫(xiě)作:
齊次馬爾科夫性假設(shè):隱藏的馬爾科夫鏈在任意時(shí)刻t的狀態(tài)只依賴于其前一時(shí)刻的狀態(tài)裳朋,與其他時(shí)刻的狀態(tài)及觀測(cè)無(wú)關(guān),也與時(shí)刻t無(wú)關(guān)
觀測(cè)獨(dú)立性假設(shè):假設(shè)任意時(shí)刻的觀測(cè)只依賴于該時(shí)刻的馬爾科夫鏈的狀態(tài)吓著,與其他觀測(cè)即狀態(tài)無(wú)關(guān).觀測(cè)概率的公式可以表達(dá)如下:
將發(fā)射概率和轉(zhuǎn)移概率相結(jié)合,得到整個(gè)句子最后的公式:
1.2 模型實(shí)現(xiàn)
class Model(object):
def __init__(self, hidden_status):
# param hidden_status: int, 隱狀態(tài)數(shù)
self.hmm_N = hidden_status
# 狀態(tài)轉(zhuǎn)移概率矩陣 A[i][j]表示從i狀態(tài)轉(zhuǎn)移到j(luò)狀態(tài)的概率
self.hmm_A = torch.zeros(self.hmm_N, self.hmm_N)
# 初始狀態(tài)概率 Pi[i]表示初始時(shí)刻為狀態(tài)i的概率
self.hmm_pi = torch.zeros(self.hmm_N)
def _build_corpus_map(self, sentences_list):
char2id = {}
for sentence in sentences_list:
for word in sentence:
if word not in char2id:
char2id[word] = len(char2id)
return char2id
def _init_emission(self):
self.hmm_M = len(self.word2id)
# 觀測(cè)概率矩陣, B[i][j]表示i狀態(tài)下生成j觀測(cè)的概率
self.hmm_B = torch.zeros(self.hmm_N, self.hmm_M)
def train(self, sentences_list, tags_list):
"""
參數(shù):
sentences_list: list鲤嫡,其中每個(gè)元素由字組成的列表,如 ['擔(dān)','任','科','員']
tags_list: list绑莺,其中每個(gè)元素是由對(duì)應(yīng)的標(biāo)注組成的列表暖眼,如 ['O','O','B-TITLE', 'E-TITLE']
"""
start_time = time.time()
assert len(sentences_list) == len(tags_list), "the lens of tag_lists is not eq to word_lists"
logger.info('開(kāi)始構(gòu)建token字典...')
self.word2id = self._build_corpus_map(sentences_list)
self.tag2id = self._build_corpus_map(tags_list)
self.id2tag = dict((id_, tag) for tag, id_ in self.tag2id.items())
logger.info('訓(xùn)練語(yǔ)料總數(shù):{}'.format(len(sentences_list)))
logger.info('詞典總數(shù):{}'.format(len(self.word2id)))
logger.info('標(biāo)簽總數(shù):{}'.format(len(self.tag2id)))
assert self.hmm_N == len(self.tag2id), "hidden_status is {}, but total tag is {}".\
format(self.hmm_N, len(self.tag2id))
self._init_emission()
logger.info('構(gòu)建詞典完成{:>.4f}s'.format(time.time()-start_time))
logger.info('開(kāi)始構(gòu)建轉(zhuǎn)移概率矩陣...')
# 估計(jì)轉(zhuǎn)移概率矩陣
for tags in tqdm(tags_list):
seq_len = len(tags)
for i in range(seq_len - 1):
current_tagid = self.tag2id[tags[i]]
next_tagid = self.tag2id[tags[i+1]]
self.hmm_A[current_tagid][next_tagid] += 1.
# 問(wèn)題:如果某元素沒(méi)有出現(xiàn)過(guò),該位置為0纺裁,這在后續(xù)的計(jì)算中是不允許的
# 解決方法:我們將等于0的概率加上很小的數(shù)
self.hmm_A[self.hmm_A == 0.] = 1e-10
self.hmm_A = self.hmm_A / self.hmm_A.sum(axis=1, keepdims=True)
logger.info('完成轉(zhuǎn)移概率矩陣構(gòu)建. {:>.4f}s'.format(time.time() - start_time))
logger.info('開(kāi)始構(gòu)建觀測(cè)概率矩陣...')
# 估計(jì)觀測(cè)概率矩陣
for tags, sentence in tqdm(zip(tags_list, sentences_list)):
assert len(tags) == len(sentence), \
"the lens of tag_list is not eq to word_list"
for tag, word in zip(tags, sentence):
tag_id = self.tag2id[tag]
word_id = self.word2id[word]
self.hmm_B[tag_id][word_id] += 1.
self.hmm_B[self.hmm_B == 0.] = 1e-10
self.hmm_B = self.hmm_B / self.hmm_B.sum(axis=1, keepdims=True)
logger.info('完成觀測(cè)概率矩陣構(gòu)建. {:>.4f}s'.format(time.time() - start_time))
logger.info('初始化初識(shí)狀態(tài)概率...')
# 估計(jì)初始狀態(tài)概率
for tags in tqdm(tags_list):
init_tagid = self.tag2id[tags[0]]
self.hmm_pi[init_tagid] += 1.
self.hmm_pi[self.hmm_pi == 0.] = 1e-10
self.hmm_pi = self.hmm_pi / self.hmm_pi.sum()
logger.info('完成初始狀態(tài)概率構(gòu)建. {:>.4f}s'.format(time.time() - start_time))
def predict(self, sentences_list):
pred_tag_lists = []
for sentence in tqdm(sentences_list):
pred_tag_list = self.decoding(sentence)
pred_tag_lists.append(pred_tag_list)
return pred_tag_lists
def decoding(self, word_list):
"""
使用維特比算法對(duì)給定觀測(cè)序列求狀態(tài)序列诫肠, 這里就是對(duì)字組成的序列,求其對(duì)應(yīng)的標(biāo)注。
維特比算法實(shí)際是用動(dòng)態(tài)規(guī)劃解隱馬爾可夫模型預(yù)測(cè)問(wèn)題欺缘,即用動(dòng)態(tài)規(guī)劃求概率最大路徑(最優(yōu)路徑)
這時(shí)一條路徑對(duì)應(yīng)著一個(gè)狀態(tài)序列
"""
A = torch.log(self.hmm_A)
B = torch.log(self.hmm_B)
Pi = torch.log(self.hmm_pi)
# 初始化 維比特矩陣viterbi 它的維度為[狀態(tài)數(shù), 序列長(zhǎng)度]
seq_len = len(word_list)
viterbi = torch.zeros(self.hmm_N, seq_len)
# 等解碼的時(shí)候栋豫,我們用backpointer進(jìn)行回溯,以求出最優(yōu)路徑
backpointer = torch.zeros(self.hmm_N, seq_len).long()
start_wordid = self.word2id.get(word_list[0], None)
Bt = B.t()
if start_wordid is None:
# 如果字不再字典里谚殊,則假設(shè)狀態(tài)的概率分布是均勻的
bt = torch.log(torch.ones(self.hmm_N) / self.hmm_N)
else:
bt = Bt[start_wordid]
viterbi[:, 0] = Pi + bt
backpointer[:, 0] = -1
for step in range(1, seq_len):
wordid = self.word2id.get(word_list[step], None)
# 處理字不在字典中的情況
# bt是在t時(shí)刻字為wordid時(shí)丧鸯,狀態(tài)的概率分布
if wordid is None:
# 如果字不再字典里,則假設(shè)狀態(tài)的概率分布是均勻的
bt = torch.log(torch.ones(self.hmm_N) / self.hmm_N)
else:
bt = Bt[wordid] # 否則從觀測(cè)概率矩陣中取bt
for tag_id in range(len(self.tag2id)):
max_prob, max_id = torch.max(
viterbi[:, step - 1] + A[:, tag_id],
dim=0
)
viterbi[tag_id, step] = max_prob + bt[tag_id]
backpointer[tag_id, step] = max_id
# 終止嫩絮, t=seq_len 即 viterbi[:, seq_len]中的最大概率丛肢,就是最優(yōu)路徑的概率
best_path_prob, best_path_pointer = torch.max(
viterbi[:, seq_len - 1], dim=0
)
# 回溯,求最優(yōu)路徑
best_path_pointer = best_path_pointer.item()
best_path = [best_path_pointer]
for back_step in range(seq_len - 1, 0, -1):
best_path_pointer = backpointer[best_path_pointer, back_step]
best_path_pointer = best_path_pointer.item()
best_path.append(best_path_pointer)
# 將tag_id組成的序列轉(zhuǎn)化為tag
assert len(best_path) == len(word_list)
tag_list = [self.id2tag[id_] for id_ in reversed(best_path)]
return tag_list
2. CRF
2.1 模型原理
相對(duì)于HMM剿干,CRF有兩個(gè)優(yōu)勢(shì)
- CRF不僅可以利用相鄰詞之間的關(guān)聯(lián)蜂怎,還可以設(shè)置更大的鄰域
- CRF可以方便融合多特征
令是觀測(cè)序列,
是狀態(tài)序列置尔,
是
CRF
模型的參數(shù)杠步,則的條件概率是:
其中是
CRF
特征函數(shù)集,加上正則化項(xiàng),在做對(duì)數(shù)變換就得到
CRF
訓(xùn)練的目的是求解令最大化的
2.2 模型實(shí)現(xiàn)(使用TorchCRF第三方庫(kù))
$ pip install TorchCRF
import torch
from TorchCRF import CRF
device = "cuda"
batch_size = 2
sequence_size = 3
num_labels = 5
mask = torch.ByteTensor([[1, 1, 1], [1, 1, 0]]).to(device) # (batch_size. sequence_size)
labels = torch.LongTensor([[0, 2, 3], [1, 4, 1]]).to(device) # (batch_size, sequence_size)
hidden = torch.randn((batch_size, sequence_size, num_labels), requires_grad=True).to(device)
crf = CRF(num_labels)
Computing log-likelihood (used where forward)
crf.forward(hidden, labels, mask)
>>> tensor([-7.6204, -3.6124], device='cuda:0', grad_fn=<ThSubBackward>)
crf.viterbi_decode(hidden, mask)
>>> [[0, 2, 2], [4, 0]]
3. BiLSTM+CRF
3.1 模型原理
應(yīng)用于NER中的BiLSTM-CRF模型主要由Embedding層(主要有詞向量篮愉,字向量以及一些額外特征)腐芍,雙向LSTM層,以及最后的CRF層構(gòu)成试躏。實(shí)驗(yàn)結(jié)果表明biLSTM-CRF已經(jīng)達(dá)到或者超過(guò)了基于豐富特征的CRF模型猪勇,成為目前基于深度學(xué)習(xí)的NER方法中的最主流模型。在特征方面颠蕴,該模型繼承了深度學(xué)習(xí)方法的優(yōu)勢(shì)泣刹,無(wú)需特征工程,使用詞向量以及字符向量就可以達(dá)到很好的效果犀被,如果有高質(zhì)量的詞典特征椅您,能夠進(jìn)一步獲得提高
3.2 模型實(shí)現(xiàn)
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.tagset_size = config.num_label + 2
self.hidden_dim = config.hidden_size
self.start_tag_id = config.num_label
self.end_tag_id = config.num_label + 1
self.device = config.device
self.embedding = nn.Embedding(config.vocab_size, config.emb_size, padding_idx=config.vocab_size - 1)
torch.nn.init.uniform_(self.embedding.weight, -0.10, 0.10)
self.encoder = nn.LSTM(config.emb_size, config.hidden_size, batch_first=True, bidirectional=True)
self.decoder = nn.LSTM(config.hidden_size * 2, config.hidden_size, batch_first=True, bidirectional=True)
self.linear = nn.Linear(2*config.hidden_size, self.tagset_size)
self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
self.transitions.data[self.start_tag_id, :] = -10000.
self.transitions.data[:, self.end_tag_id] = -10000.
self.hidden = self.init_hidden()
def init_hidden(self):
return (torch.randn(2, 1, self.hidden_dim).to(self.device),
torch.randn(2, 1, self.hidden_dim).to(self.device))
def _get_lstm_features(self, input_ids):
embeds = self.embedding(input_ids).view(1, input_ids.shape[1], -1)
self.encoder.flatten_parameters()
self.decoder.flatten_parameters()
encoder_out, _ = self.encoder(embeds, self.hidden)
decoder_out, _ = self.decoder(encoder_out, self.hidden)
decoder_out = decoder_out.view(input_ids.shape[1], -1)
lstm_logits = self.linear(decoder_out)
return lstm_logits
def log_sum_exp(self, smat):
# 每一列的最大數(shù)
vmax = smat.max(dim=0, keepdim=True).values
return torch.log(torch.sum(torch.exp(smat - vmax), axis=0, keepdim=True)) + vmax
def _forward_alg(self, feats):
alphas = torch.full((1, self.tagset_size), -10000.).to(self.device)
# 初始化分值分布. START_TAG是log(1)=0, 其他都是很小的值 "-10000"
alphas[0][self.start_tag_id] = 0.
# Iterate through the sentence
for feat in feats:
# log_sum_exp()內(nèi)三者相加會(huì)廣播: 當(dāng)前各狀態(tài)的分值分布(列向量) + 發(fā)射分值(行向量) + 轉(zhuǎn)移矩陣(方形矩陣)
# 相加所得矩陣的物理意義見(jiàn)log_sum_exp()函數(shù)的注釋; 然后按列求log_sum_exp得到行向量
alphas = self.log_sum_exp(alphas.T + self.transitions + feat.unsqueeze(0))
# 最后轉(zhuǎn)到EOS,發(fā)射分值為0寡键,轉(zhuǎn)移分值為列向量 self.transitions[:, self.end_tag_id]
score = self.log_sum_exp(alphas.T + 0 + self.transitions[:, self.end_tag_id].view(-1, 1))
return score.flatten()
def _score_sentence(self, feats, tags):
# Gives the score of a provided tag sequence
score = torch.zeros(1).to(self.device)
tags = torch.cat([torch.tensor([self.start_tag_id], dtype=torch.long).to(self.device), tags])
for i, feat in enumerate(feats):
# emit = X0,start + x1,label + ... + xn-2,label + (xn-1, end[0])
# trans = 每一個(gè)狀態(tài)的轉(zhuǎn)移狀態(tài)
score += self.transitions[tags[i], tags[i+1]] + feat[tags[i + 1]]
# 加上到END_TAG的轉(zhuǎn)移
score += self.transitions[tags[-1], self.end_tag_id]
return score
def _viterbi_decode(self, feats):
backtrace = [] # 回溯路徑; backtrace[i][j] := 第i幀到達(dá)j狀態(tài)的所有路徑中, 得分最高的那條在i-1幀是神馬狀態(tài)
alpha = torch.full((1, self.tagset_size), -10000.).to(self.device)
alpha[0][self.start_tag_id] = 0
for frame in feats:
smat = alpha.T + frame.unsqueeze(0) + self.transitions
backtrace.append(smat.argmax(0)) # 當(dāng)前幀每個(gè)狀態(tài)的最優(yōu)"來(lái)源"
alpha = smat.max(dim=0, keepdim=True).values
# Transition to STOP_TAG
smat = alpha.T + 0 + self.transitions[:, self.end_tag_id].view(-1, 1)
best_tag_id = smat.flatten().argmax().item()
best_score = smat.max(dim=0, keepdim=True).values.item()
best_path = [best_tag_id]
for bptrs_t in reversed(backtrace[1:]): # 從[1:]開(kāi)始掀泳,去掉開(kāi)頭的 START_TAG
best_tag_id = bptrs_t[best_tag_id].item()
best_path.append(best_tag_id)
best_path.reverse()
return best_score, best_path # 返回最優(yōu)路徑分值 和 最優(yōu)路徑
def forward(self, sentence_ids, tags_ids):
tags_ids = tags_ids.view(-1)
feats = self._get_lstm_features(sentence_ids)
forward_score = self._forward_alg(feats)
gold_score = self._score_sentence(feats, tags_ids)
outputs = (forward_score - gold_score, )
_, tag_seq = self._viterbi_decode(feats)
outputs = (tag_seq, ) + outputs
return outputs
def predict(self, sentence_ids):
lstm_feats = self._get_lstm_features(sentence_ids)
_, tag_seq = self._viterbi_decode(lstm_feats)
return tag_seq
4. IDCNN+CRF
4.1 模型原理
正常CNN
的filter,都是作用在輸入矩陣一片連續(xù)的區(qū)域上西轩,不斷sliding做卷積员舵。dilated CNN
為這個(gè)filter增加了一個(gè)dilation width
,作用在輸入矩陣的時(shí)候藕畔,會(huì)skip所有dilation width
中間的輸入數(shù)據(jù)马僻;而filter本身的大小保持不變,這樣filter獲取到了更廣闊的輸入矩陣上的數(shù)據(jù)注服,看上去就像是膨脹了一般韭邓。具體使用時(shí),dilated width
會(huì)隨著層數(shù)的增加而指數(shù)增加溶弟。這樣隨著層數(shù)的增加女淑,參數(shù)數(shù)量是線性增加的,而receptive field
卻是指數(shù)增加的可很,可以很快覆蓋到全部的輸入數(shù)據(jù)诗力。
圖中可見(jiàn)感受域是以指數(shù)速率擴(kuò)大的。原始感受域是位于中心點(diǎn)的1x1區(qū)域:
- 經(jīng)由原始感受域按步長(zhǎng)為1向外擴(kuò)散我抠,得到8個(gè)1x1的區(qū)域構(gòu)成新的感受域苇本,大小為3x3
- 經(jīng)過(guò)步長(zhǎng)為2的擴(kuò)散,上一步3x3的感受域擴(kuò)展為為7x7
- 經(jīng)步長(zhǎng)為4的擴(kuò)散菜拓,原7x7的感受域擴(kuò)大為15x15的感受域瓣窄。每一層的參數(shù)數(shù)量是相互獨(dú)立的。感受域呈指數(shù)擴(kuò)大纳鼎,但參數(shù)數(shù)量呈線性增加
對(duì)應(yīng)在文本上俺夕,輸入是一個(gè)一維的向量裳凸,每個(gè)元素是一個(gè)character embedding
:
IDCNN
對(duì)輸入句子的每一個(gè)字生成一個(gè)logits
,這里就和BiLSTM
模型輸出logits
完全一樣劝贸,加入CRF
層姨谷,用Viterbi
算法解碼出標(biāo)注結(jié)果,在BiLSTM
或者IDCNN
這樣的網(wǎng)絡(luò)模型末端接上CRF
層是序列標(biāo)注的一個(gè)很常見(jiàn)的方法映九。BiLSTM
或者IDCNN
計(jì)算出的是每個(gè)詞的各標(biāo)簽概率梦湘,而CRF
層引入序列的轉(zhuǎn)移概率,最終計(jì)算出loss反饋回網(wǎng)絡(luò)
4.2 模型實(shí)現(xiàn)
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.tagset_size = config.num_label + 2
self.hidden_dim = config.hidden_size
self.start_tag_id = config.num_label
self.end_tag_id = config.num_label + 1
self.device = config.device
self.embedding = nn.Embedding(config.vocab_size, config.emb_size, padding_idx=config.vocab_size - 1)
torch.nn.init.uniform_(self.embedding.weight, -0.10, 0.10)
self.encoder = nn.LSTM(config.emb_size, config.hidden_size, batch_first=True, bidirectional=True)
self.decoder = nn.LSTM(config.hidden_size * 2, config.hidden_size, batch_first=True, bidirectional=True)
self.linear = nn.Linear(2*config.hidden_size, self.tagset_size)
self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
self.transitions.data[self.start_tag_id, :] = -10000.
self.transitions.data[:, self.end_tag_id] = -10000.
self.hidden = self.init_hidden()
def init_hidden(self):
return (torch.randn(2, 1, self.hidden_dim).to(self.device),
torch.randn(2, 1, self.hidden_dim).to(self.device))
def _get_lstm_features(self, input_ids):
embeds = self.embedding(input_ids).view(1, input_ids.shape[1], -1)
self.encoder.flatten_parameters()
self.decoder.flatten_parameters()
encoder_out, _ = self.encoder(embeds, self.hidden)
decoder_out, _ = self.decoder(encoder_out, self.hidden)
decoder_out = decoder_out.view(input_ids.shape[1], -1)
lstm_logits = self.linear(decoder_out)
return lstm_logits
def log_sum_exp(self, smat):
# 每一列的最大數(shù)
vmax = smat.max(dim=0, keepdim=True).values
# return (smat - vmax).exp().sum(axis=0, keepdim=True).log() + vmax
return torch.log(torch.sum(torch.exp(smat - vmax), axis=0, keepdim=True)) + vmax
def _forward_alg(self, feats):
# Do the forward algorithm to compute the partition function
alphas = torch.full((1, self.tagset_size), -10000.).to(self.device)
# 初始化分值分布. START_TAG是log(1)=0, 其他都是很小的值 "-10000"
alphas[0][self.start_tag_id] = 0.
# Iterate through the sentence
for feat in feats:
# log_sum_exp()內(nèi)三者相加會(huì)廣播: 當(dāng)前各狀態(tài)的分值分布(列向量) + 發(fā)射分值(行向量) + 轉(zhuǎn)移矩陣(方形矩陣)
# 相加所得矩陣的物理意義見(jiàn)log_sum_exp()函數(shù)的注釋; 然后按列求log_sum_exp得到行向量
alphas = self.log_sum_exp(alphas.T + self.transitions + feat.unsqueeze(0))
# 最后轉(zhuǎn)到EOS件甥,發(fā)射分值為0捌议,轉(zhuǎn)移分值為列向量 self.transitions[:, self.end_tag_id]
score = self.log_sum_exp(alphas.T + 0 + self.transitions[:, self.end_tag_id].view(-1, 1))
return score.flatten()
def _score_sentence(self, feats, tags):
# Gives the score of a provided tag sequence
score = torch.zeros(1).to(self.device)
tags = torch.cat([torch.tensor([self.start_tag_id], dtype=torch.long).to(self.device), tags])
for i, feat in enumerate(feats):
# emit = X0,start + x1,label + ... + xn-2,label + (xn-1, end[0])
# trans = 每一個(gè)狀態(tài)的轉(zhuǎn)移狀態(tài)
score += self.transitions[tags[i], tags[i+1]] + feat[tags[i + 1]]
# 加上到END_TAG的轉(zhuǎn)移
score += self.transitions[tags[-1], self.end_tag_id]
return score
def _viterbi_decode(self, feats):
backtrace = [] # 回溯路徑; backtrace[i][j] := 第i幀到達(dá)j狀態(tài)的所有路徑中, 得分最高的那條在i-1幀是神馬狀態(tài)
alpha = torch.full((1, self.tagset_size), -10000.).to(self.device)
alpha[0][self.start_tag_id] = 0
for frame in feats:
smat = alpha.T + frame.unsqueeze(0) + self.transitions
backtrace.append(smat.argmax(0)) # 當(dāng)前幀每個(gè)狀態(tài)的最優(yōu)"來(lái)源"
alpha = smat.max(dim=0, keepdim=True).values
# Transition to STOP_TAG
smat = alpha.T + 0 + self.transitions[:, self.end_tag_id].view(-1, 1)
best_tag_id = smat.flatten().argmax().item()
best_score = smat.max(dim=0, keepdim=True).values.item()
best_path = [best_tag_id]
for bptrs_t in reversed(backtrace[1:]): # 從[1:]開(kāi)始,去掉開(kāi)頭的 START_TAG
best_tag_id = bptrs_t[best_tag_id].item()
best_path.append(best_tag_id)
best_path.reverse()
return best_score, best_path # 返回最優(yōu)路徑分值 和 最優(yōu)路徑
def forward(self, sentence_ids, tags_ids):
tags_ids = tags_ids.view(-1)
feats = self._get_lstm_features(sentence_ids)
forward_score = self._forward_alg(feats)
gold_score = self._score_sentence(feats, tags_ids)
outputs = (forward_score - gold_score, )
_, tag_seq = self._viterbi_decode(feats)
outputs = (tag_seq, ) + outputs
return outputs
def predict(self, sentence_ids):
lstm_feats = self._get_lstm_features(sentence_ids)
_, tag_seq = self._viterbi_decode(lstm_feats)
return tag_seq
5. BERT+CRF
5.1 模型原理
BERT模型+全連接層
:BERT的encoding vector通過(guò)FC layer
映射到標(biāo)簽集合后引有,單個(gè)token的output vector經(jīng)過(guò)Softmax處理瓣颅,每一維度的數(shù)值就表示該token的詞性為某一詞性的概率∑┱基于此數(shù)據(jù)便可計(jì)算loss并訓(xùn)練模型宫补。但根據(jù)BiLSTM+CRF
模型的啟發(fā),在BERT+FC layer
的基礎(chǔ)上增加CRF layer
加入一些約束來(lái)保證最終的預(yù)測(cè)結(jié)果是有效的导帝。這些約束可以在訓(xùn)練數(shù)據(jù)時(shí)被CRF層自動(dòng)學(xué)習(xí)得到守谓,從而減少預(yù)測(cè)錯(cuò)誤的概率
5.2 模型實(shí)現(xiàn)
class BertCrfForNer(BertPreTrainedModel):
def __init__(self, config):
super(BertCrfForNer, self).__init__(config)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.crf = CRF(num_tags=config.num_labels, batch_first=True)
self.init_weights()
def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None):
outputs =self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
outputs = (logits,)
if labels is not None:
loss = self.crf(emissions = logits, tags=labels, mask=attention_mask)
outputs =(-1*loss,)+outputs
return outputs # (loss), scores
6. BERT+BiLSTM+CRF
6.1 模型原理
BiLSTM+CRF優(yōu)點(diǎn)是泛化能力強(qiáng);缺點(diǎn)是需要大量的標(biāo)注樣本您单。在樣本很少的情況下,效果會(huì)很不理想荞雏。為了更快速地實(shí)現(xiàn)一個(gè)實(shí)體提取器虐秦,提高系統(tǒng)易用性,可以采用遷移學(xué)習(xí)的思想凤优,在先驗(yàn)知識(shí)的基礎(chǔ)上進(jìn)行模型訓(xùn)練悦陋,從而使用BERT+BiLSTM+CRF
同樣的,輸入是wordPiece tokenizer得到的tokenid
筑辨,進(jìn)入Bert預(yù)訓(xùn)練模型抽取豐富的文本特征得到的輸出向量俺驶,輸出向量過(guò)
BiLSTM
從中提取實(shí)體識(shí)別所需的特征,得到的向量棍辕,最終進(jìn)入
CRF
層進(jìn)行解碼暮现,計(jì)算最優(yōu)的標(biāo)注序列
6.2 模型實(shí)現(xiàn)
class BertBiLstmCrf(nn.Module):
def __init__(self, vocab_size, emb_size, hidden_size, out_size, drop_out=0.1, use_pretrained_w2v=False):
super(BertBiLstmCrf, self).__init__()
self.bert_path = get_chinese_wwm_ext_pytorch_path()
self.bert_config = BeitConfig.from_pretrained(self.bert_path)
self.bert = BertModel.from_pretrained(self.bert_path)
emb_size = 768
for param in self.bert.parameters():
param.requires_grad = True
self.bilstm = nn.LSTM(emb_size, hidden_size, batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_size*2, out_size)
self.dropout = nn.Dropout(drop_out)
self.transition = nn.Parameter(torch.ones(out_size, out_size) * 1 / out_size)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def forward(self, x, lengths):
emb = self.bert(x)[0]
emb = nn.utils.rnn.pack_padded_sequence(emb, lengths, batch_first=True)
emb, _ = self.bilstm(emb)
output, _ = nn.utils.rnn.pad_packed_sequence(emb, batch_first=True, padding_value=0., total_length=x.shape[1])
output = self.dropout(output)
emission = self.fc(output)
batch_size, max_len, out_size = emission.size()
crf_scores = emission.unsqueeze(2).expand(-1, -1, out_size, -1) + self.transition.unsqueeze(0)
return crf_scores
NLP新人,歡迎大家一起交流楚昭,互相學(xué)習(xí)栖袋,共同成長(zhǎng)~~