s代表句子吱雏,w代表句子中的單詞,z代表單詞的詞性筛峭。推導(dǎo)式中用了一些基本的概率公式铐刘,有些地方簡(jiǎn)寫了。對(duì)于維特比與HMM的概念就不再贅述影晓。
python代碼主要分為三步來實(shí)現(xiàn):
1镰吵、對(duì)數(shù)據(jù)進(jìn)行處理;
2挂签、計(jì)算HMM三元組的PI疤祭、A、B饵婆;
3勺馆、維特比算法的實(shí)現(xiàn)。
數(shù)據(jù)集可聯(lián)系私發(fā),代碼如下:
#整理數(shù)據(jù)集
tag2id,id2tag = {},{}
word2id,id2word = {},{}
res = []
for line in open('dataset.txt','r', encoding='UTF-8'):
temp = line.rstrip().split("/")
if len(temp)==2:
word, tag = temp[0], temp[1]
if word not in word2id:
word2id[word] = len(word2id)
id2word[len(id2word)] = word
if tag not in tag2id:
tag2id[tag] = len(tag2id)
id2tag[len(id2tag)] = tag
M = len(word2id)
N = len(tag2id)
#計(jì)算PI 草穆、A灌灾、B
import numpy as np
PI = np.zeros(N) #初始矩陣;每個(gè)詞性出現(xiàn)在句首的概率
A = np.zeros((N,M)) #發(fā)射矩陣悲柱;A[i][j]:給定tag[i] 出現(xiàn)單詞j的概率
B = np.zeros((N,N)) #狀態(tài)轉(zhuǎn)移矩陣紧卒;B[i][j]:從前一狀態(tài)i轉(zhuǎn)移到當(dāng)前狀態(tài)j的概率
pre_tag = ""
for line in open("dataset.txt","r",encoding='UTF-8'):
items = line.rstrip().split("/")
wordid,tagid= word2id[items[0]],tag2id[items[1]]
if pre_tag=="": #意味著句子的開始
PI[tagid] += 1 #先計(jì)算出現(xiàn)的次數(shù)
A[tagid][wordid] += 1
else: #如果不是句子的開頭
A[tagid][wordid] += 1
B[tag2id[pre_tag]][tagid] += 1
if items[0] in ["。","诗祸!","跑芳?"]: #下一個(gè)單詞是新句子的開始
pre_tag = ""
else:
pre_tag = items[1]
# 把頻次轉(zhuǎn)成概率
PI = PI/sum(PI)
for i in range(N):
A[i] /= sum(A[i])
B[i] /= sum(B[i]) #到此為止計(jì)算完了各個(gè)參數(shù)
#計(jì)算log值,防止出現(xiàn)log0
def log(ele):
if ele == 0:
return np.log(ele + 0.000001)
return np.log(ele)
#利用前向最大匹配進(jìn)行分詞
def forwardMaxMatch(strng):
dict = [x for x in word2id.keys()]
result = []
while len(strng):
maxlength = 5 if len(strng)>=5 else len(strng)
sub = strng[:maxlength]
while sub not in dict and len(sub)>1:
sub = sub[:len(sub)-1]
result.append(sub)
strng = strng[len(sub):]
return "/".join(result)
#重頭戲:利用維特比進(jìn)行詞性標(biāo)注
def vertebi(x):
x = forwardMaxMatch(x).split("/")
res = [word2id[word] for word in x]
T = len(res)
ptr = np.zeros((T,N),dtype=int) #定義一個(gè)維特比籬笆網(wǎng)直颅,存放下標(biāo)博个,故指定為整型
dp = np.zeros((T,N))
for j in range(N): #從前往后計(jì)算,所以要首先填充第一列
dp[0][j] = log(PI[j])+log(A[j][res[0]]) #第一個(gè)tag是PI[j]的概率 + 詞性是A[j]的情況下是單詞x[0]的概率
for i in range(1,T): #每個(gè)單詞
for j in range(N): #每個(gè)詞性
""""
dp[i][j],ptr[i][j] = max(dp[i-1][k] + log(B[k][j]) + log(A[j][res[i]],k) for k in range(N))
這一行可代替下面六行
"""
dp[i][j] = -9999999999
for k in range(N): #從前一項(xiàng)的每一個(gè)k可達(dá)到當(dāng)前的詞性j
scores = dp[i-1][k] + log(B[k][j]) + log(A[j][res[i]]) #前一列的第k個(gè)轉(zhuǎn)移過來的概率+狀態(tài)轉(zhuǎn)移概率(k->j)+詞性為A[j]時(shí)單詞正好為x[i]的概率
if scores > dp[i][j]:
dp[i][j] = scores
ptr[i][j] = k #填充籬笆網(wǎng)絡(luò) 當(dāng)前最好路徑是從k過來的
# 把最好的tag sequence打印出來(從后到前)
best_seq= [0]*T #找出籬笆網(wǎng)絡(luò)最后一列中最大的值得下標(biāo)
best_seq[T-1] = np.argmax(dp[T-1])
for i in range(T-2,-1,-1): #從后往前其次求出每個(gè)單詞的詞性
best_seq[i]=ptr[i+1][best_seq[i+1]]
result = []
for i in range(len(best_seq)):
result.append(x[i]+'/'+id2tag[best_seq[i]]+" ")
return "".join(result)
sentence = input("請(qǐng)輸入您想要進(jìn)行詞性標(biāo)注的句子:")
print(vertebi(sentence))