評(píng)估方法:
人工從文章中提取1-5個(gè)關(guān)鍵詞鹉胖,和機(jī)器提取的關(guān)鍵詞做比較
召回 = 機(jī)器提詞∩人工提詞 / 人工提詞
準(zhǔn)確 = 機(jī)器提詞∩人工提詞 / 機(jī)器提詞
TF-IDF
原理參考:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html
實(shí)現(xiàn)參考:tf-idf-keyword
其他參考: 使用不同的方法計(jì)算TF-IDF值
第一版 標(biāo)題和正文加權(quán)計(jì)算tf-idf
主要策略
- 使用nlpc切詞服務(wù)(可用jieba切詞代替)+TF-IDF提取關(guān)鍵詞宏侍。
- 去除停用詞
- 按照體裁+年級(jí)分成若干類型返敬,來訓(xùn)練模型剖膳,示例用高中+敘事類暇榴,取了20000條數(shù)據(jù)訓(xùn)練
- 對(duì)標(biāo)題進(jìn)行加權(quán)蟀淮,標(biāo)題的每個(gè)詞匯頻率+6蚪腐,再合一起計(jì)算tf-idf
- 按照權(quán)重取前4個(gè)關(guān)鍵詞宝踪,在這4個(gè)關(guān)鍵詞中對(duì)于權(quán)重小于 頻率(5)*平均IDF/總詞數(shù) 的進(jìn)行過濾
注:以上數(shù)據(jù)均為調(diào)節(jié)后最優(yōu)解
代碼實(shí)現(xiàn)
config.py
program = 'composition_term_weight'
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',
stream=sys.stderr,
datefmt='%a, %d %b %Y %H:%M:%S')
logging.root.setLevel(level=logging.INFO)
IDFLoader.py
class IDFLoader(object):
"""詞典加載類"""
def __init__(self, idf_path):
self.idf_path = idf_path
self.idf_freq = {} # idf
self.mean_len = 0 #平均長度
self.mean_idf = 0.0 # 均值
self.load_idf()
def load_idf(self):
"""從文件中載入idf"""
cnt = 0
with open(self.idf_path, 'rb') as f:
for line in f:
try:
word, freq = line.strip().decode('utf-8', errors='ignore').split(' ')
if word == 'LEN_AVG':
self.mean_len = int(freq)
break
self.idf_freq[word] = float(freq)
cnt += 1
except Exception as e:
# logger.error('load_idf error: ' + e.message + ' line: ' + line.decode('utf-8', errors='ignore'))
continue
self.mean_idf = sum(self.idf_freq.values()) / cnt
logger.info('Vocabularies %s loaded: %d mean_idf: %d' % (self.idf_path, cnt, self.mean_idf))
class TfIdf(object):
"""TF-IDF"""
# 對(duì)正文進(jìn)行過濾
p_cut = re.compile(r'[a-zA-Z0-9]', re.VERBOSE)
# 對(duì)標(biāo)題進(jìn)行過濾
p_title = re.compile(r'作文|\d+字|.年級(jí)|_', re.VERBOSE)
# 過濾常用標(biāo)點(diǎn)符號(hào)等侨糟,也可以放到停用詞表中
ignored = ['', ' ', '', '。', ':', '瘩燥,', ')', '(', '秕重!', '?', '”', '“', '"', '―', '.', '說', '好', '時(shí)']
# 主題最小出現(xiàn)次數(shù),用于過濾權(quán)重不達(dá)標(biāo)的關(guān)鍵詞
min_times = 5.0
# 標(biāo)題加權(quán)次數(shù)
title_add_times = 6.0
# 取關(guān)鍵詞的個(gè)數(shù)
words_num = 4
def __init__(self):
# 1. 獲取停用詞庫
my_stop_words_path = 'stop_words.utf8.txt'
self.stop_words_dict = []
with open(my_stop_words_path, 'rb') as fr:
for line in fr.readlines():
self.stop_words_dict.append(line.strip())
def my_cut(self, inTxt):
"""切詞"""
inTxt = self.p_cut.sub('', str(inTxt))
words_list = []
# 由于性能問題厉膀,一句一句的切詞
for l in inTxt.split('悲幅。'):
# NLPC切詞服務(wù),可用jieba切詞代替
r = cut(l)
if r is not None:
words_list += r
return [w for w in words_list if w not in self.stop_words_dict and w not in self.ignored and len(w.strip()) > 0]
def get_tfidf(self, idf_loader, title, content):
"""計(jì)算文章tf-idf"""
filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))
title_words = self.my_cut(filter_title)
corpus0 = title_words + self.my_cut(content)
freq = {}
for w in corpus0:
freq[w] = freq.get(w, 0.0) + 1.0
# 對(duì)標(biāo)題進(jìn)行加權(quán)
for w in title_words:
logger.info(freq[w])
freq[w] = freq.get(w, 0.0) + self.title_add_times
logger.info(freq[w])
total = sum(freq.values())
for k in freq: # 計(jì)算 TF-IDF
freq[k] *= idf_loader.idf_freq.get(k, idf_loader.mean_idf) / total
return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words
def get_term_weight(self, idf_loader, title, content):
"""獲得term權(quán)重"""
result, words_number, title_words = self.get_tfidf(idf_loader, title, content)
bound = self.min_times * idf_loader.mean_idf / words_number
machine_words = [item for item in result[:4] if item[1] > bound]
# machine_words = [item for item in result[:self.words_num]]
if len(machine_words) < 1:
# 如果一個(gè)term都沒有站蝠,則把標(biāo)題拿出來
machine_words = [item for item in result if item[1] in title_words]
data = []
offset = 0
for i, word in enumerate(machine_words):
data.append('%s:%d:%s' % (word[0], offset, str(round(word[1], 4))))
offset += len(word[0].decode('utf-8', errors='ignore'))
return data
def getCorpus(self, data_path):
"""獲取詞表"""
count = 0
corpus_list = []
with open(data_path, 'rb') as f:
for line in f:
info = json.loads(line.decode('utf-8', errors='ignore'))
sentence = self.p_title.sub('', info.get('title').encode('utf-8', errors='ignore')) + '汰具。' + info.get(
'@merge_text').encode('utf-8', errors='ignore')
r = self.my_cut(sentence)
if not r:
continue
corpus_list.append(r)
count += 1
if count % 1000 == 0:
logger.info("processd " + str(count) + " segment_sentence")
return corpus_list
def train(self, dir_name, data_path):
"""訓(xùn)練模型"""
idf_path = 'data/%s/idf.txt' % dir_name
documents = self.getCorpus(data_path)
id_freq = {}
i = 0
len_sum = 0
for doc in documents:
len_sum += len(doc)
doc = set(doc)
for x in doc:
id_freq[x] = id_freq.get(x, 0) + 1
if i % 1000 == 0:
logger.info('Documents processed: ' + str(i) + ', time: ' + str(datetime.datetime.now()))
i += 1
del documents
with open(idf_path, 'wb') as f:
for key, value in id_freq.items():
f.write(key + ' ' + str(math.log(i / value, 2)) + '\n')
logger.info(str(i) + ' ' + str(len_sum))
f.write('LEN_AVG ' + str(len_sum / i))
def test_one(self, dir_name, method='tfidf'):
"""單個(gè)測(cè)試"""
idf_loader = IDFLoader('data/%s/idf.txt' % dir_name)
for item in sys.stdin:
info = json.loads(item.decode('utf-8', errors='ignore'))
title = info['title']
content = info['@merge_text']
if method == 'tfidf':
result, words_number, title_words = self.get_tfidf(idf_loader, title, content)
else:
result, words_number, title_words = self.get_bm25(idf_loader, title, content)
bound = self.min_times * idf_loader.mean_idf / words_number
print '_____words_number bound_____'
print words_number, bound
print '_____tfidf_result_____'
for item in result[:20]:
print item[0].encode('utf-8', errors='ignore'), item[1]
經(jīng)調(diào)優(yōu),最優(yōu)解為:min_times=5 title_add_times=6.0 words_num=4
結(jié)果
人工抽樣評(píng)估了100個(gè)
TF-IDF召回率:0.2778
TF-IDF準(zhǔn)確率:0.2778
BM25
第一版
TfIdf.py 增加方法:
def get_bm25(self, idf_loader, title, content):
"""計(jì)算bm25"""
k = 1.2 # 用來限制TF值的增長極限
b = 0.75 # b是一個(gè)常數(shù)菱魔,它的作用是規(guī)定L對(duì)評(píng)分的影響有多大留荔。
# L是文檔長度與平均長度的比值
EPSILON = 0.25 # 如果idf詞表中沒有,則平均idf*該值
filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))
title_words = self.my_cut(filter_title)
corpus0 = title_words + self.my_cut(content)
freq = {}
for w in corpus0:
freq[w] = freq.get(w, 0.0) + 1.0
# 對(duì)標(biāo)題進(jìn)行加權(quán)
for w in title_words:
freq[w] = freq.get(w, 0.0) + self.title_add_times
total = sum(freq.values())
logger.info(str((k, b, total, idf_loader.mean_len)))
for i in freq:
tf = freq[i] / total
idf = idf_loader.idf_freq.get(i, idf_loader.mean_idf * EPSILON)
freq[i] = idf * ((k + 1) * tf) / (k * (1.0 - b + b * (total / idf_loader.mean_len)) + tf)
return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words
經(jīng)調(diào)優(yōu)澜倦,最優(yōu)解為:min_times=2.5 title_add_times=6.0 words_num=4 k=1.2 b=0.75 EPSILON=0.25
結(jié)果
人工抽樣評(píng)估了100個(gè)
BM25召回率:0.2889
BM25準(zhǔn)確率:0.3333