# embedding層單詞和分詞信息
embedding = self.embedding_layer(self.word_inputs, self.seg_inputs, config)
# lstm輸入層
lstm_inputs = tf.nn.dropout(embedding, self.dropout)
# lstm輸出層
lstm_outputs = self.biLSTM_layer(lstm_inputs, self.lstm_dim, self.lengths)
# 投射層
self.logits = self.project_layer(lstm_outputs)
# 損失
self.loss = self.crf_loss_layer(self.logits, self.lengths)
tf.nn.embedding_lookup的作用就是找到要尋找的embedding data中的對(duì)應(yīng)的行下的vector痹籍。
image.png
def embedding_layer(self, word_inputs, seg_inputs, config, name=None):
"""
:param word_inputs: one-hot編碼.其實(shí)所有字的one_hot編碼
:param seg_inputs: 分詞特征
:param config: 配置
:param name: 層的命名
:return: shape = [word_inputs,word_dim+seg_dim]
"""
embedding = []
with tf.variable_scope("word_embedding" if not name else name), tf.device('/cpu:0'):
self.word_lookup = tf.get_variable(
name="word_embedding",
shape=[self.num_words, self.word_dim],
initializer=self.initializer
)
embedding.append(tf.nn.embedding_lookup(self.word_lookup, word_inputs))
if config['seg_dim']:
with tf.variable_scope("seg_embedding"), tf.device('/cpu:0'):
self.seg_lookup = tf.get_variable(
name="seg_embedding",
shape=[self.num_sges, self.seg_dim],
initializer=self.initializer
)
embedding.append(tf.nn.embedding_lookup(self.seg_lookup, seg_inputs))
embed = tf.concat(embedding, axis=-1)
return embed
image.png
def biLSTM_layer(self, lstm_inputs, lstm_dim, lengths, name=None):
"""
:param lstm_inputs: [batch_size, num_steps, emb_size]
:param lstm_dim:
:param name:
:return: [batch_size, num_steps, 2*lstm_dim]
為何返回是2*lstm_dim粉渠,因?yàn)槠涫请p向的lstm。每個(gè)方向的輸出為lstm_dim
"""
with tf.variable_scope("word_biLSTM" if not name else name):
lstm_cell = {}
for direction in ['forward', 'backward']:
with tf.variable_scope(direction):
lstm_cell[direction] = rnn.CoupledInputForgetGateLSTMCell(
lstm_dim,
use_peepholes=True,
initializer=self.initializer,
state_is_tuple=True
)
outputs, final_status = tf.nn.bidirectional_dynamic_rnn(
lstm_cell['forward'],
lstm_cell['backward'],
lstm_inputs,
dtype=tf.float32,
sequence_length=lengths
)
# 因?yàn)閱蜗虻膌stm輸出的格式為[batch_size, num_steps,lstm_dim]凌外。
# 2表示在lstm_dim這個(gè)維度進(jìn)行拼接佑女。
# 個(gè)人覺(jué)得outputs的輸出格式為[[batch_size, num_steps,lstm_dim],[batch_size, num_steps,lstm_dim]]
# 即是一個(gè)list。list里面的每一個(gè)元素是單向的lstm的輸出
return tf.concat(outputs, axis=2)
def project_layer(self, lstm_outputs, name=None):
"""
:param lstm_outputs: [batch_size, num_steps, emb_size]
個(gè)人覺(jué)得lstm_outputs: [batch_size, num_steps, lstm_dim * 2] num_steps表示每個(gè)句子里面字的數(shù)量歉备。即每個(gè)批次的句子長(zhǎng)度
:param name:
:return: [batch_size,num_steps, num_tags]
"""
with tf.variable_scope('project_layer' if not name else name):
with tf.variable_scope('hidden_layer'):
W = tf.get_variable(
"W",
shape=[self.lstm_dim * 2, self.lstm_dim],
dtype=tf.float32,
initializer=self.initializer
)
b = tf.get_variable(
"b",
shape=[self.lstm_dim],
dtype=tf.float32,
initializer=tf.zeros_initializer()
)
out_put = tf.reshape(lstm_outputs, shape=[-1, self.lstm_dim * 2]) # 得到所有的字,將所有的字最后編碼為lstm_dim長(zhǎng)度
hidden = tf.tanh(tf.nn.xw_plus_b(out_put, W, b))
with tf.variable_scope('logits'):
W = tf.get_variable(
"W",
shape=[self.lstm_dim, self.num_tags],
dtype=tf.float32,
initializer=self.initializer
)
b = tf.get_variable(
"b",
shape=[self.num_tags],
dtype=tf.float32,
initializer=tf.zeros_initializer()
)
# 最后將每個(gè)字編碼為num_tags匪燕。即最后想要得到每個(gè)字屬于每個(gè)tag的概率
pred = tf.nn.xw_plus_b(hidden, W, b)
# 返回原始的shape蕾羊。即batch_size,num_setps,num_tags
return tf.reshape(pred, [-1, self.num_setps, self.num_tags])
+CRF
def crf_loss_layer(self, project_logits, lenghts, name=None):
"""
# 個(gè)人覺(jué)得是[-1, self.num_setps, self.num_tags]
:param project_logits: [1, num_steps, num_tages]
:param lenghts:
:param name:
:return: scalar loss
聽(tīng)說(shuō)下面是固定的寫法
"""
with tf.variable_scope('crf_loss' if not name else name):
small_value = -10000.0
# 下面是對(duì)于一個(gè)字。但是最后一維帽驯,比原來(lái)的標(biāo)簽長(zhǎng)度多了一個(gè)元素
start_logits = tf.concat(
[
small_value * tf.ones(shape=[self.batch_size, 1, self.num_tags]),
tf.zeros(shape=[self.batch_size, 1, 1])
],
axis=-1
)
pad_logits = tf.cast(
small_value *
tf.ones(shape=[self.batch_size, self.num_setps, 1]),
dtype=tf.float32
)
# 貌似是在列的位置最后拼接一個(gè)元素.所以此時(shí)project_layer層輸出的每個(gè)字最后一層多了一個(gè)元素
# 即在最后一個(gè)維度填充了一個(gè)元素
logits = tf.concat(
[project_logits, pad_logits],
axis=-1
)
# 此時(shí)相當(dāng)于在每個(gè)批次的龟再,每個(gè)句子開(kāi)頭位置添加了一個(gè)字
logits = tf.concat(
[start_logits, logits],
axis=1
)
# 因?yàn)閟elf.targets.shape = [batch_size,num_steps].所以下面的操作,類似于在每個(gè)句子前面添加了一個(gè)字
# 所以此時(shí)就和上面的填充的形狀tf.concat([start_logits, logits],axis=1)
# 對(duì)應(yīng)了起來(lái)
targets = tf.concat(
[tf.cast(
self.num_tags * tf.ones([self.batch_size, 1]),
tf.int32
),
self.targets
]
,
axis=-1
)
# 每個(gè)狀態(tài)之間的轉(zhuǎn)移矩陣
self.trans = tf.get_variable(
"transitions",
shape=[self.num_tags + 1, self.num_tags + 1],
initializer=self.initializer
)
log_likehood, self.trans = crf_log_likelihood(
inputs=logits,
tag_indices=targets,
transition_params=self.trans,
sequence_lengths=lenghts + 1 # 因?yàn)樯厦嬖诰渥拥拈_(kāi)頭位置添加了一個(gè)字
)
return tf.reduce_mean(-log_likehood)
用F1值來(lái)評(píng)估
def evaluate(sess, model, name, manager, id_to_tag, logger):
logger.info('evaluate:{}'.format(name))
ner_results = model.evaluate(sess, manager, id_to_tag)
eval_lines = model_utils.test_ner(ner_results, FLAGS.result_path)
for line in eval_lines:
logger.info(line)
f1 = float(eval_lines[1].strip().split()[-1])
if name == "dev":
best_test_f1 = model.best_dev_f1.eval()
if f1 > best_test_f1:
tf.assign(model.best_dev_f1, f1).eval()
logger.info('new best dev f1 socre:{:>.3f}'.format(f1))
return f1 > best_test_f1
elif name == "test":
best_test_f1 = model.best_test_f1.eval()
if f1 > best_test_f1:
tf.assign(model.best_test_f1, f1).eval()
logger.info('new best test f1 score:{:>.3f}'.format(f1))
return f1 > best_test_f1
關(guān)于調(diào)參:
Validation loss vs Training Loss
如果validation loss < Training Loss, 可能就過(guò)擬合了尼变。這樣就需要嘗試著降低網(wǎng)絡(luò)大小network size 或者 提高dropout的值利凑,比如0.5,0.6依次嘗試。用mini batch的方法嫌术,把數(shù)據(jù)集劃分成很若干個(gè)小一點(diǎn)的集合哀澈。來(lái)調(diào)整參數(shù):如embedding_dim, lstm_dim, learning=rate(3e-4)
這里用CRF++先跑了一遍,速度很快度气,準(zhǔn)確率在0.8左右割按,recall在0.87左右,f1在0.87多磷籍。然后用BiLSTM后接softmax來(lái)跑loss一下子降到很低适荣,感覺(jué)很容易局部過(guò)擬合现柠。BiLSTM+CRF后, loss穩(wěn)定變小弛矛,到0.15時(shí)候準(zhǔn)確率變化已經(jīng)比較少了够吩,比不接CRF的更快擬合≌擅ィ總體準(zhǔn)確率比無(wú)CRF的更高废恋。另外,迭代次數(shù)調(diào)高后扒寄,準(zhǔn)確率也會(huì)提高一點(diǎn)鱼鼓。