讀取數(shù)據(jù)
- 比較好的方法是從
tf.data.Dataset.from_generator
中讀取數(shù)據(jù)歹篓,因?yàn)檫@樣允許從任意一個(gè)迭代器中讀取數(shù)據(jù)龄捡,可以更靈活的對(duì)數(shù)據(jù)進(jìn)行預(yù)處理等等。
def generator_fn():
for digit in range(2):
line = 'I am digit {}'.format(digit)
words = line.split()
yield [w.encode() for w in words], len(words)
- 雖然有很多讀取數(shù)據(jù)的方法坞笙,比如
tf.data.TextLineDataset
是從text文本中讀取數(shù)據(jù),比如tf.data.Dataset.from_tensor_slices
是從np array中讀取數(shù)據(jù)的,tf.data.TFRecordDataset
是從TF records中讀取數(shù)據(jù)的拳芙,但是作為一個(gè)NLP的研究人員,除非要使用上面三個(gè)讀取方式中的一個(gè)特定函數(shù)來獲得模型性能上的提升皮璧,否在為了靈活性起見還是使用tf.data.Dataset.from_generator
最好舟扎。
shapes = ([None], ())
types = (tf.string, tf.int32)
dataset = tf.data.Dataset.from_generator(generator_fn,
output_shapes=shapes, output_types=types)
- 測(cè)試是否正常
The tf.enable_eager_execution() must be called at program startup, just after your import tensorflow as tf
import tensorflow as tf
tf.enable_eager_execution()
for tf_words, tf_size in dataset:
print(tf_words, tf_size)
>>> tf.Tensor([b'I' b'am' b'digit' b'0'], shape=(4,), dtype=string) tf.Tensor(4, shape=(), dtype=int32)
>>> tf.Tensor([b'I' b'am' b'digit' b'1'], shape=(4,), dtype=string) tf.Tensor(4, shape=(), dtype=int32)
- 使用一種old school的方式
tf.Session()
,但是這種方式需要先創(chuàng)建一個(gè)iterator悴务。 - 然后創(chuàng)建一個(gè)取下一個(gè)節(jié)目的op睹限,這樣取出一個(gè)元素以后迭代器再向后移動(dòng)一次。
iterator = dataset.make_one_shot_iterator()
node = iterator.get_next()
with tf.Session() as sess:
print(sess.run(node))
print(sess.run(node)) # Each call moves the iterator to its next position
>>> (array([b'I', b'am', b'digit', b'0'], dtype=object), 4)
>>> (array([b'I', b'am', b'digit', b'1'], dtype=object), 4)
讀取文件和進(jìn)行分詞
- 使用
tf.data.Dataset.from_generaor()
最大的好處就是可以使用你python方式進(jìn)行文本的預(yù)處理讯檐,而不用想方設(shè)法找tf中的對(duì)應(yīng)函數(shù)羡疗。
def parse_fn(line_words, line_tags):
# Encode in Bytes for TF
words = [w.encode() for w in line_words.strip().split()]
tags = [t.encode() for t in line_tags.strip().split()]
assert len(words) == len(tags), "Words and tags lengths don't match"
return (words, len(words)), tags
def generator_fn(words, tags):
with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
for line_words, line_tags in zip(f_words, f_tags):
yield parse_fn(line_words, line_tags)
- 然后使用input_fn構(gòu)建dataset,并接下來將和
tf.estimator
配合進(jìn)行使用别洪。其中的函數(shù)在我的另外一篇博客中都有叨恨。
prefetch which ensures that a batch of data is pre-loaded on the computing device so that it does not suffer from data starvation
def input_fn(words, tags, params=None, shuffle_and_repeat=False):
params = params if params is not None else {}
shapes = (([None], ()), [None])
types = ((tf.string, tf.int32), tf.string)
defaults = (('<pad>', 0), 'O')
dataset = tf.data.Dataset.from_generator(
functools.partial(generator_fn, words, tags),
output_shapes=shapes, output_types=types)
if shuffle_and_repeat:
dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])
dataset = (dataset
.padded_batch(params.get('batch_size', 20), shapes, defaults)
.prefetch(1))
return dataset
- 運(yùn)行結(jié)果,可以看到
Pad
起到了應(yīng)有的結(jié)果挖垛。
tf.estimator
- 提供一個(gè)高級(jí)的用于訓(xùn)練測(cè)試和預(yù)測(cè)的方法痒钝,在使用之前需要定義兩個(gè)組件。
- 一個(gè)模型文件
model_fn(features, labels, mode, params) ->tf.estimator.EstimatorSpec
- 前面兩個(gè)都是訓(xùn)練中需要的tensor痢毒。
- mode:是一個(gè)string送矩,用于指定model_fn是用于預(yù)測(cè),測(cè)試還是訓(xùn)練哪替。
- param:是一個(gè)字典用于存放超參栋荸。
- input_fn:就是之前我們所定義的返回
tf.data.Dataset
的函數(shù),返回訓(xùn)練的tensorfeatures
和labels
被model_fn
用于訓(xùn)練凭舶。
def model_fn(features, labels, mode, params):
# Define the inference graph
graph_outputs = some_tensorflow_applied_to(features)
if mode == tf.estimator.ModeKeys.PREDICT:
# Extract the predictions
predictions = some_dict_from(graph_outputs)
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
else:
# Compute loss, metrics, tensorboard summaries
loss = compute_loss_from(graph_outputs, labels)
metrics = compute_metrics_from(graph_outputs, labels)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
elif mode == tf.estimator.ModeKeys.TRAIN:
# Get train operator
train_op = compute_train_op_from(graph_outputs, labels)
return tf.estimator.EstimatorSpec(
mode, loss=loss, train_op=train_op)
else:
raise NotImplementedError('Unknown mode {}'.format(mode))
一個(gè)具體的例子說明
tf.contrib.lookup.index_table_from_file將strings to ids in the tensorflow graph晌块。
Here, params['words'] is the path to a file containing one lexeme (= an element of my vocabulary) per line. I use Tensorflow built-int lookup tables to map token strings to lexemes ids. We also use the same convention to store the vocabulary of tags.
dropout = params['dropout']
words, nwords = features
training = (mode == tf.estimator.ModeKeys.TRAIN)
vocab_words = tf.contrib.lookup.index_table_from_file(
params['words'], num_oov_buckets=1)
with Path(params['tags']).open() as f:
indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
num_tags = len(indices) + 1
- 創(chuàng)建word embedding。
- 可以加載預(yù)訓(xùn)練的詞向量库快。
word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings'] # np.array
variable = np.vstack([glove, [[0.]*params['dim']]]) # For unknown words
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
embeddings = tf.nn.embedding_lookup(variable, word_ids)
embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)
- 我們使用最為有效的lstm cell方式摸袁,它將所有的LSTM操作都放在一個(gè)CUDA kernel里面進(jìn)行
t = tf.transpose(embeddings, perm=[1, 0, 2]) # Make time-major
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
output = tf.concat([output_fw, output_bw], axis=-1)
output = tf.transpose(output, perm=[1, 0, 2]) # Make batch-major
output = tf.layers.dropout(output, rate=dropout, training=training)
-
LSTMBlockCell
需要time在前所以要使用tf.transpose
進(jìn)行翻轉(zhuǎn)。
This is an extremely efficient LSTM implementation, that uses a single TF op for the entire LSTM. It should be both faster and more memory-efficient than LSTMBlockCell defined above.
- 加入CRF
logits = tf.layers.dense(output, num_tags)
crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)
測(cè)度和使用tensorboard
import tf_metrics
# Metrics
weights = tf.sequence_mask(nwords)
metrics = {
'acc': tf.metrics.accuracy(tags, pred_ids, weights),
'precision': tf_metrics.precision(tags, pred_ids, num_tags, indices, weights),
'recall': tf_metrics.recall(tags, pred_ids, num_tags, indices, weights),
'f1': tf_metrics.f1(tags, pred_ids, num_tags, indices, weights),
}
# Tensoboard summaries
for metric_name, op in metrics.items():
tf.summary.scalar(metric_name, op[1])
評(píng)估模型
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
elif mode == tf.estimator.ModeKeys.TRAIN:
train_op = tf.train.AdamOptimizer().minimize(
loss, global_step=tf.train.get_or_create_global_step())
return tf.estimator.EstimatorSpec(
mode, loss=loss, train_op=train_op)
實(shí)例化Estimator
params = {
'dim': 300,
'dropout': 0.5,
'num_oov_buckets': 1,
'epochs': 25,
'batch_size': 20,
'buffer': 15000,
'lstm_size': 100,
'words': str(Path(DATADIR, 'vocab.words.txt')),
'chars': str(Path(DATADIR, 'vocab.chars.txt')),
'tags': str(Path(DATADIR, 'vocab.tags.txt')),
'glove': str(Path(DATADIR, 'glove.npz'))
}
cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)
Train an Estimator with early stopping
- 因?yàn)槲覀兊暮瘮?shù)中只有后面幾個(gè)參數(shù)不同沒有必要再寫一個(gè)函數(shù)义屏,因此我們使用
functools.partial
對(duì)函數(shù)在不同數(shù)據(jù)集合上進(jìn)行包裝靠汁。 - 早停法訓(xùn)練蜂大,獲得F1最高值的模型,使用
tf.contrib.estimator.stop_if_no_increase_hook
# 1. Define our input_fn
train_inpf = functools.partial(input_fn, 'words.train.txt', 'tags.train.txt',
params, shuffle_and_repeat=True)
eval_inpf = functools.partial(input_fn,'words.testa.txt', 'tags.testa.txt'
params)
# 2. Create a hook
Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
hook = tf.contrib.estimator.stop_if_no_increase_hook(
estimator, 'f1', 500, min_steps=8000, run_every_secs=120)
train_spec = tf.estimator.TrainSpec(input_fn=input_fn, hooks=[hook])
eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
# 3. Train with early stopping
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)