本文模型
定一個(gè)attention后面會(huì)用到
#自定義注意力層
from tf.keras import initializers, constraints,activations,regularizers
from tf.keras import backend as K
from tf.keras.layers import Layer
class Attention(Layer):
#返回值:返回的不是attention權(quán)重稽亏,而是每個(gè)timestep乘以權(quán)重后相加得到的向量赞季。
#輸入:輸入是rnn的timesteps,也是最長輸入序列的長度
def __init__(self, step_dim,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(shape=(input_shape[-1],),initializer=self.init,name='{}_W'.format(self.name),
regularizer=self.W_regularizer,constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight(shape=(input_shape[1],),initializer='zero', name='{}_b'.format(self.name),
regularizer=self.b_regularizer,constraint=self.b_constraint)
else:
self.b = None
self.built = True
def compute_mask(self, input, input_mask=None):
return None ## 后面的層不需要mask了,所以這里可以直接返回none
def call(self, x, mask=None):
features_dim = self.features_dim ## 這里應(yīng)該是 step_dim是我們指定的參數(shù)碍粥,它等于input_shape[1],也就是rnn的timesteps
step_dim = self.step_dim
# 輸入和參數(shù)分別reshape再點(diǎn)乘后叁温,tensor.shape變成了(batch_size*timesteps, 1),之后每個(gè)batch要分開進(jìn)行歸一化
# 所以應(yīng)該有 eij = K.reshape(..., (-1, timesteps))
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b
eij = K.tanh(eij) #RNN一般默認(rèn)激活函數(shù)為tanh, 對(duì)attention來說激活函數(shù)差別不大,因?yàn)橐鰏oftmax
a = K.exp(eij)
if mask is not None: ## 如果前面的層有mask炫七,那么后面這些被mask掉的timestep肯定是不能參與計(jì)算輸出的差牛,也就是將他們attention權(quán)重設(shè)為0
a *= K.cast(mask, K.floatx()) ## cast是做類型轉(zhuǎn)換命锄,keras計(jì)算時(shí)會(huì)檢查類型堰乔,可能是因?yàn)橛胓pu的原因
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a) # a = K.expand_dims(a, axis=-1) , axis默認(rèn)為-1偏化, 表示在最后擴(kuò)充一個(gè)維度。比如shape = (3,)變成 (3, 1)
## 此時(shí)a.shape = (batch_size, timesteps, 1), x.shape = (batch_size, timesteps, units)
weighted_input = x * a
# weighted_input的shape為 (batch_size, timesteps, units), 每個(gè)timestep的輸出向量已經(jīng)乘上了該timestep的權(quán)重
# weighted_input在axis=1上取和镐侯,返回值的shape為 (batch_size, 1, units)
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape): ## 返回的結(jié)果是c侦讨,其shape為 (batch_size, units)
return input_shape[0], self.features_dim
常用的文本模型
def build_model(top_words=top_words,max_words=max_words,num_labels=num_labels,mode='LSTM',hidden_dim=[32]):
if mode=='RNN':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(SimpleRNN(32))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='MLP':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='LSTM':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(LSTM(32))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='GRU':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(GRU(32))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='CNN': #一維卷積
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(Conv1D(filters=32, kernel_size=3, padding="same",activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='CNN+LSTM':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(Conv1D(filters=32, kernel_size=3, padding="same",activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(64))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='BiLSTM':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation='softmax'))
#下面的網(wǎng)絡(luò)采用Funcional API實(shí)現(xiàn)
elif mode=='TextCNN':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
## 詞嵌入使用預(yù)訓(xùn)練的詞向量
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
## 詞窗大小分別為3,4,5
cnn1 = Conv1D(32, 3, padding='same', strides = 1, activation='relu')(layer)
cnn1 = MaxPooling1D(pool_size=2)(cnn1)
cnn2 = Conv1D(32, 4, padding='same', strides = 1, activation='relu')(layer)
cnn2 = MaxPooling1D(pool_size=2)(cnn2)
cnn3 = Conv1D(32, 5, padding='same', strides = 1, activation='relu')(layer)
cnn3 = MaxPooling1D(pool_size=2)(cnn3)
# 合并三個(gè)模型的輸出向量
cnn = concatenate([cnn1,cnn2,cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(num_labels, activation='softmax')(drop)
model = Model(inputs=inputs, outputs=main_output)
elif mode=='Attention':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
attention_probs = Dense(32, activation='softmax', name='attention_vec')(layer)
attention_mul = Multiply()([layer, attention_probs])
mlp = Dense(64)(attention_mul) #原始的全連接
fla=Flatten()(mlp)
output = Dense(num_labels, activation='softmax')(fla)
model = Model(inputs=[inputs], outputs=output)
elif mode=='Attention*3':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
attention_probs = Dense(32, activation='softmax', name='attention_vec')(layer)
attention_mul = Multiply()([layer, attention_probs])
mlp = Dense(32,activation='relu')(attention_mul)
attention_probs = Dense(32, activation='softmax', name='attention_vec1')(mlp)
attention_mul = Multiply()([mlp, attention_probs])
mlp2 = Dense(32,activation='relu')(attention_mul)
attention_probs = Dense(32, activation='softmax', name='attention_vec2')(mlp2)
attention_mul = Multiply()([mlp2, attention_probs])
mlp3 = Dense(32,activation='relu')(attention_mul)
fla=Flatten()(mlp3)
output = Dense(num_labels, activation='softmax')(fla)
model = Model(inputs=[inputs], outputs=output)
elif mode=='BiLSTM+Attention':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
bilstm = Bidirectional(LSTM(64, return_sequences=True))(layer) #參數(shù)保持維度3
bilstm = Bidirectional(LSTM(64, return_sequences=True))(bilstm)
layer = Dense(256, activation='relu')(bilstm)
layer = Dropout(0.2)(layer)
## 注意力機(jī)制
attention = Attention(step_dim=max_words)(layer)
layer = Dense(128, activation='relu')(attention)
output = Dense(num_labels, activation='softmax')(layer)
model = Model(inputs=inputs, outputs=output)
elif mode=='BiGRU+Attention':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
attention_probs = Dense(32, activation='softmax', name='attention_vec')(layer)
attention_mul = Multiply()([layer, attention_probs])
mlp = Dense(64,activation='relu')(attention_mul) #原始的全連接
#bat=BatchNormalization()(mlp)
#act=Activation('relu')
gru=Bidirectional(GRU(32))(mlp)
mlp = Dense(16,activation='relu')(gru)
output = Dense(num_labels, activation='softmax')(mlp)
model = Model(inputs=[inputs], outputs=output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
return model