对于match-lstm,将hi文本与输出的match-lstm(由si,hi,qi)组合重新输入到LSTM网络中,以端对端的操作理念。
参考的博客:https://blog.csdn.net/laddie132/article/details/79159895 #MATCH-LSTM原理
https://blog.csdn.net/jdbc/article/details/80755576 # 将SQUAD数据集转换为id
https://blog.csdn.net/xbinworld/article/details/54607525 # 注意机制模型
https://blog.csdn.net/appleml/article/details/76607980 #point-net模型
# !/usr/bin/env python3 # -*- coding: utf-8 -*- import tensorflow as tf import numpy as np import tensorflow.contrib as contrib # from app.decorator import exe_time class MatchLstm: # @exe_time def __init__(self, vocab_size, sentence_size, embedding_size, word_embedding, initializer=tf.truncated_normal_initializer(stddev=0.1), session=tf.Session(), num_class=3, window_size=4, name='MatchLstm', initial_lr=0.001): # 字典的大小 self._vocab_size = vocab_size # 句子的大小 self._sentence_size = sentence_size # 隐含层的大小 self._embedding_size = embedding_size # 用于构造向量 self._we = word_embedding # 初始化 self._initializer = initializer # 名字 self._name = name # 输出种类 self._num_class = num_class self._sess = session # 窗口的大小 self._window_size = window_size # 学习率 self._initial_lr = initial_lr # 编码原文和上下文的信息 self._build_inputs_and_vars() # 构造模型的结构 self._inference() # 初始化 self._initial_optimizer() def _build_inputs_and_vars(self): # 文章的内容 self.premises = tf.placeholder(shape=[None, self._sentence_size], dtype=tf.int32, name='premises') # 问题 self.hypotheses = tf.placeholder(shape=[None, self._sentence_size], dtype=tf.int32, name='hypotheses') # 标签 self.labels = tf.placeholder(shape=[None, self._num_class], dtype=tf.float32, name='labels') # 根据输入的大小来获得样本的大小 self._batch_size = tf.shape(self.premises)[0] # 初始化学习率 self.lr = tf.get_variable(shape=[], dtype=tf.float32, trainable=False, initializer=tf.constant_initializer(self._initial_lr), name='lr') # 初始化new_lr self.new_lr = tf.placeholder(shape=[], dtype=tf.float32, name='new_lr') # 将self.new_lr 赋值给self.lr self.lr_update_op = tf.assign(self.lr, self.new_lr) with tf.variable_scope(self._name): # self._word_embedding用于进行单词向量化操作 self._word_embedding = tf.get_variable(name='word_embedding', shape=[self._vocab_size, self._embedding_size], initializer=tf.constant_initializer(self._we), trainable=False) # 对原文进行向量化操作,同时提取答案上下文的向量矩阵作为答案的向量 self._embed_pre = self._embed_inputs(self.premises, self._word_embedding) # 对问题进行向量化操作 self._embed_hyp = self._embed_inputs(self.hypotheses, self._word_embedding) def _inference(self): with tf.variable_scope('{}_lstm_s'.format(self._name)): # 对原文进行了一次LSTM操作 lstm_s = contrib.rnn.BasicLSTMCell(num_units=self._embedding_size, forget_bias=0.0) pre_length = self._length(self.premises) h_s, _ = tf.nn.dynamic_rnn(lstm_s, self._embed_pre, sequence_length=pre_length, dtype=tf.float32) self.h_s = h_s with tf.variable_scope('{}_lstm_t'.format(self._name)): # 对问题进行了一次LSTM操作 lstm_t = contrib.rnn.BasicLSTMCell(num_units=self._embedding_size, forget_bias=0.0) hyp_length = self._length(self.hypotheses) h_t, _ = tf.nn.dynamic_rnn(lstm_t, self._embed_hyp, sequence_length=hyp_length, dtype=tf.float32) self.h_t = h_t # 构造一个lstm网络 self.lstm_m = contrib.rnn.BasicLSTMCell(num_units=self._embedding_size, forget_bias=0.0) # 构造一个可以变化的向量矩阵 h_m_arr = tf.TensorArray(dtype=tf.float32, size=self._batch_size) i = tf.constant(0) # while_loop,cond作为条件,body做为操作过程 c = lambda x, y: tf.less(x, self._batch_size) b = lambda x, y: self._match_sent(x, y) res = tf.while_loop(cond=c, body=b, loop_vars=(i, h_m_arr)) # LSTM的输出结果 self.h_m_tensor = tf.squeeze(res[-1].stack(), axis=[1]) # 进行一次全连接操作,使得最后的输出结果是一维的 with tf.variable_scope('{}_fully_connect'.format(self._name)): w_fc = tf.get_variable(shape=[self._embedding_size, self._num_class], initializer=self._initializer, name='w_fc') b_fc = tf.get_variable(shape=[self._num_class], initializer=self._initializer, name='b_fc') self.logits = tf.matmul(self.h_m_tensor, w_fc) + b_fc # softmax损失函数,直接使用交叉熵损失函数,输出的结果只是一个数 cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits, name='cross_entropy') # 把batch_size的样本的损失函数进行加和 cross_entropy_sum = tf.reduce_sum(cross_entropy, name='cross_entropy_sum') # 加和以后相除,求损失的平均值 self.loss_op = tf.div(cross_entropy_sum, tf.cast(self._batch_size, dtype=tf.float32)) # argmax,求出每个样本中最大的概率值 self.predict_op = tf.arg_max(self.logits, dimension=1) def _match_sent(self, i, h_m_arr): # 对每一个句子进行操作 h_s_i = self.h_s[i] h_t_i = self.h_t[i] # 输入句子的长度 length_s_i = self._length(self.premises[i]) length_t_i = self._length(self.hypotheses[i]) state = self.lstm_m.zero_state(batch_size=1, dtype=tf.float32) k = tf.constant(0) c = lambda a, x, y, z, s: tf.less(a, length_t_i) b = lambda a, x, y, z, s: self._match_attention(a, x, y, z, s) res = tf.while_loop(cond=c, body=b, loop_vars=(k, h_s_i, h_t_i, length_s_i, state)) # 只获取最后一次的输出结果 final_state_h = res[-1].h # 将其写入到h_m_arr文件中 h_m_arr = h_m_arr.write(i, final_state_h) i = tf.add(i, 1) return i, h_m_arr def _match_attention(self, k, h_s, h_t, length_s, state): h_t_k = tf.reshape(h_t[k], [1, -1]) h_s_j = tf.slice(h_s, begin=[0, 0], size=[length_s, self._embedding_size]) with tf.variable_scope('{}_attention_w'.format(self._name)): w_s = tf.get_variable(shape=[self._embedding_size, self._embedding_size], initializer=self._initializer, name='w_s') w_t = tf.get_variable(shape=[self._embedding_size, self._embedding_size], initializer=self._initializer, name='w_t') w_m = tf.get_variable(shape=[self._embedding_size, self._embedding_size], initializer=self._initializer, name='w_m') w_e = tf.get_variable(shape=[self._embedding_size, 1], initializer=self._initializer, name='w_e') last_m_h = state.h # sum_h进行全连接操作,通过对原文进行操作,输出一个权重参数 sum_h = tf.matmul(h_s_j, w_s) + tf.matmul(h_t_k, w_t) + tf.matmul(last_m_h, w_m) # 经过一个激活层然后再与w_e进行相乘 e_kj = tf.matmul(tf.tanh(sum_h), w_e) # 求得ai,j a_kj = tf.nn.softmax(e_kj) # 进行原文的参数加权 alpha_k = tf.matmul(a_kj, h_s_j, transpose_a=True) alpha_k.set_shape([1, self._embedding_size]) # 将context与即将输入的h_t_k组合输入到下一次的LSTM中 m_k = tf.concat([alpha_k, h_t_k], axis=1) # with tf.variable_scope('{}_lstm_m'.format(self._name)): # 输入到LSTM重新进行计算 # state表示的是si _, new_state = self.lstm_m(inputs=m_k, state=state) k = tf.add(k, 1) return k, h_s, h_t, length_s, new_state def _embed_inputs(self, inputs, embeddings): ndim0_tensor_arr = tf.TensorArray(dtype=tf.float32, size=self._batch_size) i = tf.constant(0) # tf.less 当x大于self._batch_size时返回为假 c = lambda x, y, z, n: tf.less(x, self._batch_size) b = lambda x, y, z, n: self._embed_line(x, y, z, n) # cond为条件,body为内容 res = tf.while_loop(cond=c, body=b, loop_vars=(i, inputs, embeddings, ndim0_tensor_arr)) ndim0_tensor = res[-1].stack() ndim0_tensor = tf.reshape(ndim0_tensor, [-1, self._sentence_size, self._embedding_size]) return ndim0_tensor def _embed_line(self, i, inputs, embeddings, ndim0_tensor_arr): ndim1_list = [] # 对输入的每一句话进行操作 for j in range(self._sentence_size): # 输入的第一个字符串 word = inputs[i][j] unk_word = tf.constant(-1) # tf.squeeze删除所有大小为1的数组(6,1) 变成(6, ?),在构造的向量矩阵中根据word找出位置 f1 = lambda: tf.squeeze(tf.nn.embedding_lookup(params=embeddings, ids=word)) # 如果没有的话使用0向量代替 f2 = lambda: tf.zeros(shape=[self._embedding_size]) # 如果wordunk与word不相等,执行f1,否者执行f2 res_tensor = tf.case([(tf.not_equal(word, unk_word), f1)], default=f2) # 添加到ndim1_list 向量中 ndim1_list.append(res_tensor) for j in range(self._sentence_size): word = inputs[i][j] unk_word = tf.constant(-1) # 如果word等于-1代表了提取答案上下文的内容 f1 = lambda: self._ave_vec(ndim1_list, j) f2 = lambda: ndim1_list[j] ndim1_list[j] = tf.case([(tf.not_equal(word, unk_word), f2)], default=f1) # tf.stack是一个函数拼接 ndim1_tensor = tf.stack(ndim1_list) ndim0_tensor_arr = ndim0_tensor_arr.write(i, ndim1_tensor) i = tf.add(i, 1) return i, inputs, embeddings, ndim0_tensor_arr def _ave_vec(self, embed_list, cur_pos): """ 生词的词向量为词窗口的词向量平均值 :param embed_list: :param cur_pos: :return: """ # 根据句子的大小来获取当前词的上下文,self._window_size 表示提取词的大小 left_pos = max(0, cur_pos - self._window_size) right_pos = min(cur_pos + self._window_size, self._sentence_size) # 获得上下文的词向量 e_list = embed_list[left_pos:cur_pos] + embed_list[cur_pos + 1:right_pos + 1] # tf.stack合并词向量 e_tensor = tf.stack(e_list) # 对上下文的内容使用reduce_mean来替代原来的位置的信息 ave_tensor = tf.reduce_mean(e_tensor, axis=0) return ave_tensor @staticmethod def _length(sequence): mask = tf.sign(tf.abs(sequence)) length = tf.reduce_sum(mask, axis=-1) return length def _initial_optimizer(self): with tf.variable_scope('{}_step'.format(self._name)): # 进行学习率的衰减, 使用Ada,容易找出全局的最优解,且速度快. self.global_step = tf.get_variable(shape=[], initializer=tf.constant_initializer(0), dtype=tf.int32, name='global_step') # 根据动量平均跟新参数 self._optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.999) # 缩小loss self.train_op = self._optimizer.minimize(self.loss_op, global_step=self.global_step) if __name__ == '__main__': with tf.Session() as sess: # embedding需要翻译的句子 embedding = np.random.randn(4, 6) embedding[0] = 0.0 model = MatchLstm(vocab_size=7, sentence_size=5, embedding_size=6, word_embedding=embedding, session=sess) model.batch_size = 1 sent1 = [[3, -1, 2, 1, 0], [4, 5, 1, 0, 0], [2, 1, 0, 0, 0]] sent2 = [[2, 1, 0, 0, 0], [3, -1, 2, 1, 0], [4, 5, 1, 0, 0]] labels = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] sess.run(tf.global_variables_initializer()) # 迭代优化 for temp in range(300): loss, _, step = sess.run([model.loss_op, model.train_op, model.global_step], feed_dict={model.premises: sent1, model.hypotheses: sent2, model.labels: labels, model.lr: 0.001}) print(step, loss) sent1, sent2 = sent2, sent1
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
import tensorflow.contrib as contrib
# from app.decorator import exe_time
class MatchLstm:
# @exe_time
def __init__(self, vocab_size, sentence_size, embedding_size,
word_embedding, initializer=tf.truncated_normal_initializer(stddev=0.1),
session=tf.Session(), num_class=3,
window_size=4, name='MatchLstm', initial_lr=0.001):
# 字典的大小
self._vocab_size = vocab_size
# 句子的大小
self._sentence_size = sentence_size
# 隐含层的大小
self._embedding_size = embedding_size
# 用于构造向量
self._we = word_embedding
# 初始化
self._initializer = initializer
# 名字
self._name = name
# 输出种类
self._num_class = num_class
self._sess = session
# 窗口的大小
self._window_size = window_size
# 学习率
self._initial_lr = initial_lr
# 编码原文和上下文的信息
self._build_inputs_and_vars()
# 构造模型的结构
self._inference()
# 初始化
self._initial_optimizer()
def _build_inputs_and_vars(self):
# 文章的内容
self.premises = tf.placeholder(shape=[None, self._sentence_size], dtype=tf.int32,
name='premises')
# 问题
self.hypotheses = tf.placeholder(shape=[None, self._sentence_size], dtype=tf.int32,
name='hypotheses')
# 标签
self.labels = tf.placeholder(shape=[None, self._num_class], dtype=tf.float32,
name='labels')
# 根据输入的大小来获得样本的大小
self._batch_size = tf.shape(self.premises)[0]
# 初始化学习率
self.lr = tf.get_variable(shape=[], dtype=tf.float32, trainable=False,
initializer=tf.constant_initializer(self._initial_lr), name='lr')
# 初始化new_lr
self.new_lr = tf.placeholder(shape=[], dtype=tf.float32,
name='new_lr')
# 将self.new_lr 赋值给self.lr
self.lr_update_op = tf.assign(self.lr, self.new_lr)
with tf.variable_scope(self._name):
# self._word_embedding用于进行单词向量化操作
self._word_embedding = tf.get_variable(name='word_embedding',
shape=[self._vocab_size, self._embedding_size],
initializer=tf.constant_initializer(self._we),
trainable=False)
# 对原文进行向量化操作,同时提取答案上下文的向量矩阵作为答案的向量
self._embed_pre = self._embed_inputs(self.premises, self._word_embedding)
# 对问题进行向量化操作
self._embed_hyp = self._embed_inputs(self.hypotheses, self._word_embedding)
def _inference(self):
with tf.variable_scope('{}_lstm_s'.format(self._name)):
# 对原文进行了一次LSTM操作
lstm_s = contrib.rnn.BasicLSTMCell(num_units=self._embedding_size, forget_bias=0.0)
pre_length = self._length(self.premises)
h_s, _ = tf.nn.dynamic_rnn(lstm_s, self._embed_pre, sequence_length=pre_length,
dtype=tf.float32)
self.h_s = h_s
with tf.variable_scope('{}_lstm_t'.format(self._name)):
# 对问题进行了一次LSTM操作
lstm_t = contrib.rnn.BasicLSTMCell(num_units=self._embedding_size, forget_bias=0.0)
hyp_length = self._length(self.hypotheses)
h_t, _ = tf.nn.dynamic_rnn(lstm_t, self._embed_hyp, sequence_length=hyp_length,
dtype=tf.float32)
self.h_t = h_t
# 构造一个lstm网络
self.lstm_m = contrib.rnn.BasicLSTMCell(num_units=self._embedding_size,
forget_bias=0.0)
# 构造一个可以变化的向量矩阵
h_m_arr = tf.TensorArray(dtype=tf.float32, size=self._batch_size)
i = tf.constant(0)
# while_loop,cond作为条件,body做为操作过程
c = lambda x, y: tf.less(x, self._batch_size)
b = lambda x, y: self._match_sent(x, y)
res = tf.while_loop(cond=c, body=b, loop_vars=(i, h_m_arr))
# LSTM的输出结果
self.h_m_tensor = tf.squeeze(res[-1].stack(), axis=[1])
# 进行一次全连接操作,使得最后的输出结果是一维的
with tf.variable_scope('{}_fully_connect'.format(self._name)):
w_fc = tf.get_variable(shape=[self._embedding_size, self._num_class],
initializer=self._initializer, name='w_fc')
b_fc = tf.get_variable(shape=[self._num_class],
initializer=self._initializer, name='b_fc')
self.logits = tf.matmul(self.h_m_tensor, w_fc) + b_fc
# softmax损失函数,直接使用交叉熵损失函数,输出的结果只是一个数
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=self.labels,
logits=self.logits,
name='cross_entropy')
# 把batch_size的样本的损失函数进行加和
cross_entropy_sum = tf.reduce_sum(cross_entropy, name='cross_entropy_sum')
# 加和以后相除,求损失的平均值
self.loss_op = tf.div(cross_entropy_sum, tf.cast(self._batch_size, dtype=tf.float32))
# argmax,求出每个样本中最大的概率值
self.predict_op = tf.arg_max(self.logits, dimension=1)
def _match_sent(self, i, h_m_arr):
# 对每一个句子进行操作
h_s_i = self.h_s[i]
h_t_i = self.h_t[i]
# 输入句子的长度
length_s_i = self._length(self.premises[i])
length_t_i = self._length(self.hypotheses[i])
state = self.lstm_m.zero_state(batch_size=1, dtype=tf.float32)
k = tf.constant(0)
c = lambda a, x, y, z, s: tf.less(a, length_t_i)
b = lambda a, x, y, z, s: self._match_attention(a, x, y, z, s)
res = tf.while_loop(cond=c, body=b, loop_vars=(k, h_s_i, h_t_i, length_s_i, state))
# 只获取最后一次的输出结果
final_state_h = res[-1].h
# 将其写入到h_m_arr文件中
h_m_arr = h_m_arr.write(i, final_state_h)
i = tf.add(i, 1)
return i, h_m_arr
def _match_attention(self, k, h_s, h_t, length_s, state):
h_t_k = tf.reshape(h_t[k], [1, -1])
h_s_j = tf.slice(h_s, begin=[0, 0], size=[length_s, self._embedding_size])
with tf.variable_scope('{}_attention_w'.format(self._name)):
w_s = tf.get_variable(shape=[self._embedding_size, self._embedding_size],
initializer=self._initializer, name='w_s')
w_t = tf.get_variable(shape=[self._embedding_size, self._embedding_size],
initializer=self._initializer, name='w_t')
w_m = tf.get_variable(shape=[self._embedding_size, self._embedding_size],
initializer=self._initializer, name='w_m')
w_e = tf.get_variable(shape=[self._embedding_size, 1],
initializer=self._initializer, name='w_e')
last_m_h = state.h
# sum_h进行全连接操作,通过对原文进行操作,输出一个权重参数
sum_h = tf.matmul(h_s_j, w_s) + tf.matmul(h_t_k, w_t) + tf.matmul(last_m_h, w_m)
# 经过一个激活层然后再与w_e进行相乘
e_kj = tf.matmul(tf.tanh(sum_h), w_e)
# 求得ai,j
a_kj = tf.nn.softmax(e_kj)
# 进行原文的参数加权
alpha_k = tf.matmul(a_kj, h_s_j, transpose_a=True)
alpha_k.set_shape([1, self._embedding_size])
# 将context与即将输入的h_t_k组合输入到下一次的LSTM中
m_k = tf.concat([alpha_k, h_t_k], axis=1)
#
with tf.variable_scope('{}_lstm_m'.format(self._name)):
# 输入到LSTM重新进行计算
# state表示的是si
_, new_state = self.lstm_m(inputs=m_k, state=state)
k = tf.add(k, 1)
return k, h_s, h_t, length_s, new_state
def _embed_inputs(self, inputs, embeddings):
ndim0_tensor_arr = tf.TensorArray(dtype=tf.float32, size=self._batch_size)
i = tf.constant(0)
# tf.less 当x大于self._batch_size时返回为假
c = lambda x, y, z, n: tf.less(x, self._batch_size)
b = lambda x, y, z, n: self._embed_line(x, y, z, n)
# cond为条件,body为内容
res = tf.while_loop(cond=c, body=b,
loop_vars=(i, inputs, embeddings, ndim0_tensor_arr))
ndim0_tensor = res[-1].stack()
ndim0_tensor = tf.reshape(ndim0_tensor, [-1, self._sentence_size, self._embedding_size])
return ndim0_tensor
def _embed_line(self, i, inputs, embeddings, ndim0_tensor_arr):
ndim1_list = []
# 对输入的每一句话进行操作
for j in range(self._sentence_size):
# 输入的第一个字符串
word = inputs[i][j]
unk_word = tf.constant(-1)
# tf.squeeze删除所有大小为1的数组(6,1) 变成(6, ?),在构造的向量矩阵中根据word找出位置
f1 = lambda: tf.squeeze(tf.nn.embedding_lookup(params=embeddings, ids=word))
# 如果没有的话使用0向量代替
f2 = lambda: tf.zeros(shape=[self._embedding_size])
# 如果wordunk与word不相等,执行f1,否者执行f2
res_tensor = tf.case([(tf.not_equal(word, unk_word), f1)], default=f2)
# 添加到ndim1_list 向量中
ndim1_list.append(res_tensor)
for j in range(self._sentence_size):
word = inputs[i][j]
unk_word = tf.constant(-1)
# 如果word等于-1代表了提取答案上下文的内容
f1 = lambda: self._ave_vec(ndim1_list, j)
f2 = lambda: ndim1_list[j]
ndim1_list[j] = tf.case([(tf.not_equal(word, unk_word), f2)],
default=f1)
# tf.stack是一个函数拼接
ndim1_tensor = tf.stack(ndim1_list)
ndim0_tensor_arr = ndim0_tensor_arr.write(i, ndim1_tensor)
i = tf.add(i, 1)
return i, inputs, embeddings, ndim0_tensor_arr
def _ave_vec(self, embed_list, cur_pos):
"""
生词的词向量为词窗口的词向量平均值
:param embed_list:
:param cur_pos:
:return:
"""
# 根据句子的大小来获取当前词的上下文,self._window_size 表示提取词的大小
left_pos = max(0, cur_pos - self._window_size)
right_pos = min(cur_pos + self._window_size, self._sentence_size)
# 获得上下文的词向量
e_list = embed_list[left_pos:cur_pos] + embed_list[cur_pos + 1:right_pos + 1]
# tf.stack合并词向量
e_tensor = tf.stack(e_list)
# 对上下文的内容使用reduce_mean来替代原来的位置的信息
ave_tensor = tf.reduce_mean(e_tensor, axis=0)
return ave_tensor
@staticmethod
def _length(sequence):
mask = tf.sign(tf.abs(sequence))
length = tf.reduce_sum(mask, axis=-1)
return length
def _initial_optimizer(self):
with tf.variable_scope('{}_step'.format(self._name)):
# 进行学习率的衰减, 使用Ada,容易找出全局的最优解,且速度快.
self.global_step = tf.get_variable(shape=[],
initializer=tf.constant_initializer(0),
dtype=tf.int32,
name='global_step')
# 根据动量平均跟新参数
self._optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.999)
# 缩小loss
self.train_op = self._optimizer.minimize(self.loss_op, global_step=self.global_step)
if __name__ == '__main__':
with tf.Session() as sess:
# embedding需要翻译的句子
embedding = np.random.randn(4, 6)
embedding[0] = 0.0
model = MatchLstm(vocab_size=7, sentence_size=5, embedding_size=6,
word_embedding=embedding, session=sess)
model.batch_size = 1
sent1 = [[3, -1, 2, 1, 0],
[4, 5, 1, 0, 0],
[2, 1, 0, 0, 0]]
sent2 = [[2, 1, 0, 0, 0],
[3, -1, 2, 1, 0],
[4, 5, 1, 0, 0]]
labels = [[1, 0, 0],
[0, 1, 0],
[0, 0, 1]]
sess.run(tf.global_variables_initializer())
# 迭代优化
for temp in range(300):
loss, _, step = sess.run([model.loss_op, model.train_op, model.global_step],
feed_dict={model.premises: sent1, model.hypotheses: sent2,
model.labels: labels, model.lr: 0.001})
print(step, loss)
sent1, sent2 = sent2, sent1