一、前言
1、Skip-Thought-Vector论文 https://github.com/ryankiros/skip-thoughts
2、本文假设读者已了解Skip-Gram-Vector和RNN相关基础
3、quick_thought 论文:Lajanugen Logeswaran, Honglak Lee, An efficient framework for learning sentence representations. In ICLR, 2018.
二、实战
1、对数据进行分句(去掉过短的句子)、删除频率高的句子、分词
def fenju(data):
sentence=[]
for i in range(len(data)):
try:
m = re.findall('。',data[i][0])
# print(m)
if data[i][1] is not None and len(m)>0:
if len(m)>1:
content=data[i][0].split('。')
# print(content)
for c in range(len(content)):
if len(content[c])>10:
sentence.append(content[c]+'。')
elif len(data[i][0])>10:
sentence.append(data[i][0])
else:
continue
except:
continue
return sentence
def _process_sentence_list(sentence_list, threshold=0.01):
sentence_count = Counter(sentence_list)
total_count = len(sentence_list)
# 计算句子频率
sentence_freqs = {w: c / total_count for w, c in sentence_count.items()}
# 剔除出现频率太高的句子
sentence=[]
for w in range(len(sentence_list)):
if sentence_freqs[sentence_list[w]] < threshold:
sentence.append(sentence_list[w])
else:
continue
return sentence
def fenci(alltext, writefile, filename):
if not os.path.exists(writefile):
os.makedirs(writefile)
sentence = [' '.join(jieba.lcut(''.join(text.split()))) for text in alltext]
print(sentence)
with open(os.path.join(writefile, filename), "w") as fw:
fw.write("\n".join(sentence))
2、构建vocab、TFRecord文件(详细看github代码)
3、模型输入定义(3种模式train/eval/encode)
def build_inputs(self):
if self.mode == "encode":
encode\_ids = tf.placeholder(tf.int64, (None, None), name="encode\_ids")
encode\_mask = tf.placeholder(tf.int8, (None, None), name="encode\_mask")
else:
# Prefetch serialized tf.Example protos.
input\_queue = input\_ops.prefetch\_input\_data(
self.reader,
FLAGS.input\_file\_pattern,
shuffle=FLAGS.shuffle\_input\_data,
capacity=FLAGS.input\_queue\_capacity,
num\_reader\_threads=FLAGS.num\_input\_reader\_threads)
print("input\_queue",input\_queue)
# Deserialize a batch.
serialized = input\_queue.dequeue\_many(FLAGS.batch\_size)
encode = input\_ops.parse\_example\_batch(serialized)
encode\_ids = encode.ids
encode\_mask = encode.mask
self.encode\_ids = encode\_ids
self.encode\_mask = encode\_mask
由于我们每个batch中句子都进行了padding,为了防止padding对训练的影响,这里需要传递掩码给到RNN网络--每个句子各自的原始长度(encode_mask)。
4、对输入句子进行embedding
def build_word_embeddings(self):
rand_init = self.uniform_initializer
self.word_embeddings = []
self.encode_emb = []
self.init = None
for v in self.config.vocab_configs:
if v.mode == 'fixed':
if self.mode == "train":
word_emb = tf.get_variable(
name=v.name,
shape=[v.size, v.dim],
trainable=False)
embedding_placeholder = tf.placeholder(
tf.float32, [v.size, v.dim])
embedding_init = word_emb.assign(embedding_placeholder)
rand = np.random.rand(1, v.dim)
word\_vecs = np.load(v.embs\_file)
load\_vocab\_size = word\_vecs.shape\[0\]
assert(load\_vocab\_size == v.size - 1)
word\_init = np.concatenate((rand, word\_vecs), axis=0)
self.init = (embedding\_init, embedding\_placeholder, word\_init)
else:
word\_emb = tf.get\_variable(
name=v.name,
shape=\[v.size, v.dim\])
encode\_emb = tf.nn.embedding\_lookup(word\_emb, self.encode\_ids)
self.word\_emb = word\_emb
self.encode\_emb.extend(\[encode\_emb, encode\_emb\])#####
if v.mode == 'trained':
for inout in \["", "\_out"\]:
word\_emb = tf.get\_variable(
name=v.name + inout,
shape=\[v.size, v.dim\],
initializer=rand\_init)
if self.mode == 'train':
self.word\_embeddings.append(word\_emb)
encode\_emb = tf.nn.embedding\_lookup(word\_emb, self.encode\_ids)
self.encode\_emb.append(encode\_emb)
if v.mode == 'expand':
for inout in \["", "\_out"\]:
encode\_emb = tf.placeholder(tf.float32, (
None, None, v.dim), v.name + inout)
self.encode\_emb.append(encode\_emb)
word\_emb\_dict = read\_vocab\_embs(v.vocab\_file + inout + ".txt",
v.embs\_file + inout + ".npy")
self.word\_embeddings.append(word\_emb\_dict)
if v.mode != 'expand' and self.mode == 'encode':
word\_emb\_dict = read\_vocab(v.vocab\_file)
self.word\_embeddings.extend(\[word\_emb\_dict, word\_emb\_dict\])
将句子中的每一个字都转化为vocab size长度的向量。v.mode的3种模式fixed(使用预训练的embedding)/train(训练)/expand(扩展)。 最终输出的形式[encode_emb,encode_emb],用来获取上下句联系。
5、构建encoder
encoder对句子进行encode,得到最终的hidden state,这里可用单层的LSTM网络\双向LSTM\双向GRU。
def _initialize_cell(self, num_units, cell_type="GRU"):
if cell_type == "GRU":
return tf.contrib.rnn.GRUCell(num_units=num_units)
elif cell_type == "LSTM":
return tf.contrib.rnn.LSTMCell(num_units=num_units)
else:
raise ValueError("Invalid cell type")
def rnn(self, word_embs, mask, scope, encoder_dim, cell_type="GRU"):
length = tf.to_int32(tf.reduce_sum(mask, 1), name="length")
if self.config.bidir:
if encoder_dim % 2:
raise ValueError(
"encoder_dim must be even when using a bidirectional encoder.")
num_units = encoder_dim // 2
cell_fw = self._initialize_cell(num_units, cell_type=cell_type)
cell_bw = self._initialize_cell(num_units, cell_type=cell_type)
outputs, states = tf.nn.bidirectional_dynamic_rnn(
cell_fw=cell_fw,
cell_bw=cell_bw,
inputs=word_embs,
sequence_length=length,
dtype=tf.float32,
scope=scope)
if cell_type == "LSTM":
states = [states[0][1], states[1][1]]
state = tf.concat(states, 1)
else:
cell = self._initialize_cell(encoder_dim, cell_type=cell_type)
outputs, state = tf.nn.dynamic_rnn(
cell=cell,
inputs=word_embs,
sequence_length=length,
dtype=tf.float32,
scope=scope)
if cell_type == "LSTM":
state = state[1]
return state
def build_encoder(self):
"""Builds the sentence encoder.
Inputs:
self.encode\_emb
self.encode\_mask
Outputs:
self.thought\_vectors
Raises:
ValueError: if config.bidirectional\_encoder is True and config.encoder\_dim
is odd.
"""
names = \["", "\_out"\]
self.thought\_vectors = \[\]
for i in range(2):
with tf.variable\_scope("encoder" + names\[i\]) as scope:
if self.config.encoder == "gru":
sent\_rep = self.rnn(self.encode\_emb\[i\], self.encode\_mask, scope, self.config.encoder\_dim, cell\_type="GRU")
elif self.config.encoder == "lstm":
sent\_rep = self.rnn(self.encode\_emb\[i\], self.encode\_mask, scope, self.config.encoder\_dim, cell\_type="LSTM")
elif self.config.encoder == 'bow':
sent\_rep = self.bow(self.encode\_emb\[i\], self.encode\_mask)
else:
raise ValueError("Invalid encoder")
thought\_vectors = tf.identity(sent\_rep, name="thought\_vectors")
self.thought\_vectors.append(thought\_vectors)
可见分别对[encode_emb,encode_emb]进行了encode,得到[thought_vectors,thought_vectors]
6、构建损失函数
def build_loss(self):
"""Builds the loss Tensor.
Outputs:
self.total\_loss
"""
all\_sen\_embs = self.thought\_vectors
if FLAGS.dropout:
mask\_shp = \[1, self.config.encoder\_dim\]
bin\_mask = tf.random\_uniform(mask\_shp) > FLAGS.dropout\_rate
bin\_mask = tf.where(bin\_mask, tf.ones(mask\_shp), tf.zeros(mask\_shp))
src = all\_sen\_embs\[0\] \* bin\_mask
dst = all\_sen\_embs\[1\] \* bin\_mask
scores = tf.matmul(src, dst, transpose\_b=True)
else:
scores = tf.matmul(all\_sen\_embs\[0\], all\_sen\_embs\[1\], transpose\_b=True)###study pre current post
# Ignore source sentence
scores = tf.matrix\_set\_diag(scores, np.zeros(FLAGS.batch\_size))
# Targets
targets\_np = np.zeros((FLAGS.batch\_size, FLAGS.batch\_size))
ctxt\_sent\_pos = list(range(-FLAGS.context\_size, FLAGS.context\_size + 1))
ctxt\_sent\_pos.remove(0)
for ctxt\_pos in ctxt\_sent\_pos:
targets\_np += np.eye(FLAGS.batch\_size, k=ctxt\_pos)
targets\_np\_sum = np.sum(targets\_np, axis=1, keepdims=True)
targets\_np = targets\_np/targets\_np\_sum
targets = tf.constant(targets\_np, dtype=tf.float32)
# Forward and backward scores
f\_scores = scores\[:-1\]
b\_scores = scores\[1:\]
losses = tf.nn.softmax\_cross\_entropy\_with\_logits(
labels=targets, logits=scores)
loss = tf.reduce\_mean(losses)
tf.summary.scalar("losses/ent\_loss", loss)
self.total\_loss = loss
if self.mode == "eval":
f\_max = tf.to\_int64(tf.argmax(f\_scores, axis=1))
b\_max = tf.to\_int64(tf.argmax(b\_scores, axis=1))
targets = range(FLAGS.batch\_size - 1)
targets = tf.constant(list(targets), dtype=tf.int64)
fwd\_targets = targets + 1
names\_to\_values, names\_to\_updates = tf.contrib.slim.metrics.aggregate\_metric\_map({
"Acc/Fwd Acc": tf.contrib.slim.metrics.streaming\_accuracy(f\_max, fwd\_targets),
"Acc/Bwd Acc": tf.contrib.slim.metrics.streaming\_accuracy(b\_max, targets)
})
for name, value in names\_to\_values.items():
tf.summary.scalar(name, value)
self.eval\_op = names\_to\_updates.values()
损失函数图解如下:
用 tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=scores)进行交叉熵,从targets可以看出quick_thought思想是根据上下文来推出目标句的相似性,个人认为并没有学习到目标句的特征,我用quick_thought训练出来的句子向量进行多类别分类,效果不是很好(quick_thought 评估里的例子有电影情感分类)。
具体论文复现的代码https://github.com/lajanugen/S2V(英文)
手机扫一扫
移动阅读更方便
你可能感兴趣的文章