基于Quick_Thought Vectors的Sentence2Vec神经网络实现
阅读原文时间:2023年07月09日阅读:1

一、前言

1、Skip-Thought-Vector论文 https://github.com/ryankiros/skip-thoughts

2、本文假设读者已了解Skip-Gram-Vector和RNN相关基础

3、quick_thought 论文:Lajanugen Logeswaran, Honglak Lee, An efficient framework for learning sentence representations. In ICLR, 2018.

二、实战

1、对数据进行分句(去掉过短的句子)、删除频率高的句子、分词

def fenju(data):
sentence=[]
for i in range(len(data)):
try:
m = re.findall('。',data[i][0])
# print(m)
if data[i][1] is not None and len(m)>0:
if len(m)>1:
content=data[i][0].split('。')
# print(content)
for c in range(len(content)):
if len(content[c])>10:
sentence.append(content[c]+'。')
elif len(data[i][0])>10:
sentence.append(data[i][0])
else:
continue
except:
continue
return sentence

def _process_sentence_list(sentence_list, threshold=0.01):
sentence_count = Counter(sentence_list)
total_count = len(sentence_list)
# 计算句子频率
sentence_freqs = {w: c / total_count for w, c in sentence_count.items()}
# 剔除出现频率太高的句子
sentence=[]
for w in range(len(sentence_list)):
if sentence_freqs[sentence_list[w]] < threshold:
sentence.append(sentence_list[w])
else:
continue
return sentence

def fenci(alltext, writefile, filename):
if not os.path.exists(writefile):
os.makedirs(writefile)
sentence = [' '.join(jieba.lcut(''.join(text.split()))) for text in alltext]
print(sentence)
with open(os.path.join(writefile, filename), "w") as fw:
fw.write("\n".join(sentence))

2、构建vocab、TFRecord文件(详细看github代码)

3、模型输入定义(3种模式train/eval/encode)

def build_inputs(self):

if self.mode == "encode":  
  encode\_ids = tf.placeholder(tf.int64, (None, None), name="encode\_ids")  
  encode\_mask = tf.placeholder(tf.int8, (None, None), name="encode\_mask")  
else:  
  # Prefetch serialized tf.Example protos.  
  input\_queue = input\_ops.prefetch\_input\_data(  
      self.reader,  
      FLAGS.input\_file\_pattern,  
      shuffle=FLAGS.shuffle\_input\_data,  
      capacity=FLAGS.input\_queue\_capacity,  
      num\_reader\_threads=FLAGS.num\_input\_reader\_threads)  
  print("input\_queue",input\_queue)  
  # Deserialize a batch.  
  serialized = input\_queue.dequeue\_many(FLAGS.batch\_size)  
  encode = input\_ops.parse\_example\_batch(serialized)  
  encode\_ids = encode.ids  
  encode\_mask = encode.mask  
self.encode\_ids = encode\_ids  
self.encode\_mask = encode\_mask

由于我们每个batch中句子都进行了padding,为了防止padding对训练的影响,这里需要传递掩码给到RNN网络--每个句子各自的原始长度(encode_mask)。

4、对输入句子进行embedding

def build_word_embeddings(self):
rand_init = self.uniform_initializer
self.word_embeddings = []
self.encode_emb = []
self.init = None
for v in self.config.vocab_configs:
if v.mode == 'fixed':
if self.mode == "train":
word_emb = tf.get_variable(
name=v.name,
shape=[v.size, v.dim],
trainable=False)
embedding_placeholder = tf.placeholder(
tf.float32, [v.size, v.dim])
embedding_init = word_emb.assign(embedding_placeholder)

      rand = np.random.rand(1, v.dim)  
      word\_vecs = np.load(v.embs\_file)  
      load\_vocab\_size = word\_vecs.shape\[0\]  
      assert(load\_vocab\_size == v.size - 1)  
      word\_init = np.concatenate((rand, word\_vecs), axis=0)  
      self.init = (embedding\_init, embedding\_placeholder, word\_init)

    else:  
      word\_emb = tf.get\_variable(  
          name=v.name,  
          shape=\[v.size, v.dim\])

    encode\_emb = tf.nn.embedding\_lookup(word\_emb, self.encode\_ids)  
    self.word\_emb = word\_emb  
    self.encode\_emb.extend(\[encode\_emb, encode\_emb\])#####

  if v.mode == 'trained':  
    for inout in \["", "\_out"\]:  
      word\_emb = tf.get\_variable(  
          name=v.name + inout,  
          shape=\[v.size, v.dim\],  
          initializer=rand\_init)  
      if self.mode == 'train':  
        self.word\_embeddings.append(word\_emb)

      encode\_emb = tf.nn.embedding\_lookup(word\_emb, self.encode\_ids)  
      self.encode\_emb.append(encode\_emb)

  if v.mode == 'expand':  
    for inout in \["", "\_out"\]:  
      encode\_emb = tf.placeholder(tf.float32, (  
          None, None, v.dim), v.name + inout)  
      self.encode\_emb.append(encode\_emb)  
      word\_emb\_dict = read\_vocab\_embs(v.vocab\_file + inout + ".txt",  
          v.embs\_file + inout + ".npy")  
      self.word\_embeddings.append(word\_emb\_dict)

  if v.mode != 'expand' and self.mode == 'encode':  
    word\_emb\_dict = read\_vocab(v.vocab\_file)  
    self.word\_embeddings.extend(\[word\_emb\_dict, word\_emb\_dict\])

将句子中的每一个字都转化为vocab size长度的向量。v.mode的3种模式fixed(使用预训练的embedding)/train(训练)/expand(扩展)。 最终输出的形式[encode_emb,encode_emb],用来获取上下句联系。

5、构建encoder

encoder对句子进行encode,得到最终的hidden state,这里可用单层的LSTM网络\双向LSTM\双向GRU。

def _initialize_cell(self, num_units, cell_type="GRU"):
if cell_type == "GRU":
return tf.contrib.rnn.GRUCell(num_units=num_units)
elif cell_type == "LSTM":
return tf.contrib.rnn.LSTMCell(num_units=num_units)
else:
raise ValueError("Invalid cell type")

def rnn(self, word_embs, mask, scope, encoder_dim, cell_type="GRU"):
length = tf.to_int32(tf.reduce_sum(mask, 1), name="length")
if self.config.bidir:
if encoder_dim % 2:
raise ValueError(
"encoder_dim must be even when using a bidirectional encoder.")
num_units = encoder_dim // 2
cell_fw = self._initialize_cell(num_units, cell_type=cell_type)
cell_bw = self._initialize_cell(num_units, cell_type=cell_type)
outputs, states = tf.nn.bidirectional_dynamic_rnn(
cell_fw=cell_fw,
cell_bw=cell_bw,
inputs=word_embs,
sequence_length=length,
dtype=tf.float32,
scope=scope)
if cell_type == "LSTM":
states = [states[0][1], states[1][1]]
state = tf.concat(states, 1)
else:
cell = self._initialize_cell(encoder_dim, cell_type=cell_type)
outputs, state = tf.nn.dynamic_rnn(
cell=cell,
inputs=word_embs,
sequence_length=length,
dtype=tf.float32,
scope=scope)
if cell_type == "LSTM":
state = state[1]
return state

def build_encoder(self):
"""Builds the sentence encoder.

Inputs:  
  self.encode\_emb  
  self.encode\_mask

Outputs:  
  self.thought\_vectors

Raises:  
  ValueError: if config.bidirectional\_encoder is True and config.encoder\_dim  
    is odd.  
"""  
names = \["", "\_out"\]  
self.thought\_vectors = \[\]  
for i in range(2):  
  with tf.variable\_scope("encoder" + names\[i\]) as scope:  
    if self.config.encoder == "gru":  
      sent\_rep = self.rnn(self.encode\_emb\[i\], self.encode\_mask, scope, self.config.encoder\_dim, cell\_type="GRU")  
    elif self.config.encoder == "lstm":  
      sent\_rep = self.rnn(self.encode\_emb\[i\], self.encode\_mask, scope, self.config.encoder\_dim, cell\_type="LSTM")  
    elif self.config.encoder == 'bow':  
      sent\_rep = self.bow(self.encode\_emb\[i\], self.encode\_mask)  
    else:  
      raise ValueError("Invalid encoder")

    thought\_vectors = tf.identity(sent\_rep, name="thought\_vectors")  
    self.thought\_vectors.append(thought\_vectors)

可见分别对[encode_emb,encode_emb]进行了encode,得到[thought_vectors,thought_vectors]

6、构建损失函数

def build_loss(self):
"""Builds the loss Tensor.

Outputs:  
  self.total\_loss  
"""  
all\_sen\_embs = self.thought\_vectors

if FLAGS.dropout:  
  mask\_shp = \[1, self.config.encoder\_dim\]  
  bin\_mask = tf.random\_uniform(mask\_shp) > FLAGS.dropout\_rate  
  bin\_mask = tf.where(bin\_mask, tf.ones(mask\_shp), tf.zeros(mask\_shp))  
  src = all\_sen\_embs\[0\] \* bin\_mask  
  dst = all\_sen\_embs\[1\] \* bin\_mask  
  scores = tf.matmul(src, dst, transpose\_b=True)  
else:  
  scores = tf.matmul(all\_sen\_embs\[0\], all\_sen\_embs\[1\], transpose\_b=True)###study pre current post

# Ignore source sentence  
scores = tf.matrix\_set\_diag(scores, np.zeros(FLAGS.batch\_size))  
# Targets  
targets\_np = np.zeros((FLAGS.batch\_size, FLAGS.batch\_size))  
ctxt\_sent\_pos = list(range(-FLAGS.context\_size, FLAGS.context\_size + 1))  
ctxt\_sent\_pos.remove(0)  
for ctxt\_pos in ctxt\_sent\_pos:  
  targets\_np += np.eye(FLAGS.batch\_size, k=ctxt\_pos)  
targets\_np\_sum = np.sum(targets\_np, axis=1, keepdims=True)  
targets\_np = targets\_np/targets\_np\_sum  
targets = tf.constant(targets\_np, dtype=tf.float32)

# Forward and backward scores  
f\_scores = scores\[:-1\]  
b\_scores = scores\[1:\]

losses = tf.nn.softmax\_cross\_entropy\_with\_logits(  
    labels=targets, logits=scores)

loss = tf.reduce\_mean(losses)

tf.summary.scalar("losses/ent\_loss", loss)  
self.total\_loss = loss

if self.mode == "eval":  
  f\_max = tf.to\_int64(tf.argmax(f\_scores, axis=1))  
  b\_max = tf.to\_int64(tf.argmax(b\_scores, axis=1))

  targets = range(FLAGS.batch\_size - 1)  
  targets = tf.constant(list(targets), dtype=tf.int64)  
  fwd\_targets = targets + 1

  names\_to\_values, names\_to\_updates = tf.contrib.slim.metrics.aggregate\_metric\_map({  
    "Acc/Fwd Acc": tf.contrib.slim.metrics.streaming\_accuracy(f\_max, fwd\_targets),  
    "Acc/Bwd Acc": tf.contrib.slim.metrics.streaming\_accuracy(b\_max, targets)  
  })

  for name, value in names\_to\_values.items():  
    tf.summary.scalar(name, value)

  self.eval\_op = names\_to\_updates.values()

损失函数图解如下:

用 tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=scores)进行交叉熵,从targets可以看出quick_thought思想是根据上下文来推出目标句的相似性,个人认为并没有学习到目标句的特征,我用quick_thought训练出来的句子向量进行多类别分类,效果不是很好(quick_thought 评估里的例子有电影情感分类)。

具体论文复现的代码https://github.com/lajanugen/S2V(英文)

修改https://github.com/jinjiajia/Quick_Thought(中文)

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章