




# coding: utf-8

from __future__ import print_function

import os
import sys
import time
from datetime import timedelta
import keras

import numpy as np
import tensorflow as tf
from sklearn import metrics
def word2id():
vocabulary_path = '/content/drive/My Drive/NLP/dataset/Fudan/vocabulary.txt'
fp1 = open(vocabulary_path,'r',encoding='utf-8')
word2id_dict = {}
for i,line in enumerate(fp1.readlines()):
word2id_dict[line.strip()] = i
return word2id_dict

def get_content_label(path):
#data = '/content/drive/My Drive/NLP/dataset/Fudan/data/train_clean_jieba.txt'
fp = open(path,'r',encoding='utf-8')
content_list = []
label_list = []
for line in fp.readlines():
line = line.strip().split('\t')
if len(line) == 2:
return content_list,label_list
def get_label_id():
label = '/content/drive/My Drive/NLP/dataset/Fudan/label.txt'
label2id_dict = {}
fp = open(label,'r',encoding='utf-8')
for line in fp.readlines():
line = line.strip().split('\t')
label2id_dict[line[0]] = line[1]
return label2id_dict
def process(path,max_length):
contents,labels = get_content_label(path)
word_to_id = word2id()
cat_to_id = get_label_id()
data_id = []
label_id = []
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])

# 使用keras提供的pad_sequences来将文本pad为固定长度
x_pad = keras.preprocessing.sequence.pad_sequences(data_id, max_length)
y_pad = keras.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示
return x_pad,y_pad

def batch_iter(x, y, batch_size=64):
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1

indices = np.random.permutation(np.arange(data\_len))  
x\_shuffle = x\[indices\]  
y\_shuffle = y\[indices\]

for i in range(num\_batch):  
    start\_id = i \* batch\_size  
    end\_id = min((i + 1) \* batch\_size, data\_len)  
    yield x\_shuffle\[start\_id:end\_id\], y\_shuffle\[start\_id:end\_id\]

def evaluate(sess, x_, y_):
data_len = len(x_)
batch_eval = batch_iter(x_, y_, 128)
total_loss = 0.0
total_acc = 0.0
for x_batch, y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = feed_data(x_batch, y_batch, 1.0)
loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
total_loss += loss * batch_len
total_acc += acc * batch_len

return total\_loss / data\_len, total\_acc / data\_len

def get_time_dif(start_time):
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))

def feed_data(x_batch, y_batch, keep_prob):
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch,
model.keep_prob: keep_prob
return feed_dict

def get_training_word2vec_vectors(filename):
with np.load(filename) as data:
return data["embeddings"]

class TCNNConfig(object):

embedding\_dim = 100  # 词向量维度  
seq\_length = 600  # 序列长度  
num\_classes = 20  # 类别数  
num\_filters = 256  # 卷积核数目  
kernel\_size = 5  # 卷积核尺寸  
vocab\_size = 183664  # 词汇表达小

hidden\_dim = 128  # 全连接层神经元

dropout\_keep\_prob = 0.5  # dropout保留比例  
learning\_rate = 1e-3  # 学习率

batch\_size = 64  # 每批训练大小  
num\_epochs = 10  # 总迭代轮次

print\_per\_batch = 20  # 每多少轮输出一次结果  
save\_per\_batch = 10  # 每多少轮存入tensorboard  
pre\_trianing = None  
vector\_word\_npz = '/content/drive/My Drive/NLP/dataset/Fudan/vector\_word.npz'

class TextCNN(object):

def \_\_init\_\_(self, config):  
    self.config = config

    # 三个待输入的数据  
    self.input\_x = tf.placeholder(tf.int32, \[None, self.config.seq\_length\], name='input\_x')  
    self.input\_y = tf.placeholder(tf.float32, \[None, self.config.num\_classes\], name='input\_y')  
    self.keep\_prob = tf.placeholder(tf.float32, name='keep\_prob')


def cnn(self):  
    # 词向量映射  
    with tf.device('/cpu:0'):  
        #embedding = tf.get\_variable('embedding', \[self.config.vocab\_size, self.config.embedding\_dim\])  
        embedding = tf.get\_variable("embeddings", shape=\[self.config.vocab\_size, self.config.embedding\_dim\],  
        embedding\_inputs = tf.nn.embedding\_lookup(embedding, self.input\_x)

    with tf.name\_scope("cnn"):  
        # CNN layer  
        conv = tf.layers.conv1d(embedding\_inputs, self.config.num\_filters, self.config.kernel\_size, name='conv')  
        # global max pooling layer  
        gmp = tf.reduce\_max(conv, reduction\_indices=\[1\], name='gmp')

    with tf.name\_scope("score"):  
        # 全连接层,后面接dropout以及relu激活  
        fc = tf.layers.dense(gmp, self.config.hidden\_dim, name='fc1')  
        fc = tf.contrib.layers.dropout(fc, self.keep\_prob)  
        fc = tf.nn.relu(fc)

        # 分类器  
        self.logits = tf.layers.dense(fc, self.config.num\_classes, name='fc2')  
        self.y\_pred\_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别

    with tf.name\_scope("optimize"):  
        # 损失函数,交叉熵  
        cross\_entropy = tf.nn.softmax\_cross\_entropy\_with\_logits(logits=self.logits, labels=self.input\_y)  
        self.loss = tf.reduce\_mean(cross\_entropy)  
        # 优化器  
        self.optim = tf.train.AdamOptimizer(learning\_rate=self.config.learning\_rate).minimize(self.loss)

    with tf.name\_scope("accuracy"):  
        # 准确率  
        correct\_pred = tf.equal(tf.argmax(self.input\_y, 1), self.y\_pred\_cls)  
        self.acc = tf.reduce\_mean(tf.cast(correct\_pred, tf.float32))

def train():
print("Configuring TensorBoard and Saver…")
# 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
tensorboard_dir = 'tensorboard/textcnn'
if not os.path.exists(tensorboard_dir):

tf.summary.scalar("loss", model.loss)  
tf.summary.scalar("accuracy", model.acc)  
merged\_summary = tf.summary.merge\_all()  
writer = tf.summary.FileWriter(tensorboard\_dir)  
save\_dir = 'checkpoint/textcnn/'  
save\_path = os.path.join(save\_dir, 'best\_validation')  # 最佳验证结果保存路径  
# 配置 Saver  
saver = tf.train.Saver()  
if not os.path.exists(save\_dir):  

print("Loading training and validation data...")  
# 载入训练集与验证集  
start\_time = time.time()  
train\_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/train\_clean\_jieba.txt'  
val\_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test\_clean\_jieba.txt'  
x\_train, y\_train = process(train\_dir, config.seq\_length)  
x\_val, y\_val = process(val\_dir, config.seq\_length)  
time\_dif = get\_time\_dif(start\_time)  
print("Time usage:", time\_dif)

# 创建session  
session = tf.Session()  

print('Training and evaluating...')  
start\_time = time.time()  
total\_batch = 0  # 总批次  
best\_acc\_val = 0.0  # 最佳验证集准确率  
last\_improved = 0  # 记录上一次提升批次  
require\_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

flag = False  
for epoch in range(config.num\_epochs):  
    print('Epoch:', epoch + 1)  
    batch\_train = batch\_iter(x\_train, y\_train, config.batch\_size)  
    for x\_batch, y\_batch in batch\_train:  
        feed\_dict = feed\_data(x\_batch, y\_batch, config.dropout\_keep\_prob)

        if total\_batch % config.save\_per\_batch == 0:  
            # 每多少轮次将训练结果写入tensorboard scalar  
            s = session.run(merged\_summary, feed\_dict=feed\_dict)  
            writer.add\_summary(s, total\_batch)

        if total\_batch % config.print\_per\_batch == 0:  
            # 每多少轮次输出在训练集和验证集上的性能  
            feed\_dict\[model.keep\_prob\] = 1.0  
            loss\_train, acc\_train = session.run(\[model.loss, model.acc\], feed\_dict=feed\_dict)  
            loss\_val, acc\_val = evaluate(session, x\_val, y\_val)  # todo

            if acc\_val > best\_acc\_val:  
                # 保存最好结果  
                best\_acc\_val = acc\_val  
                last\_improved = total\_batch  
                saver.save(sess=session, save\_path=save\_path)  
                improved\_str = '\*'  
                improved\_str = ''

            time\_dif = get\_time\_dif(start\_time)  
            msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \\  
                  + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'  
            print(msg.format(total\_batch, loss\_train, acc\_train, loss\_val, acc\_val, time\_dif, improved\_str))

        feed\_dict\[model.keep\_prob\] = config.dropout\_keep\_prob  
        session.run(model.optim, feed\_dict=feed\_dict)  # 运行优化  
        total\_batch += 1

        if total\_batch - last\_improved > require\_improvement:  
            # 验证集正确率长期不提升,提前结束训练  
            print("No optimization for a long time, auto-stopping...")  
            flag = True  
            break  # 跳出循环  
    if flag:  # 同上  

def test():
print("Loading test data…")
start_time = time.time()
test_dir = '/content/drive/My Drive/NLP/dataset/Fudan/data/test_clean_jieba.txt'
x_test, y_test = process(test_dir, config.seq_length)
save_path = 'checkpoint/textcnn/best_validation'

session = tf.Session()  

saver = tf.train.Saver()  
saver.restore(sess=session, save\_path=save\_path)  # 读取保存的模型

loss\_test, acc\_test = evaluate(session, x\_test, y\_test)  
msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'  
print(msg.format(loss\_test, acc\_test))

batch\_size = 128  
data\_len = len(x\_test)  
num\_batch = int((data\_len - 1) / batch\_size) + 1

y\_test\_cls = np.argmax(y\_test, 1)  
y\_pred\_cls = np.zeros(shape=len(x\_test), dtype=np.int32)  # 保存预测结果  
for i in range(num\_batch):  # 逐批次处理  
    start\_id = i \* batch\_size  
    end\_id = min((i + 1) \* batch\_size, data\_len)  
    feed\_dict = {  
        model.input\_x: x\_test\[start\_id:end\_id\],  
        model.keep\_prob: 1.0  
    y\_pred\_cls\[start\_id:end\_id\] = session.run(model.y\_pred\_cls, feed\_dict=feed\_dict)  
categories = get\_label\_id().values()  
# 评估  
print("Precision, Recall and F1-Score...")  
print(metrics.classification\_report(y\_test\_cls, y\_pred\_cls, target\_names=categories))

# 混淆矩阵  
print("Confusion Matrix...")  
cm = metrics.confusion\_matrix(y\_test\_cls, y\_pred\_cls)  

time\_dif = get\_time\_dif(start\_time)  
print("Time usage:", time\_dif)

if __name__ == '__main__':
print('Configuring CNN model…')
config = TCNNConfig()
config.pre_trianing = get_training_word2vec_vectors(config.vector_word_npz)
model = TextCNN(config)


Epoch: 8
Iter: 1080, Train Loss: 0.13, Train Acc: 95.31%, Val Loss: 0.44, Val Acc: 87.19%, Time: 0:04:33
Iter: 1100, Train Loss: 0.24, Train Acc: 95.31%, Val Loss: 0.44, Val Acc: 87.03%, Time: 0:04:38
Iter: 1120, Train Loss: 0.19, Train Acc: 93.75%, Val Loss: 0.43, Val Acc: 87.38%, Time: 0:04:42
Iter: 1140, Train Loss: 0.17, Train Acc: 92.19%, Val Loss: 0.42, Val Acc: 87.80%, Time: 0:04:47 *
Iter: 1160, Train Loss: 0.21, Train Acc: 90.62%, Val Loss: 0.41, Val Acc: 87.89%, Time: 0:04:53 *
Iter: 1180, Train Loss: 0.34, Train Acc: 89.06%, Val Loss: 0.43, Val Acc: 87.57%, Time: 0:04:57
Iter: 1200, Train Loss: 0.22, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 87.62%, Time: 0:05:01
Iter: 1220, Train Loss: 0.24, Train Acc: 90.62%, Val Loss: 0.41, Val Acc: 87.87%, Time: 0:05:06
Epoch: 9
Iter: 1240, Train Loss: 0.096, Train Acc: 95.31%, Val Loss: 0.4, Val Acc: 88.34%, Time: 0:05:11 *
Iter: 1260, Train Loss: 0.21, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 87.98%, Time: 0:05:16
Iter: 1280, Train Loss: 0.13, Train Acc: 95.31%, Val Loss: 0.42, Val Acc: 88.14%, Time: 0:05:20
Iter: 1300, Train Loss: 0.1, Train Acc: 98.44%, Val Loss: 0.43, Val Acc: 87.76%, Time: 0:05:25
Iter: 1320, Train Loss: 0.27, Train Acc: 92.19%, Val Loss: 0.39, Val Acc: 87.93%, Time: 0:05:29
Iter: 1340, Train Loss: 0.19, Train Acc: 92.19%, Val Loss: 0.45, Val Acc: 87.67%, Time: 0:05:33
Iter: 1360, Train Loss: 0.27, Train Acc: 92.19%, Val Loss: 0.42, Val Acc: 87.57%, Time: 0:05:38
Iter: 1380, Train Loss: 0.17, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 88.07%, Time: 0:05:42
Epoch: 10
Iter: 1400, Train Loss: 0.1, Train Acc: 98.44%, Val Loss: 0.39, Val Acc: 88.64%, Time: 0:05:47 *
Iter: 1420, Train Loss: 0.069, Train Acc: 96.88%, Val Loss: 0.4, Val Acc: 88.46%, Time: 0:05:51
Iter: 1440, Train Loss: 0.15, Train Acc: 98.44%, Val Loss: 0.41, Val Acc: 88.16%, Time: 0:05:56
Iter: 1460, Train Loss: 0.073, Train Acc: 98.44%, Val Loss: 0.4, Val Acc: 88.38%, Time: 0:06:00
Iter: 1480, Train Loss: 0.16, Train Acc: 95.31%, Val Loss: 0.42, Val Acc: 88.12%, Time: 0:06:05
Iter: 1500, Train Loss: 0.21, Train Acc: 92.19%, Val Loss: 0.41, Val Acc: 87.79%, Time: 0:06:09
Iter: 1520, Train Loss: 0.16, Train Acc: 93.75%, Val Loss: 0.41, Val Acc: 88.03%, Time: 0:06:13


Test Loss: 0.39, Test Acc: 88.64%
Precision, Recall and F1-Score…
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support

       0       0.33      0.05      0.09        61  
       1       0.89      0.96      0.93      1022  
       2       0.39      0.15      0.22        59  
       3       0.89      0.95      0.92      1254  
       4       0.33      0.08      0.12        52  
       5       0.83      0.90      0.86      1026  
       6       0.95      0.98      0.96      1358  
       7       0.67      0.04      0.08        45  
       8       0.39      0.28      0.32        76  
       9       0.85      0.94      0.89       742  
      10       0.00      0.00      0.00        34  
      11       0.00      0.00      0.00        28  
      12       0.96      0.96      0.96      1218  
      13       0.87      0.92      0.89       642  
      14       0.50      0.15      0.23        33  
      15       0.67      0.07      0.13        27  
      16       0.91      0.91      0.91      1601  
      17       0.86      0.11      0.20        53  
      18       0.00      0.00      0.00        34  
      19       0.74      0.69      0.72       468

accuracy                           0.89      9833  

macro avg 0.60 0.46 0.47 9833
weighted avg 0.87 0.89 0.87 9833

Confusion Matrix…
[[ 3 1 0 42 0 5 0 0 4 3 0 0 0 2
0 0 1 0 0 0]
[ 0 983 0 5 0 1 0 0 0 0 0 0 8 3
0 0 14 1 0 7]
[ 1 2 9 3 0 4 2 0 3 1 0 0 2 15
3 0 13 0 0 1]
[ 0 3 0 1195 0 12 2 0 0 16 0 0 3 2
0 0 8 0 0 13]
[ 0 6 1 1 4 14 5 0 5 0 0 0 1 1
0 0 14 0 0 0]
[ 0 7 0 16 0 924 1 0 3 5 0 0 1 0
0 0 39 0 0 30]
[ 0 1 0 3 0 0 1328 1 1 0 0 0 1 17
0 0 5 0 0 1]
[ 0 0 0 13 0 12 0 2 0 8 0 0 1 2
0 0 0 0 0 7]
[ 2 1 1 7 0 39 0 0 21 0 0 0 0 4
0 0 0 0 0 1]
[ 0 1 0 10 0 10 1 0 1 696 0 0 0 0
0 0 3 0 0 20]
[ 0 0 0 4 0 0 0 0 0 15 0 0 0 1
0 0 1 0 0 13]
[ 0 0 0 2 1 0 5 0 2 0 0 0 0 10
1 0 7 0 0 0]
[ 0 11 0 1 1 1 8 0 3 0 0 0 1175 6
0 0 7 0 0 5]
[ 0 0 0 6 0 0 31 0 0 1 0 0 12 589
0 0 3 0 0 0]
[ 0 2 4 1 1 1 0 0 1 0 0 0 4 6
5 1 7 0 0 0]
[ 0 0 2 1 0 1 6 0 0 0 0 0 0 11
0 2 4 0 0 0]
[ 0 70 2 10 2 39 5 0 2 2 0 0 7 0
0 0 1451 0 0 11]
[ 3 4 0 10 3 12 0 0 6 3 0 0 0 0
0 0 5 6 0 1]
[ 0 7 4 0 0 1 0 0 1 1 0 0 6 5
1 0 7 0 0 1]
[ 0 4 0 7 0 43 5 0 1 72 0 0 1 1
0 0 11 0 0 323]]
Time usage: 0:00:13


