本节目录
常用函数一:词频统计
常用函数二:word2vec
常用函数三:doc2vec
常用函数四:LDA主题分析
# -*- coding: utf-8 -*-
"""
Datetime: 2020/06/25
Author: Zhang Yafei
Description: 统计词频
输入 文件名 列名 分割符
输出 词频统计结果-文件
"""
from collections import Counter
import pandas as pd
def count_word_freq(file_path, col_name, to_file, sep='; ', multi_table=False):
"""
统计词频
:param file_path: 读取文件路径
:param col_name: 统计词频所在列名
:param to_file: 保存文件路径
:param sep: 词语分割符
:param multi_table: 是否读取多张表
:return:
"""
if multi_table:
datas = pd.read_excel(file_path, header=None, sheet_name=None)
with pd.ExcelWriter(path=to_file) as writer:
for sheet_name in datas:
df = datas[sheet_name]
keywords = (word for word_list in df.loc[df[col_name].notna(), col_name].str.split(sep) for word in word_list if word)
words_freq = Counter(keywords)
words = [word for word in words_freq]
freqs = [words_freq[word] for word in words]
words\_df = pd.DataFrame(data={'word': words, 'freq': freqs})
words\_df.sort\_values('freq', ascending=False, inplace=True)
words\_df.to\_excel(excel\_writer=writer, sheet\_name=sheet\_name, index=False)
writer.save()
else:
df = pd.read\_excel(file\_path)
keywords = (word for word\_list in df.loc\[df\[col\_name\].notna(), col\_name\].str.split(sep) for word in word\_list if word)
words\_freq = Counter(keywords)
words = \[word for word in words\_freq\]
freqs = \[words\_freq\[word\] for word in words\]
words\_df = pd.DataFrame(data={'word': words, 'freq': freqs})
words\_df.sort\_values('freq', ascending=False, inplace=True)
words\_df.to\_excel(to\_file, index=False)
if __name__ == '__main__':
# 对data.xlsx所有表中的keyword列统计词频,以默认'; '为分割符切割词语,统计该列分词后的词频,结果保存至res.xlsx中
count_word_freq(file_path='data.xlsx', col_name='keyword', to_file='res.xlsx', multi_table=True)
经验分享:注意输入格式为excel文件,这也是我学习生活中常用的处理方式,直接拿去用,非常方便
另外,在我之前的一篇博客中,我介绍了Python统计词频常用的几种方式,不同的场景可以满足你各自的需求。博客传送门:
https://www.cnblogs.com/zhangyafei/p/10653977.html
word2vec是一种词向量技术,核心思想是把单词转换成向量,意思相近的单词向量间的距离越近,反之越远。实际使用的体验也是非常好。
# -*- coding: utf-8 -*-
"""
Datetime: 2019/7/25
Author: Zhang Yafei
Description: word2vec
data.txt
word1 word2 word3 …
word1 word2 word3 …
word1 word2 word3 …
… … … …
"""
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
def word2vec_model_train(file, model_path, ):
model = Word2Vec(LineSentence(file), size=100, window=5, iter=10, min_count=5)
model.save(model_path)
def word2vec_load(self, model_path):
model = Word2Vec.load(model_path)
print(model.similarity('生育意愿', '主观幸福感'))
for key in model.wv.similar_by_word('新生代农民工', topn=50):
print(key)
if __name__ == "__main__":
word2vec_model_train(file='data.txt', model_path='word2vec_keywords.model')
# word2vec_load(model_path='word2vec_keywords.model')
doc2vec和word2vec类似, word2vec是词向量技术,那么doc2vec见名知意就是文档向量技术,可以将一篇文档转换成一个向量。理论上讲,意思相近的句子向量间的距离越近。
# -*- coding: utf-8 -*-
"""
Datetime: 2019/7/14
Author: Zhang Yafei
Description: doc2vec
docs format
TaggedDocument([word1, word2, …], [doc tag])
TaggedDocument([word1, word2, …], [doc tag])
TaggedDocument([word1, word2, …], [doc tag])
…
"""
import os
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
output_dir = 'res'
model_dir = 'model'
if not os.path.exists(model_dir):
os.mkdir(model_dir)
if not os.path.exists(output_dir):
os.mkdir(output_dir)
def data_preparetion():
"""
数据预处理 准备文档词矩阵
:return [TaggedDocument(words=['contribut', 'antarctica', 'past', 'futur', 'sea-level', 'rise'], tags=[0]),
TaggedDocument(words=['evid', 'limit', 'human', 'lifespan'], tags=[1]),
…]
"""
print('开始准备文档语料')
df = pd.read_excel('data/data.xlsx')
documents = iter(df.text)
for index, doc in enumerate(documents):
doc_word_list = doc.split()
yield TaggedDocument(doc_word_list, [index])
def get_datasest():
df = pd.read_excel('data/data.xlsx')
documents = iter(df.text)
datasets = []
for index, doc in enumerate(documents):
doc_word_list = doc.split()
datasets.append(TaggedDocument(doc_word_list, [index]))
return datasets
class Doc2VecModel(object):
"""
Doc2Vec模型
"""
def \_\_init\_\_(self, vector\_size=100, dm=0, window=10, epochs=30, iter\_num=10):
self.model = Doc2Vec(vector\_size=vector\_size,
dm=dm,
window=window,
epochs=epochs,
iter=iter\_num,
)
def run(self, documents, model\_path, epochs=30):
"""
训练模型及结果的保存
:param documents: iterable \[\[doc1\], \[doc2\], \[doc3\], ...\]
:param model\_path: str
:param max\_epochs: int
:param epochs: int
:return:
"""
# 根据文档词矩阵构建词汇表
print('开始构建词汇表')
self.model.build\_vocab(documents)
print('开始训练')
self.model.train(documents, total\_examples=self.model.corpus\_count, epochs=epochs)
# 模型保存
self.model.save(f'{model\_dir}/{model\_path}')
print(f'{model\_path}\\t保存成功')
@staticmethod
def simlarity\_cal(vector1, vector2):
vector1\_mod = np.sqrt(vector1.dot(vector1))
vector2\_mod = np.sqrt(vector2.dot(vector2))
if vector2\_mod != 0 and vector1\_mod != 0:
simlarity = (vector1.dot(vector2)) / (vector1\_mod \* vector2\_mod)
else:
simlarity = 0
return simlarity
def model\_test(self):
doc2vec\_model = Doc2Vec.load(f'{model\_dir}/doc2vec.model')
vectors\_docs = doc2vec\_model.docvecs.vectors\_docs
datasets = get\_datasest()
sentence1 = '老年人 生活满意度 影响 全国 老年人口 健康状况 调查数据 以往 社会经济因素 健康 因素 人口因素 老年人 生活满意度 影响 基础 引入 变量 模型 分析 老年人 生活满意度 自评 影响 统计 控制 影响因素 基础 老年人 性格 情绪 孤独感 焦虑 程度 生活满意度 自评 影响 影响 原有 模型 变量 变化 生活满意度 老年人'
inferred\_vector = doc2vec\_model.infer\_vector(sentence1)
sims = doc2vec\_model.docvecs.most\_similar(\[inferred\_vector\], topn=10)
for count, sim in sims:
sentence = datasets\[count\]
words = ''
for word in sentence\[0\]:
words = words + word + ' '
print(words, sim, len(sentence\[0\]))
def get\_topic\_num(self, min\_topic\_num, max\_topic\_num):
doc2vec\_model = Doc2Vec.load(f'{model\_dir}/doc2vec.model')
vectors\_docs = doc2vec\_model.docvecs.vectors\_docs
silhouette\_score\_dict = {}
ch\_score\_dict = {}
inertia\_score = {}
for n in range(min\_topic\_num, max\_topic\_num + 1):
km = KMeans(n\_clusters=n)
km.fit(X=vectors\_docs)
pre\_labels = km.labels\_
inertia = km.inertia\_
sil\_score = metrics.silhouette\_score(X=vectors\_docs, labels=pre\_labels)
ch\_score = metrics.calinski\_harabaz\_score(X=vectors\_docs, labels=pre\_labels)
print(f'{n} inertia score: {inertia} silhouette\_score: {sil\_score} ch score: {ch\_score}')
inertia\_score\[n\] = inertia
silhouette\_score\_dict\[n\] = sil\_score
ch\_score\_dict\[n\] = ch\_score
self.plot\_image(data=silhouette\_score\_dict, xticks=range(min\_topic\_num, max\_topic\_num + 1),
title='不同聚类个数下silhouette\_score对比', xlabel='cluster\_num',
ylabel='silhouette\_score')
self.plot\_image(data=ch\_score\_dict, xticks=range(min\_topic\_num, max\_topic\_num + 1),
title='不同聚类个数下calinski\_harabaz\_score对比', xlabel='cluster\_num',
ylabel='calinski\_harabaz\_score')
self.plot\_image(data=inertia\_score, xticks=range(min\_topic\_num, max\_topic\_num + 1),
title='不同聚类个数下inertia score对比',
xlabel='cluster\_num', ylabel='inertia\_score')
@staticmethod
def plot\_image(data, title, xticks, xlabel, ylabel):
""" 画图 """
plt.rcParams\['font.sans-serif'\] = \['SimHei'\]
plt.figure(figsize=(8, 4), dpi=500)
plt.plot(data.keys(), data.values(), '#007A99')
plt.xticks(xticks)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.savefig(f'{output\_dir}/{title}.png',
bbox\_inches='tight', pad\_inches=0.1)
plt.show()
if __name__ == '__main__':
docs = data_preparetion()
model = Doc2VecModel(vector_size=100, epochs=30, window=10, dm=0, iter_num=20)
model.run(documents=docs, model_path=f'doc2vec.model')
# model.model_test()
# model.get_topic_num(min_topic_num=5, max_topic_num=40)
常用函数四:LDA主题分析
LDA(Latent dirichlet allocation)是文档主题生成模型中最有代表性的一种。LDA于2003年由David Blei等人提出,由于其应用简单且有效,在学术界被广泛应用在主题聚类、热点识别、演化分析等领域。
# -*- coding: utf-8 -*-
"""
Datetime: 2019/7/14
Author: Zhang Yafei
Description: LDA主题模型
安装依赖环境
pip install pandas numpy matplotlib sklearn
使用说明:
import numpy as np
import pandas as pd
import scipy
from matplotlib import pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_columns', None)
output_dir = 'res'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
def timeit(func):
""" 时间装饰器 """
@wraps(func)
def inner(\*args, \*\*kwargs):
start\_time = time.time()
ret = func(\*args, \*\*kwargs)
end\_time = time.time() - start\_time
if end\_time < 60:
print(f'共花费时间:', round(end\_time, 2), '秒')
else:
minute, sec = divmod(end\_time, 60)
print(f'花费时间\\t{round(minute)}分\\t{round(sec, 2)}秒')
return ret
return inner
class Articles(object):
def __init__(self, data, stopwords=False):
self.data = data
if stopwords:
self.stopwords = set([line.strip() for line in open('data/stopwords.txt')])
else:
self.stopwords = None
def \_\_iter\_\_(self):
if self.stopwords:
for word\_list in self.data:
yield ' '.join(self.pro\_words\_with\_stopwords(word\_list))
else:
for word\_list in self.data:
yield ' '.join(self.pro\_words(word\_list))
@staticmethod
def word\_replace(word):
return word.replace(' & ', '\_\_\_\_\_').replace('/', '\_\_\_').replace(', ', '\_\_'). \\
replace(',', '\_\_').replace(' ', '\_').replace('-', '\_\_\_\_'). \\
replace('(', '\_\_\_\_\_\_').replace(')', '\_\_\_\_\_\_')
def pro\_words\_with\_stopwords(self, word\_list):
return (self.word\_replace(word) for word in word\_list if word.lower() not in self.stopwords)
def pro\_words(self, word\_list):
return (self.word\_replace(word) for word in word\_list)
class SklearnLDA(object):
def __init__(self, corpus, n_topics, tf_idf=True, max_iter=10, learning_method='online', learning_offset=50.,
random_state=0, res_dir='res', english_words_fixed=False):
self.tfidf = tf_idf
self.lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,
doc_topic_prior=0.001, topic_word_prior=0.02,
learning_method=learning_method,
learning_offset=learning_offset,
random_state=random_state) # 定义lda模型
print('正在将语料转化为向量------------')
self.vectorizer = TfidfVectorizer() if tf_idf else CountVectorizer()
self.bow_corpus = self.vectorizer.fit_transform(corpus) # 将语料生成词袋向量
if english_words_fixed:
self.vocab = self.fixed_vocab()
else:
self.vocab = self.vectorizer.get_feature_names() # 词汇表
self.res_dir = res_dir
def fixed\_vocab(self):
return \[
vocab.replace('\_\_\_\_\_', ' & ').replace('\_\_\_\_', '-').replace('\_\_\_', '/').replace('\_\_', ',').replace('\_', ' ')
for vocab in self.vectorizer.get\_feature\_names()\]
def get\_topic\_num(self, index, max\_iter=10, min\_topic=5, max\_topic=30, learning\_offset=50., random\_state=0,
topic\_word\_num=30):
""" 确定LDA主题个数 """
print('开始训练模型, 计算困惑度')
perplexity\_dict = {}
kld\_list = {}
jsd\_list = {}
cos\_sim\_list = {}
w\_score\_dict = {}
x\_ticks = list(range(min\_topic, max\_topic + 1))
for n\_topics in x\_ticks:
result\_dir = f'{self.res\_dir}/{n\_topics}'
if not os.path.exists(result\_dir):
os.mkdir(result\_dir)
if os.path.exists(f'{result\_dir}/topic-word-{topic\_word\_num}.csv'):
doc\_topic\_matrix = np.loadtxt(f'{result\_dir}/doc\_topic\_matrix.txt')
topic\_word\_matrix = np.loadtxt(f'{result\_dir}/topic\_word\_matrix.txt')
else:
lda = LatentDirichletAllocation(n\_components=n\_topics, max\_iter=max\_iter, learning\_method='online',
doc\_topic\_prior=0.001, topic\_word\_prior=0.02,
learning\_offset=learning\_offset,
random\_state=random\_state) # 定义lda模型
doc\_topic\_matrix = lda.fit\_transform(self.bow\_corpus)
topic\_word\_matrix = lda.components\_
# 计算困惑度
perplexity = lda.perplexity(self.bow\_corpus)
perplexity\_dict\[n\_topics\] = perplexity
print(f'topic: {n\_topics}\\tsklearn preplexity: {perplexity:.3f}')
# 保存数据
np.savetxt(f'{result\_dir}/doc\_topic\_matrix.txt', doc\_topic\_matrix)
np.savetxt(f'{result\_dir}/topic\_word\_matrix.txt', topic\_word\_matrix)
doc\_topic\_columns = \[f'topic{num}' for num in range(
1, n\_topics + 1)\]
topic\_word\_columns = \[
f'word{num}' for num in range(1, topic\_word\_num + 1)\]
doc\_topic\_index = index
topic\_word\_index = pd.Index(data=doc\_topic\_columns, name='topic')
doc\_topic\_data = np.argsort(-doc\_topic\_matrix, axis=1)
topic\_word\_data = np.array(self.vocab)\[np.argsort(-topic\_word\_matrix, axis=1)\[:, :topic\_word\_num\]\]
self.save\_data(file\_path=f'{result\_dir}/doc-topic.csv', data=doc\_topic\_data,
columns=doc\_topic\_columns, index=doc\_topic\_index)
self.save\_data(file\_path=f"{result\_dir}/topic-word-{topic\_word\_num}.csv", data=topic\_word\_data,
columns=topic\_word\_columns, index=topic\_word\_index)
# 计算文本–主题最大平均分布概率和主题–词语平均相似度概率的加权数值的方法
w\_score = self.weight\_score(doc\_topic\_matrix, topic\_word\_matrix)
w\_score\_dict\[n\_topics\] = w\_score
# 计算KL距离和JS距离
kld\_sum = 0
jsd\_sum = 0
for topic\_vec1 in topic\_word\_matrix:
for topic\_vec2 in topic\_word\_matrix:
kld\_sum += self.kl\_divergence(topic\_vec1, topic\_vec2)
jsd\_sum += self.js\_divergence(topic\_vec1, topic\_vec2)
avg\_kld = kld\_sum / (n\_topics \*\* 2)
kld\_list\[n\_topics\] = avg\_kld
avg\_jsd = jsd\_sum / (n\_topics \*\* 2)
jsd\_list\[n\_topics\] = avg\_jsd
# 计算余弦相似度
cos\_sim\_matrix = cosine\_similarity(X=topic\_word\_matrix)
cos\_sim = cos\_sim\_matrix.sum() / (n\_topics \* (n\_topics - 1))
cos\_sim\_list\[n\_topics\] = cos\_sim
# 计算JS散度
for topic\_vec1 in topic\_word\_matrix:
for topic\_vec2 in topic\_word\_matrix:
jsd\_sum += self.js\_divergence(topic\_vec1, topic\_vec2)
# 打印
print(f'topic: {n\_topics}\\tavg KLD: {avg\_kld:.3f}')
print(f'topic: {n\_topics}\\tavg JSD: {avg\_jsd:.3f}')
print(f'topic: {n\_topics}\\tcosine\_similarity: {cos\_sim:.3f}')
print(f'topic: {n\_topics}\\tweight\_score: {w\_score:.3f}')
# 画图
if perplexity\_dict:
self.plot\_image(data=perplexity\_dict, x\_ticks=list(perplexity\_dict.keys()), title='lda\_topic\_perplexity',
xlabel='topic num', ylabel='perplexity')
self.plot\_image(data=kld\_list, x\_ticks=x\_ticks, title='lda\_topic\_KLD',
xlabel='topic num', ylabel='KLD')
self.plot\_image(data=jsd\_list, title='lda\_topic\_JSD', x\_ticks=x\_ticks,
xlabel='topic num', ylabel='JSD')
self.plot\_image(data=cos\_sim\_list, title='lda\_topic\_cosine\_simlarity', x\_ticks=x\_ticks,
xlabel='topic num', ylabel='cosine\_simlarity')
self.plot\_image(data=w\_score\_dict, title='lda\_topic\_weight\_score', x\_ticks=x\_ticks,
xlabel='topic num', ylabel='weight\_score')
def train(self, index, topic\_word\_num=10, save\_matrix=True, save\_data=True, print\_doc\_topic=False,
print\_topic\_word=True, save\_vocab=True):
""" 训练LDA模型 """
print('正在训练模型')
doc\_topic\_matrix = self.lda\_model.fit\_transform(self.bow\_corpus)
topic\_word\_matrix = self.lda\_model.components\_
if save\_vocab:
with open('res/vocab.txt', 'w') as f:
json.dump(self.vocab, f)
if save\_matrix:
print('正在保存矩阵')
if self.tfidf:
np.savetxt(f'{output\_dir}/doc\_topic\_tfidf\_matrix.txt', doc\_topic\_matrix)
np.savetxt(f'{output\_dir}/topic\_word\_tfidf\_matrix.txt', topic\_word\_matrix)
else:
np.savetxt(f'{output\_dir}/doc\_topic\_matrix.txt', doc\_topic\_matrix)
np.savetxt(f'{output\_dir}/topic\_word\_matrix.txt', topic\_word\_matrix)
if save\_data:
print('正在保存数据')
doc\_topic\_columns = \[f'topic{num}' for num in range(
1, self.lda\_model.n\_components + 1)\]
topic\_word\_columns = \[
f'word{num}' for num in range(1, topic\_word\_num + 1)\]
doc\_topic\_index = index
topic\_word\_index = pd.Index(data=doc\_topic\_columns, name='topic')
doc\_topic\_data = np.argsort(-doc\_topic\_matrix, axis=1)
topic\_word\_data = np.array(
self.vocab)\[np.argsort(-topic\_word\_matrix, axis=1)\[:, :topic\_word\_num\]\]
if self.tfidf:
self.save\_data(file\_path=f'{output\_dir}/doc-topic\_tfidf.csv', data=doc\_topic\_data,
columns=doc\_topic\_columns, index=doc\_topic\_index)
self.save\_data(file\_path=f"{output\_dir}/topic-word-tfidf\_{topic\_word\_num}.csv", data=topic\_word\_data,
columns=topic\_word\_columns, index=topic\_word\_index)
else:
self.save\_data(file\_path=f'{output\_dir}/doc-topic.csv', data=doc\_topic\_data,
columns=doc\_topic\_columns, index=doc\_topic\_index)
self.save\_data(file\_path=f"{output\_dir}/topic-word-{topic\_word\_num}.csv", data=topic\_word\_data,
columns=topic\_word\_columns, index=topic\_word\_index)
if print\_doc\_topic:
print('正在输出文档-主题')
for doc\_num, doc\_topic\_index in zip(index, np.argsort(-doc\_topic\_matrix, axis=1)):
print(f'{doc\_num}:\\t{doc\_topic\_index\[:5\]}')
if print\_topic\_word:
print('正在输出主题-词')
for topic\_num, topic\_word\_index in enumerate(np.argsort(-topic\_word\_matrix, axis=1)):
words\_list = np.array(
self.vocab)\[topic\_word\_index\]\[: 10\]
print(f'主题{topic\_num}:\\t{words\_list}')
@staticmethod
def save\_data(file\_path, data, columns, index):
""" 保存数据 """
df = pd.DataFrame(data=data, columns=columns, index=index)
df.to\_csv(file\_path, encoding='utf\_8\_sig')
print(f'{file\_path}\\t保存成功')
@staticmethod
def kl\_divergence(p, q):
"""
有时也称为相对熵,KL距离。对于两个概率分布P、Q,二者越相似,KL散度越小。
KL散度满足非负性
KL散度是不对称的,交换P、Q的位置将得到不同结果。
:param p:
:param q:
:return:
"""
return scipy.stats.entropy(p, q)
@staticmethod
def js\_divergence(p, q):
"""
JS散度基于KL散度,同样是二者越相似,JS散度越小。
JS散度的取值范围在0-1之间,完全相同时为0
JS散度是对称的
:param p:
:param q:
:return:
"""
M = (p + q) / 2
return 0.5 \* scipy.stats.entropy(p, M) + 0.5 \* scipy.stats.entropy(q, M)
@staticmethod
def weight\_score(doc\_topic\_matrix, topic\_word\_matrix):
# doc\_topic\_matrix = np.loadtxt('res/doc\_topic\_matrix.txt')
# topic\_word\_matrix = np.loadtxt('res/topic\_word\_matrix.txt')
# 计算最大平均主题分布概率
max\_mean\_topic\_prob = np.mean(np.max(doc\_topic\_matrix, axis=1))
# 计算平均主题相似度
topic\_cos\_sim\_matrix = cosine\_similarity(X=topic\_word\_matrix)
topic\_num = topic\_cos\_sim\_matrix.shape\[0\]
mean\_topic\_sim = np.sum(np.where(topic\_cos\_sim\_matrix > 0.99, 0, topic\_cos\_sim\_matrix)) / (
topic\_num \* (topic\_num - 1))
# 加权得分
weight\_score = max\_mean\_topic\_prob / mean\_topic\_sim
# print(f'加权得分:{weight\_score}')
return weight\_score
def plot\_image(self, data, title, x\_ticks, xlabel, ylabel):
""" 画图 """
plt.figure(figsize=(12, 6), dpi=180)
plt.plot(list(data.keys()), list(data.values()), '#007A99')
plt.xticks(x\_ticks)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.savefig(f'{self.res\_dir}/{title}.png',
bbox\_inches='tight', pad\_inches=0.1)
plt.show()
def data_preparetion(path, doc_col, index_col=None, sep=None, english_words_fixed=False, stopwords=False):
"""
数据准备
:param path: 数据路径
:param doc_col: 文档列
:param index_col: 索引列
:return:
"""
df = pd.read_excel(path)
df.dropna(subset=[doc_col], inplace=True)
if sep:
docs = iter(df[doc_col].str.split(sep))
else:
docs = iter(df[doc_col])
if english_words_fixed:
documents = Articles(data=docs, stopwords=stopwords)
else:
documents = docs
index_list = df[index_col] if index_col else df.index
return index_list, documents
@timeit
def main(index, docs, n_topics=10, tfidf=False, max_iter=5, min_topic=5, max_topic=30, learning_offset=50.,
random_state=0,
test_topic_num=False, topic_word_num=30, res_dir='res', english_words_fixed=False):
"""
主函数
:param index: 索引
:param docs: 文档
:param n_topics: 指定主题个数
:param tfidf: 是否对文档采用tfidf编码
:param max_iter: 最大迭代次数
:param min_topic: 最小主题个数 前提为test_topic_num=True
:param max_topic: 最大主题个数 前提为test_topic_num=True
:param learning_offset: 学习率
:param random_state: 随机状态值
:param test_topic_num: 测试主题个数
:param topic_word_num: 主题词矩阵词的个数
:param res_dir: 结果文件夹
:return:
"""
if not os.path.exists(res_dir):
os.mkdir(res_dir)
lda = SklearnLDA(corpus=docs, n_topics=n_topics, max_iter=max_iter, tf_idf=tfidf, learning_offset=learning_offset,
random_state=random_state, res_dir=res_dir, english_words_fixed=english_words_fixed)
if test_topic_num:
lda.get_topic_num(index=index, max_iter=max_iter, min_topic=min_topic, max_topic=max_topic,
learning_offset=learning_offset, random_state=random_state, topic_word_num=topic_word_num)
else:
lda.train(index=index, save_matrix=True, save_data=True,
print_doc_topic=False, print_topic_word=True, topic_word_num=topic_word_num)
if __name__ == '__main__':
# 数据准备
# index, docs = data_preparetion(path='data/山西政策3.xlsx', doc_col='标题分词')
index, docs = data_preparetion(path='data/COVID-19-2020.xlsx', doc_col='keywords', index_col='PMID', sep='; ', english_words_fixed=True, stopwords=False)
# LDA模型指定主题个数范围
main(index=index, docs=docs, test_topic_num=True, tfidf=False, max_iter=50, min_topic=5, max_topic=10,
topic_word_num=20, res_dir='res/聚类结果', english_words_fixed=True)
# LDA模型指定主题个数
# main(index=index, docs=docs, n_topics=19, tfidf=False, max_iter=50)
topic_evolution.py
# -*- coding: utf-8 -*-
'''
Datetime: 2019/08/16
author: Zhang Yafei
description:
colormap https://blog.csdn.net/Mr_Cat123/article/details/78638491
'''
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim.matutils')
import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
class TopicEvolution(object):
def __init__(self, data_path, doc_topic_matrix_path=None, topic_word_csv_path=None):
self.data_path = data_path
self.topic_word_csv_path = topic_word_csv_path
self.doc_topic_matrix_path = doc_topic_matrix_path
def topic\_intensity\_evolution(self, start\_year, end\_year, topic\_num, res\_dir='res', space=1):
df = pd.read\_excel(self.data\_path)
# print(df\['年'\])
doc\_topic\_matrix = np.loadtxt(self.doc\_topic\_matrix\_path.format(topic\_num))
# # 柱状图
x = \[f'topic{num}' for num in range(1, topic\_num + 1)\]
y = doc\_topic\_matrix.mean(axis=0)
print(x, np.mean(y))
self.plot\_bar(x=x, y=y, path=f'{res\_dir}/{topic\_num}/柱状图.png')
# # # 热图
doc\_topic\_df = pd.DataFrame(data=doc\_topic\_matrix)
doc\_topic\_df.index = df\['年'\]
topic\_intensity\_df = pd.DataFrame(columns=list(range(start\_year, end\_year, space)))
for year in range(start\_year, end\_year, space):
topic\_intensity\_df\[year\] = doc\_topic\_df.loc\[year, :\].mean()
topic\_intensity\_df.index = \[f'Topic {num}' for num in range(1, topic\_num + 1)\]
self.plot\_heatmap(data=topic\_intensity\_df, cmap='Reds', xlabel='年份', ylabel='主题',
path=f'{res\_dir}/{topic\_num}/热力图.png')
x = \[int(year) for year in range(start\_year, end\_year, space)\]
print(x, topic\_intensity\_df)
topic\_intensity\_df.to\_excel('res/topic\_intensity.xlsx')
self.plot(x=x, data\_list=topic\_intensity\_df, path=f'{res\_dir}/{topic\_num}/折线图.png')
@staticmethod
def plot(x, data\_list, path=None):
for index in data\_list.index.unique():
y = \[num for num in data\_list.loc\[index, :\]\]
# plt.plot(x, y)
plt.plot(x, y, "x-", label=f'主题{index}')
plt.savefig(path)
# plt.legend(loc='best', labels=\[f'主题{num}' for num in range(1, len(data\_list.index.unique()+1))\])
plt.show()
@staticmethod
def plot\_bar(x, y, path=None):
plt.bar(x, y, width=0.5)
plt.xticks(range(len(x)), x, rotation=45)
plt.axhline(y=np.mean(y), xmin=.05, xmax=.95, ls='--', color='black')
plt.savefig(path)
plt.show()
@staticmethod
def plot\_heatmap(data, cmap, xlabel, ylabel, path=None):
if cmap:
sns.heatmap(data, cmap=cmap)
else:
sns.heatmap(data)
plt.xticks(rotation=45)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
# plt.title(name)
# 保存图片
plt.savefig(path)
# 显示图片
plt.show()
def extract\_keywords\_txt(self):
df = pd.read\_excel(self.data\_file)
# data\_key = pd.read\_csv(f'{data\_dir}/data\_key.txt', delimiter='\\t', encoding='gbk')
# df\['keywords'\] = data\_key.ID.apply(self.add\_keywords)
# df\['keywords'\] = df.apply(self.add\_keywords, axis=1)
# df.to\_excel(self.data\_file)
# for year in range(2004, 2019):
# print(year)
# year\_df = pd.DataFrame(columns=\['ID'\])
# year\_df\['ID'\] = df.loc\[df\['年'\] == year, 'keywords'\].str.strip().str.replace(' ', '; ')
# year\_df.reset\_index(inplace=True, drop=True)
# year\_df.to\_csv(f'{data\_dir}/{year}.txt', sep='\\t')
with open(self.keywords\_txt, 'w', encoding='utf-8') as f:
for text in df.keywords:
f.write(f'{text}\\n')
@staticmethod
def word\_replace(word):
return word.replace(' & ', '\_\_\_\_\_').replace('/', '\_\_\_').replace(', ', '\_\_').replace(',', '\_\_').replace(' ',
'\_').replace(
'-', '\_\_\_\_').replace('(', '\_\_\_\_\_\_').replace(')', '\_\_\_\_\_\_')
def clac\_inter\_intimate(self, row, model, keywords):
topic\_internal\_sim\_sum = \[\]
for word1 in row:
word1 = self.word\_replace(word1)
if word1 not in keywords:
continue
for word2 in row:
word2 = self.word\_replace(word2)
if (word2 not in keywords) or (word1 == word2):
continue
try:
topic\_internal\_sim\_sum.append(model.wv.similarity(word1, word2))
except KeyError:
continue
# print(word1, word2, model.wv.similarity(word1, word2))
return np.mean(topic\_internal\_sim\_sum)
def topic\_intimate(self, model, topic\_num=None):
df = pd.read\_csv(self.topic\_word\_csv\_path, index\_col=0)
with open('data/vocab.txt', encoding='utf-8') as f:
keywords = {word.strip() for word in f if word}
topic\_inter\_intimate = np.mean(df.apply(self.clac\_inter\_intimate, axis=1, args=(model, keywords)))
topic\_exter\_sim\_sum = \[\]
for row1 in df.values.tolist():
for row2 in df.values.tolist():
if row1 == row2:
continue
topic\_exter\_sim = \[\]
for word1 in row1:
word1 = self.word\_replace(word1)
if word1 not in keywords:
continue
for word2 in row2:
word2 = self.word\_replace(word2)
if word2 not in keywords:
continue
try:
topic\_exter\_sim.append(model.wv.similarity(word1, word2))
except KeyError as e:
continue
topic\_exter\_sim\_sum.append(np.mean(topic\_exter\_sim))
# 主题间亲密度
topic\_exter\_intimate = np.mean(topic\_exter\_sim\_sum)
# 主题亲密度 = (主题内亲密度 - 主题间亲密度) / 主题内亲密度
topic\_proximity = (topic\_inter\_intimate - topic\_exter\_intimate) / topic\_inter\_intimate
print(topic\_num, topic\_inter\_intimate, topic\_exter\_intimate, topic\_proximity)
return topic\_num, topic\_proximity
def file_rename(dir_path, start, end):
for num in range(start, end):
os.rename(f'res/2004-2018/{dir_path}/{num}/文档-主题.csv', f'res/2004-2018/{dir_path}/{num}/doc-topic.csv')
# os.rename(f'res/2004-2018/{dir_path}/{num}/主题-词-30.csv', f'res/2004-2018/{dir_path}/{num}/topic-word-30.csv')
def plot_image(data, title, x_ticks, xlabel, ylabel, output_dir=None):
""" 画图 """
plt.figure(figsize=(12, 6), dpi=180)
plt.plot(data.keys(), data.values(), '#007A99')
plt.xticks(x_ticks)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
if output_dir:
plt.savefig(f'{output_dir}/{title}.png', bbox_inches='tight', pad_inches=0.1)
plt.show()
def start_plot(start_year, end_year, data_path, doc_topic_matrix_path, res_dir, topic_num=None, min_topics=None,
max_topics=None, space=1):
""" 柱状图、折线图、heatmap图 """
if min_topics and max_topics:
for n_topics in range(min_topics, max_topics + 1):
topic = TopicEvolution(data_path=data_path, doc_topic_matrix_path=doc_topic_matrix_path.format(n_topics))
topic.topic_intensity_evolution(start_year=start_year, end_year=end_year, topic_num=n_topics,
res_dir=res_dir, space=space)
elif topic_num:
topic = TopicEvolution(data_path=data_path, doc_topic_matrix_path=doc_topic_matrix_path)
topic.topic_intensity_evolution(start_year=start_year, end_year=end_year, topic_num=topic_num, res_dir=res_dir,
space=space)
def start_run(model_path, data_path, topic_word_csv_path, min_topics, max_topics, res_dir=None):
""" 主题亲密度 """
topic_proximity_dict = {}
model = Word2Vec.load(model_path)
for n_topics in range(min_topics, max_topics + 1):
topic = TopicEvolution(data_path='data/data.xlsx', topic_word_csv_path=topic_word_csv_path.format(n_topics))
proximity = topic.topic_intimate(topic_num=n_topics, model=model)
topic_proximity_dict[n_topics] = proximity
# plot_image(data=topic_proximity_dict, x_ticks=list(range(start, end+1)), title='topic_proximity', xlabel='topic num', ylabel='proximity', output_dir='res/2004-2018')
if __name__ == "__main__":
topic = TopicEvolution(data_path='data/data.xlsx')
start_plot(min_topics=5, max_topics=30, start_year=1993, end_year=2018, data_path='GLP1.xlsx',
doc_topic_matrix_path='res/{}/doc_topic_matrix.txt', res_dir='res', space=5)
start_run(model_path='model/word2vec.model', data_path='data/GLP1.xlsx',
topic_word_csv_path='res/{}/topic-word-30.csv', min_topics=5, max_topics=6)
经验分享:我都写好了,直接拿去用吧!
手机扫一扫
移动阅读更方便
你可能感兴趣的文章