dachuang-shujufenxi–数据处理找出文段主题

数据是上篇经过归类简单得到的主题段落(本来是55000,个人只简单测了200个):

[‘movie.douban.com’, ‘book.douban.com’, ‘1905.com’, ‘mtime.com’]


豆瓣电影提供新的电影介绍及评论包括上映影片的影讯查询及购票服务。你可以记录想看、在看和看过的电影电视剧,顺便打分、写影评。根据你的口味,豆瓣电影会推荐好电影给你…
记录你读过的、想读和正在读的书,顺便打分,添加标签及个人附注,写评论。根据你的口味,豆瓣会推荐适合的书给你。
国家新闻出版广电总局直属媒体-1905电影网(电影频道官方网站),涵盖新电影、好看的电影、经典电影、电影推荐、免费电影、高清电影在线观看及海量新电影图文视频资讯,看电…
Mtime时光网,中国专业的电影电视剧及影人资料库,这里有新专业的电影新闻,预告片,海报,写真和热门影评,同时提供电影院影讯查询,博客,相册和群组等服务,是电影粉丝的佳电影社…

相关系列的代码可参考个人:https://github.com/nihaohello/dachuang-shujufenxi

一.K-Means算法之文本聚类

只聚类一个主题试下效果,直接把以前的拿来改改

#coding=utf-8
import re
import jieba
import jieba.analyse # 提取关键内容
import jieba.posseg as pseg # 词性标注
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from snownlp import SnowNLP
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # 基于TF-IDF的词频转向量库
from sklearn.manifold import TSNE
from pandas import DataFrame
import pandas as pd
def read_from_file(file_name):
with open(file_name) as fp:
words = fp.read()
return words


def cut_words(words):
result = jieba.cut(words)
words = []
for r in result:
words.append(r)
return words


def stop_words(stop_word_file):
with open(stop_word_file) as fp:
words = fp.read()
# print(words)
result = words.split('\n')
return result

read_file_name="data_test.txt"
stop_file_name="/root/temp/data/stopwordlist.txt"
listall=read_from_file(read_file_name)
stopwords = stop_words(stop_file_name)
def del_stop_words(words,stop_words_set):
new_words = []
for k in words:
if k not in stop_words_set:
new_words.append(k)
return new_words

# for a in listall:
# list0.extend(del_stop_words(cut_words(a),stopwords))
list0 = []
#去除停用词
list0 = del_stop_words(cut_words(listall),stopwords)
listall1 = list(set(list0))
listall1.sort(key = list0.index)
comment_list=listall1
# print(listall1)


vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(listall1))


word = vectorizer.get_feature_names()
# print("word feature length: {}".format(len(word)))

tfidf_weight = tfidf.toarray()


def KMeans_function(tfidf_weight,word,clusters=1):
clf_KMeans = KMeans(n_clusters=clusters)
clf_KMeans.fit(tfidf_weight)

clf_KMeans_label = clf_KMeans.labels_ # 获取聚类标签
clf_KMeans_cluster_centers_ = clf_KMeans.cluster_centers_ # 获取聚类中心
clf_KMeans_inertia = clf_KMeans.inertia_ # 获取聚类准则的总和

res0Series = pd.Series(clf_KMeans.labels_)
# print(res0Series)

quantity=pd.Series(clf_KMeans.labels_).value_counts()
print("cluster2聚类数量:")
print(quantity)
order_centroids=clf_KMeans_cluster_centers_.argsort()
# print(type(order_centroids))
# for i in order_centroids:
# for j in i:
# print(j)
for i in range(6):
print(word[order_centroids[0][i]],end=' ')
print("\n")
# print(order_centroids)
# for i in (clusters):
# print(word[i],end='')
# print()


KMeans_function(tfidf_weight,word)



得到:

cluster2聚类数量:
0 83
dtype: int64
豆瓣 电影 粉丝 上映 提供 电影院

二.LDA主题模型及python实现

参考:

https://cloud.tencent.com/developer/article/1077495
https://blog.csdn.net/qq_40006058/article/details/85865695

安装网上的文章所讲解,不用tf建模,其余和ke算法过程类似

#coding=utf-8
import re
import jieba
import jieba.analyse  # 提取关键内容
import jieba.posseg as pseg  # 词性标注
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from snownlp import SnowNLP
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # 基于TF-IDF的词频转向量库
from sklearn.manifold import TSNE
from pandas import DataFrame
import pandas as pd
def read_from_file(file_name):
    with open(file_name) as fp:
        words = fp.read()
    return words
def cut_words(words):
    result = jieba.cut(words)
    words = []
    for r in result:
        words.append(r)
    return words
def stop_words(stop_word_file):
    with open(stop_word_file) as fp:
        words = fp.read()
        # print(words)
        result = words.split('\n')
    return result
read_file_name="data_test.txt"
stop_file_name="/root/temp/data/stopwordlist.txt"
listall=read_from_file(read_file_name)
stopwords = stop_words(stop_file_name)
def del_stop_words(words,stop_words_set):
    new_words = []
    for k in words:
        if k not in stop_words_set:
            new_words.append(k)
    return new_words
list0 = []
#去除停用词
list0 = del_stop_words(cut_words(listall),stopwords)
listall1 = list(set(list0))
listall1.sort(key = list0.index)
comment_list=listall1
# print(listall1)
vectorizer = CountVectorizer()
# cntVector = CountVectorizer(stop_words=stopwords)
# transformer = TfidfTransformer()
# tfidf = transformer.fit_transform(vectorizer.fit_transform(listall1))
cntTf = vectorizer.fit_transform(comment_list)
terms = vectorizer.get_feature_names()
# print(terms)  #词量
# print("词频向量:")
# print(cntTf)
# 主题数量
n_topics=1
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
docres = lda.fit_transform(cntTf)
# 通过fit_transform函数,我们就可以得到文档的主题模型分布在docres中。而主题词分布则在lda.components_中。
# print(docres)#
# print(lda.components_)
top_num=5
for topic_idx,topic in enumerate(lda.components_):
    # print(topic_idx)
    # print(topic)
    print("topic %d:" % topic_idx)
    print(topic.argsort())
    num=[]
    # for i in topic.argsort()[:-top_num - 1:-1]:
    for i in topic.argsort()[1:top_num]:
        num.append(i)
    for i in num:
        print(terms[i])
    # print(" ".join([terms[i] for i in topic.argsort()[:-top_num-1:-1]]))

2019.6.2