K-Means算法之文本聚类

参考:

https://blog.csdn.net/sinat_26917383/article/details/70240628
https://blog.csdn.net/gamer_gyt/article/details/51244850
https://blog.csdn.net/feng_zhiyu/article/details/81952697 *8
https://www.cnblogs.com/mengnan/p/9307648.html

https://www.2cto.com/net/201805/747768.html *
https://www.jianshu.com/p/622222b96f76
https://blog.csdn.net/qq_43228162/article/details/85111049
http://blog.sina.com.cn/s/blog_9b03e9eb0102wpdz.html
https://www.cnblogs.com/mengnan/p/9307648.html
https://blog.csdn.net/feng_zhiyu/article/details/81952697 Python中的TfidfVectorizer参数解析
https://blog.csdn.net/gamer_gyt/article/details/51244850
https://blog.csdn.net/sinat_26917383/article/details/70240628#4_124

能找出一些明显的代表词:

代码(其中可视化等系统解决):

#coding=utf-8
import re
import jieba
import jieba.analyse # 提取关键内容
import jieba.posseg as pseg # 词性标注
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from snownlp import SnowNLP
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # 基于TF-IDF的词频转向量库
from sklearn.cluster import KMeans
import jieba.posseg as pseg
import newspaper
import requests # 导入爬虫库
from bs4 import BeautifulSoup
import datetime as dt
from sklearn.manifold import TSNE
from pandas import DataFrame
import pandas as pd
def read_from_file(file_name):
with open(file_name) as fp:
words = fp.read()
return words


def cut_words(words):
result = jieba.cut(words)
words = []
for r in result:
words.append(r)
return words


def stop_words(stop_word_file):
with open(stop_word_file) as fp:
words = fp.read()
# print(words)
result = words.split('\n')
return result

read_file_name="/root/temp/data2/17k.txt"
stop_file_name="/root/temp/data/stopwordlist.txt"
listall=read_from_file(read_file_name)
stopwords = stop_words(stop_file_name)
def del_stop_words(words,stop_words_set):
new_words = []
for k in words:
if k not in stop_words_set:
new_words.append(k)
return new_words

# for a in listall:
# list0.extend(del_stop_words(cut_words(a),stopwords))
list0 = []
#去除停用词
list0 = del_stop_words(cut_words(listall),stopwords)
listall1 = list(set(list0))
listall1.sort(key = list0.index)
comment_list=listall1
# print(listall1)


vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(listall1))


word = vectorizer.get_feature_names()
# print("word feature length: {}".format(len(word)))

tfidf_weight = tfidf.toarray()


def KMeans_function(tfidf_weight,word,clusters=4):
clf_KMeans = KMeans(n_clusters=clusters)
clf_KMeans.fit(tfidf_weight)

clf_KMeans_label = clf_KMeans.labels_ # 获取聚类标签
clf_KMeans_cluster_centers_ = clf_KMeans.cluster_centers_ # 获取聚类中心
clf_KMeans_inertia = clf_KMeans.inertia_ # 获取聚类准则的总和

res0Series = pd.Series(clf_KMeans.labels_)
# print(res0Series)

quantity=pd.Series(clf_KMeans.labels_).value_counts()
print("cluster2聚类数量:\n")
print(quantity)
order_centroids=clf_KMeans_cluster_centers_.argsort()
for i in (clusters):
print(word[i],end='')
print()
# for ss in range(k):
# print("\n")
# print("Cluster %d:\n" % ss, end='')
# for ind in order_centroids[ss, :10]:
# print(' %s' % word[ind], end='')

# print ("类别为1的数据\n", (df.iloc[res0.index]))

# print("获取聚类标签:\n")
# print(kmeans_label)
# print("获取聚类中心:\n")
# print(kmeans_cluster_centers_)
# print("获取聚类准则的总和:\n")
# print(kmeans_inertia)
# print("kmeans:")
# print(kmeans)



KMeans_function(tfidf_weight,word)



# order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
# print(order_centroids)
# terms = vectorizer.get_feature_names()
# for i in range(3):
# print("\n Cluster %d:" % i, end='')
# for ind in order_centroids[i, :10]:
# # print(ind)
# print(' %s' % terms[ind], end='')
# print()

# 打印出各个族的中心点
# print(kmeans.cluster_centers_)
# for index, label in enumerate(kmeans.labels_, 1):
# print("index: {}, label: {}".format(index, label))

# 样本距其最近的聚类中心的平方距离之和,用来评判分类的准确度,值越小越好
# k-means的超参数n_clusters可以通过该值来评估
# print("inertia: {}".format(kmeans.inertia_))


'''
6、可视化
'''

# 使用T-SNE算法,对权重进行降维,准确度比PCA算法高,但是耗时长
# tsne = TSNE(n_components=2)
# decomposition_data = tsne.fit_transform(tfidf_weight)

# x = []
# y = []

# for i in decomposition_data:
# x.append(i[0])
# y.append(i[1])
#
# fig = plt.figure(figsize=(10, 10))
# ax = plt.axes()
# plt.scatter(x, y, c=kmeans.labels_, marker="x")
# plt.xticks(())
# plt.yticks(())
# # plt.show()
# plt.savefig('./sample.png', aspect=1)


2019.5.25