本文使用word2vec(100维)做聚类,训练文本中一行是一条数据(已分词),具体代码如下:
from sklearn.cluster import KMeans from sklearn import preprocessing from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer #from sklearn.decomposition import PCA from gensim.models import Word2Vec import nltk from nltk.corpus import stopwords #from sklearn.model_selection import train_test_split import random import matplotlib.pyplot as plt %matplotlib inline #from sklearn.datasets.samples_generator import make_blob
加载文本:
sents = [] #sents:已分好词的文件,一行是一条数据,已经分好词并去掉停用词 with open('generate_data/sents_for_kmeans.txt','r',encoding='utf-8') as f: for line in f: sents.append(line.replace(' ',''))
文本去重:
sents = list(set(sents)) print(len(sents)) print(sents[10])
结果如下:
训练word2vec模型:
all_words = [sent.split(' ') for sent in sents] word2vec = Word2Vec(all_words)
查看词典:
vocabulary = word2vec.wv.vocab print(vocabulary.keys()) len(vocabulary)
将所有的词向量汇合到一个list中:
vectors = [] for item in vocabulary: vectors.append(word2vec.wv[item])
训练kmeans模型:
num_clusters = 2 km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=40, init='k-means++',n_jobs=-1) #返回各自文本的所被分配到的类索引 #result = km_cluster.fit_predict(vectors) #print("Predicting result: ", result) km_cluster.fit(vectors)
图形化展示:
cents = km_cluster.cluster_centers_ labels = km_cluster.labels_ inertia = km_cluster.inertia_ mark = ['or','ob'] color = 0 j = 0 for i in labels: #print(vectors[j]) plt.plot(vectors[j],mark[i],markersize=5) j += 1 plt.show()