1.学会了Python的文本聚类:大致步骤是分词、获取词频矩阵和词袋、进行聚类
import codecs import pandas as pd data=pd.read_csv("scien1.csv") col=data.iloc[:,1] arrs=col.values import jieba stopwords = {}.fromkeys([',', '。', '!', '这', '我', '非常']) # 精确模式 print(u" 中文分此后结果:") curpus=[] for a in arrs: seglist=jieba.cut(a,cut_all=False) final = '' for seg in seglist: if seg not in stopwords: final=final+seg seg_list=jieba.cut(final,cut_all=False) output=' '.join(list(seg_list)) curpus.append(output) #%% from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer #将文本中的词语转换为词频矩阵 vectorizer=CountVectorizer(curpus) #计算每个词语出现的次数 transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(curpus)) x=vectorizer.fit_transform(curpus) #获取词袋中所有文本关键词 word=vectorizer.get_feature_names() print('') docs_matrix = pd.np.array(x.toarray()) print(docs_matrix) weight = tfidf.toarray() resName = "BaiduTfidf_Result.txt" result = codecs.open(resName, 'w', 'utf-8') for j in range(len(word)): result.write(word[j] + ' ') print(word[j] + ' ') result.write(' ') #%% for i in range(len(weight)): for j in range(len(word)): result.write(str(weight[i][j]) + ' ') result.write(' ') #%% from sklearn.cluster import KMeans clf = KMeans(n_clusters=20) s = clf.fit(weight) print(s) print(clf.cluster_centers_) print(clf.labels_) i = 1 number=[] while i <= len(clf.labels_): print (i, clf.labels_[i-1]) number.append(clf.labels_[i-1]) i = i + 1 print(number) #用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 #print(clf.inertia_) #%% lable = [] #存储408个类标 20个类 content = [] first=pd.read_csv("scien1.csv") first['julei']=number #%% first.head() asdf=first.to_csv("result.csv")
2.GL关系图的使用
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <!DOCTYPE html> <html style="height: 100%"> <head> <meta charset="UTF-8"> <title>Insert title here</title> </head> <body style="height: 100%; margin: 0"> <div id="container" style="height: 100%"></div> <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/dist/echarts.min.js"></script> <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts-gl/dist/echarts-gl.min.js"></script> <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts-stat/dist/ecStat.min.js"></script> <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/dist/extension/dataTool.min.js"></script> <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/map/js/china.js"></script> <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/map/js/world.js"></script> <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/dist/extension/bmap.min.js"></script> <script type="text/javascript" src="assets/js/jquery-3.3.1.js"></script> <script type="text/javascript"> var dom = document.getElementById("container"); var myChart = echarts.init(dom); var app = {}; option = null; $.when( $.getJSON("data/b.json"), $.getScript("data/graph-modularity.js") ).done(function (res) { var data = res[0]; var nodes = data.nodes.map(function (nodeName, idx) { return { name: nodeName, value: data.dependentsCount[idx] } }); var edges = []; for (var i = 0; i < data.edges.length;) { var s = data.edges[i++]; var t = data.edges[i++]; edges.push({ source: s, target: t }); } nodes.forEach(function (node) { // if (node.value > 100) { node.emphasis = { label: { show: true } } // } if (node.value > 5000) { node.label = { show: true } } }); myChart.setOption({ backgroundColor: '#000', series: [{ color: ["rgb(203,239,15)", "rgb(73,15,239)","rgb(15,217,239)","rgb(30,15,239)","rgb(15,174,239)","rgb(116,239,15)","rgb(239,15,58)","rgb(15,239,174)","rgb(239,102,15)","rgb(239,15,15)","rgb(15,44,239)","rgb(239,145,15)","rgb(30,239,15)","rgb(239,188,15)","rgb(159,239,15)","rgb(159,15,239)","rgb(15,239,44)","rgb(15,239,87)","rgb(15,239,217)","rgb(203,15,239)","rgb(239,15,188)","rgb(239,15,102)","rgb(239,58,15)","rgb(239,15,145)","rgb(116,15,239)","rgb(15,131,239)","rgb(73,239,15)","rgb(15,239,131)","rgb(15,87,239)","rgb(239,15,231)"], type: 'graphGL', nodes: nodes, edges: edges, modularity: { resolution: 2, sort: true }, lineStyle: { color: 'rgba(255,255,255,1)', opacity: 0.05 }, itemStyle: { opacity: 1, // borderColor: '#fff', // borderWidth: 1 }, focusNodeAdjacency: false, focusNodeAdjacencyOn: 'click', symbolSize: function (value) { return Math.sqrt(value / 10); }, label: { textStyle: { color: '#fff' } }, emphasis: { label: { show: false }, lineStyle: { opacity: 0.5, 4 } }, forceAtlas2: { steps: 5, stopThreshold: 20, jitterTolerence: 10, edgeWeight: [0.2, 1], gravity: 5, edgeWeightInfluence: 0, // preventOverlap: true } }] }); }); ; if (option && typeof option === "object") { myChart.setOption(option, true); } </script> </body> </html>