1 import sys 2 reload(sys) 3 sys.setdefaultencoding('utf-8') 4 5 from datetime import datetime 6 from elasticsearch import Elasticsearch 7 from os import path 8 import jieba 9 import random 10 es = Elasticsearch() 11 12 filePath = path.dirname(__file__) 13 14 15 # index1:wordcount 16 # stopwords 17 stopWordFile = u'stopwords.txt' 18 stopWordList = [] 19 for L in open(path.join(filePath , stopWordFile)).readlines(): 20 stopWordList.append(L.strip().decode('utf-8')) 21 stopWordList.extend([u'腾讯',u'视频' , u'。']) 22 stopWordList = set(stopWordList) 23 24 # information words 25 new = 'words.txt' 26 text = open(path.join( filePath , new )).read().strip(' ') 27 wordDict = {} 28 for w in jieba.cut(text): 29 if w not in stopWordList: 30 wordDict.setdefault(w , 0) 31 wordDict[w] += 1 32 33 for key in wordDict.keys(): 34 data = {'word':key , 'count':wordDict[key]} 35 es.index(index = 'wordcount' , doc_type = 'test' , body = data) 36