代码本身就是最好的解释,不赘述。
文本聚类输出: cluster.py
#!/usr/bin/env python # coding=utf-8 import jieba,re from gensim import corpora,models from sklearn.cluster import KMeans import sys reload(sys) sys.setdefaultencoding('utf-8') class MyCorpus(object): def __init__(self,fname): self.fname = fname def __iter__(self): for line in open(self.fname): yield jieba.cut(line,cut_all=False) class MyCluster(object): def __init__(self): self.CLEAN = re.compile(ur"[^u4e00-u9f5aA-Za-z0-9]") self.dictionary = {} self.corpus = [] def gen_dataset(self,documents): self.gen_corpus(documents) res = [self.doc2vec(doc) for doc in documents] return res def gen_corpus(self,documents): texts = [ list(jieba.cut(doc)) for doc in documents ] self.dictionary = corpora.Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.tfidf = models.TfidfModel(self.corpus) def doc2vec(self,doc): vec = self.dictionary.doc2bow(jieba.cut(doc)) vec = self.tfidf[vec] wordlist = [.0] * len(self.dictionary) for w in vec: wordlist[w[0]] = w[1] return wordlist def kcluster(self,texts,k=3): from random import shuffle data = self.gen_dataset(texts) data = [ map(lambda x:round(x,5),line) for line in data ] km = KMeans(n_clusters=k,init='k-means++',max_iter=200,n_init=1,verbose=True) km.fit(data) labels = km.labels_ flag = [0]*len(labels) randomtext = zip(labels,texts) shuffle(randomtext) res = [] for d in randomtext: if flag[d[0]]==0: res.append(d[1]) flag[d[0]] = 1 return res if __name__ == "__main__": texts = [ line for line in open('data/python.db') ] test = MyCluster() res = test.kcluster(texts,k=4) print ' '.join(res)
自动生成主文件: auto_gen_jd.py
#!/usr/bin/env python # coding=utf-8 import sys,os import simplejson as json import codecs # from snownlp import SnowNLP from simhash import Simhash # from bosonnlp import BosonNLP from cluster import MyCluster from jd_parser import JdParser import re reload(sys) sys.setdefaultencoding('utf-8') class AutoGenJD(object): ''' 自动生成JD,输入一个职位名 和句子数,输出一份岗位描述和要求 ''' def __init__(self): self.CLEAR_NUM = re.compile(u"^d+[.、::]|^[((]d+[)).]?|ds*[))】]") self.CLEAR_COLO = re.compile(u"^[。.)(【】]S+|[.;:;。]$") self.jd_database = json.load(codecs.open('data/lagou_jd_clean.json')) # self.jobname = [ jobname[:-3] for jobname in os.listdir("data") if jobname.endswith(".db") ] self.jobname = self.jd_database.keys() # self.bosonnlp = BosonNLP('UYTG1Csb.3652.5pZ2otkIncEn') self.jdparser = JdParser() self.km = MyCluster() def load_json_data(self,fname="../preprocess/data/mini_jd.json",arg1=None,arg2=None): for line in codecs.open(fname): try: data = json.loads(line) except Exception,e: print e continue if data.get(arg1,False) != False and data[arg1].has_key("job_title") and data[arg1].has_key("job_description"): if len(data[arg1]["job_title"])<2 or len(data[arg1]["job_title"])>16: continue else: fw = codecs.open('./data/'+data[arg1][arg2]+".txt",'w','utf-8') fw.write(data[arg1]["job_description"].strip()+" ") print "writing...",data[arg1][arg2] # 去除 序列号等清洗数据 def clean_jd(self,fname="./data/java.txt"): clean_sents = set() with codecs.open(fname+".txt",'r','utf-8') as fr: for line in fr: line = self.CLEAR_NUM.sub("",line.strip()) line = self.CLEAR_COLO.sub("",line.strip()) if len(line)>2: clean_sents.add(line.strip()) with codecs.open(fname[:-3]+"db",'w','utf-8') as fw: for line in clean_sents: fw.write(line+' ') return clean_sents def is_most_english(self,line): en_word = [ uchar for uchar in line if (uchar>=u'u0041' and uchar<=u'u005a') or (uchar>=u'u0061' and uchar<=u'u007a') ] return float(len(en_word)*1.0/len(line))>0.7 def clean_jd2(self,jdstr): """ 清洗数据,去除句子前后的标点符合,序号等杂乱数据 """ res = set() for line in jdstr.split(" "): line = line.strip() if len(line)<12: print "line",line if re.search(u"[;.;。]d+|d?[,,、::.]$|^ds{0,1}[u4e00-u9f5e]",line) or len(line)<8 or len(line)>32:continue if self.is_most_english(line):continue line = self.CLEAR_NUM.sub("",line) line = self.CLEAR_COLO.sub("",line) res.add(line) return res # 获取和用户输入相似度最近的职位名 def get_closet_job(self,jobname="java"): dis = [ (other,Simhash(jobname).distance(Simhash(other))) for other in self.jobname ] sorteddis = sorted(dis,key = lambda x:x[1]) for k,v in sorteddis[:5]: print k,v return sorteddis[0][0] # 规范化jd句子数目 def norm_jd_num(self,num): if num<1: num=1 elif num>20: num = 20 return num # 根据职位名和句子数,获得jd def get_jd_with_snownlp(self,jobname="java",num=5): jobname = self.get_closet_job(jobname) # with open("./data/"+jobname+".db") as fr: # s = SnowNLP(fr.read()) # return s.summary(num) jdstr = self.clean_jd2(self.jd_database[jobname]) s = SnowNLP(jdstr) return s.summary(num) def get_jd_with_bosonnlp(self,jobname="java",num=5): res = set() jobname = self.get_closet_job(jobname) jdstr = self.clean_jd2(self.jd_database[jobname])[:80] all_cluster = self.bosonnlp.cluster(jdstr) sort_all_cluster = sorted(all_cluster,key = lambda x:x['num'],reverse=True) for idx,cluster in enumerate(sort_all_cluster): print idx+1,cluster['_id'] res.add(jdstr[cluster['_id']]) return res def _get_sent_score(self,line): """ 句子得分,最后结果排序使用,分值越小,排序越靠前 """ s = len(line)+100 if re.search(u"男|女|男女不限|性别|岁",line): s -= 60 if re.search(u"学历|专业|d+[kK元]",line): s -= 40 if re.search(u"经验",line): s -= 20 return s def get_jd_with_kmeans(self,jobname='python',num=6): """ 使用kmeans 进行聚类,相同一类只出现一句 """ jobname = self.get_closet_job(jobname) jdstr = self.clean_jd2(self.jd_database[jobname]) print "jdstr",len(jdstr) print self.jd_database[jobname] if len(jdstr)<int(num): num = len(jdstr) res = self.km.kcluster(jdstr,k=int(num)) return sorted(res,cmp=lambda x,y:self._get_sent_score(x)-self._get_sent_score(y)) def jd_parser(self,jdstr): result = self.jdparser.parser(jdstr) return result if __name__ == "__main__": test = AutoGenJD() jobname = sys.argv[1] jdnum = int(sys.argv[2]) print "job name:",jobname print "demand:" demand = test.get_jd_with_kmeans(jobname,jdnum) for i,jdstr in enumerate(demand): print "%d. %s" %(i+1,jdstr)