• 根据职位名,自动生成jd


    代码本身就是最好的解释,不赘述。

    文本聚类输出: cluster.py

    #!/usr/bin/env python
    # coding=utf-8
    
    import jieba,re
    from gensim import corpora,models
    from sklearn.cluster import KMeans
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    
    class MyCorpus(object):
        def __init__(self,fname):
            self.fname = fname
    
        def __iter__(self):
            for line in open(self.fname):
                yield jieba.cut(line,cut_all=False)
    
    
    class MyCluster(object):
    
        def __init__(self):
            self.CLEAN = re.compile(ur"[^u4e00-u9f5aA-Za-z0-9]")
            self.dictionary = {}
            self.corpus = []
    
        
        def gen_dataset(self,documents):
            self.gen_corpus(documents)
            res = [self.doc2vec(doc) for doc in documents]
            return res
    
    
        def gen_corpus(self,documents):
            texts = [ list(jieba.cut(doc)) for doc in documents ]
            self.dictionary = corpora.Dictionary(texts)
            self.corpus = [self.dictionary.doc2bow(text) for text in texts]
            self.tfidf = models.TfidfModel(self.corpus)
    
    
        def doc2vec(self,doc):
            vec =  self.dictionary.doc2bow(jieba.cut(doc))
            vec = self.tfidf[vec]
            wordlist = [.0] * len(self.dictionary)
            for w in vec:
                wordlist[w[0]] = w[1]
            return wordlist
                
    
        def kcluster(self,texts,k=3):
            from random import shuffle
            data = self.gen_dataset(texts)
            data = [ map(lambda x:round(x,5),line) for line in data ]
            km = KMeans(n_clusters=k,init='k-means++',max_iter=200,n_init=1,verbose=True)
            km.fit(data)
            labels = km.labels_
            flag = [0]*len(labels)
            randomtext = zip(labels,texts)
            shuffle(randomtext)
            res = []
            for d in randomtext:
                if flag[d[0]]==0:
                    res.append(d[1])
                    flag[d[0]] = 1
    
            return res
    
    
    if __name__ == "__main__":
        texts = [ line for line in open('data/python.db') ]
        test = MyCluster()
        res = test.kcluster(texts,k=4)
    
        print '
    '.join(res)

    自动生成主文件: auto_gen_jd.py

    #!/usr/bin/env python
    # coding=utf-8
    
    import sys,os
    import simplejson as json
    import codecs
    # from snownlp import SnowNLP
    from simhash import Simhash
    # from bosonnlp import BosonNLP
    from cluster import MyCluster
    from jd_parser import JdParser
    import re
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class AutoGenJD(object):
        ''' 自动生成JD,输入一个职位名 和句子数,输出一份岗位描述和要求 '''
    
        def __init__(self):
            self.CLEAR_NUM = re.compile(u"^d+[.、::]|^[((]d+[)).]?|ds*[))】]")
            self.CLEAR_COLO = re.compile(u"^[。.)(【】]S+|[.;:;。]$")
            self.jd_database = json.load(codecs.open('data/lagou_jd_clean.json'))
        #   self.jobname = [ jobname[:-3] for jobname in os.listdir("data") if jobname.endswith(".db") ]
            self.jobname = self.jd_database.keys()
        #   self.bosonnlp = BosonNLP('UYTG1Csb.3652.5pZ2otkIncEn')
            self.jdparser = JdParser()
            self.km = MyCluster()
    
        def load_json_data(self,fname="../preprocess/data/mini_jd.json",arg1=None,arg2=None):
            for line in codecs.open(fname):
                try:
                    data = json.loads(line)
                except Exception,e:
                    print e
                    continue
                if data.get(arg1,False) != False and data[arg1].has_key("job_title") and data[arg1].has_key("job_description"):
                    if len(data[arg1]["job_title"])<2 or len(data[arg1]["job_title"])>16:
                        continue
                    else:
                        fw = codecs.open('./data/'+data[arg1][arg2]+".txt",'w','utf-8')
                        fw.write(data[arg1]["job_description"].strip()+"
    
    ")
                        print "writing...",data[arg1][arg2]
        
        # 去除 序列号等清洗数据
        def clean_jd(self,fname="./data/java.txt"):
            clean_sents = set()
            with codecs.open(fname+".txt",'r','utf-8') as fr:
                for line in fr:
                    line = self.CLEAR_NUM.sub("",line.strip())
                    line = self.CLEAR_COLO.sub("",line.strip())
                    if len(line)>2:
                        clean_sents.add(line.strip())
            with codecs.open(fname[:-3]+"db",'w','utf-8') as fw:
                for line in clean_sents:
                    fw.write(line+'
    ')
            return clean_sents
       
        def is_most_english(self,line):
            en_word = [ uchar for uchar in line if (uchar>=u'u0041' and uchar<=u'u005a') or (uchar>=u'u0061' and uchar<=u'u007a') ]
            return float(len(en_word)*1.0/len(line))>0.7
    
        def clean_jd2(self,jdstr):
            """
            清洗数据,去除句子前后的标点符合,序号等杂乱数据
            """
            res = set()
            for line in jdstr.split("
    "):
                line = line.strip()
                if len(line)<12:
                    print "line",line
                if re.search(u"[;.;。]d+|d?[,,、::.]$|^ds{0,1}[u4e00-u9f5e]",line) or len(line)<8 or len(line)>32:continue
                if self.is_most_english(line):continue
                line = self.CLEAR_NUM.sub("",line)
                line = self.CLEAR_COLO.sub("",line)
                res.add(line)
            return res
            
    
        # 获取和用户输入相似度最近的职位名
        def get_closet_job(self,jobname="java"):
            dis = [ (other,Simhash(jobname).distance(Simhash(other))) for other in self.jobname ]
            sorteddis = sorted(dis,key = lambda x:x[1])
            for k,v in sorteddis[:5]:
                print k,v
            return sorteddis[0][0]
        
        # 规范化jd句子数目
        def norm_jd_num(self,num):
            if num<1:
                num=1
            elif num>20:
                num = 20
            return num
    
    
        # 根据职位名和句子数,获得jd
        def get_jd_with_snownlp(self,jobname="java",num=5):
            jobname = self.get_closet_job(jobname)
          #  with open("./data/"+jobname+".db") as fr:
          #      s = SnowNLP(fr.read())
          #      return s.summary(num)
            jdstr = self.clean_jd2(self.jd_database[jobname])
            s = SnowNLP(jdstr)
            return s.summary(num)
    
        def get_jd_with_bosonnlp(self,jobname="java",num=5):
    
            res = set()
            jobname = self.get_closet_job(jobname)
            jdstr = self.clean_jd2(self.jd_database[jobname])[:80]
            all_cluster = self.bosonnlp.cluster(jdstr)
            sort_all_cluster = sorted(all_cluster,key = lambda x:x['num'],reverse=True)
            for idx,cluster in enumerate(sort_all_cluster):
                print idx+1,cluster['_id']
                res.add(jdstr[cluster['_id']])
            return res
    
    
        def _get_sent_score(self,line):
            """
            句子得分,最后结果排序使用,分值越小,排序越靠前
            """
            s = len(line)+100
            if re.search(u"男|女|男女不限|性别|岁",line):
                s -= 60
            if re.search(u"学历|专业|d+[kK元]",line):
                s -= 40
            if re.search(u"经验",line):
                s -= 20
            return s
                
    
        def get_jd_with_kmeans(self,jobname='python',num=6):
            """
            使用kmeans 进行聚类,相同一类只出现一句
            """
            jobname = self.get_closet_job(jobname)
            jdstr = self.clean_jd2(self.jd_database[jobname])
            print "jdstr",len(jdstr)
            print self.jd_database[jobname]
    
            if len(jdstr)<int(num):
                num = len(jdstr)
            res = self.km.kcluster(jdstr,k=int(num))
            return sorted(res,cmp=lambda x,y:self._get_sent_score(x)-self._get_sent_score(y))
    
    
        def jd_parser(self,jdstr):
            result = self.jdparser.parser(jdstr) 
            return result
    
    if __name__ == "__main__":
    
        test = AutoGenJD()
        jobname = sys.argv[1]
        jdnum = int(sys.argv[2])
        print "job name:",jobname
        print "demand:"
        demand = test.get_jd_with_kmeans(jobname,jdnum)
        for i,jdstr in enumerate(demand):
            print "%d. %s" %(i+1,jdstr)
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    初探Remoting双向通信(三)
    MySQL主从复制
    MySQL锁机制
    mySql索引
    连接池
    JDBC
    数据库建表、约束、索引
    Oracle和SQL简介
    Stream API 和 注解
    lambda表达式
  • 原文地址:https://www.cnblogs.com/jkmiao/p/4874803.html
Copyright © 2020-2023  润新知