• python3 doc2vec文本聚类实现


    import sys    #doc2vev
    import gensim
    import sklearn
    import numpy as np
     
    from gensim.models.doc2vec import Doc2Vec, LabeledSentence
     
    TaggededDocument = gensim.models.doc2vec.TaggedDocument
     
    def get_datasest():
        with open("ttt.txt", 'r') as cf:
            docs = cf.readlines()
            print (len(docs))
     
        x_train = []
        #y = np.concatenate(np.ones(len(docs)))
        for i, text in enumerate(docs):
            word_list = text.split(' ')
            l = len(word_list)
            word_list[l-1] = word_list[l-1].strip()
            document = TaggededDocument(word_list, tags=[i])
            x_train.append(document)
     
        return x_train
     
    def getVecs(model, corpus, size):
        vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, size)) for z in corpus]
        return np.concatenate(vecs)
     
    def train(x_train, size=200, epoch_num=1):
        model_dm = Doc2Vec(x_train,min_count=1, window = 3, size = size, sample=1e-3, negative=5, workers=4)
        model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
        model_dm.save('test/test')
     
        return model_dm
     
    def test():
        model_dm = Doc2Vec.load("test/test")
        print(model_dm)
        test_text = ['', '舞林', '争霸' '', '十强' '出炉', '复活', '舞者', '澳门', '踢馆']
        inferred_vector_dm = model_dm.infer_vector(test_text)
        print (inferred_vector_dm)
        sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
     
     
        return sims
     
    if __name__ == '__main__':
        x_train = get_datasest()
        model_dm = train(x_train)
     
        sims = test()
        for count, sim in sims:
            sentence = x_train[count]
            words = ''
            for word in sentence[0]:
                words = words + word + ' '
            print (words, sim, len(sentence[0]))
    print('ok')
     
  • 相关阅读:
    Git删除不存在对应远程分支的本地分支
    Git删除远程分支
    将博客搬至CSDN
    HttpStatus
    Mysql 日期
    jekyll开发静态网站
    修改maven默认的jdk版本
    使用@Value进行静态常量的值注入
    妙笔生花处,惊艳四座
    Integer 和 int 值比较
  • 原文地址:https://www.cnblogs.com/oikoumene/p/9798265.html
Copyright © 2020-2023  润新知