• doc2vec使用笔记


    #!/usr/bin/env Python
    # coding:utf-8
    #improt依赖包
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    import chardet
    from gensim import utils
    from gensim.models.doc2vec import LabeledSentence
    from gensim.models import Doc2Vec
    import numpy
    from random import shuffle
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import confusion_matrix
    import sklearn.metrics as metrics
    # Doc2vec需要以LabeledLineSentece对象作为输入,所以需要构建一个类将文本转化为LabeledLineStentece对象
    class LabeledLineSentence(object):
    
        def __init__(self, sources):
            self.sources = sources
            flipped = {}
            # make sure that keys are unique
            for key, value in sources.items():
                if value not in flipped:
                    flipped[value] = [key]
                else:
                    raise Exception('Non-unique prefix encountered')
    
        def __iter__(self):
            for source, prefix in self.sources.items():
                with utils.smart_open(source) as fin:
                    for item_no, line in enumerate(fin):
                        yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
        def to_array(self):
            self.sentences = []
            for source, prefix in self.sources.items():
                with utils.smart_open(source) as fin:
                    for item_no, line in enumerate(fin):
                        print chardet.detect(line)
                        line=line.decode("GB2312",'ignore').encode("utf-8")
                        print chardet.detect(line)
                        self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
                        # self.sentences.append(LabeledSentence(utils.to_utf8(line).split(), [prefix + '_%s' % item_no]))
            return self.sentences
    
        def sentences_perm(self):
            shuffle(self.sentences)
            return self.sentences
    
    #将文本数据以以下方式导入到Doc2vec中
    # sources = {u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/neg_train.txt':'TRAIN_NEG',
    # u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/pos_train.txt':'TRAIN_POS'
    # ,u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/uns_train.txt':'TRAIN_UNS',
    # u'/Volumes/Macintosh HD/Users/RayChou/Downloads/情感分析训练语料/uns_test.txt':'TEST_UNS'}
    sources = {
    './yuliao/fYuliao0.txt':'TRAIN_0',
    './yuliao/fYuliao1.txt':'TRAIN_1',
    './yuliao/fYuliao2.txt':'TRAIN_2',
    './yuliao/fYuliao3.txt':'TRAIN_3',
    './yuliao/fYuliao4.txt':'TRAIN_4',
    './yuliao/fYuliao5.txt':'TRAIN_5',
    }
    sentences = LabeledLineSentence(sources)
    
    #构建Doc2vec模型
    
    model = Doc2Vec(min_count=1, window=15, size=100, sample=1e-4, negative=5, workers=8)
    model.build_vocab(sentences.to_array())
    
    #训练Doc2vec模型(本例迭代次数为10,如果时间允许,可以迭代更多的次数)
    for epoch in range(2):
        model.train(sentences.sentences_perm())
    model.save("model.txt")
    # model=Doc2Vec.load("model.txt")
    
    #将训练好的句子向量装进array里面,后文作为分类器的输入
    train_arrays = numpy.zeros((5000, 100))
    train_labels = numpy.zeros(5000)
    test_arrays = []
    true_labels=[]
    train_data=[]
    train_lb=[]
    for i in range(5000):
        if(i<=645):
            prefix_train_0 = 'TRAIN_0_' + str(i)
            train_arrays[i] = model.docvecs[prefix_train_0]
            train_labels[i] = 0
        elif(i>645 and i<=4249):
            j=i-646
            prefix_train_1 = 'TRAIN_1_' + str(j)
            train_arrays[i]=model.docvecs[prefix_train_1]
            train_labels[i]=1
        elif(i>4249 and i<=4800):
            j=i-4250
            prefix_train_2 = 'TRAIN_2_' + str(j)
            train_arrays[i]=model.docvecs[prefix_train_2]
            train_labels[i]=2
        elif(i>4800 and i<=4965):
            j=i-4801
            prefix_train_3 = 'TRAIN_3_' + str(j)
            train_arrays[i]=model.docvecs[prefix_train_3]
            train_labels[i]=3
        elif(i>4965 and i<=4994):
            j=i-4966
            prefix_train_4 = 'TRAIN_4_' + str(j)
            train_arrays[i]=model.docvecs[prefix_train_4]
            train_labels[i]=4
        else:
            j=i-4995
            prefix_train_5 = 'TRAIN_5_' + str(j)
            train_arrays[i]=model.docvecs[prefix_train_5]
            train_labels[i]=5
    #载入测试集数据
    a=open("./yuliao/fYuliao0_test.txt")
    b=open("./yuliao/fYuliao1_test.txt")
    c=open("./yuliao/fYuliao2_test.txt")
    d=open("./yuliao/fYuliao3_test.txt")
    e=open("./yuliao/fYuliao4_test.txt")
    f=open("./yuliao/fYuliao5_test.txt")
    
    test_content1=a.readlines()
    test_content2=b.readlines()
    test_content3=c.readlines()
    test_content4=d.readlines()
    test_content5=e.readlines()
    test_content6=f.readlines()
    
    g=open("./yuliao/fYuliao0_test.txt")
    test_content7=g.readline()
    inferred_docvec=model.infer_vector(test_content7)
    print model.docvecs.most_similar([inferred_docvec], topn=3)
    
    for i in test_content1:
        test_arrays.append(model.infer_vector(i))
        true_labels.append(0)
    for i in test_content2:
        test_arrays.append(model.infer_vector(i))
        true_labels.append(1)
    for i in test_content3:
        test_arrays.append(model.infer_vector(i))
        true_labels.append(2)
    for i in test_content4:
        test_arrays.append(model.infer_vector(i))
        true_labels.append(3)
    for i in test_content5:
        test_arrays.append(model.infer_vector(i))
        true_labels.append(4)
    for i in test_content6:
        test_arrays.append(model.infer_vector(i))
        true_labels.append(5)
    
    #构建逻辑回归分类器
    classifier = LogisticRegression(class_weight={0:0.38,1:0.62})
    classifier.fit(train_arrays, train_labels)
    # 构建随机森林分类器
    '''
    from sklearn.ensemble import RandomForestClassifier
    RF = RandomForestClassifier(n_estimators=1200,max_depth=14,class_weight={0:0.3,1:0.7})
    RF.fit(train_arrays, train_labels)
    '''
    #构建GBDT分类器
    '''
    from sklearn.ensemble import GradientBoostingClassifier
    GBDT = GradientBoostingClassifier(n_estimators=1000,max_depth=14)
    GBDT.fit(train_arrays, train_labels)
    '''
    #对Test数据进行预测
    test_labels_LR=[]
    # test_labels_RF=[]
    # test_labels_GBDT=[]
    for i in range(len(test_arrays)):
        test_labels_LR.append(classifier.predict(test_arrays[i]))
        '''
        test_labels_RF.append(RF.predict(test_arrays[i]))
        test_labels_GBDT.append(GBDT.predict(test_arrays[i]))
        '''
    #打印各个模型的准确率和召回率
    print("LR:")
    test_labels_LR1 = []
    count = 0
    for i in range(len(test_labels_LR)):
        if (test_labels_LR[i][0] == true_labels[i]):
            count +=1
    print count
    '''
    print("RF:")
    print(metrics.accuracy_score(test_labels_RF,true_labels))
    print(confusion_matrix(test_labels_RF,true_labels))
    print("GBDT:")
    print(metrics.accuracy_score(test_labels_GBDT,true_labels))
    print(confusion_matrix(test_labels_GBDT,true_labels))
    '''
  • 相关阅读:
    ReadOnly TextEdit输入问题
    关于正太分布单侧区间上下限的定义
    关于Devexpress richEditControl
    CentOS7 升级 cmake
    极客时间实战目录
    kafka安装
    查找连续相同值的算法,并给出连续相同值的个数以及位置
    解决若依linux启动ERROR
    supervisor配置进程
    python做上位机
  • 原文地址:https://www.cnblogs.com/wuchuanying/p/6243762.html
Copyright © 2020-2023  润新知