• python文本挖掘模版


    import xlrd
    import jieba
    import sys  
    import importlib
    import os         #python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数  
    import pickle    #导入cPickle包并且取一个别名pickle #持久化类
    import random
    import numpy as np
    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
    from pylab import mpl  
    from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包
    from sklearn import svm
    
    from sklearn import metrics 
    from sklearn.datasets.base import Bunch
    from sklearn.feature_extraction.text import TfidfVectorizer
    importlib.reload(sys)
    
    
    #把内容和类别转化成一个向量的形式
    trainContentdatasave=[] #存储所有训练和测试数据的分词
    testContentdatasave=[]
    
    trainContentdata = []
    testContentdata = []
    trainlabeldata = []
    testlabeldata = []
    
    #导入文本描述的训练和测试数据
    def importTrainContentdata():
        file = '20180716_train.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            col = []
            for c in range(1):
                col.append(ws.cell(r, c).value)
            trainContentdata.append(col)
    
    def importTestContentdata():
        file = '20180716_test.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            col = []
            for c in range(1):
                col.append(ws.cell(r, c).value)
            testContentdata.append(col)   
    
    #导入类别的训练和测试数据
    def importTrainlabeldata():
        file = '20180716_train_label.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            col = []
            for c in range(1):
                col.append(ws.cell(r, c).value)
            trainlabeldata.append(col)
            
    def importTestlabeldata():
        file = '20180716_test_label.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            col = []
            for c in range(1):
                col.append(ws.cell(r, c).value)
            testlabeldata.append(col)
    """
    def importClassSet():
        file = 'ClassSet.xls'
        wb = xlrd.open_workbook(file)
        ws = wb.sheet_by_name("Sheet1")
        for r in range(ws.nrows):
            col = []
            for c in range(ws.ncols):
                col.append(ws.cell(r, c).value)
            ClassSet.append(col)
    """
    def buildtrainbunch(bunch_path):
        bunch = Bunch(label=[],contents=[]) 
        for item1 in trainlabeldata:
            bunch.label.append(item1)
            
        for item2 in trainContentdata:
            item2=str(item2)
            item2 = item2.replace("
    ", "")
            item2 = item2.replace(" ", "")
            content_seg=jieba.cut(item2)
            save2=''
            for item3 in content_seg:
                if len(item3) > 1 and item3!='
    ':
                    trainContentdatasave.append(item3)
                    save2=save2+","+item3
            bunch.contents.append(save2)
        with open(bunch_path, "wb") as file_obj:  
            pickle.dump(bunch, file_obj)  
        print("构建训练数据文本对象结束!!!")
    
    def buildtestbunch(bunch_path):
        bunch = Bunch(label=[],contents=[]) 
        for item1 in testlabeldata:
            bunch.label.append(item1)
            
        for item2 in testContentdata:
            item2=str(item2)
            item2 = item2.replace("
    ", "")
            item2 = item2.replace(" ", "")
            content_seg=jieba.cut(item2)
            save2=''
            for item3 in content_seg:
                if len(item3) > 1 and item3!='
    ':
                    testContentdatasave.append(item3)
                    save2=save2+","+item3
            bunch.contents.append(save2)
        with open(bunch_path, "wb") as file_obj:  
            pickle.dump(bunch, file_obj)  
        print("构建测试数据文本对象结束!!!")
        
    
    #读取停用词
    def _readfile(path):  
        with open(path, "rb") as fp:  
            content = fp.read()  
        return content  
    
    # 读取bunch对象  
    def _readbunchobj(path):  
        with open(path, "rb") as file_obj:  
            bunch = pickle.load(file_obj)  
        return bunch  
     
    # 写入bunch对象  
    def _writebunchobj(path, bunchobj):  
        with open(path, "wb") as file_obj:  
            pickle.dump(bunchobj, file_obj) 
        
    def vector_space(stopword_path,bunch_path,space_path):
        
        stpwrdlst = _readfile(stopword_path).splitlines()#读取停用词  
        bunch = _readbunchobj(bunch_path)#导入分词后的词向量bunch对象  
        #构建tf-idf词向量空间对象  
        tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})  
        '''
        权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值
        '''
        #使用TfidVectorizer初始化向量空间模型
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=False,max_features=10000)
        #print(vectorizer)
        #文本转为词频矩阵,单独保存字典文件
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)  
        tfidfspace.vocabulary = vectorizer.vocabulary_ 
        #创建词袋的持久化
        _writebunchobj(space_path, tfidfspace)  
        print("if-idf词向量空间实例创建成功!!!")
    
    def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path):
        
        stpwrdlst = _readfile(stopword_path).splitlines()#把停用词变成列表  
        bunch = _readbunchobj(bunch_path)  
        tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={}) 
        '''
        tdm存放的是计算后得到的TF-IDF权重矩阵.
        vocabulary是词向量空间的索引,例如,如果我们定义的词向量空间是(我,喜欢,相国大人),那么vocabulary就是这样一个索引字典 
        vocabulary={"我":0,"喜欢":1,"相国大人":2},你可以简单的理解为:vocabulary就是词向量空间的坐标轴,索引值相当于表明了第几个维度。 
        '''
        #导入训练集的TF-IDF词向量空间  ★★
        trainbunch = _readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary  
        '''
        关于参数,你只需要了解这么几个就可以了: 
        stop_words: 
        传入停用词,以后我们获得vocabulary_的时候,就会根据文本信息去掉停用词得到 
        vocabulary: 
        之前说过,不再解释。 
        sublinear_tf: 
        计算tf值采用亚线性策略。比如,我们以前算tf是词频,现在用1+log(tf)来充当词频。 
        smooth_idf: 
        计算idf的时候log(分子/分母)分母有可能是0,smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启,无需关心。 
        norm: 
        归一化,我们计算TF-IDF的时候,是用TF*IDF,TF可以是归一化的,也可以是没有归一化的,一般都是采用归一化的方法,默认开启. 
        max_df: 
        有些词,他们的文档频率太高了(一个词如果每篇文档都出现,那还有必要用它来区分文本类别吗?当然不用了呀),所以,我们可以 
        设定一个阈值,比如float类型0.5(取值范围[0.0,1.0]),表示这个词如果在整个数据集中超过50%的文本都出现了,那么我们也把它列 
        为临时停用词。当然你也可以设定为int型,例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了,那么我们也把它列 
        为临时停用词。 
        min_df: 
        与max_df相反,虽然文档频率越低,似乎越能区分文本,可是如果太低,例如10000篇文本中只有1篇文本出现过这个词,仅仅因为这1篇 
        文本,就增加了词向量空间的维度,太不划算。 
        当然,max_df和min_df在给定vocabulary参数时,就失效了。 
        '''  
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001)  
        
        #print(vectorizer)
        
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        _writebunchobj(space_path, tfidfspace)  
        print("if-idf词向量空间实例创建成功!!!")
    
    def metrics_result(actual, predict):  #  metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted', labels=np.unique(predict)))) 
        print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted', labels=np.unique(predict))))
        print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted', labels=np.unique(predict))))
        #准确率和召回率是相互影响的,理想情况下是二者都高,但是一般情况下准确率高,召回率就低;召回率高,准确率就低  
      
    if __name__=="__main__":  
        
        importTrainContentdata()
        importTestContentdata()
        importTrainlabeldata()
        importTestlabeldata()
        
        #导入分词后的词向量bunch对象
        train_bunch_path ="F:/goverment/ArticleMining/trainbunch.bat"#Bunch保存路径
        test_bunch_path ="F:/goverment/ArticleMining/testbunch.bat"
        stopword_path ="F:/goverment/ArticleMining/hlt_stop_words.txt"
        train_space_path = "F:/goverment/ArticleMining/traintfdifspace.dat"
        test_space_path = "F:/goverment/ArticleMining/testtfdifspace.dat"
        
        #对训练和测试集进行bunch操作
        buildtrainbunch(train_bunch_path)
        buildtestbunch(test_bunch_path)
        
        vector_space(stopword_path,train_bunch_path,train_space_path)  
        testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path)
        
        #导入训练和测试数据集
        train_set=_readbunchobj(train_space_path)
        test_set=_readbunchobj(test_space_path)
        
        print(train_set.tdm)
        '''
        mm=0
        ii=0
        jj=0
        for i in range(3142):
            for j in range(3142):
                if train_set.tdm[i][j] >mm:
                    mm=train_set.tdm[i][j]
                    ii=i
                    jj=j
        print(ii)
        print(jj)
        '''        
    
        #test_set.tdm
        #train_set.label
        # 训练分类器:输入词袋向量和分类标签,alpha:0.001 alpha越小,迭代次数越多,精度越高  
        
        #低召回、F1: 0.75 rbf:0.59    0.8 rbf 0.578
        #c0.75 poly 66.5 精度:0.665 gamma=10 召回:0.330  f1-score:0.416
        #C=0.7, kernel='poly', gamma=10 召回:0.331 f1-score:0.417
        # alpha:0.001 alpha 越小,迭代次数越多,精度越高
        '''
        clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)  
        #clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr')
        clf.fit(train_set.tdm, train_set.label)  
        predicted=clf.predict(test_set.tdm)
    
        tv = TfidfVectorizer()
        train_data = tv.fit_transform(X_train)
        test_data = tv.transform(X_test)
        
        lr = LogisticRegression(C=3)
        lr.fit(train_set.tdm, train_set.label)
        predicted=lr.predict(test_set.tdm)
        print(lr.score(test_set.tdm, test_set.label))
        #print(test_set.tdm)
        '''
        
        clf = SVC(C=1500)
        clf.fit(train_set.tdm, train_set.label)
        predicted=clf.predict(test_set.tdm)
        print(clf.score(test_set.tdm, test_set.label))
        
        '''
        from sklearn.neighbors import KNeighborsClassifier  
        knnclf = KNeighborsClassifier(n_neighbors=9)#default with k=5  
        knnclf.fit(train_set.tdm,train_set.label)  
        predicted = knnclf.predict(test_set.tdm)
        '''
        a=[]
        b=[]
        for i in range(len(predicted)):
            b.append((int)(float(predicted[i])))
            a.append(int(test_set.label[i][0]))
        
        f=open('F:/goverment/ArticleMining/predict.txt', 'w')
        for i in range(len(predicted)):
           f.write(str(b[i]))
           f.write('
    ')
        f.write("写好了")
        f.close()
        #for i in range(len(predicted)):
            #print(b[i])
        
        metrics_result(a, b)
    
            
            
            
            
            
            
            
            
            
            
            
        
        
        
        
        
    
     
  • 相关阅读:
    python的不可变对象与可变对象及其妙用与坑
    WAAPI+Python使用中的相关问题和学习记录
    开发工具使用
    面试要点5
    面试要点4
    HTTP状态码——详解
    ElasticSearch使用curl导数据报400可能原因
    elasticsearch的安装、部署
    js二级联动
    aspose.words for java操作文档doc,设置一级二级三级标题以及段落表格等详情
  • 原文地址:https://www.cnblogs.com/caiyishuai/p/13270970.html
Copyright © 2020-2023  润新知