• LibSVM分类之文本预处理模块


    作者:finallyliuyu 转载使用等请注明出处

    功能:根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据

    数据Demo请见:新闻文本分类libsvm格式数据

    预处理部分的框架图

    此模块的各个子模块的功能我就不详细介绍了,与此系列博客K-means文本聚类系列(已经完成) 一曲同工。

    下面开始给出各个子模块的代码:

    # -*- coding: cp936 -*-
    ########################################################################3
    #
    #自动建立文件夹
    #
    ########################################################################
            
    def CreateFolders(path):
        import os
        curpath=os.getcwd()
        os.mkdir(path)
        os.chdir(path)
        os.mkdir('class1')
        os.mkdir('class2')
        os.chdir(curpath)
    def CreateAssist(toCalInfoGain):
        import os
        if toCalInfoGain==0:
            folders=[r'D:\TextCategorization\corpus',r'D:\TextCategorization\testing',r'D:\TextCategorization\training',r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
            for myfolder in folders:
                CreateFolders(myfolder)
            os.mkdir(r'D:\TextCategorization\VITdata')
            os.mkdir(r'D:\TextCategorization\data')
            os.mkdir(r'D:\TextCategorization\VITdata\data')
            os.mkdir(r'D:\TextCategorization\data\data')
       
        print 'finish,congratulations'
    if __name__=="__main__":
        import os
        os.mkdir(r'D:\TextCategorization')
        CreateAssist()
        

    文档集分割:多少篇文章归为训练集,多少篇文章归为测试集:注意参数N为整个文档集合(包括测试集和训练集)的每一类的文章数目。这里设置两个类别有相等的文章数目集整个文档集合的文章数目为2N

    # -*- coding: cp936 -*-
    #此模块用于对文本语料库进行预处理
    ###################################################
    #origidir:原语料库目录如E:\新闻语料\EntireTrainingSet\ClassFile\C000024
    #destidir:目标语料库目录D:\corpus\class1
    #N:需要移动的文本的个数
    #####################################################
    
    def CorpusFormation(origidir,destidir,N):
        paths=[]
        import os,shutil
        for i in range(0,N):#产生待移动的文件路径
            paths.append(origidir+'\\'+str(i)+'.txt')
        for mypath in paths:
            shutil.copy(mypath,destidir)
            #print 'finsh%s'%mypath
    
    #####################################################
            #移动语料库
    #####################################################
    def MoveCorpus(N,toCalInfoGain):
        if toCalInfoGain==0:
            originaldirs=[r'E:\新闻语料\EntireTrainingSet\ClassFile\C000024',r'E:\新闻语料\EntireTrainingSet\ClassFile\C000013']
            destinationdirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2']
            for i in range(0,2):
                CorpusFormation(originaldirs[i],destinationdirs[i],N)
        print 'finish'
    
    #####################################################################
    #origidir:原语料库目录,如D:\corpus\class1
    #destdir1:目标目录:如D:\training\class1
    #destidir2:目标目录:如D:\testing\class1
    #Vfold:几折交叉验证 count:已经进行了几次实验count=0,1,2,3
    #N语料库的总规模
    #####################################################################
    
    def CorpusPartition(origidir,destdir1,destdir2,count,N,vfold=5):
        import os,shutil
        step=N/vfold
        paths=[]
        for i in range(0+count*step,step+count*step):
             paths.append(origidir+'\\'+str(i)+'.txt')   
        for mypath in paths:
            shutil.move(mypath,destdir1)
            #print 'finsh%s'%mypath
        paths=[]
        pathstemp=os.listdir(origidir)
        for m in pathstemp:
            paths.append(origidir+os.sep+m)
        for mypath in paths:
            shutil.move(mypath,destdir2)
            #print 'finish%s'%mypath
    
        #print 'finalfinish congratulations!'
    
        
    def moveAccordingPartition(N,count,toCalInfoGain):
        if toCalInfoGain==0:
            originaldirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2']
            destidirs1=[r'D:\TextCategorization\training\class1',r'D:\TextCategorization\training\class2']
            destidirs2=[r'D:\TextCategorization\testing\class1',r'D:\TextCategorization\testing\class2']
            for i in range(0,2):
                CorpusPartition(originaldirs[i],destidirs1[i],destidirs2[i],count,N)
                print '第%s finish'%i
            
    
        
    
    
    if __name__=="__main__":
        #MoveCorpus(500)
        N=500
        count=0
        moveAccordingPartition(N,count)
        
    # -*- coding: cp936 -*-
    #此模块用于建立词袋子模型
    
    def BagOfWordsConstruction(root,toCalInfoGain):
        if toCalInfoGain==0:
            
            import cPickle as mypickle
            file_dest=file(r'D:\TextCategorization\VITdata\vocabularystatistics.dat','w')
            rawVSMMatrix=TrainingFileProcess(root)
            vocabularystatistics={}
            templist=[]
            for rawVSM in rawVSMMatrix:
                templist=templist+rawVSM
            wordscollection=list(set(templist))
        
            for word in wordscollection:
                index=0
                for rawVSM in rawVSMMatrix:
                    count=rawVSM.count(word)
                    if count>0 :
                        if vocabularystatistics.has_key(word)==False:
                            vocabularystatistics[word]=[]
                            vocabularystatistics[word].append((index,count))
                        else:
                            vocabularystatistics[word].append((index,count))
                    index=index+1
            mypickle.dump(vocabularystatistics,file_dest)
            print len(vocabularystatistics)
            file_dest.close()
        print 'BagOfWordsConstructionFinish'
           
    ############################################################################################
    #将文章内容变成词集合    
    def FilePreProcess(rawtext):
        import re
        listresult=rawtext.split("|")
        finalrawVSM=[]
        stopwordlist=FilterNoiseWord(r'C:\Python26\SVM\stopwords.txt')
        for m in listresult:
             if m!=''and m not in stopwordlist and re.search('\xa3[\xa1-\xfe]',m)==None:
                finalrawVSM.append(m)
                         
        return finalrawVSM
    
    #################################################################################################
    #训练集文档预处理
    def TrainingFileProcess(root):
        from SVM import DataManager
        import cPickle as mypickle
        import os
       
        rawVSMMatrix=[]#存放整个文档集
        dm=DataManager.DataManager(root)
        subdir=dm.GetSubDir()
        for sub in subdir:
            dm.SetFilePathsFromsubDir(root+os.sep+sub)
            filepaths=dm.GetFilePaths()
            for path in filepaths:
                myfile=file(root+os.sep+sub+os.sep+path)
                rawtext=myfile.read()
                myfile.close()
                rawVSM=FilePreProcess(rawtext)
                rawVSMMatrix.append(rawVSM)
        return rawVSMMatrix
    
    ####################################################################################
    #生成停用词列表
    def FilterNoiseWord(stopword_file_name):
        import re
        f=file(stopword_file_name)
        stopword=f.read()
        f.close()
        stopwordlist=re.split('\n',stopword)
        return stopwordlist
        
    
       
    if __name__=="__main__":
        BagOfWordsConstruction(r'D:\TextCategorization\segmented')
        #fid=file(r'D:\3011.txt')
        #rawtext=fid.read()
        #fid.close()
        #FilePreProcess(rawtext)
        
    
            
        
    
        
    class IG:
        '''
           此模块用于计算信息增益
        '''
        #######################################################################################
        def __init__(self,n_size):
            
            ''' 类的构造函数,初始化 类的数据成员变量
                keys保存Term的值
                labelOneNums保存对应的term出现在类1中的文章的篇数
                labelTwoNums保存对应的term出现在类2中的文字的篇数
            '''
            import cPickle as mypickle
            mydict=mypickle.load(file(r'D:\TextCategorization\VITData\vocabularystatistics.dat'))
            self.mykeys=[]
            self.labelOneNums=[]
            self.labelTwoNums=[]
            self.probs=[]
            self.conProbs=[]
            #self.informationgain=[]
           
            for key ,value in mydict.iteritems():
                self.mykeys.append(key)
                class1_count=0 #某个term属于类别一的次数
                class2_count=0#某个term 属于类别二的次数
                for val in value:
                    if val[0]<n_size/2:#该文章标号属于类别1
                        class1_count=class1_count+val[1]
                        #class1_count=class1_count+1
                    else:#该文章标号属于类别2
                        class2_count=class2_count+val[1]
                        #class2_count=class2_count+1
                self.labelOneNums.append(class1_count)
                self.labelTwoNums.append(class2_count)
            #测试代码
            #fid=file('1.txt','a')
            #for m in self.labelOneNums:
                #print>>fid,m
                #fid.flush()
            #fid.close()
            #print len([m for m in self.labelOneNums if m>0])
            #print len(self.labelTwoNums)
            #print len(self.mykeys)
        ###################################################################################3
        #def GetConditionProbabilityBaseC(self,n_size,termcount):  
            #conditionPtxC=float(termcount+1)/(n_size/2+len(self.mykeys))
            #return conditionPtxC
        def GetConditionProbabilityBaseC(self,index,termcount):
            '''
                计算P(t|C)
            '''
            if index==1:
                conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelOneNums))
            else:
                conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelTwoNums))
            return conditionPtxC
                                           
                
        def GetTermProbability(self,n_size):
            
            '''
                计算每个term的先验概率
            '''
            #sumtotal=sum(self.labelOneNums)+sum(self.labelTwoNums)
            for i in range(0,len(self.mykeys)):
                prob=0.5*self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])+0.5*self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i])
                self.probs.append(prob)
            #测试代码
            #fid=file('prob.txt','a')
            #for m in self.probs:
                #print>>fid,m
                #fid.flush()
            #fid.close()
                
            
    
        ###################################################################################
    
        def GetCategoryProbConditionTerm(self,n_size):
                           
            '''
               保存在一个词出现与否的前提下,文章属于某个类的概率
            '''
            
            for i in range(0,len(self.mykeys)):
                           # conprob1:出现term t 其属于类别1的概率 
                           conprob1=self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])*0.5/self.probs[i]
                           # conprob2:出现term t 其属于类别2的概率 
                           conprob2=self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i])*0.5/self.probs[i]
                           #nonconprob1:不出现term t的条件下,属于类别 1的概率
                           nonconprob1=1-conprob1
                           #nonconprob2:不出现term t的条件下,属于类别 2的概率
                           nonconprob2=1-conprob2
                           self.conProbs.append((conprob1,conprob2,nonconprob1,nonconprob2))
            #测试代码
            #fid=file('conprob.txt','a')
            #for m in self.conProbs:
                #print>>fid,m
                #fid.flush()
            #fid.close()
            
    
        ########################################################################################
        def CalInformationGain(self,n_size):
                           
            '''
                计算每个单词的信息增益
            '''
            import math
            import cPickle as mypickle
            
            temp=0#辅助计算变量
            nontemp=0#辅助计算变量
            self.GetTermProbability(n_size)
            self.GetCategoryProbConditionTerm(n_size)
            infoGain={}
           
            for  i in range(0,len(self.mykeys)):
                temp=0#辅助计算变量
                nontemp=0#辅助计算变量
                conprob1=self.conProbs[i][0]
                conprob2=self.conProbs[i][1]
                nonconprob1=self.conProbs[i][2]
                nonconprob2=self.conProbs[i][3]
                if conprob1!=0:
                    temp=temp+conprob1*math.log(conprob1,2)
                if conprob2!=0:
                    temp=temp+conprob2*math.log(conprob2,2)
                if nonconprob1!=0:
                    nontemp=nontemp+nonconprob1*math.log(nonconprob1,2)
                if nonconprob2!=0:
                    nontemp=nontemp+nonconprob2+math.log(nonconprob2,2)
                igval=2+self.probs[i]*temp+(1-self.probs[i])*nontemp
                infoGain[self.mykeys[i]]=igval
                
            #infoGain.sort(key=lambda d:d[1],reverse=True)
            infoGainResult=sorted(infoGain.iteritems(),key=lambda infoGain:infoGain[1],reverse=True)
            print '共计算了%s个词的IG值' %len(infoGainResult)
            #for m in infoGainResult:
                #print '%s,%s'%(m[0],m[1])
                                      
                                      
                                
            fid=file(r'D:\TextCategorization\VITData\infoGain.dat','w')
            mypickle.dump(infoGainResult,fid)
            fid.close()
    if __name__=="__main__":
        
        MyIG=IG(200)
        MyIG.CalInformationGain(200)
        
        
     -*- coding: cp936 -*-
    '''
    此模块根据信息增益选择特征词
    '''
    ###########################################################################
    def featureSelectionIG(N,flag,n_size):
        '''
            更新数据库,并返回特征词集合
            #flag=0表示infoGain没有被计算
        '''
        from SVM import InformationGain
        import cPickle as mypickle
        if flag==0:
            MyIG=InformationGain.IG(n_size)
            MyIG.CalInformationGain(n_size)
        featurewords=[]
        infoGainResult=mypickle.load(file(r'D:\TextCategorization\VITdata\infoGain.dat'))
        print 'infoGainResult的长度%s'%len(infoGainResult)
        #N=1000#确定特征维数。
        infoGainfinal=infoGainResult[0:N]
        print 'infoGainfinal的长度%s' %len(infoGainfinal)
        featurewords=[m[0] for m in infoGainfinal]
        print '共有%s个特征词'%len(featurewords)
        return featurewords
    #####################################################################
    if __name__=="__main__":
        featurewords=featureSelectionIG(1000,0,200)
        import cPickle as mypickle
        fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w')
        mypickle.dump(featurewords,fid)
        fid.close()
        
     
    '''
        此模块用于形成文档向量模型
    '''
    ################################################################
    def FormatVSM(sub,root,keywordsaddress):
        '''
            对文档集建立文档向量模型,储存在一个二维list中
        '''
        from SVM import DataManager
        import cPickle as mypickle
        import re
        import os
        #root=r'D:\tsegmented'
        keywords=mypickle.load(file(keywordsaddress))
        dm=DataManager.DataManager(root)#读数据专家
        VSMMatrix=[]
        dm.SetFilePathsFromsubDir(root+os.sep+sub)
        filepaths=dm.GetFilePaths()
        for path in filepaths:
            myfile=file(root+os.sep+sub+os.sep+path)
            rawtext=myfile.read()
            myfile.close()
            textwordslist=FilePreProcess(rawtext)
            VSM=[]
            for i in range(0,len(keywords)):
                count=textwordslist.count(keywords[i])
                VSM.append((i+1,count))   
            VSMMatrix.append(VSM)
        return VSMMatrix
    ####################################################################
    
    
    def LibSVMFormat(dest,root,keywordsaddress):
        '''
            形成VSM
        '''
        fid=file(dest,'a')
        VSMMatrix=FormatVSM('class1',root,keywordsaddress)
        for VSM in VSMMatrix:
            s='1'
            for elem in VSM:
                if elem[1]!=0:
                    
                    s=s+' \t'+str(elem[0])+':'+str(elem[1])
            s=s+' \t\n'
            fid.write(s)
        VSMMatrix=FormatVSM('class2',root,keywordsaddress)
        for VSM in VSMMatrix:
            s='0'
            for elem in VSM:
                if elem[1]!=0:
                    s=s+' \t'+str(elem[0])+':'+str(elem[1])
            s=s+' \t\n'
            fid.write(s)
            #print 'finish'
        fid.close()
        print 'functionfinish'
    #############################################################33
    def FilePreProcess(rawtext):
        listresult=rawtext.split("|")
        return listresult    
    
    ###################################################################
            
    if __name__=="__main__":
        root1=r'D:\TextCategorization\segmented'
        root2=r'D:\TextCategorization\tsegmented'
        print 'begin.....'
        LibSVMFormat(r'D:\TextCategorization\data\train.libsvm',root1,keywordsaddress)
        print '训练语料库转化完毕'
        LibSVMFormat(r'D:\TextCategorization\data\test.libsvm',root2,keywordsaddress)
        print '测试语料库转化完毕'
        
        
            
    文本预处理的主程序模块,该模块调用上面的各个子模块完成“根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据”的功能。
    # -*- coding: cp936 -*-
    #coding gb2312
    from SVM import FoldersCreation
    import os
    ##############################################################################################
    #参数设计
    corpus_size=[1500]
    #N: half of total corpus size
    vfold=5 #vfold: 循环验证的次数 
    featureDimensions=[10,20,30,40,50,60,70,80,90,100,110,120,130,140,150] #featureDimension:VSM模型特征维度
    toCalInfoGain=0#是否计算词袋子模型中的词集合的信息增益=1则不计算
    times=[2]
    #count_done_research_times=0#已经进行了几次试验
    # N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
    #featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    for count_done_research_times in times:
        
        for N in corpus_size:
            print '目前文档集规模为%s'%N
            print '目前在该规模文档集上面已经进行了%s次实验'%count_done_research_times
            for  featureDimension in featureDimensions:
                if featureDimension>10:
                    toCalInfoGain=1
                print '目前处理的特征维数是%s'%featureDimension
        ##############创建文件夹########################################################################
                if toCalInfoGain==0:
                    os.mkdir(r'D:\TextCategorization')
                FoldersCreation.CreateAssist(toCalInfoGain)
                print '创建文件夹模块运行结束'
                print '***************************************************************************'
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    
        ################处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
                from SVM import CorpusPartition
                CorpusPartition.MoveCorpus(N,toCalInfoGain)
                CorpusPartition.moveAccordingPartition(N,count_done_research_times,toCalInfoGain)
                print '分割文本集模块运行结束'
                print '*******************************************************************'
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
        #########################文档集合分词##########################################################
                from SVM import DataManager
                from ctypes import *
                import os
                import cPickle as p
                import re
                if toCalInfoGain==0:   
                    roots=[r'D:\TextCategorization\training',r'D:\TextCategorization\testing']
                    rootfinals=[r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
                    for i in range(0,2):
                        dm=DataManager.DataManager(roots[i])
                        subdir=dm.GetSubDir()
                        filepathstotalsrc=[]
                        for sub  in subdir:
                            dm.SetFilePathsFromsubDir(roots[i]+os.sep+sub)
                            filepaths=dm.GetFilePaths()
                            filepathsassist=[sub+os.sep+path for path in filepaths ]
                            filepathstotalsrc=filepathstotalsrc+filepathsassist  
                        for path in filepathstotalsrc:
                            myfile=file(roots[i]+os.sep+path)
                            s=myfile.read()
                            myfile.close()
                            dll=cdll.LoadLibrary("ICTCLAS30.dll")    
                            dll.ICTCLAS_Init(c_char_p("."))  
                            bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
                            segmented=c_char_p(bSuccess).value
                            segmentedtmp=re.sub("\s+",'|',segmented,0)
                            segmentedfinal=re.sub('\xa1\xa1','',segmentedtmp)
                            fid=file(rootfinals[i]+os.sep+path,'w')
                            fid.write(segmentedfinal)
                            fid.close()
                            dll.ICTCLAS_Exit()
                #print 'finalfinish congratulations!'     
                print '文档集分词模块运行结束'
                print '**********************************************************************'
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    
        ##################建立词袋子模型######################################################################
                from SVM import BagOfWordsConstruction
                BagOfWordsConstruction.BagOfWordsConstruction(r'D:\TextCategorization\segmented',toCalInfoGain)
                print '建立词袋子模型模块运行结束'
                print '***********************************************************************************'
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    
        #######################特征词选择##################################################################
                from SVM import FeatureSelectionModel
                featurewords=FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain,2*N/vfold)#feature
                import cPickle as mypickle
                fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w')
                mypickle.dump(featurewords,fid)
                fid.close()
                print '特征词选择模块运行结束'
                print '*******************************************************************************************'
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    
        #######################文档向量模型建立模块##############################################################
                from SVM import VSMformation
                import shutil
                root1=r'D:\TextCategorization\segmented'
                root2=r'D:\TextCategorization\tsegmented'
                keywordsaddress=r'D:\TextCategorization\VITData\data\keywords.dat'
                print 'begin.....'
                VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\train.libsvm',root1,keywordsaddress)
                print '训练语料库转化完毕'
                VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\test.libsvm',root2,keywordsaddress)
                print '测试语料库转化完毕'
                print '文档向量模型建立模块运行结束'
                print '批处理完毕,congratulations!'
                os.chdir(r'C:\\Python26')
                os.chdir('D:\\TextCategorization')
                new_dir='TextCategorization_'+str(count_done_research_times)+'_'+str(N)+'_'+str(featureDimension)
                os.mkdir(new_dir)
                os.chdir(new_dir)
                os.mkdir('data')
                os.chdir(r'C:\\Python26')
                print os.getcwd()
                shutil.move(r'D:\TextCategorization\VITdata\data\keywords.dat','D:\\TextCategorization\\'+new_dir+'\\data')
                shutil.move(r'D:\TextCategorization\data\data\train.libsvm','D:\\TextCategorization\\'+new_dir+'\\data')
                shutil.move(r'D:\TextCategorization\data\data\test.libsvm','D:\\TextCategorization\\'+new_dir+'\\data')
                print'恭喜,文件夹重命名完毕'
                print '###########################finish##################################'
            os.chdir('D:\\')
            print os.getcwd()
            if os.path.isdir('TextCategorization'):
                os.rename('TextCategorization',str(count_done_research_times)+'_'+str(N)+'_rfinish')
            os.chdir(r'C:\Python26')
            toCalInfoGain=0
            print str(count_done_research_times)+'_'+str(N)+'finish'
    
            
           
  • 相关阅读:
    C# 大小写转换(非金额)
    DataService系列教程 (一)
    C# 大小写转换(金额)
    sql注入杂谈(一)union select
    sql注入杂谈(二)报错注入
    python正则提取txt文本
    sql注入杂谈(三)盲注
    对指定网站渗透的一些总结
    MSF的利用
    SQLMAP怎么拿shell
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1817836.html
Copyright © 2020-2023  润新知