作者:finallyliuyu 转载使用等请注明出处
功能:根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据
数据Demo请见:新闻文本分类libsvm格式数据
此模块的各个子模块的功能我就不详细介绍了,与此系列博客K-means文本聚类系列(已经完成) 一曲同工。
下面开始给出各个子模块的代码:
# -*- coding: cp936 -*- ########################################################################3 # #自动建立文件夹 # ######################################################################## def CreateFolders(path): import os curpath=os.getcwd() os.mkdir(path) os.chdir(path) os.mkdir('class1') os.mkdir('class2') os.chdir(curpath) def CreateAssist(toCalInfoGain): import os if toCalInfoGain==0: folders=[r'D:\TextCategorization\corpus',r'D:\TextCategorization\testing',r'D:\TextCategorization\training',r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented'] for myfolder in folders: CreateFolders(myfolder) os.mkdir(r'D:\TextCategorization\VITdata') os.mkdir(r'D:\TextCategorization\data') os.mkdir(r'D:\TextCategorization\VITdata\data') os.mkdir(r'D:\TextCategorization\data\data') print 'finish,congratulations' if __name__=="__main__": import os os.mkdir(r'D:\TextCategorization') CreateAssist()
文档集分割:多少篇文章归为训练集,多少篇文章归为测试集:注意参数N为整个文档集合(包括测试集和训练集)的每一类的文章数目。这里设置两个类别有相等的文章数目集整个文档集合的文章数目为2N
# -*- coding: cp936 -*- #此模块用于对文本语料库进行预处理 ################################################### #origidir:原语料库目录如E:\新闻语料\EntireTrainingSet\ClassFile\C000024 #destidir:目标语料库目录D:\corpus\class1 #N:需要移动的文本的个数 ##################################################### def CorpusFormation(origidir,destidir,N): paths=[] import os,shutil for i in range(0,N):#产生待移动的文件路径 paths.append(origidir+'\\'+str(i)+'.txt') for mypath in paths: shutil.copy(mypath,destidir) #print 'finsh%s'%mypath ##################################################### #移动语料库 ##################################################### def MoveCorpus(N,toCalInfoGain): if toCalInfoGain==0: originaldirs=[r'E:\新闻语料\EntireTrainingSet\ClassFile\C000024',r'E:\新闻语料\EntireTrainingSet\ClassFile\C000013'] destinationdirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2'] for i in range(0,2): CorpusFormation(originaldirs[i],destinationdirs[i],N) print 'finish' ##################################################################### #origidir:原语料库目录,如D:\corpus\class1 #destdir1:目标目录:如D:\training\class1 #destidir2:目标目录:如D:\testing\class1 #Vfold:几折交叉验证 count:已经进行了几次实验count=0,1,2,3 #N语料库的总规模 ##################################################################### def CorpusPartition(origidir,destdir1,destdir2,count,N,vfold=5): import os,shutil step=N/vfold paths=[] for i in range(0+count*step,step+count*step): paths.append(origidir+'\\'+str(i)+'.txt') for mypath in paths: shutil.move(mypath,destdir1) #print 'finsh%s'%mypath paths=[] pathstemp=os.listdir(origidir) for m in pathstemp: paths.append(origidir+os.sep+m) for mypath in paths: shutil.move(mypath,destdir2) #print 'finish%s'%mypath #print 'finalfinish congratulations!' def moveAccordingPartition(N,count,toCalInfoGain): if toCalInfoGain==0: originaldirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2'] destidirs1=[r'D:\TextCategorization\training\class1',r'D:\TextCategorization\training\class2'] destidirs2=[r'D:\TextCategorization\testing\class1',r'D:\TextCategorization\testing\class2'] for i in range(0,2): CorpusPartition(originaldirs[i],destidirs1[i],destidirs2[i],count,N) print '第%s finish'%i if __name__=="__main__": #MoveCorpus(500) N=500 count=0 moveAccordingPartition(N,count)
# -*- coding: cp936 -*- #此模块用于建立词袋子模型 def BagOfWordsConstruction(root,toCalInfoGain): if toCalInfoGain==0: import cPickle as mypickle file_dest=file(r'D:\TextCategorization\VITdata\vocabularystatistics.dat','w') rawVSMMatrix=TrainingFileProcess(root) vocabularystatistics={} templist=[] for rawVSM in rawVSMMatrix: templist=templist+rawVSM wordscollection=list(set(templist)) for word in wordscollection: index=0 for rawVSM in rawVSMMatrix: count=rawVSM.count(word) if count>0 : if vocabularystatistics.has_key(word)==False: vocabularystatistics[word]=[] vocabularystatistics[word].append((index,count)) else: vocabularystatistics[word].append((index,count)) index=index+1 mypickle.dump(vocabularystatistics,file_dest) print len(vocabularystatistics) file_dest.close() print 'BagOfWordsConstructionFinish' ############################################################################################ #将文章内容变成词集合 def FilePreProcess(rawtext): import re listresult=rawtext.split("|") finalrawVSM=[] stopwordlist=FilterNoiseWord(r'C:\Python26\SVM\stopwords.txt') for m in listresult: if m!=''and m not in stopwordlist and re.search('\xa3[\xa1-\xfe]',m)==None: finalrawVSM.append(m) return finalrawVSM ################################################################################################# #训练集文档预处理 def TrainingFileProcess(root): from SVM import DataManager import cPickle as mypickle import os rawVSMMatrix=[]#存放整个文档集 dm=DataManager.DataManager(root) subdir=dm.GetSubDir() for sub in subdir: dm.SetFilePathsFromsubDir(root+os.sep+sub) filepaths=dm.GetFilePaths() for path in filepaths: myfile=file(root+os.sep+sub+os.sep+path) rawtext=myfile.read() myfile.close() rawVSM=FilePreProcess(rawtext) rawVSMMatrix.append(rawVSM) return rawVSMMatrix #################################################################################### #生成停用词列表 def FilterNoiseWord(stopword_file_name): import re f=file(stopword_file_name) stopword=f.read() f.close() stopwordlist=re.split('\n',stopword) return stopwordlist if __name__=="__main__": BagOfWordsConstruction(r'D:\TextCategorization\segmented') #fid=file(r'D:\3011.txt') #rawtext=fid.read() #fid.close() #FilePreProcess(rawtext)
class IG: ''' 此模块用于计算信息增益 ''' ####################################################################################### def __init__(self,n_size): ''' 类的构造函数,初始化 类的数据成员变量 keys保存Term的值 labelOneNums保存对应的term出现在类1中的文章的篇数 labelTwoNums保存对应的term出现在类2中的文字的篇数 ''' import cPickle as mypickle mydict=mypickle.load(file(r'D:\TextCategorization\VITData\vocabularystatistics.dat')) self.mykeys=[] self.labelOneNums=[] self.labelTwoNums=[] self.probs=[] self.conProbs=[] #self.informationgain=[] for key ,value in mydict.iteritems(): self.mykeys.append(key) class1_count=0 #某个term属于类别一的次数 class2_count=0#某个term 属于类别二的次数 for val in value: if val[0]<n_size/2:#该文章标号属于类别1 class1_count=class1_count+val[1] #class1_count=class1_count+1 else:#该文章标号属于类别2 class2_count=class2_count+val[1] #class2_count=class2_count+1 self.labelOneNums.append(class1_count) self.labelTwoNums.append(class2_count) #测试代码 #fid=file('1.txt','a') #for m in self.labelOneNums: #print>>fid,m #fid.flush() #fid.close() #print len([m for m in self.labelOneNums if m>0]) #print len(self.labelTwoNums) #print len(self.mykeys) ###################################################################################3 #def GetConditionProbabilityBaseC(self,n_size,termcount): #conditionPtxC=float(termcount+1)/(n_size/2+len(self.mykeys)) #return conditionPtxC def GetConditionProbabilityBaseC(self,index,termcount): ''' 计算P(t|C) ''' if index==1: conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelOneNums)) else: conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelTwoNums)) return conditionPtxC def GetTermProbability(self,n_size): ''' 计算每个term的先验概率 ''' #sumtotal=sum(self.labelOneNums)+sum(self.labelTwoNums) for i in range(0,len(self.mykeys)): prob=0.5*self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])+0.5*self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i]) self.probs.append(prob) #测试代码 #fid=file('prob.txt','a') #for m in self.probs: #print>>fid,m #fid.flush() #fid.close() ################################################################################### def GetCategoryProbConditionTerm(self,n_size): ''' 保存在一个词出现与否的前提下,文章属于某个类的概率 ''' for i in range(0,len(self.mykeys)): # conprob1:出现term t 其属于类别1的概率 conprob1=self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])*0.5/self.probs[i] # conprob2:出现term t 其属于类别2的概率 conprob2=self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i])*0.5/self.probs[i] #nonconprob1:不出现term t的条件下,属于类别 1的概率 nonconprob1=1-conprob1 #nonconprob2:不出现term t的条件下,属于类别 2的概率 nonconprob2=1-conprob2 self.conProbs.append((conprob1,conprob2,nonconprob1,nonconprob2)) #测试代码 #fid=file('conprob.txt','a') #for m in self.conProbs: #print>>fid,m #fid.flush() #fid.close() ######################################################################################## def CalInformationGain(self,n_size): ''' 计算每个单词的信息增益 ''' import math import cPickle as mypickle temp=0#辅助计算变量 nontemp=0#辅助计算变量 self.GetTermProbability(n_size) self.GetCategoryProbConditionTerm(n_size) infoGain={} for i in range(0,len(self.mykeys)): temp=0#辅助计算变量 nontemp=0#辅助计算变量 conprob1=self.conProbs[i][0] conprob2=self.conProbs[i][1] nonconprob1=self.conProbs[i][2] nonconprob2=self.conProbs[i][3] if conprob1!=0: temp=temp+conprob1*math.log(conprob1,2) if conprob2!=0: temp=temp+conprob2*math.log(conprob2,2) if nonconprob1!=0: nontemp=nontemp+nonconprob1*math.log(nonconprob1,2) if nonconprob2!=0: nontemp=nontemp+nonconprob2+math.log(nonconprob2,2) igval=2+self.probs[i]*temp+(1-self.probs[i])*nontemp infoGain[self.mykeys[i]]=igval #infoGain.sort(key=lambda d:d[1],reverse=True) infoGainResult=sorted(infoGain.iteritems(),key=lambda infoGain:infoGain[1],reverse=True) print '共计算了%s个词的IG值' %len(infoGainResult) #for m in infoGainResult: #print '%s,%s'%(m[0],m[1]) fid=file(r'D:\TextCategorization\VITData\infoGain.dat','w') mypickle.dump(infoGainResult,fid) fid.close() if __name__=="__main__": MyIG=IG(200) MyIG.CalInformationGain(200)
-*- coding: cp936 -*- ''' 此模块根据信息增益选择特征词 ''' ########################################################################### def featureSelectionIG(N,flag,n_size): ''' 更新数据库,并返回特征词集合 #flag=0表示infoGain没有被计算 ''' from SVM import InformationGain import cPickle as mypickle if flag==0: MyIG=InformationGain.IG(n_size) MyIG.CalInformationGain(n_size) featurewords=[] infoGainResult=mypickle.load(file(r'D:\TextCategorization\VITdata\infoGain.dat')) print 'infoGainResult的长度%s'%len(infoGainResult) #N=1000#确定特征维数。 infoGainfinal=infoGainResult[0:N] print 'infoGainfinal的长度%s' %len(infoGainfinal) featurewords=[m[0] for m in infoGainfinal] print '共有%s个特征词'%len(featurewords) return featurewords ##################################################################### if __name__=="__main__": featurewords=featureSelectionIG(1000,0,200) import cPickle as mypickle fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w') mypickle.dump(featurewords,fid) fid.close()
''' 此模块用于形成文档向量模型 ''' ################################################################ def FormatVSM(sub,root,keywordsaddress): ''' 对文档集建立文档向量模型,储存在一个二维list中 ''' from SVM import DataManager import cPickle as mypickle import re import os #root=r'D:\tsegmented' keywords=mypickle.load(file(keywordsaddress)) dm=DataManager.DataManager(root)#读数据专家 VSMMatrix=[] dm.SetFilePathsFromsubDir(root+os.sep+sub) filepaths=dm.GetFilePaths() for path in filepaths: myfile=file(root+os.sep+sub+os.sep+path) rawtext=myfile.read() myfile.close() textwordslist=FilePreProcess(rawtext) VSM=[] for i in range(0,len(keywords)): count=textwordslist.count(keywords[i]) VSM.append((i+1,count)) VSMMatrix.append(VSM) return VSMMatrix #################################################################### def LibSVMFormat(dest,root,keywordsaddress): ''' 形成VSM ''' fid=file(dest,'a') VSMMatrix=FormatVSM('class1',root,keywordsaddress) for VSM in VSMMatrix: s='1' for elem in VSM: if elem[1]!=0: s=s+' \t'+str(elem[0])+':'+str(elem[1]) s=s+' \t\n' fid.write(s) VSMMatrix=FormatVSM('class2',root,keywordsaddress) for VSM in VSMMatrix: s='0' for elem in VSM: if elem[1]!=0: s=s+' \t'+str(elem[0])+':'+str(elem[1]) s=s+' \t\n' fid.write(s) #print 'finish' fid.close() print 'functionfinish' #############################################################33 def FilePreProcess(rawtext): listresult=rawtext.split("|") return listresult ################################################################### if __name__=="__main__": root1=r'D:\TextCategorization\segmented' root2=r'D:\TextCategorization\tsegmented' print 'begin.....' LibSVMFormat(r'D:\TextCategorization\data\train.libsvm',root1,keywordsaddress) print '训练语料库转化完毕' LibSVMFormat(r'D:\TextCategorization\data\test.libsvm',root2,keywordsaddress) print '测试语料库转化完毕' 文本预处理的主程序模块,该模块调用上面的各个子模块完成“根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据”的功能。
# -*- coding: cp936 -*- #coding gb2312 from SVM import FoldersCreation import os ############################################################################################## #参数设计 corpus_size=[1500] #N: half of total corpus size vfold=5 #vfold: 循环验证的次数 featureDimensions=[10,20,30,40,50,60,70,80,90,100,110,120,130,140,150] #featureDimension:VSM模型特征维度 toCalInfoGain=0#是否计算词袋子模型中的词集合的信息增益=1则不计算 times=[2] #count_done_research_times=0#已经进行了几次试验 # N,count_done_research 为CorpusPartition.moveAccordingPartition的参数 #featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# for count_done_research_times in times: for N in corpus_size: print '目前文档集规模为%s'%N print '目前在该规模文档集上面已经进行了%s次实验'%count_done_research_times for featureDimension in featureDimensions: if featureDimension>10: toCalInfoGain=1 print '目前处理的特征维数是%s'%featureDimension ##############创建文件夹######################################################################## if toCalInfoGain==0: os.mkdir(r'D:\TextCategorization') FoldersCreation.CreateAssist(toCalInfoGain) print '创建文件夹模块运行结束' print '***************************************************************************' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# ################处理文档集合,对文档集合进行划分,区分测试集合和训练集合############################### from SVM import CorpusPartition CorpusPartition.MoveCorpus(N,toCalInfoGain) CorpusPartition.moveAccordingPartition(N,count_done_research_times,toCalInfoGain) print '分割文本集模块运行结束' print '*******************************************************************' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #########################文档集合分词########################################################## from SVM import DataManager from ctypes import * import os import cPickle as p import re if toCalInfoGain==0: roots=[r'D:\TextCategorization\training',r'D:\TextCategorization\testing'] rootfinals=[r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented'] for i in range(0,2): dm=DataManager.DataManager(roots[i]) subdir=dm.GetSubDir() filepathstotalsrc=[] for sub in subdir: dm.SetFilePathsFromsubDir(roots[i]+os.sep+sub) filepaths=dm.GetFilePaths() filepathsassist=[sub+os.sep+path for path in filepaths ] filepathstotalsrc=filepathstotalsrc+filepathsassist for path in filepathstotalsrc: myfile=file(roots[i]+os.sep+path) s=myfile.read() myfile.close() dll=cdll.LoadLibrary("ICTCLAS30.dll") dll.ICTCLAS_Init(c_char_p(".")) bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0) segmented=c_char_p(bSuccess).value segmentedtmp=re.sub("\s+",'|',segmented,0) segmentedfinal=re.sub('\xa1\xa1','',segmentedtmp) fid=file(rootfinals[i]+os.sep+path,'w') fid.write(segmentedfinal) fid.close() dll.ICTCLAS_Exit() #print 'finalfinish congratulations!' print '文档集分词模块运行结束' print '**********************************************************************' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# ##################建立词袋子模型###################################################################### from SVM import BagOfWordsConstruction BagOfWordsConstruction.BagOfWordsConstruction(r'D:\TextCategorization\segmented',toCalInfoGain) print '建立词袋子模型模块运行结束' print '***********************************************************************************' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #######################特征词选择################################################################## from SVM import FeatureSelectionModel featurewords=FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain,2*N/vfold)#feature import cPickle as mypickle fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w') mypickle.dump(featurewords,fid) fid.close() print '特征词选择模块运行结束' print '*******************************************************************************************' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #######################文档向量模型建立模块############################################################## from SVM import VSMformation import shutil root1=r'D:\TextCategorization\segmented' root2=r'D:\TextCategorization\tsegmented' keywordsaddress=r'D:\TextCategorization\VITData\data\keywords.dat' print 'begin.....' VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\train.libsvm',root1,keywordsaddress) print '训练语料库转化完毕' VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\test.libsvm',root2,keywordsaddress) print '测试语料库转化完毕' print '文档向量模型建立模块运行结束' print '批处理完毕,congratulations!' os.chdir(r'C:\\Python26') os.chdir('D:\\TextCategorization') new_dir='TextCategorization_'+str(count_done_research_times)+'_'+str(N)+'_'+str(featureDimension) os.mkdir(new_dir) os.chdir(new_dir) os.mkdir('data') os.chdir(r'C:\\Python26') print os.getcwd() shutil.move(r'D:\TextCategorization\VITdata\data\keywords.dat','D:\\TextCategorization\\'+new_dir+'\\data') shutil.move(r'D:\TextCategorization\data\data\train.libsvm','D:\\TextCategorization\\'+new_dir+'\\data') shutil.move(r'D:\TextCategorization\data\data\test.libsvm','D:\\TextCategorization\\'+new_dir+'\\data') print'恭喜,文件夹重命名完毕' print '###########################finish##################################' os.chdir('D:\\') print os.getcwd() if os.path.isdir('TextCategorization'): os.rename('TextCategorization',str(count_done_research_times)+'_'+str(N)+'_rfinish') os.chdir(r'C:\Python26') toCalInfoGain=0 print str(count_done_research_times)+'_'+str(N)+'finish'