xml中文档(query)的结构:
<topic> <number>CIRB010TopicZH006</number> <title>科索沃難民潮</title> <question> 查詢科索沃戰爭中的難民潮情況,以及國際間對其采取的援助。 </question> <narrative> 相關文件內容包含科省難民湧入的地點、人數。受安置的狀況,難民潮引發的問題,参與救援之國家與國際組織,其援助策略與行動內容之報導。 </narrative> <concepts> 科省、柯省、科索沃、柯索伏、難民、難民潮、難民營、援助、收容、救援、醫療、人道、避難、馬其頓、土耳其、外交部、國際、聯合國、紅十字會、阿爾巴尼亞裔難民。 </concepts> </topic>
文档列表的样子(file-list)
CIRB010/cdn/loc/CDN_LOC_0001457 CIRB010/cdn/loc/CDN_LOC_0000294 CIRB010/cdn/loc/CDN_LOC_0000120 CIRB010/cdn/loc/CDN_LOC_0000661 CIRB010/cdn/loc/CDN_LOC_0001347 CIRB010/cdn/loc/CDN_LOC_0000439
utf8 Copper version EGCG 432Kbps RESERVECHARDONNAY TommyHolloway platts Celeron266MHz VOLKSWAGEN INDEX SmarTone
倒排文档的表示(inverted-file)
词库中词的行号1 词库中词的行号2(-1表示单个词,仅仅考虑1)文档个数
文档在列表中的行数 词出现的次数
1 -1 2 33689 1 38365 1 2 -1 1 33256 1 2 12371 1 33256 1 3 -1 1 10849 2 3 6756 1
代码实现 仅仅是考虑单个的字
# -*- coding: utf-8 -*- #!usr/bin/python import sys import getopt from xml.dom.minidom import parse import xml.dom.minidom import scipy.sparse as sp from numpy import * from math import log from sklearn.preprocessing import normalize #deal with the argv def main(argv): ifFeedback=False try: opts,args=getopt.getopt(argv,'ri:o:m:d:',[]) except getopt.GetoptError: # run input print 'wrong input' for opt,arg in opts: if opt=='-r' and ifFeedback==False: ifFeedback=True elif opt=='-i': queryFile=arg elif opt=='-o': rankedList=arg elif opt=='-m': modelDir=arg elif opt=='-d': NTCIRDir=arg else: pass return ifFeedback,queryFile,rankedList,modelDir,NTCIRDir #if __name__=='__main__' : #get the path in the arguments ifFeedback,queryFile,rankedList,modelDir,NTCIRDir=main(sys.argv[1:]) #print ifFeedback,queryFile,rankedList,modelDir,NTCIRDir #get the file path in the model-dir vocab=modelDir+'/vocab.all' fileList=modelDir+'/file-list' invList=modelDir+'/inverted-file' #read pf=open(vocab,'r') vocab=pf.read() pf.close() pf=open(fileList,'r') fileList=pf.read() pf.close() pf=open(invList,'r') invList=pf.read() pf.close() #splitlines vocab=vocab.splitlines(); fileList=fileList.splitlines() invList=invList.splitlines() # vocab dict vocabDict={} k=0 while k <len(vocab): vocabDict[vocab[k]]=k k+=1 #get the TF and IDF matrix #dimension: #tfMatrix=sp.csr_matrix(len(fileList),len(vocab)) IDFVector=zeros(len(vocab)) totalDocs=len(fileList) count=0 tempMatrix=zeros((len(fileList),len(vocab))) while count<len(invList): postings=invList[count] post=postings.split(' ') k=1 #just deal with the single word if(len(post)>2 and post[1]=='-1'): IDFVector[int(post[0])]=int(post[2]) while k<=int(post[2]): line=invList[count+k].split(' ') tempMatrix[int(line[0])][int(post[0])]=int(line[1]) k+=1 count+=k tfMatrix=sp.csr_matrix(tempMatrix) #BM25 doclens=tfMatrix.sum(1) avglen=doclens.mean() k=7 b=0.7 # tp1=tfMatrix*(k+1) tp2=k*(1-b+b*doclens/avglen) tfMatrix.data+=array(tp2[tfMatrix.tocoo().row]).reshape(len(tfMatrix.data)) tfMatrix.data=tp1.data/tfMatrix.data #calculate the idf k=0 while k<len(vocab): if IDFVector[k]!=0: IDFVector[k]=log(float(totalDocs)/IDFVector[k]) k+=1 #tf-idf tfMatrix.data*=IDFVector[tfMatrix.indices] #row normalization for tf-idf matrix normalize(tfMatrix,norm='l2',axis=1,copy=False) #deal with the query doc=xml.dom.minidom.parse(queryFile) root=doc.documentElement topics=root.getElementsByTagName('topic') rankList='' for topic in topics: #query vector qVector=zeros(len(vocab)) number=topic.getElementsByTagName('number')[0].childNodes[0].data title=topic.getElementsByTagName('title')[0].childNodes[0].data question=topic.getElementsByTagName('question')[0].childNodes[0].data narrative=topic.getElementsByTagName('narrative')[0].childNodes[0].data concepts=topic.getElementsByTagName('concepts')[0].childNodes[0].data narrative+=question+concepts for w in narrative: if vocabDict.has_key(w.encode('utf8')): qVector[vocabDict[w.encode('utf8')]]+=1 for w in title: if vocabDict.has_key(w.encode('utf8')): qVector[vocabDict[w.encode('utf8')]]+=1 #...normalization normalize(qVector,norm='l2',axis=1,copy=False) #similarity compute: #a sparse matrix sim=tfMatrix*(sp.csr_matrix(qVector).transpose()) sim=sim.toarray() k=0 simCount=[] while k<len(fileList): tup=(sim[k],k) simCount.append(tup) k+=1 #sort simCount.sort(reverse=True) simCount=simCount[:100] if ifFeedback: topk=[] for score,k in simCount[:20]: topk.append(k) d=tfMatrix[topk,:].sum(0)/20 qVector+=array(0.8*d).reshape(len(qVector)) #..... normalize(qVector,norm='l2',axis=1,copy=False) #similarity compute: #a sparse matrix sim=tfMatrix*(sp.csr_matrix(qVector).transpose()) sim=sim.toarray() k=0 simCount=[] while k<len(fileList): tup=(sim[k],k) simCount.append(tup) k+=1 #sort simCount.sort(reverse=True) simCount=simCount[:100] #..... num=number.split('ZH') num=num[1] for sim in simCount: name=fileList[sim[1]] name=name.split('/') name=name[3].lower() rank=num+' '+name rankList+=rank+' ' pf=open(rankedList,'w') pf.write(rankList)