• 向量空间模型实现文档查询(Vector Space Model to realize document query)



    xml中文档(query)的结构:

    <topic>
    <number>CIRB010TopicZH006</number>
    <title>科索沃難民潮</title>
    <question>
    查詢科索沃戰爭中的難民潮情況,以及國際間對其采取的援助。
    </question>
    <narrative>
    相關文件內容包含科省難民湧入的地點、人數。受安置的狀況,難民潮引發的問題,参與救援之國家與國際組織,其援助策略與行動內容之報導。
    </narrative>
    <concepts>
    科省、柯省、科索沃、柯索伏、難民、難民潮、難民營、援助、收容、救援、醫療、人道、避難、馬其頓、土耳其、外交部、國際、聯合國、紅十字會、阿爾巴尼亞裔難民。
    </concepts>
    </topic>

    文档列表的样子(file-list)

    CIRB010/cdn/loc/CDN_LOC_0001457
    CIRB010/cdn/loc/CDN_LOC_0000294
    CIRB010/cdn/loc/CDN_LOC_0000120
    CIRB010/cdn/loc/CDN_LOC_0000661
    CIRB010/cdn/loc/CDN_LOC_0001347
    CIRB010/cdn/loc/CDN_LOC_0000439


    词库的样子(vocab.all) 中文的话是单个字一行

    utf8
    Copper
    version
    EGCG
    432Kbps
    RESERVECHARDONNAY
    TommyHolloway
    platts
    Celeron266MHz
    VOLKSWAGEN
    INDEX
    SmarTone


    倒排文档的表示(inverted-file)

    词库中词的行号1  词库中词的行号2(-1表示单个词,仅仅考虑1)文档个数

    文档在列表中的行数 词出现的次数

    1 -1 2
    33689 1
    38365 1
    2 -1 1
    33256 1
    2 12371 1
    33256 1
    3 -1 1
    10849 2
    3 6756 1

    代码实现 仅仅是考虑单个的字


    # -*- coding: utf-8 -*-
    #!usr/bin/python
    
    import sys
    import getopt
    from xml.dom.minidom import parse
    import xml.dom.minidom
    import scipy.sparse as sp
    from numpy import *
    from math import log
    from sklearn.preprocessing import normalize
    
    #deal with the argv
    def main(argv):
    	ifFeedback=False
    	try:
    		opts,args=getopt.getopt(argv,'ri:o:m:d:',[])
    	except getopt.GetoptError:
    		# run input
    		print 'wrong input'
    	for opt,arg in opts:
    		if opt=='-r' and ifFeedback==False:
    			ifFeedback=True
    		elif opt=='-i':
    			queryFile=arg
    		elif opt=='-o':
    			rankedList=arg
    		elif opt=='-m':
    			modelDir=arg
    		elif opt=='-d':
    			NTCIRDir=arg
    		else:
    			pass
    	return ifFeedback,queryFile,rankedList,modelDir,NTCIRDir		
    #if __name__=='__main__' :
    
    
    #get the path in the arguments
    ifFeedback,queryFile,rankedList,modelDir,NTCIRDir=main(sys.argv[1:])
    #print ifFeedback,queryFile,rankedList,modelDir,NTCIRDir
    
    #get the file path in the model-dir
    vocab=modelDir+'/vocab.all'
    fileList=modelDir+'/file-list'
    invList=modelDir+'/inverted-file'
    
    #read
    pf=open(vocab,'r')
    vocab=pf.read()
    pf.close()
    
    pf=open(fileList,'r')
    fileList=pf.read()
    pf.close()
    
    pf=open(invList,'r')
    invList=pf.read()
    pf.close()
    
    #splitlines
    vocab=vocab.splitlines();
    fileList=fileList.splitlines()
    invList=invList.splitlines()
    
    # vocab dict
    vocabDict={}
    k=0
    while k <len(vocab):
    	vocabDict[vocab[k]]=k
    	k+=1
    
    
    #get the TF and IDF matrix
    #dimension:
    #tfMatrix=sp.csr_matrix(len(fileList),len(vocab))
    
    IDFVector=zeros(len(vocab))
    totalDocs=len(fileList)
    count=0
    tempMatrix=zeros((len(fileList),len(vocab)))
    
    while count<len(invList):
    	postings=invList[count]
    	post=postings.split(' ')
    	k=1
    	#just deal with the single word
    	if(len(post)>2 and post[1]=='-1'):
    		IDFVector[int(post[0])]=int(post[2])
    		while k<=int(post[2]):
    			line=invList[count+k].split(' ')
    			tempMatrix[int(line[0])][int(post[0])]=int(line[1])
    			k+=1
    	count+=k
    
    tfMatrix=sp.csr_matrix(tempMatrix)
    
    #BM25
    doclens=tfMatrix.sum(1)
    avglen=doclens.mean()
    k=7
    b=0.7
    #
    tp1=tfMatrix*(k+1)
    tp2=k*(1-b+b*doclens/avglen)
    tfMatrix.data+=array(tp2[tfMatrix.tocoo().row]).reshape(len(tfMatrix.data))
    tfMatrix.data=tp1.data/tfMatrix.data
    
    #calculate the idf
    k=0
    while k<len(vocab):
    	if IDFVector[k]!=0:
    		IDFVector[k]=log(float(totalDocs)/IDFVector[k])
    	k+=1
    #tf-idf
    tfMatrix.data*=IDFVector[tfMatrix.indices]
    
    #row normalization for tf-idf matrix
    normalize(tfMatrix,norm='l2',axis=1,copy=False)
    
    #deal with the query
    doc=xml.dom.minidom.parse(queryFile)
    root=doc.documentElement
    topics=root.getElementsByTagName('topic')
    rankList=''
    for topic in topics:
    	#query vector
    	qVector=zeros(len(vocab))
    
    	number=topic.getElementsByTagName('number')[0].childNodes[0].data
    	title=topic.getElementsByTagName('title')[0].childNodes[0].data
    
    	question=topic.getElementsByTagName('question')[0].childNodes[0].data
    	narrative=topic.getElementsByTagName('narrative')[0].childNodes[0].data
    	concepts=topic.getElementsByTagName('concepts')[0].childNodes[0].data
    	
            narrative+=question+concepts
    	for w in narrative:
    		if vocabDict.has_key(w.encode('utf8')):
    			qVector[vocabDict[w.encode('utf8')]]+=1
    	for w in title:
    		if vocabDict.has_key(w.encode('utf8')):
    			qVector[vocabDict[w.encode('utf8')]]+=1
    #...normalization
    	normalize(qVector,norm='l2',axis=1,copy=False)	
    	#similarity compute:
    	#a sparse matrix
    	sim=tfMatrix*(sp.csr_matrix(qVector).transpose())
    
    	sim=sim.toarray()
    	k=0
    	simCount=[]
    	while k<len(fileList):
    		tup=(sim[k],k)
    		simCount.append(tup)
    		k+=1
    
    	#sort
    	simCount.sort(reverse=True)
    	simCount=simCount[:100]
    	if ifFeedback:
    		topk=[]
    		for score,k in simCount[:20]:
    			topk.append(k)
    		d=tfMatrix[topk,:].sum(0)/20
    		qVector+=array(0.8*d).reshape(len(qVector))
    	#.....
    	normalize(qVector,norm='l2',axis=1,copy=False)	
    	#similarity compute:
    	#a sparse matrix
    	sim=tfMatrix*(sp.csr_matrix(qVector).transpose())
    
    	sim=sim.toarray()
    	k=0
    	simCount=[]
    	while k<len(fileList):
    		tup=(sim[k],k)
    		simCount.append(tup)
    		k+=1
    
    	#sort
    	simCount.sort(reverse=True)
    	simCount=simCount[:100]
    	#.....
    
    	num=number.split('ZH')
    	num=num[1]
    	for sim in simCount:
    		name=fileList[sim[1]]
    		name=name.split('/')
    		name=name[3].lower()		
    		rank=num+' '+name
    		rankList+=rank+'
    '
    
    pf=open(rankedList,'w')
    pf.write(rankList)
    





  • 相关阅读:
    HTTP 深入详解(HTTP Web 的基础)
    webpack 代码分离
    webpack 常见问题
    细说 webpack 之流程篇
    一个页面从输入 URL 到页面加载显示完成,这个过程中都发生了什么?
    Ajax 解决浏览器缓存问题
    十大经典排序算法
    react-redux 之 connect 方法详解
    JS实现继承的几种方式
    GIT常用命令及常见问题解决方法-协作篇
  • 原文地址:https://www.cnblogs.com/yjbjingcha/p/7340590.html
Copyright © 2020-2023  润新知