相关词推荐问题就是一个计算任意两个词相似度的问题。
1. 制作爬虫,获取百科页面,首批语料有10万多条记录。
2. 解析百科页面,利用lucene 制作索引index(字段:title,id,summary)。
3.相关词推荐算法
我们最终要获取任意两个词的相似度Sim(Wi,Wj)。
基本思想:每条记录有 title-summary 信息,对于一个词Wi,它的summary信息就是对该词的详细解释。
我们打算利用summary信息提取出词Wi的特征向量fvi,所以词相似度Sim(Wi,Wj)的问题就转换为向量相似度的问题Sim(fvi,fvj)。
利用TFIDF 技术提取出每个词的tfidf,选择top-300个词作为该词的fv,同时建立lucene 索引保存。
import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import com.roboo.segmenter.twoWayAnalyzer.RobooSegmenterFactory; import com.roboo.segmenter.twoWayAnalyzer.RobooSegmenterFactory.AnalyzerType; public class FeatureVector { /** * @param args * @throws IOException */ public void compute(String inputIndex,String outputIndex) throws IOException { Comparator<termSet> comparator= new Comparator<termSet>(){ public int compare(termSet t1,termSet t2){ return -new Double(t1.weight).compareTo(new Double(t2.weight)); } }; OutputStreamWriter output = new OutputStreamWriter(new FileOutputStream("Exception.txt",true), "UTF-8"); BufferedWriter ExcepWriter = new BufferedWriter(output); //String index = "E:\data\word2vec-corpus\baike\baike_all_lucene"; File indexDir=new File(inputIndex); Directory directory=FSDirectory.getDirectory(indexDir); IndexReader Inreader=IndexReader.open(directory); int maxdocument=Inreader.maxDoc(); //File INDEX_DIR = new File("index"); File INDEX_DIR = new File(outputIndex); IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(),true); IndexSearcher searcher=new IndexSearcher(Inreader); for(int i=0;i<maxdocument;i++){ Document doc=Inreader.document(i); String keywordContent=doc.get("title"); String summaryContent=doc.get("summary"); String idStr=doc.get("id"); Map<String, Integer> fileTermCountMap=new HashMap<String,Integer>(); Analyzer ana = RobooSegmenterFactory.createAnalyzer(AnalyzerType.NORMAL_WAP_CHANNEL); Reader reader = new StringReader(summaryContent); TokenStream stream = ana.tokenStream("", reader); for(Token t = stream.next();t!=null;t=stream.next()){ if(t.termText().length()<=1) continue; String word=t.termText(); if(!fileTermCountMap.containsKey(word)){ fileTermCountMap.put(word, new Integer(1)); }else{ fileTermCountMap.put(word, fileTermCountMap.get(word)+1); } } Iterator<Map.Entry<String,Integer>> it=fileTermCountMap.entrySet().iterator(); ArrayList<termSet> termList=new ArrayList<termSet>(); boolean flag=false; try{ while(it.hasNext()) { Map.Entry<String, Integer> entry=it.next(); String word=entry.getKey(); Term t=new Term("summary",word); int docFreq=searcher.docFreq(t); if(docFreq==0) { flag=true; docFreq=1; ExcepWriter.write(idStr+":"+word+";"); } int tf=entry.getValue(); double tfidf=Math.log(1+tf)*Math.log(1+maxdocument/docFreq); termSet ts=new termSet(word,tfidf); termList.add(ts); } }catch(Exception e){ flag=true; ExcepWriter.write(idStr); ExcepWriter.newLine(); continue; } if(flag) { ExcepWriter.newLine(); } Collections.sort(termList,comparator); //sort int topk=300; //choose top 300 terms double sumFactor=0; if(topk>termList.size()) topk=termList.size(); for(int j=0;j<topk;j++){ sumFactor+=Math.pow(termList.get(j).weight,2); } String fv=""; for(int j=0;j<topk;j++) { fv+=termList.get(j).word+","+Double.toString(termList.get(j).weight/Math.sqrt(sumFactor))+";"; } Document newdoc=new Document(); newdoc.add(new Field("id",idStr,Field.Store.YES, Field.Index.UN_TOKENIZED)); newdoc.add(new Field("title",keywordContent,Field.Store.YES, Field.Index.UN_TOKENIZED)); newdoc.add(new Field("featureVector",fv,Field.Store.YES, Field.Index.UN_TOKENIZED)); writer.addDocument(newdoc); stream.close(); //System.out.println("read:" +i); } ExcepWriter.flush(); ExcepWriter.close(); writer.optimize(); writer.close(); searcher.close(); Inreader.close(); } }
然后对于任意一个词,我们可以利用余弦相似度的方法计算该词与其它任意词的相似度得分,从而就能完成相关词推荐。
Comparator<termSet> comparator= new Comparator<termSet>(){ public int compare(termSet t1,termSet t2){ return -new Double(t1.weight).compareTo(new Double(t2.weight)); } }; String index = "index"; File indexDir=new File(index); Directory directory=FSDirectory.getDirectory(indexDir); IndexReader Inreader=IndexReader.open(directory); int maxdocument=Inreader.maxDoc(); IndexSearcher search=new IndexSearcher(directory); Term term=new Term("title",keyword); TermQuery query = null; query=new TermQuery(term); String relatedWords=""; Hits hits=search.search(query); if(hits.length()!=0){ ArrayList<termSet> simList=new ArrayList<termSet>(); Map<String, Double> termMap=new HashMap<String,Double>(); Map<String,Integer> uniquetermMap=new HashMap<String,Integer>(); Document doc=hits.doc(0); //String keyword=doc.get("keyword"); String fv=doc.get("featureVector"); String[] allUnit=fv.split(";"); for(String s:allUnit) { if(s!=null) { String[] oneUnit=s.split(","); String word=oneUnit[0]; double value=Double.valueOf(oneUnit[1]); termMap.put(word, value); } } for(int j=0;j<maxdocument;j++) { Document testdoc=Inreader.document(j); String testkeyword=testdoc.get("title"); if(testkeyword.equals(keyword)||testkeyword.contains(keyword)||keyword.contains(testkeyword)) continue; if(uniquetermMap.containsKey(testkeyword)) continue; String testfv=testdoc.get("featureVector"); String[] allUnittest=testfv.split(";"); double score=0; for(String s:allUnittest) { if(s!=null) { String[] oneUnit=s.split(","); String word=oneUnit[0]; double testvalue=Double.valueOf(oneUnit[1]); if(termMap.containsKey(word)) { score+=termMap.get(word)*testvalue; } } } termSet tt=new termSet(testkeyword,score); simList.add(tt); uniquetermMap.put(testkeyword, new Integer(1)); } uniquetermMap.clear(); Collections.sort(simList,comparator); int topk=200; for(int k=0;k<topk;k++) { relatedWords+=simList.get(k).word+"--"+Double.toString(simList.get(k).weight)+" "; } }else{ relatedWords="no matched"; }
输入 查询词 “腾讯”,我们会得到如下推荐相关词
然而想要线下计算出所有词的相关词,单机计算基本处于瘫痪(10万条记录估计要几十天)。
所以下一步会利用mapreduce 来处理这一问题,mapreduce 代码会在后续公布。。。