• lucene3.6.0的高亮显示


    需要引入

    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-core</artifactId>
    			<version>3.6.0</version>
    		</dependency>
    		<dependency>
    			<groupId>org.apache.lucene</groupId>
    			<artifactId>lucene-highlighter</artifactId>
    			<version>3.6.0</version>
    		</dependency>

    示例代码:

    import java.io.IOException;
    import java.io.StringReader;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.highlight.Highlighter;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    public class DocSearch {
    
    	private static IndexSearcher isearcher = null;
    	public static void search(String key) throws IOException, ParseException, InvalidTokenOffsetsException{
    		 Directory directory = FSDirectory.open(new File("E:\\output\\lucence\\index"));
    		 // Now search the index:
    	    IndexReader ireader = IndexReader.open(directory); // read-only=true
    	    isearcher  = new IndexSearcher(ireader);
    	    // Parse a simple query that searches for "text":
    	    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    	    
    //	    TokenStream tokenStream = analyzer.tokenStream("context", new StringReader("this is a quick gooobuy"));
    //	    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    //	    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    //
    //	    while (tokenStream.incrementToken()) {
    //	        int startOffset = offsetAttribute.startOffset();
    //	        int endOffset = offsetAttribute.endOffset();
    //	        String term = charTermAttribute.toString();
    //	        System.out.println(offsetAttribute.toString() + "\t" + term);
    //	    }
    	    
    	    QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"context", analyzer);
    	    Query query = parser.parse(key);
    	    ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
    	    
    	    Highlighter hl = new Highlighter(new QueryScorer(query));
    	    
    	    System.out.println(query.toString());
    	    // Iterate through the results:
    	    for (int i = 0; i < hits.length; i++) {
    	      Document hitDoc = isearcher.doc(hits[i].doc);
    	      TokenStream ts = analyzer.tokenStream("context", new StringReader(hitDoc.getValues("context")[0]));
    	      String frament = hl.getBestFragment(ts, hitDoc.getValues("context")[0]);
    	      System.out.println(frament);
    //	      System.out.println(hitDoc.getValues("id")[0] + "\t" + hitDoc.getValues("context")[0] + "\t" + hits[i].score);
    //	      Explanation explan = isearcher.explain(query, hits[i].doc);
    //	      System.out.println(explan);
    	    }
    	}
    	
    	public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
    		search("旧水泥袋");
    		isearcher.close();
    	}
    	
    }

    索引建立和数据参考http://zhwj184.iteye.com/admin/blogs/1522709

     

    输出结果:

    context:旧 context:水 context:泥 context:袋
    采购<B>旧</B>编织<B>袋</B>、<B>旧</B><B>水</B><B>泥</B><B>袋</B>
    <B>水</B><B>泥</B>
    采购<B>水</B><B>泥</B>电阻
    求购<B>水</B><B>泥</B>输送链条和提升机
    1万5 潜<B>水</B>料啤酒手提包 手提<B>袋</B>
    大量采购包装用的编织<B>袋</B>(新的<B>旧</B>的,有无商标皆可)
    铁<B>泥</B> 铁灰
    废<B>旧</B>砂轮
    软陶<B>泥</B>,超轻粘土
    <B>水</B>泵
    手<B>袋</B>
    <B>水</B>锈石 上<B>水</B>石  吸<B>水</B>石
    足浴<B>袋</B>  泡脚<B>袋</B> 异形<B>袋</B>
    手提<B>袋</B>制<B>袋</B>机
    回收库存废<B>旧</B>油墨油漆
    回收库存<B>旧</B>油漆13463048572
    求购废<B>旧</B>油漆油墨13463048572
    求购库存<B>旧</B>化工树脂

    highlighter类的分析
    /**
     * Class used to markup highlighted terms found in the best sections of a
     * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
     * {@link Encoder} and tokenizers.
     */
    public class Highlighter
    {
      public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
    
      private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
    	private Formatter formatter;
    	private Encoder encoder;
    	private Fragmenter textFragmenter=new SimpleFragmenter();
    	private Scorer fragmentScorer=null;
    
    	public Highlighter(Scorer fragmentScorer)
    	{
    		this(new SimpleHTMLFormatter(),fragmentScorer);
    	}
    
    
     	public Highlighter(Formatter formatter, Scorer fragmentScorer)
     	{
    		this(formatter,new DefaultEncoder(),fragmentScorer);
    	}
    
    
    	public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
    	{
     		this.formatter = formatter;
    		this.encoder = encoder;
     		this.fragmentScorer = fragmentScorer;
     	}

    这里有两个扩展,formatter和encoder,formatter其实就是堆高亮部分的显示逻辑,比如默认是直接加<B></B>,encoder编码这里默认是不错任何处理,这里可以对输入的文本进行编码处理,

     

    可以查看highlighter的encoder的一个默认实现

    package org.apache.lucene.search.highlight;
    /**
     * Copyright 2005 The Apache Software Foundation
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    /**
     * Simple {@link Encoder} implementation to escape text for HTML output
     *
     */
    public class SimpleHTMLEncoder implements Encoder
    {
    	public SimpleHTMLEncoder()
    	{
    	}
    
    	public String encodeText(String originalText)
    	{
    		return htmlEncode(originalText);
    	}
    	
    	/**
    	 * Encode string into HTML
    	 */
    	public final static String htmlEncode(String plainText) 
    	{
    		if (plainText == null || plainText.length() == 0)
    		{
    			return "";
    		}
    
    		StringBuilder result = new StringBuilder(plainText.length());
    
    		for (int index=0; index<plainText.length(); index++) 
    		{
    			char ch = plainText.charAt(index);
    
    			switch (ch) 
    			{
    			case '"':
    				result.append(""");
    				break;
    
    			case '&':
    				result.append("&");
    				break;
    
    			case '<':
    				result.append("<");
    				break;
    
    			case '>':
    				result.append(">");
    				break;
    
    			default:
    				   if (ch < 128) 
    				   {
    			           result.append(ch);
    			       } 
    				   else 
    			       {
    			           result.append("&#").append((int)ch).append(";");
    			       }
    			}
    		}
    
    		return result.toString();
    	}
    }

    formatter的默认实现

    package org.apache.lucene.search.highlight;
    
    /**
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    /**
     * Simple {@link Formatter} implementation to highlight terms with a pre and
     * post tag.
     */
    public class SimpleHTMLFormatter implements Formatter {
      
      private static final String DEFAULT_PRE_TAG = "<B>";
      private static final String DEFAULT_POST_TAG = "</B>";
      
    	private String preTag;
    	private String postTag;
    	
    	public SimpleHTMLFormatter(String preTag, String postTag) {
    		this.preTag = preTag;
    		this.postTag = postTag;
    	}
    
    	/** Default constructor uses HTML: <B> tags to markup terms. */
    	public SimpleHTMLFormatter() {
    	  this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG);
    	}
    
    	/* (non-Javadoc)
    	 * @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
    	 */
    	public String highlightTerm(String originalText, TokenGroup tokenGroup) {
    	  if (tokenGroup.getTotalScore() <= 0) {
    	    return originalText;
    	  }
    	  
    	  // Allocate StringBuilder with the right number of characters from the
        // beginning, to avoid char[] allocations in the middle of appends.
    	  StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
    	  returnBuffer.append(preTag);
    	  returnBuffer.append(originalText);
    	  returnBuffer.append(postTag);
    	  return returnBuffer.toString();
    	}
    	
    }
    


  • 相关阅读:
    基于docker swarm的搭建高可用web集群
    软链接和硬链接的区别
    PTA(Basic Level) Practice 刷题(部分) Python实现
    使用官方提供的方式在CentOS上安装docker
    mysql表分区的限制
    有个免费云服务器速度很快!
    PhpMyAdmin 配置文件现在需要一个短语密码的解决方法
    Django笔记:Memcached缓存系统
    Django笔记:文件上传
    Django笔记:表单验证
  • 原文地址:https://www.cnblogs.com/secbook/p/2655174.html
Copyright © 2020-2023  润新知