• Lucene 自动补全


    package com.pera.suggestion;

    import java.io.IOException;
    import java.io.Reader;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;

    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.LowerCaseFilter;
    import org.apache.lucene.analysis.StopFilter;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.standard.StandardFilter;
    import org.apache.lucene.analysis.standard.StandardTokenizer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.Sort;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;


     public class Sugesstion {

         private static final String GRAMMED_WORDS_FIELD = "words";

         private static final String SOURCE_WORD_FIELD = "sourceWord";

         private static final String COUNT_FIELD = "count";

         private static final String[] ENGLISH_STOP_WORDS = {
         "a", "an", "and", "are", "as", "at", "be", "but", "by",
         "for", "i", "if", "in", "into", "is",
         "no", "not", "of", "on", "or", "s", "such",
         "t", "that", "the", "their", "then", "there", "these",
         "they", "this", "to", "was", "will", "with"
         };

         private final Directory autoCompleteDirectory;

         private IndexReader autoCompleteReader;

         private IndexSearcher autoCompleteSearcher;

         public Sugesstion(String autoCompleteDir) throws IOException {
          this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir,
            null);

          reOpenReader();
         }

         public List<String> suggestTermsFor(String term) throws IOException {
          // get the top 5 terms for query
          Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term));
          Sort sort = new Sort(COUNT_FIELD, true);

          TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort);
          List<String> suggestions = new ArrayList<String>();
          for (ScoreDoc doc : docs.scoreDocs) {
           suggestions.add(autoCompleteReader.document(doc.doc).get(
             SOURCE_WORD_FIELD));
          }

          return suggestions;
         }

         @SuppressWarnings("unchecked")
         public void reIndex(Directory sourceDirectory, String fieldToAutocomplete)
           throws CorruptIndexException, IOException {
          // build a dictionary (from the spell package)
          IndexReader sourceReader = IndexReader.open(sourceDirectory);

          LuceneDictionary dict = new LuceneDictionary(sourceReader,
            fieldToAutocomplete);

          // code from
          // org.apache.lucene.search.spell.SpellChecker.indexDictionary(
          // Dictionary)
          IndexReader.unlock(autoCompleteDirectory);

          // use a custom analyzer so we can do EdgeNGramFiltering
          IndexWriter writer = new IndexWriter(autoCompleteDirectory,
          new Analyzer() {
           public TokenStream tokenStream(String fieldName,
             Reader reader) {
            TokenStream result = new StandardTokenizer(reader);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new ISOLatin1AccentFilter(result);
            result = new StopFilter(result,
             ENGLISH_STOP_WORDS);
            result = new EdgeNGramTokenFilter(
             result, Side.FRONT,1, 20);

            return result;
           }
          }, true);

          writer.setMergeFactor(300);
          writer.setMaxBufferedDocs(150);

          // go through every word, storing the original word (incl. n-grams)
          // and the number of times it occurs
          Map<String, Integer> wordsMap = new HashMap<String, Integer>();

          Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
          while (iter.hasNext()) {
           String word = iter.next();

           int len = word.length();
           if (len < 3) {
            continue; // too short we bail but "too long" is fine...
           }

           if (wordsMap.containsKey(word)) {
            throw new IllegalStateException(
              "This should never happen in Lucene 2.3.2");
            // wordsMap.put(word, wordsMap.get(word) + 1);
           } else {
            // use the number of documents this word appears in
            wordsMap.put(word, sourceReader.docFreq(new Term(
              fieldToAutocomplete, word)));
           }
          }

          for (String word : wordsMap.keySet()) {
           // ok index the word
           Document doc = new Document();
           doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES,
             Field.Index.UN_TOKENIZED)); // orig term
           doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES,
             Field.Index.TOKENIZED)); // grammed
           doc.add(new Field(COUNT_FIELD,
             Integer.toString(wordsMap.get(word)), Field.Store.NO,
             Field.Index.UN_TOKENIZED)); // count

           writer.addDocument(doc);
          }

          sourceReader.close();

          // close writer
          writer.optimize();
          writer.close();

          // re-open our reader
          reOpenReader();
         }

         private void reOpenReader() throws CorruptIndexException, IOException {
          if (autoCompleteReader == null) {
           autoCompleteReader = IndexReader.open(autoCompleteDirectory);
          } else {
           autoCompleteReader.reopen();
          }

          autoCompleteSearcher = new IndexSearcher(autoCompleteReader);
         }

         public static void main(String[] args) throws Exception {
          Sugesstion autocomplete = new Sugesstion("/index/autocomplete");

          // run this to re-index from the current index, shouldn't need to do
          // this very often
          // autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null),
          // "content");

          String term = "steve";

          System.out.println(autocomplete.suggestTermsFor(term));
          // prints [steve, steven, stevens, stevenson, stevenage]
         }

     }

  • 相关阅读:
    linux系统更新及开启自动更新
    关于ICO的一些理解
    中小学教育缴费遇到的一些问题
    中小学教育缴费----支付宝回传数据.net core 接收中文乱码
    中小学教育缴费——验签失败
    C# MVC+EF—WebApi
    C# MVC+EF—页面搭建
    C# MVC+EF—结构搭建
    EF中的预先加载和延迟加载
    WebApi路由
  • 原文地址:https://www.cnblogs.com/cl1024cl/p/6205321.html
Copyright © 2020-2023  润新知