• lucene中分词器的用法


    package com.ljq.analyzer;

    import java.io.StringReader;

    import jeasy.analysis.MMAnalyzer;

    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.SimpleAnalyzer;
    import org.apache.lucene.analysis.Token;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.cjk.CJKAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.junit.Test;

    public class AnalyzerTest {

    String ensaText
    = "IndexWriter addDocument's a javadoc.txt";
    String ensa2Text
    = "我们是中国人";
    String zhcjkaText
    = "小笑话_总统的房间 Room .txt";
    String zhmnText
    = "一位绅士到旅游胜地的一家饭店要开个房间";

    Analyzer ensa
    = new StandardAnalyzer(); // 单字分词
    Analyzer ensa2 = new SimpleAnalyzer();
    Analyzer zhcjka
    = new CJKAnalyzer(); // 二分法分词
    Analyzer zhmn = new MMAnalyzer(); // 词库分词

    @Test
    public void test() throws Exception {
    // 单字分词
    /*(indexwriter,0,11,type=<ALPHANUM>)
    (adddocument,12,25,type=<APOSTROPHE>)
    (javadoc.txt,28,39,type=<HOST>)
    */
    //analyze(ensa, ensaText);

    //(我们是中国人,0,6)
    //analyze(ensa2, ensa2Text);

    // 二分法分词
    /*(小笑,0,2,type=double)
    (笑话,1,3,type=double)
    (_,3,4,type=single)
    (总统,4,6,type=double)
    (统的,5,7,type=double)
    (的房,6,8,type=double)
    (房间,7,9,type=double)
    (room,10,14,type=single)
    (txt,16,19,type=single)
    */
    //analyze(zhcjka, zhcjkaText);

    // 词库分词
    /*(一位,0,2)
    (绅士,2,4)
    (旅游胜地,5,9)
    (一家,10,12)
    (饭店,12,14)
    (要,14,15)
    (开个,15,17)
    (房间,17,19)
    */
    analyze(zhmn, zhmnText);
    }

    /**
    * 分词
    *
    *
    @param analyzer
    * 分词器
    *
    @param text
    * 数据源
    *
    @throws Exception
    */
    public void analyze(Analyzer analyzer, String text) throws Exception {
    TokenStream tokenStream
    = analyzer.tokenStream("content",
    new StringReader(text));
    for (Token token = new Token(); (token = tokenStream.next(token)) != null;) {
    System.out.println(token);
    }
    }
    }
  • 相关阅读:
    快捷JS PHP
    css userAgent (简易浏览器区分) PHP
    http://fw.qq.com/ipaddress PHP
    JS竖排文字 PHP
    奇怪的body PHP
    使用36进制,无损压缩GUID到26位 PHP
    链接<a>执行JS PHP
    纯JS省市区三级联动 PHP
    Table 样式 PHP
    Exceeded storage allocation. The server response was: 4.3.1 Message size exceeds fixed maximum message size
  • 原文地址:https://www.cnblogs.com/linjiqin/p/2001594.html
Copyright © 2020-2023  润新知