• lucene中分词器的用法


    package com.ljq.analyzer;

    import java.io.StringReader;

    import jeasy.analysis.MMAnalyzer;

    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.SimpleAnalyzer;
    import org.apache.lucene.analysis.Token;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.cjk.CJKAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.junit.Test;

    public class AnalyzerTest {

    String ensaText
    = "IndexWriter addDocument's a javadoc.txt";
    String ensa2Text
    = "我们是中国人";
    String zhcjkaText
    = "小笑话_总统的房间 Room .txt";
    String zhmnText
    = "一位绅士到旅游胜地的一家饭店要开个房间";

    Analyzer ensa
    = new StandardAnalyzer(); // 单字分词
    Analyzer ensa2 = new SimpleAnalyzer();
    Analyzer zhcjka
    = new CJKAnalyzer(); // 二分法分词
    Analyzer zhmn = new MMAnalyzer(); // 词库分词

    @Test
    public void test() throws Exception {
    // 单字分词
    /*(indexwriter,0,11,type=<ALPHANUM>)
    (adddocument,12,25,type=<APOSTROPHE>)
    (javadoc.txt,28,39,type=<HOST>)
    */
    //analyze(ensa, ensaText);

    //(我们是中国人,0,6)
    //analyze(ensa2, ensa2Text);

    // 二分法分词
    /*(小笑,0,2,type=double)
    (笑话,1,3,type=double)
    (_,3,4,type=single)
    (总统,4,6,type=double)
    (统的,5,7,type=double)
    (的房,6,8,type=double)
    (房间,7,9,type=double)
    (room,10,14,type=single)
    (txt,16,19,type=single)
    */
    //analyze(zhcjka, zhcjkaText);

    // 词库分词
    /*(一位,0,2)
    (绅士,2,4)
    (旅游胜地,5,9)
    (一家,10,12)
    (饭店,12,14)
    (要,14,15)
    (开个,15,17)
    (房间,17,19)
    */
    analyze(zhmn, zhmnText);
    }

    /**
    * 分词
    *
    *
    @param analyzer
    * 分词器
    *
    @param text
    * 数据源
    *
    @throws Exception
    */
    public void analyze(Analyzer analyzer, String text) throws Exception {
    TokenStream tokenStream
    = analyzer.tokenStream("content",
    new StringReader(text));
    for (Token token = new Token(); (token = tokenStream.next(token)) != null;) {
    System.out.println(token);
    }
    }
    }
  • 相关阅读:
    斯特林数
    JAVA substring截取报错java.lang.StringIndexOutOfBoundsException: String index out of range:
    大爽Python入门教程 2-2 序列: 字符串、元组与列表
    大爽Python入门教程 2-3 字符串,列表,字典
    大爽Python入门教程 2-4 练习
    大爽Python入门教程 2-1 认识容器
    JS 树形结构 根据子节点找到所有上级
    kafka扩容和分区重新分配
    Kafka 常用命令总结
    kafka的groupid
  • 原文地址:https://www.cnblogs.com/linjiqin/p/2001594.html
Copyright © 2020-2023  润新知