• 在DotLucene/Lucene.net中, 增加自己的中文分词Analyzer


    一种非常简单,但是不是很优化的方法,继承Lucene.Net.Analysis.Analyzer,实现了Lucene.Net.Analysis.Analyzer,Lucene.Net.Analysis.Tokenizer,Lucene.Net.Analysis.TokenFilter的子类.参考了Lucene.Net.Analysis.Cn的实现,该项目采用对汉语进行一元分词.
    
    ChineseAnalyer类,继承自Lucene.Net.Analysis.Analyzer
    
    using System;
    using System.IO;
    using System.Text;
    using System.Collections;
    using ShootSeg; //分词类的命名空间,该分词组件来源于http://www.shootsoft.net,开源项目,感谢作者
    using Lucene.Net.Analysis;
    namespace Lucene.Net.Analysis.CnByKing
    {
    public class ChineseAnalyzer : Analyzer
    {
    private Segment segment = new Segment(); //这个是自己的中文分词类
    public ChineseAnalyzer()
    {
    segment.InitWordDics(); //在构造函数装载词典
    segment.Separator = "|"; //分词间隔符号
    }
    public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
    {
    TokenStream result = new ChineseTokenizer(reader,segment); //把分词类传引用进去
    result = new ChineseFilter(result); //对处理好的结果进行过滤
    return result;
    }
    }
    }
    
    ChineseTokenizer类继承自Lucene.Net.Analysis.Tokenizer
    
    using System;
    using System.IO;
    using System.Text;
    using System.Collections;
    using System.Globalization;
    using ShootSeg;
    using Lucene.Net.Analysis;
    namespace Lucene.Net.Analysis.CnByKing
    {
    
    public sealed class ChineseTokenizer : Tokenizer
    {
    
    private Segment segment;
    private string[] Wordlist; //切好的词放入此数组中
    private string Allstr; //对传入的流转成此string
    private int offset = 0; int start = 0; int step = 0; //offset偏移量,start开始位置,step次数
    public ChineseTokenizer(TextReader _in,Segment segment)
    {
    input = _in;
    Allstr = input.ReadToEnd(); //把流读到Allstr
    this.segment = segment; //继续传引用,这会才发现写的时候糊涂了,完全可以不用写
    Wordlist = segment.SegmentText(Allstr).Split('|'); //把分好的词装入wordlist
    }
    private Token Flush(string str)
    {
    
    if (str.Length > 0)
    {
    return new Token(str,start, start + str.Length); //返回一个Token 包含词,词在流中的开始位置和结束位置.
    }
    else
    return null;
    }
    
    public override Token Next() //重载Next函数,就是返回Token
    {
    Token token = null;
    if (step <= Wordlist.Length)
    {
    start = Allstr.IndexOf(Wordlist[step], offset); //从Allstr里找每个分出来词汇的开始位置
    offset = start + 1; //计算偏移量
    token = Flush(Wordlist[step]); //返回已分词汇
    step = step + 1; //变量+1,移动到wordlist的下一个词汇
    }
    return token;
    }
    }
    }
    
    这个ChineseFilter继承自Lucene.Net.Analysis.TokenFilter,完全照抄Lucene.Net.Analysis.Cn工程的同名类(此类过滤了数字及符号,英文助词,需要过滤其他相应增加代码)
    using System;
    using System.IO;
    using System.Collections;
    using System.Globalization;
    using Lucene.Net.Analysis;
    namespace Lucene.Net.Analysis.CnByKing
    {
    
    
    /// <summary>
    /// Title: ChineseFilter
    /// Description: Filter with a stop word table
    /// Rule: No digital is allowed.
    /// English word/token should larger than 1 character.
    /// One Chinese character as one Chinese word.
    /// TO DO:
    /// 1. Add Chinese stop words, such as \ue400
    /// 2. Dictionary based Chinese word extraction
    /// 3. Intelligent Chinese word extraction
    /// 
    /// Copyright: Copyright (c) 2001
    /// Company:
    /// @author Yiyi Sun
    /// @version $Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $
    /// </summary>
    public sealed class ChineseFilter : TokenFilter
    {
    // Only English now, Chinese to be added later.
    public static String[] STOP_WORDS = 
             {
               "and", "are", "as", "at", "be", "but", "by",
               "for", "if", "in", "into", "is", "it",
               "no", "not", "of", "on", "or", "such",
               "that", "the", "their", "then", "there", "these",
               "they", "this", "to", "was", "will", "with"
             };
    
    private Hashtable stopTable;
    
    public ChineseFilter(TokenStream _in)
    : base(_in)
    {
    stopTable = new Hashtable(STOP_WORDS.Length);
    
    for (int i = 0; i < STOP_WORDS.Length; i++)
    stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
    }
    
    public override Token Next()
    {
    
    for (Token token = input.Next(); token != null; token = input.Next())
    {
    String text = token.TermText();
    
    // why not key off token type here assuming ChineseTokenizer comes first?
    if (stopTable[text] == null)
    {
    switch (Char.GetUnicodeCategory(text[0]))
    {
    
    case UnicodeCategory.LowercaseLetter:
    case UnicodeCategory.UppercaseLetter:
    
    // English word/token should larger than 1 character.
    if (text.Length > 1)
    {
    return token;
    }
    break;
    case UnicodeCategory.OtherLetter:
    
    // One Chinese character as one Chinese word.
    // Chinese word extraction to be added later here.
    
    return token;
    }
    
    }
    
    }
    return null;
    }
    }
    }

    以上基本没什么技术含量,好处就是增加新的中文分词不管什么算法,只需要简单几行代码搞定.中文分词完全和DotLucene/Lucene.net本身无关. 使用的时候用ChineseAnalyzer替换 StandardAnalyzer就OK了.
    Click Here To Download是编译好的lucene.net 1.91 Lucene.Net.Analysis.CnByKing.dll ShootSeg.dll引用这三玩意就可以搞定简单的中文搜索了


  • 相关阅读:
    【5.3】dict的子类
    【5.2】dict的常用方法
    【5.1】dict的abc继承关系
    【4.5】列表推导式、生成器表达式、字典推导式
    【4.4】bisect维护已排序序列
    【4.3】实现可切片的对象
    【4.2】Python序列中+、+=和extend的区别
    【4.1】Python中的序列分类
    【3.12】contextlib简化上下文管理器
    【3.11】Python中的with语句
  • 原文地址:https://www.cnblogs.com/HeroBeast/p/1361984.html
Copyright © 2020-2023  润新知