一种非常简单,但是不是很优化的方法,继承Lucene.Net.Analysis.Analyzer,实现了Lucene.Net.Analysis.Analyzer,Lucene.Net.Analysis.Tokenizer,Lucene.Net.Analysis.TokenFilter的子类.参考了Lucene.Net.Analysis.Cn的实现,该项目采用对汉语进行一元分词. ChineseAnalyer类,继承自Lucene.Net.Analysis.Analyzer using System; using System.IO; using System.Text; using System.Collections; using ShootSeg; //分词类的命名空间,该分词组件来源于http://www.shootsoft.net,开源项目,感谢作者 using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.CnByKing { public class ChineseAnalyzer : Analyzer { private Segment segment = new Segment(); //这个是自己的中文分词类 public ChineseAnalyzer() { segment.InitWordDics(); //在构造函数装载词典 segment.Separator = "|"; //分词间隔符号 } public override sealed TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new ChineseTokenizer(reader,segment); //把分词类传引用进去 result = new ChineseFilter(result); //对处理好的结果进行过滤 return result; } } } ChineseTokenizer类继承自Lucene.Net.Analysis.Tokenizer using System; using System.IO; using System.Text; using System.Collections; using System.Globalization; using ShootSeg; using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.CnByKing { public sealed class ChineseTokenizer : Tokenizer { private Segment segment; private string[] Wordlist; //切好的词放入此数组中 private string Allstr; //对传入的流转成此string private int offset = 0; int start = 0; int step = 0; //offset偏移量,start开始位置,step次数 public ChineseTokenizer(TextReader _in,Segment segment) { input = _in; Allstr = input.ReadToEnd(); //把流读到Allstr this.segment = segment; //继续传引用,这会才发现写的时候糊涂了,完全可以不用写 Wordlist = segment.SegmentText(Allstr).Split('|'); //把分好的词装入wordlist } private Token Flush(string str) { if (str.Length > 0) { return new Token(str,start, start + str.Length); //返回一个Token 包含词,词在流中的开始位置和结束位置. } else return null; } public override Token Next() //重载Next函数,就是返回Token { Token token = null; if (step <= Wordlist.Length) { start = Allstr.IndexOf(Wordlist[step], offset); //从Allstr里找每个分出来词汇的开始位置 offset = start + 1; //计算偏移量 token = Flush(Wordlist[step]); //返回已分词汇 step = step + 1; //变量+1,移动到wordlist的下一个词汇 } return token; } } } 这个ChineseFilter继承自Lucene.Net.Analysis.TokenFilter,完全照抄Lucene.Net.Analysis.Cn工程的同名类(此类过滤了数字及符号,英文助词,需要过滤其他相应增加代码) using System; using System.IO; using System.Collections; using System.Globalization; using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.CnByKing { /// <summary> /// Title: ChineseFilter /// Description: Filter with a stop word table /// Rule: No digital is allowed. /// English word/token should larger than 1 character. /// One Chinese character as one Chinese word. /// TO DO: /// 1. Add Chinese stop words, such as \ue400 /// 2. Dictionary based Chinese word extraction /// 3. Intelligent Chinese word extraction /// /// Copyright: Copyright (c) 2001 /// Company: /// @author Yiyi Sun /// @version $Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $ /// </summary> public sealed class ChineseFilter : TokenFilter { // Only English now, Chinese to be added later. public static String[] STOP_WORDS = { "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private Hashtable stopTable; public ChineseFilter(TokenStream _in) : base(_in) { stopTable = new Hashtable(STOP_WORDS.Length); for (int i = 0; i < STOP_WORDS.Length; i++) stopTable[STOP_WORDS[i]] = STOP_WORDS[i]; } public override Token Next() { for (Token token = input.Next(); token != null; token = input.Next()) { String text = token.TermText(); // why not key off token type here assuming ChineseTokenizer comes first? if (stopTable[text] == null) { switch (Char.GetUnicodeCategory(text[0])) { case UnicodeCategory.LowercaseLetter: case UnicodeCategory.UppercaseLetter: // English word/token should larger than 1 character. if (text.Length > 1) { return token; } break; case UnicodeCategory.OtherLetter: // One Chinese character as one Chinese word. // Chinese word extraction to be added later here. return token; } } } return null; } } }
以上基本没什么技术含量,好处就是增加新的中文分词不管什么算法,只需要简单几行代码搞定.中文分词完全和DotLucene/Lucene.net本身无关. 使用的时候用ChineseAnalyzer替换 StandardAnalyzer就OK了.
Click Here To Download是编译好的lucene.net 1.91 Lucene.Net.Analysis.CnByKing.dll ShootSeg.dll引用这三玩意就可以搞定简单的中文搜索了