重写分析器
lucene.net中每个分词器都是一个类,同时有一个辅助类,这个辅助类完成分词的大部分逻辑。分词类以Analyzer结尾,辅助类通常以Tokenizer结尾。分类词全部继承自Analyzer类,辅助类通常也会继承CharTokenizer,比如说有新需求,希望我们加一个逗号分词器:
namespace Lucene.Net.Analysis { /// <summary> /// 逗号分词器 这是自定义的一个分词器,如果之后还需要自定义分词器的话,就需要和这一样分别继承Analyzer和CharTokenizer /// </summary> public class CommaAnalyzer : Analyzer { public override TokenStream TokenStream(string fieldName, TextReader reader) { //CommaTokenizer 自定义的逗号分析器的 return new CommaTokenizer(reader); } public override TokenStream ReusableTokenStream(string fieldName, TextReader reader) { Tokenizer tokenizer = (Tokenizer)this.PreviousTokenStream; if (tokenizer == null) { tokenizer = new CommaTokenizer(reader); this.PreviousTokenStream = tokenizer; } else { tokenizer.Reset(reader); } return tokenizer; } } }
辅助类:
namespace Lucene.Net.Analysis { public class CommaTokenizer : CharTokenizer { public CommaTokenizer(TextReader in_Renamed) : base(in_Renamed) { } public CommaTokenizer(AttributeSource source, TextReader in_Renamed) : base(source, in_Renamed) { } public CommaTokenizer(AttributeSource.AttributeFactory factory, TextReader in_Renamed) : base(factory, in_Renamed) { } protected override bool IsTokenChar(char c) { return c != ','; } } }