• lucene 自定义分词器小程序


    测试类
    package LuceneUtil;
    
    import java.io.Reader;
    import java.util.Set;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.LetterTokenizer;
    import org.apache.lucene.analysis.LowerCaseFilter;
    import org.apache.lucene.analysis.StopAnalyzer;
    import org.apache.lucene.analysis.StopFilter;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.util.Version;
    //自定义过滤分词器
    
    public class MyStopAnalyzer extends Analyzer {
    	private Set stops;
    	public MyStopAnalyzer(String [] sws)//形参为 字符串数组
    	{
    		//会自动将字符串数组转换为Set
    		stops=StopFilter.makeStopSet(Version.LUCENE_35, sws,true);
    		//将原有的停用词加入到现在的停用词中
    		stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    	}
    	
    	public MyStopAnalyzer()
    	{
    		stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    	}
    	
    	public TokenStream tokenStream(String FileName,Reader reader)
    	{
    		return new StopFilter(Version.LUCENE_35, 
    				new LowerCaseFilter(Version.LUCENE_35,
    				new LetterTokenizer(Version.LUCENE_35, reader)),  stops);
    		
    	}
    
    }
    

    
    
     
    package LuceneTest;
    
    import java.io.BufferedWriter;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Iterator;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.StopAnalyzer;
    import org.apache.lucene.util.Version;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
    
    import LuceneUtil.AnalyzerUtils;
    import LuceneUtil.MyStopAnalyzer;
    
    public class TestAnalyzer {
    	static ArrayList<String> list=null;
    	
    	
    	public static void main (String [] args) throws IOException{
    		//addNewWord( "烟台大学 ");
    		//test();
    		test01();
    		
    	}
    	
    	public static void test()
    	{
    		Analyzer a1=new MMSegAnalyzer();
    		String txt="我是一名大学生,我来自菏*,我现在烟台大学。";
    		AnalyzerUtils.displayToken(txt,a1); 
    	
    	}
    	
    	
    	public static void test01()
    	{
    		//使用自定义的过滤分词器
    		//这个语句 可以吧 “you“,”meet”,和“***”  给和谐掉
    		Analyzer a2=new MyStopAnalyzer(new String [] {"you","meet","***"});
    		//系统自带的StopAnalyzer
    		Analyzer a3=new StopAnalyzer(Version.LUCENE_35);
    		
    		String txt=" i say :how are You,nice to meet you. ***";
    		AnalyzerUtils.displayToken(txt,a2); 
    		AnalyzerUtils.displayToken(txt,a3); 
    	
    	}
    
    package LuceneUtil;
    import java.io.IOException;
    import java.io.StringReader;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.omg.CORBA.portable.Streamable;
    //测试类
    
    public class AnalyzerUtils {
    	
    	
    	public static  void displayToken(String str,Analyzer a)
    	{
    		
    		try {
    			TokenStream ts=a.tokenStream("cotents", new StringReader(str));
    			//创建一个属性,这个属性添加到流中,随着TokenStream增加
    			CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class);
    			
    			while (ts.incrementToken())
    			{
    				System.out.print("["+cta+"]");
    				
    			}
    			System.out.println();
    			
    		} catch (IOException e) {
    			
    			e.printStackTrace();
    		}
    	}
    
    }
    
    
    

    /*public static void addNewWord(String newWord) throws IOException{BufferedWriter bw=new BufferedWriter(new FileWriter("G:\\mmseg\\data\\words-my.dic"));ArrayList<String> list=new ArrayList<String>();list.add(newWord);Iterator<String> iterator=list.iterator();while (iterator.hasNext()){bw.write(iterator.next());bw.flush();bw.newLine();}bw.close();System.out.println("添加成功");}*/}
    
    

    测试结果如下:

    可见 我想和谐掉的 那几个字已被和谐

    第一行为执行和谐后的结果

    第二行为未被和谐的
    [i][say][how][nice]
    [i][say][how][you][nice][meet][you][***]

    不足:还不能对单个汉语词语和谐 ,汉语只能屏蔽一句话。而英语却可以

  • 相关阅读:
    mac上的终端bash命令(二)基础
    Android 开发笔记___drawable
    Android 开发笔记___图像按钮__imageButton
    Android 开发笔记___滚动视图__scroll view
    Android 开发笔记___textvieww__跑马灯效果
    Android 开发笔记___textview_聊天室效果
    Android 开发笔记___图像视图__简单截屏
    Android 开发笔记___图像视图
    javaScript学习笔记(一)js基础
    iview2+ 表单密码验证
  • 原文地址:https://www.cnblogs.com/lixingle/p/3313037.html
Copyright © 2020-2023  润新知