import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; public class StandardAnalyzerTest { @SuppressWarnings("resource") public static void main(String[] args) { try { // 要处理的文本 String text = "你好*hello,哈哈! !'ni> < !' hao->。“我”.192.168.8.10"; // 自定义停用词 List<String> sw = new LinkedList<String>();// custom stopWords set sw.add(""); CharArraySet stopWords = new CharArraySet(sw, true); // 加入系统默认停用词 Iterator<Object> itor = StandardAnalyzer.STOP_WORDS_SET.iterator(); while (itor.hasNext()) { stopWords.add(itor.next()); } // 标准分词器(Lucene内置的标准分析器,会将语汇单元转成小写形式,并去除停用词及标点符号) StandardAnalyzer analyzer = new StandardAnalyzer(stopWords); TokenStream ts = analyzer.tokenStream("field", text); CharTermAttribute ch = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println(ch.toString()); } ts.end(); ts.close(); } catch (Exception ex) { ex.printStackTrace(); } } } 分词结果如下: 你 好 hello 哈 哈 ni hao 我 192.168.8.10