1、工程结构
2、查询语法代码
- package org.itat.index;<font></font>
- <font></font>
- import java.io.File;<font></font>
- import java.io.IOException;<font></font>
- import java.io.StringReader;<font></font>
- import java.text.ParseException;<font></font>
- import java.text.SimpleDateFormat;<font></font>
- import java.util.Date;<font></font>
- import java.util.HashMap;<font></font>
- import java.util.Map;<font></font>
- import org.apache.lucene.analysis.Analyzer;<font></font>
- import org.apache.lucene.analysis.TokenStream;<font></font>
- import org.apache.lucene.analysis.standard.StandardAnalyzer;<font></font>
- import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;<font></font>
- import org.apache.lucene.document.Document;<font></font>
- import org.apache.lucene.document.Field;<font></font>
- import org.apache.lucene.document.NumericField;<font></font>
- import org.apache.lucene.index.CorruptIndexException;<font></font>
- import org.apache.lucene.index.IndexReader;<font></font>
- import org.apache.lucene.index.IndexWriter;<font></font>
- import org.apache.lucene.index.IndexWriterConfig;<font></font>
- import org.apache.lucene.index.Term;<font></font>
- import org.apache.lucene.queryParser.QueryParser;<font></font>
- import org.apache.lucene.search.BooleanClause.Occur;<font></font>
- import org.apache.lucene.search.BooleanQuery;<font></font>
- import org.apache.lucene.search.FuzzyQuery;<font></font>
- import org.apache.lucene.search.IndexSearcher;<font></font>
- import org.apache.lucene.search.NumericRangeQuery;<font></font>
- import org.apache.lucene.search.PhraseQuery;<font></font>
- import org.apache.lucene.search.PrefixQuery;<font></font>
- import org.apache.lucene.search.Query;<font></font>
- import org.apache.lucene.search.ScoreDoc;<font></font>
- import org.apache.lucene.search.TermQuery;<font></font>
- import org.apache.lucene.search.TermRangeQuery;<font></font>
- import org.apache.lucene.search.TopDocs;<font></font>
- import org.apache.lucene.search.WildcardQuery;<font></font>
- import org.apache.lucene.store.Directory;<font></font>
- import org.apache.lucene.store.FSDirectory;<font></font>
- import org.apache.lucene.store.LockObtainFailedException;<font></font>
- import org.apache.lucene.util.Version;<font></font>
- import org.wltea.analyzer.lucene.IKAnalyzer;<font></font>
- <font></font>
- public class SearcherUtil {<font></font>
- private Directory directory;<font></font>
- private Analyzer analyzer = new IKAnalyzer();<font></font>
- private IndexReader reader;<font></font>
- private String[] ids = {"1","2","3","4","5","6"};<font></font>
- private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};<font></font>
- private String[] contents = {<font></font>
- "welcome to visited the space,I like book",<font></font>
- "hello boy, I like pingpeng ball",<font></font>
- "my name is cc I like game",<font></font>
- "I like football",<font></font>
- "I like football and I like basketball too",<font></font>
- "I like movie and swim"<font></font>
- };<font></font>
- private Date[] dates = null;<font></font>
- private int[] attachs = {2,3,1,4,5,5};<font></font>
- private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};<font></font>
- private Map<String,Float> scores = new HashMap<String,Float>();<font></font>
- <font></font>
- public SearcherUtil() {<font></font>
- // directory = new RAMDirectory();<font></font>
- try {<font></font>
- directory = FSDirectory.open(new File("F:\\Workspaces\\lucenes\\02_lucene_searcher\\index"));<font></font>
- setDates();<font></font>
- scores.put("itat.org",2.0f);<font></font>
- scores.put("zttc.edu", 1.5f);<font></font>
- // index();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- private void setDates() {<font></font>
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");<font></font>
- try {<font></font>
- dates = new Date[ids.length];<font></font>
- dates[0] = sdf.parse("2010-02-19");<font></font>
- dates[1] = sdf.parse("2012-01-11");<font></font>
- dates[2] = sdf.parse("2011-09-19");<font></font>
- dates[3] = sdf.parse("2010-12-22");<font></font>
- dates[4] = sdf.parse("2012-01-01");<font></font>
- dates[5] = sdf.parse("2011-05-19");<font></font>
- } catch (ParseException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- <font></font>
- public void index() {<font></font>
- IndexWriter writer = null;<font></font>
- try {<font></font>
- writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));<font></font>
- writer.deleteAll();<font></font>
- Document doc = null;<font></font>
- for(int i=0;i<ids.length;i++) {<font></font>
- doc = new Document();<font></font>
- doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));<font></font>
- doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));<font></font>
- doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));<font></font>
- doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));<font></font>
- //存储数字<font></font>
- doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));<font></font>
- //存储日期<font></font>
- doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));<font></font>
- String et = emails[i].substring(emails[i].lastIndexOf("@")+1);<font></font>
- if(scores.containsKey(et)) {<font></font>
- doc.setBoost(scores.get(et));<font></font>
- } else {<font></font>
- doc.setBoost(0.5f);<font></font>
- }<font></font>
- writer.addDocument(doc);<font></font>
- }<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (LockObtainFailedException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- } finally {<font></font>
- try {<font></font>
- if(writer!=null)writer.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- public IndexSearcher getSearcher() {<font></font>
- try {<font></font>
- if(reader==null) {<font></font>
- reader = IndexReader.open(directory);<font></font>
- } else {<font></font>
- IndexReader tr = IndexReader.openIfChanged(reader);<font></font>
- if(tr!=null) {<font></font>
- reader.close();<font></font>
- reader = tr;<font></font>
- }<font></font>
- }<font></font>
- return new IndexSearcher(reader);<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- return null;<font></font>
- }<font></font>
- <font></font>
- public IndexSearcher getSearcher(Directory directory) {<font></font>
- try {<font></font>
- if(reader==null) {<font></font>
- reader = IndexReader.open(directory);<font></font>
- } else {<font></font>
- IndexReader tr = IndexReader.openIfChanged(reader);<font></font>
- if(tr!=null) {<font></font>
- reader.close();<font></font>
- reader = tr;<font></font>
- }<font></font>
- }<font></font>
- return new IndexSearcher(reader);<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- return null;<font></font>
- }<font></font>
- <font></font>
- public void searchByTerm(String field,String name,int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- Query query = new TermQuery(new Term(field,name));<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- public void searchByTermToken(String field,String name,int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- // Query query = new TermQuery(new Term(field,name));<font></font>
- //当用户输入两个关键字时,QueryParser默认它们之间的关系为“或”关系<font></font>
- //下面这么写的话在对用户输入进行扫描时,就会用空格分开的关键字理解为“与”,<font></font>
- //其实也就是构建了一个“与”关系的布尔型查询<font></font>
- // parser.setDefaultOperator(Operator.AND);<font></font>
- QueryParser parser = new QueryParser(Version.LUCENE_35, field, analyzer);<font></font>
- String k = analyzerKey(name);<font></font>
- Query query = parser.parse(name);<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (Exception e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- private String analyzerKey(String key){<font></font>
- // StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);<font></font>
- StringReader reader = new StringReader(key);<font></font>
- TokenStream tokenStream = analyzer.tokenStream("", reader);<font></font>
- CharTermAttribute termattr = tokenStream.addAttribute(CharTermAttribute.class);<font></font>
- StringBuilder sb = new StringBuilder();<font></font>
- try {<font></font>
- while(tokenStream.incrementToken()){<font></font>
- String k = termattr.toString();<font></font>
- sb.append(k).append(" ");<font></font>
- }<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- key = sb.toString().trim();<font></font>
- key = key.replaceAll("\\s+", " AND ");<font></font>
- return sb.toString();<font></font>
- }<font></font>
- public void printDocument(IndexSearcher searcher,TopDocs tds){<font></font>
- System.out.println("共查询了【"+tds.totalHits+"】条");<font></font>
- for(ScoreDoc sd : tds.scoreDocs){<font></font>
- try {<font></font>
- Document doc = searcher.doc(sd.doc);<font></font>
- System.out.println("filename:"+doc.get("filename"));<font></font>
- System.out.println("path:"+doc.get("path"));<font></font>
- System.out.println("date:"+doc.get("date"));<font></font>
- System.out.println("size:"+doc.get("size"));<font></font>
- System.out.println("content:"+doc.get("content"));<font></font>
- System.out.println("-------------------------------------------");<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- }<font></font>
- public void searchByTermRange(String field,String start,String end,int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- Query query = new TermRangeQuery(field,start,end,true, true);<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- /**<font></font>
- * 建立索引时:使用的Field, 而使用NumericRangeQuery, 必须使用NumericField<font></font>
- * @param field<font></font>
- * @param start<font></font>
- * @param end<font></font>
- * @param num<font></font>
- */<font></font>
- public void searchByNumricRange(String field,int start,int end,int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- Query query = NumericRangeQuery.newIntRange(field,start, end,true,true);<font></font>
- // DateTools.dateToString(new Date(), null);<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- public void searchByPrefix(String field,String value,int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- Query query = new PrefixQuery(new Term(field,value));<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- public void searchByWildcard(String field,String value,int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- //在传入的value中可以使用通配符:?和*,?表示匹配一个字符,*表示匹配任意多个字符<font></font>
- Query query = new WildcardQuery(new Term(field,value));<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- public void searchByBoolean(int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- BooleanQuery query = new BooleanQuery();<font></font>
- /*<font></font>
- * BooleanQuery可以连接多个子查询<font></font>
- * Occur.MUST表示必须出现<font></font>
- * Occur.SHOULD表示可以出现<font></font>
- * Occur.MUSE_NOT表示不能出现<font></font>
- */<font></font>
- query.add(new TermQuery(new Term("name","3")), Occur.MUST_NOT);<font></font>
- query.add(new TermQuery(new Term("content","健壮")),Occur.SHOULD);<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- public void searchByPhrase(int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- PhraseQuery query = new PhraseQuery();<font></font>
- query.setSlop(10);<font></font>
- query.add(new Term("content","java"));<font></font>
- //第一个Term<font></font>
- query.add(new Term("content","程序"));<font></font>
- //产生距离之后的第二个Term<font></font>
- // query.add(new Term("content","football"));<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- /**<font></font>
- * 查询用于匹配与指定项相似的项<font></font>
- * 默认是匹配一个有不同的,其他一样的,比如like 和 mike,就是距离算法的相似距离为1<font></font>
- * 这种方式少用,影响效率<font></font>
- */<font></font>
- public void searchByFuzzy(int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- //最后两个参数为匹配率和距离<font></font>
- FuzzyQuery query = new FuzzyQuery(new Term("content","总统"),0.4f,0);<font></font>
- System.out.println(query.getPrefixLength());<font></font>
- System.out.println(query.getMinSimilarity());<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- public void searchByQueryParse(Query query,int num) {<font></font>
- try {<font></font>
- IndexSearcher searcher = getSearcher();<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- System.out.println("一共查询了:"+tds.totalHits);<font></font>
- for(ScoreDoc sd:tds.scoreDocs) {<font></font>
- Document doc = searcher.doc(sd.doc);<font></font>
- System.out.println(doc.get("id")+"---->"+<font></font>
- doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+<font></font>
- doc.get("attach")+","+doc.get("date")+"=="+sd.score);<font></font>
- }<font></font>
- searcher.close();<font></font>
- } catch (CorruptIndexException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- /**<font></font>
- * lucene3.5之前采用的是一种再查询的方式,也就是说先把全部的结果的docid查询出来,然后<font></font>
- * 分页得到该页的docid,然后根据docid得到document信息,<font></font>
- * lucene官方是说他的速度已经够快,再查询不会有效率问题<font></font>
- * @param query<font></font>
- * @param pageIndex<font></font>
- * @param pageSize<font></font>
- */<font></font>
- public void searchPage(String query,int pageIndex,int pageSize) {<font></font>
- try {<font></font>
- Directory dir = FileIndexUtils.getDirectory();<font></font>
- IndexSearcher searcher = getSearcher(dir);<font></font>
- QueryParser parser = new QueryParser(Version.LUCENE_35,"content",analyzer);<font></font>
- Query q = parser.parse(query);<font></font>
- TopDocs tds = searcher.search(q, 500);<font></font>
- ScoreDoc[] sds = tds.scoreDocs;<font></font>
- int start = (pageIndex-1)*pageSize;<font></font>
- int end = pageIndex*pageSize;<font></font>
- for(int i=start;i<end;i++) {<font></font>
- Document doc = searcher.doc(sds[i].doc);<font></font>
- System.out.println("filename:"+doc.get("filename"));<font></font>
- System.out.println("path:"+doc.get("path"));<font></font>
- System.out.println("date:"+doc.get("date"));<font></font>
- System.out.println("size:"+doc.get("size"));<font></font>
- System.out.println("content:"+doc.get("content"));<font></font>
- System.out.println("-------------------------------------------");<font></font>
- }<font></font>
- <font></font>
- searcher.close();<font></font>
- } catch (org.apache.lucene.queryParser.ParseException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- /**<font></font>
- * 目前没有办法只取当前这页的数据,而是要全部查询然后得到docid<font></font>
- * 一种增加效率的方式是取的条数做下限制,比如不要每次都取500条,<font></font>
- * 也是把取的条数设置为当前页的所在位置数,比如每页10条,<font></font>
- * 取第一页数据则取10条,取第二页则取20条,取五页则去50条<font></font>
- * 根据页码和分页大小获取上一次的最后一个ScoreDoc<font></font>
- */<font></font>
- private ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws IOException {<font></font>
- if(pageIndex==1)return null;//如果是第一页就返回空<font></font>
- int num = pageSize*(pageIndex-1);//获取上一页的数量<font></font>
- TopDocs tds = searcher.search(query, num);<font></font>
- return tds.scoreDocs[num-1];<font></font>
- }<font></font>
- /**<font></font>
- * 使用这种方式的话是把上一页的最后一个元素给拿到,然后再把pagesize传入,<font></font>
- * 就可以得到当页的数据,其实就是简便了查询,原理还是把全部的docid查询后在得到document<font></font>
- * @param query<font></font>
- * @param pageIndex<font></font>
- * @param pageSize<font></font>
- */<font></font>
- public void searchPageByAfter(String query,int pageIndex,int pageSize) {<font></font>
- try {<font></font>
- Directory dir = FileIndexUtils.getDirectory();<font></font>
- IndexSearcher searcher = getSearcher(dir);<font></font>
- QueryParser parser = new QueryParser(Version.LUCENE_35,"content",analyzer);<font></font>
- Query q = parser.parse(query);<font></font>
- //先获取上一页的最后一个元素<font></font>
- ScoreDoc lastSd = getLastScoreDoc(pageIndex, pageSize, q, searcher);<font></font>
- //通过最后一个元素搜索下页的pageSize个元素<font></font>
- TopDocs tds = searcher.searchAfter(lastSd,q, pageSize);<font></font>
- printDocument(searcher, tds);<font></font>
- searcher.close();<font></font>
- } catch (org.apache.lucene.queryParser.ParseException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- public void searchNoPage(String query) {<font></font>
- try {<font></font>
- Directory dir = FileIndexUtils.getDirectory();<font></font>
- IndexSearcher searcher = getSearcher(dir);<font></font>
- QueryParser parser = new QueryParser(Version.LUCENE_35,"content",new StandardAnalyzer(Version.LUCENE_35));<font></font>
- Query q = parser.parse(query);<font></font>
- TopDocs tds = searcher.search(q, 20);<font></font>
- ScoreDoc[] sds = tds.scoreDocs;<font></font>
- for(int i=0;i<sds.length;i++) {<font></font>
- Document doc = searcher.doc(sds[i].doc);<font></font>
- System.out.println(sds[i].doc+":"+doc.get("path")+"-->"+doc.get("filename"));<font></font>
- }<font></font>
- <font></font>
- searcher.close();<font></font>
- } catch (org.apache.lucene.queryParser.ParseException e) {<font></font>
- e.printStackTrace();<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- }
3、查询语法的测试单元类
- package org.itat.test;<font></font>
- <font></font>
- import java.io.File;<font></font>
- import java.io.IOException;<font></font>
- import org.apache.commons.io.FileUtils;<font></font>
- import org.apache.commons.io.FilenameUtils;<font></font>
- import org.apache.lucene.analysis.Analyzer;<font></font>
- import org.apache.lucene.analysis.standard.StandardAnalyzer;<font></font>
- import org.apache.lucene.queryParser.ParseException;<font></font>
- import org.apache.lucene.queryParser.QueryParser;<font></font>
- import org.apache.lucene.search.Query;<font></font>
- import org.apache.lucene.util.Version;<font></font>
- import org.itat.index.FileIndexUtils;<font></font>
- import org.itat.index.SearcherUtil;<font></font>
- import org.junit.Before;<font></font>
- import org.junit.Test;<font></font>
- import org.wltea.analyzer.lucene.IKAnalyzer;<font></font>
- <font></font>
- public class TestSearch {<font></font>
- private SearcherUtil su;<font></font>
- private Analyzer analyzer = new IKAnalyzer();<font></font>
- @Before<font></font>
- public void init() {<font></font>
- su = new SearcherUtil();<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void testCopyFiles() {<font></font>
- try {<font></font>
- File file = new File("F:\\Workspaces\\lucenes\\02_lucene_searcher\\resource");<font></font>
- for(File f:file.listFiles()) {<font></font>
- String destFileName = FilenameUtils.getFullPath(f.getAbsolutePath())+<font></font>
- FilenameUtils.getBaseName(f.getName())+".she";<font></font>
- FileUtils.copyFile(f, new File(destFileName));<font></font>
- }<font></font>
- } catch (IOException e) {<font></font>
- e.printStackTrace();<font></font>
- }<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByTerm() {<font></font>
- // su.searchByTerm("content","",10);<font></font>
- su.searchByTermToken("content","头脑风暴",10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByTermRange() {<font></font>
- //查询name以a开头和s结尾的<font></font>
- // su.searchByTermRange("name","a","s",10);<font></font>
- //由于attachs是数字类型,使用TermRange无法查询<font></font>
- // su.searchByTermRange("size",new NumericField("200").stringValue(),new NumericField("500").stringValue(), 10);<font></font>
- QueryParser parser = new QueryParser(Version.LUCENE_35, "size", analyzer);<font></font>
- Query query;<font></font>
- try {<font></font>
- query = parser.parse("size:[100 TO 500]");<font></font>
- su.searchByQueryParse(query, 10);<font></font>
- } catch (ParseException e) {<font></font>
- e.printStackTrace();<font></font>
- } <font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByNumRange() {<font></font>
- // su.searchByNumricRange("attach",2,10, 5);<font></font>
- su.searchByNumricRange("size",100,300, 10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByPrefix() {<font></font>
- su.searchByPrefix("content", "人", 10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByWildcard() {<font></font>
- //匹配@itat.org结尾的所有字符<font></font>
- // su.searchByWildcard("email", "*@itat.org", 10);<font></font>
- //匹配j开头的有三个字符的name<font></font>
- // su.searchByWildcard("name", "j???", 10);<font></font>
- su.searchByWildcard("content", "类?", 10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByBoolean() {<font></font>
- su.searchByBoolean(10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByPhrase() {<font></font>
- su.searchByPhrase(10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByFuzzy() {<font></font>
- su.searchByFuzzy(10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void searchByQueryParse() throws ParseException {<font></font>
- //1、创建QueryParser对象,默认搜索域为content<font></font>
- QueryParser parser = new QueryParser(Version.LUCENE_35, "content", new StandardAnalyzer(Version.LUCENE_35));<font></font>
- //改变空格的默认操作符,以下可以改成AND<font></font>
- //parser.setDefaultOperator(Operator.AND);<font></font>
- //开启第一个字符的通配符匹配,默认关闭因为效率不高<font></font>
- parser.setAllowLeadingWildcard(true);<font></font>
- //搜索content中包含有like的<font></font>
- Query query = parser.parse("like");<font></font>
- <font></font>
- //有basketball或者football的,空格默认就是OR<font></font>
- query = parser.parse("basketball football");<font></font>
- <font></font>
- //改变搜索域为name为mike<font></font>
- //query = parser.parse("content:like");<font></font>
- <font></font>
- //同样可以使用*和?来进行通配符匹配<font></font>
- // query = parser.parse("name:j*");<font></font>
- <font></font>
- //通配符默认不能放在首位<font></font>
- // query = parser.parse("email:*@itat.org");<font></font>
- <font></font>
- //匹配name中没有mike但是content中必须有football的,+和-要放置到域说明前面<font></font>
- query = parser.parse("- name:mike + like");<font></font>
- <font></font>
- //匹配一个区间,注意:TO必须是大写<font></font>
- //query = parser.parse("id:[1 TO 6]");<font></font>
- <font></font>
- //闭区间匹配只会匹配到2<font></font>
- //query = parser.parse("id:{1 TO 3}");<font></font>
- <font></font>
- //完全匹配I Like Football的<font></font>
- //query = parser.parse("\"I like football\"");<font></font>
- <font></font>
- //匹配I 和football之间有一个单词距离的<font></font>
- //query = parser.parse("\"I football\"~1");<font></font>
- <font></font>
- //模糊查询<font></font>
- //query = parser.parse("name:make~");<font></font>
- <font></font>
- //没有办法匹配数字范围(自己扩展Parser)<font></font>
- //query = parser.parse("attach:[2 TO 10]");<font></font>
- su.searchByQueryParse(query, 10);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void indexFile() {<font></font>
- FileIndexUtils.index(true);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void testSearchPage01() {<font></font>
- su.searchPage("java", 2,5);<font></font>
- System.out.println("-------------------------------");<font></font>
- // su.searchNoPage("java");<font></font>
- su.searchPageByAfter("java", 2,2);<font></font>
- }<font></font>
- <font></font>
- @Test<font></font>
- public void testSearchPage02() {<font></font>
- su.searchPageByAfter("java", 3,20);<font></font>
- }<font></font>
- <font></font>
- }
4、创建索引的类
- package org.itat.index;
- import java.io.File;
- import java.io.FileReader;
- import java.io.IOException;
- import org.apache.commons.io.FileUtils;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.NumericField;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.util.Version;
- import org.wltea.analyzer.lucene.IKAnalyzer;
- public class FileIndexUtils {
- private static Directory directory = null;
- private static Analyzer analyzer = new IKAnalyzer();
- static{
- try {
- directory = FSDirectory.open(new File("F:\\Workspaces\\lucenes\\02_lucene_searcher\\index"));
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public static Directory getDirectory() {
- return directory;
- }
- public static void index(boolean hasNew) {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, analyzer));
- if(hasNew) {
- writer.deleteAll();
- }
- File file = new File("F:\\Workspaces\\lucenes\\02_lucene_searcher\\resource");
- Document doc = null;
- for(File f:file.listFiles()) {
- doc = new Document();
- doc.add(new Field("content",FileUtils.readFileToString(f),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(new Field("filename",f.getName(),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(new Field("path",f.getAbsolutePath(),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(f.lastModified()));
- doc.add(new NumericField("size",Field.Store.YES,true).setIntValue((int)(f.length())));
- writer.addDocument(doc);
- }
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
5、对索引进行操作的类
- package org.itat.index;
- import java.io.IOException;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.NumericField;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.StaleReaderException;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.store.RAMDirectory;
- import org.apache.lucene.util.Version;
- public class IndexUtil {
- private String[] ids = {"1","2","3","4","5","6"};
- private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
- private String[] contents = {
- "welcome to visited the space,I like book",
- "hello boy, I like pingpeng ball",
- "my name is cc I like game",
- "I like football",
- "I like football and I like basketball too",
- "I like movie and swim"
- };
- private Date[] dates = null;
- private int[] attachs = {2,3,1,4,5,5};
- private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};
- private Directory directory = null;
- private Map<String,Float> scores = new HashMap<String,Float>();
- public IndexUtil() {
- setDates();
- scores.put("itat.org",2.0f);
- scores.put("zttc.edu", 1.5f);
- directory = new RAMDirectory();
- index();
- }
- private void setDates() {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
- try {
- dates = new Date[ids.length];
- dates[0] = sdf.parse("2010-02-19");
- dates[1] = sdf.parse("2012-01-11");
- dates[2] = sdf.parse("2011-09-19");
- dates[3] = sdf.parse("2010-12-22");
- dates[4] = sdf.parse("2012-01-01");
- dates[5] = sdf.parse("2011-05-19");
- } catch (ParseException e) {
- e.printStackTrace();
- }
- }
- public void undelete() {
- //使用IndexReader进行恢复
- try {
- IndexReader reader = IndexReader.open(directory,false);
- //恢复时,必须把IndexReader的只读(readOnly)设置为false
- reader.undeleteAll();
- reader.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (StaleReaderException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void merge() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //会将索引合并为2段,这两段中的被删除的数据会被清空
- //特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销,
- //Lucene会根据情况自动处理的
- writer.forceMerge(2);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void forceDelete() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- writer.forceMergeDeletes();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void delete() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
- //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复
- writer.deleteDocuments(new Term("id","1"));
- writer.commit();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void update() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
- /*
- * Lucene并没有提供更新,这里的更新操作其实是如下两个操作的合集
- * 先删除之后再添加
- */
- Document doc = new Document();
- doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- writer.updateDocument(new Term("id","1"), doc);
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null) writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- public void query() {
- try {
- IndexReader reader = IndexReader.open(directory);
- //通过reader可以有效的获取到文档的数量
- System.out.println("numDocs:"+reader.numDocs());
- System.out.println("maxDocs:"+reader.maxDoc());
- System.out.println("deleteDocs:"+reader.numDeletedDocs());
- reader.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public void index() {
- IndexWriter writer = null;
- try {
- writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
- writer.deleteAll();
- Document doc = null;
- for(int i=0;i<ids.length;i++) {
- doc = new Document();
- doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- //存储数字
- doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
- //存储日期
- doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
- String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
- System.out.println(et);
- if(scores.containsKey(et)) {
- doc.setBoost(scores.get(et));
- } else {
- doc.setBoost(0.5f);
- }
- writer.addDocument(doc);
- }
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (LockObtainFailedException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(writer!=null)writer.close();
- } catch (CorruptIndexException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }