Lucene就是一个全文检索的工具,建立索引用的,类似于新华字典的目录
这里使用的是lucene-4.4.0版本,入门代码所需jar包如下图所示(解压lucene-4.4.0后的目录):
入门代码:
import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; /*8 * luceneDemo * */ public class TestLucene { /** * 通过lucene 提供的api 对数据建立索引,indexWriter * @throws IOException * */ @Test public void testAdd() throws IOException{ //索引在硬盘上面存放的位置.. Directory directory=FSDirectory.open(new File("D:/INDEX")); //lucene 当前使用的版本... Version matchVersion=Version.LUCENE_44; //分词器...(把一段文本分词)(黑马程序员是高端的培训机构) //analzyer 是一个抽象类,具体的切分词规则由子类实现... Analyzer analyzer=new StandardAnalyzer(matchVersion); IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer); //构造索引写入的对象.. IndexWriter indexWriter=new IndexWriter(directory, config); //往索引库里面写数据.. //索引库里面的数据都是document 一个document相当于是一条记录 //这个document里面的数据相当于索引结构.. Document document=new Document(); IndexableField indexableField=new IntField("id",1, Store.YES); IndexableField stringfield=new StringField("title","对王召廷的个人评价",Store.YES); IndexableField teIndexableField=new TextField("content","风流倜傥有点黄",Store.YES); document.add(indexableField); document.add(stringfield); document.add(teIndexableField); //索引库里面接收的数据都是document对象 indexWriter.addDocument(document); indexWriter.close(); } /** * 对建立的索引进行搜索... * 通过indexSearcher 去搜索... * @throws IOException */ @Test public void testSearcher() throws IOException{ //索引在硬盘上面存放的位置.. Directory directory=FSDirectory.open(new File("D:/INDEX")); //把索引目录里面的索引读取到IndexReader 当中... IndexReader indexReader=DirectoryReader.open(directory); // /构造搜索索引的对象.. IndexSearcher indexSearcher=new IndexSearcher(indexReader); //Query 它是一个查询条件对象,它是一个抽象类,不同的查询规则就构造不同的子类... Query query=new TermQuery(new Term("title", "对王召廷的个人评价")); //检索符合query 条件的前面N 条记录.. // TopDocs topDocs=indexSearcher.search(query, 10); //返回总记录数... System.out.println(topDocs.totalHits); //存放的都是document 的id ScoreDoc scoreDocs []=topDocs.scoreDocs; for(ScoreDoc scoreDoc:scoreDocs){ //返回的就是document id int docID=scoreDoc.doc; //我还需要根据id 检索到对应的document Document document=indexSearcher.doc(docID); System.out.println("id=="+document.get("id")); System.out.println("title=="+document.get("title")); System.out.println("content=="+document.get("content")); } } }
原理分析图:
demo演示:
根据入门代码流程提炼工具类代码:
import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * lucene 工具类... * @author Administrator * */ /** * 提炼规则,假设这段代码可以完成一个功能,把这个代码提炼到一个方法里面去,假设这个方法在某个业务罗继承可以共用,那么往上抽取, * 假设在其它逻辑层也可以用,提炼到工具类里面去。 * */ public class LuceneUtils { private static IndexWriter indexWriter=null; private static IndexSearcher indexSearcher=null; //索引存放目录.. private static Directory directory=null; private static IndexWriterConfig indexWriterConfig=null; private static Version version=null; private static Analyzer analyzer=null; static { try { directory=FSDirectory.open(new File(Constants.URL)); version=Version.LUCENE_44; analyzer=new StandardAnalyzer(version); indexWriterConfig=new IndexWriterConfig(version, analyzer); } catch (IOException e) { e.printStackTrace(); } } /** * * @return 返回用于操作索引的对象... * @throws IOException */ public static IndexWriter getIndexWriter() throws IOException{ indexWriter=new IndexWriter(directory, indexWriterConfig); return indexWriter; } /** * 返回用于搜索索引的对象... * @return * @throws IOException */ public static IndexSearcher getIndexSearcher() throws IOException{ IndexReader indexReader=DirectoryReader.open(directory); indexSearcher=new IndexSearcher(indexReader); return indexSearcher; } /** * * 返回lucene 当前的版本... * @return */ public static Version getVersion() { return version; } /** * * 返回lucene 当前使用的分词器.. * @return */ public static Analyzer getAnalyzer() { return analyzer; } }
public class Constants { /** * 索引存放的目录 */ public static final String URL="d:/indexdir/news"; }
bean:
package cn.itcast.bean; public class Article { private int id; public int getId() { return id; } public void setId(int id) { this.id = id; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } private String title; private String content; private String author; private String url; }
转换工具类:
package cn.itcast.lucene; import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexableField; import cn.itcast.bean.Article; /*8 * 对象与索引库document 之间的转化 * */ public class ArticleToDocument { public static Document articleToDocument(Article article){ Document document=new Document(); IntField idfield=new IntField("id", article.getId(), Store.YES); //StringField 对应的值不分词,textField 分词.. TextField titleField=new TextField("title", article.getTitle(),Store.YES); TextField contentField=new TextField("content", article.getContent(),Store.YES); //修改这个字段对应的权重值,默认这个值为1f // contentField.setBoost(3f); StringField authorField=new StringField("author", article.getAuthor(), Store.YES); StringField urlField=new StringField("url", article.getUrl(), Store.YES); document.add(idfield); document.add(titleField); document.add(contentField); document.add(authorField); document.add(urlField); return document; } }
Dao层:
package cn.itcast.dao; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import cn.itcast.bean.Article; import cn.itcast.lucene.ArticleToDocument; import cn.itcast.uitls.LuceneUtils; /** * 使用lucene 的API 来操作索引库.. * @author Administrator * */ public class LuceneDao { public void addIndex(Article article) throws IOException{ IndexWriter indexWriter=LuceneUtils.getIndexWriter(); Document doc=ArticleToDocument.articleToDocument(article); indexWriter.addDocument(doc); indexWriter.close(); } /** * 删除符合条件的记录... * @param fieldName * @param fieldValue * @throws IOException */ public void delIndex(String fieldName,String fieldValue) throws IOException{ IndexWriter indexWriter=LuceneUtils.getIndexWriter(); //一定要梦想,万一实现了勒 Term term=new Term(fieldName, fieldValue); indexWriter.deleteDocuments(term); indexWriter.close(); } /** * * 更新 * * update table set ? where condtion * @throws IOException * * */ public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{ IndexWriter indexWriter=LuceneUtils.getIndexWriter(); /** * 1:term 设置更新的条件... * * 2:设置更新的内容的对象.. * */ Term term=new Term(fieldName,fieldValue); Document doc=ArticleToDocument.articleToDocument(article); /** * * 在lucene 里面是先删除符合这个条件term 的记录,在创建一个doc 记录... * */ indexWriter.updateDocument(term, doc); indexWriter.close(); } /** * 0,10 * 10,10 * 20,10 * @param keywords * @throws Exception */ public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{ IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher(); //第一个条件.. 单字段查询... // Query query=new TermQuery(new Term("title","梦想")) //select * from table where fieldname="" or content="" String fields []={"title","content"}; //第二种条件:使用查询解析器,多字段。。。 我们需要重新导入一个jar queryParser 的jar... 位置在lucene解压后的queryparser文件夹下 QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer()); // /这个事一个条件.. Query query=queryParser.parse(keywords); //query 它是一个查询条件,query 是一个抽象类,不同的查询规则构造部同的子类即可 //检索符合query 条件的前面N 条记录... //检索的是索引目录... (总记录数,socreDOC (docID)) //使用lucene 提供的api 进行操作... TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult); // /存放的是docID ScoreDoc scoreDocs []=topDocs.scoreDocs; //判断:scoreDocs 的length (实际取出来的数量..) 与 firstResult+maxResult 的值取小值... //在java jdk 里面提供了一个api int endResult=Math.min(scoreDocs.length, firstResult+maxResult); for(int i=firstResult;i<endResult;i++){ // /取出来的是docID,这个id 是lucene 自己来维护。 int docID=scoreDocs[i].doc; Document document=indexSearcher.doc(docID); System.out.println("id==="+document.get("id")); System.out.println("title==="+document.get("title")); System.out.println("content==="+document.get("content")); System.out.println("url==="+document.get("url")); System.out.println("author==="+document.get("author")); } } }
测试类:
package cn.itcast.junit; import java.io.IOException; import org.junit.Test; import cn.itcast.bean.Article; import cn.itcast.dao.LuceneDao; /** * 测试luceneDao * @author Administrator * */ public class LuceneDaoTest { private LuceneDao luceneDao=new LuceneDao(); @Test public void testCreate() throws IOException{ for(int i=28;i<=28;i++){ Article article=new Article(); article.setId(i); article.setTitle("一定要梦想,万一实现了勒"); article.setContent("矫情我觉得这句话太矫情了矫情矫情矫情矫情矫情矫情"); article.setUrl("http://www.tianmao.com"); article.setAuthor("马云"); luceneDao.addIndex(article); } } @Test public void testsearcher() throws Exception{ // article.setTitle("一定要梦想,万一实现了勒"); textfield 分词 标准分词器 // article.setContent("我觉得这句话太矫情了"); textfield 分词 标准分词器 luceneDao.findIndex("梦想",20,10); } @Test public void testdelete() throws IOException{ String fieldName="title"; String fieldValue="定"; luceneDao.delIndex(fieldName, fieldValue); } @Test public void testUpdate() throws IOException{ String fieldName="title"; String fieldValue="定"; Article article=new Article(); article.setId(9527); article.setTitle("一定要梦想,万一实现了勒"); article.setContent("我觉得这句话太矫情了"); article.setUrl("http://www.tianmao.com"); article.setAuthor("马云"); luceneDao.updateIndex(fieldName, fieldValue, article); } }
分词器的流程图:
关于分词器,网上可以找到很多种类的分词器配合Lucene使用,相关分词规则查看对应说明。
举例如下:
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文单字切分、英文按空格切分成单词
Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分词,中文相连的两个词作为一个索引
Analyzer analyzer=new IKAnalyzer();//第三方的分词器,对中文支持较好,可以自定义分词单词与停用词
索引库优化
package cn.itcast.lucene; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import cn.itcast.uitls.Constants; public class TestOptimise { /*8 * 优化的第一种方式:通过 IndexWriterConfig 优化设置mergePolicy(合并策略) * * */ public void testoptimise() throws IOException{ Directory directory=FSDirectory.open(new File(Constants.URL)); Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer); LogDocMergePolicy mergePolicy=new LogDocMergePolicy(); /** * 当这个值越小,更少的内存会被运用当创建索引的时候,搜索的时候越快,创建的时候越慢。 * 当这个值越大,更多的内存会被运用当创建索引的时候,搜索的时候越慢,创建的时候越快.. * larger values >10 * * 2<=smaller<=10 * */ //设置合并因子.. mergePolicy.setMergeFactor(10); // /设置索引的合并策略.. config.setMergePolicy(mergePolicy); IndexWriter indexWriter=new IndexWriter(directory, config); } /** * 通过directory 去优化.... * @throws IOException * */ @Test public void testoptimise2() throws IOException{ //现在的索引放在硬盘上面... Directory directory=FSDirectory.open(new File(Constants.URL)); // /通过这个对象吧directory 里面的数据读取到directory1 里面来.. IOContext ioContext=new IOContext(); //相办法吧directory 的索引读取到内存当中来... Directory directory1=new RAMDirectory(directory,ioContext); IndexReader indexReader=DirectoryReader.open(directory1); IndexSearcher indexSearcher=new IndexSearcher(indexReader); Query query=new TermQuery(new Term("title", "想")); TopDocs topDocs=indexSearcher.search(query, 100); System.out.println(topDocs.totalHits); } /** * 索引文件越大,会影响检索的速度.. (减少索引文件的大小) * * 1:排除停用词.. * */ public void testoptimise3(){ } /** * 将索引分目盘存放 将数据归类... * */ public void testoptimise4(){ } }