• Lucene之删除索引


    1.前言

    之前的博客《Lucene全文检索之HelloWorld》已经简单介绍了Lucene的索引生成和检索。本文着重介绍Lucene的索引删除。

    2.应用场景

    索引建立完成后,因为有些原因,被索引的文件已经删除。此时,索引仍然存在,为了不产生“虚假检索结果”,需要将失效的索引删除

    3.HelloLucene类(重点关注deleteIndexByQuery方法)

    package com.njupt.zhb;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.LongField;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.IndexWriterConfig.OpenMode;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    /*
     *@author: ZhengHaibo  
     *web:     http://blog.csdn.net/nuptboyzhb
     *mail:    zhb931706659@126.com
     *2013-08-27  Nanjing,njupt,China
     */
    public class HelloLucene {
    	/**
    	 * Index all text files under a directory.
    	 * String indexPath = "index";//索引保存的路径
    	 * String docsPath = "";//文档保存的路径(待索引)
    	 */
    	public void index(String indexPath,String docsPath) {
    		try {
    			// 1.创建Directory
    			Directory dir = FSDirectory.open(new File(indexPath));//保存在硬盘上
    			// 2.创建IndexWriter
    			Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44,
    					analyzer);
    			iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);// 设置创建或追加模式
    			IndexWriter writer = new IndexWriter(dir, iwc);
    			final File docDir = new File(docsPath);
    			indexDocs(writer, docDir);
    			writer.close();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
        
    	public void indexDocs(IndexWriter writer, File file) throws IOException {
    		if (file.canRead()) {
    			if (file.isDirectory()) {//如果是文件夹,则遍历文件夹内的所有文件
    				String[] files = file.list();
    				// an IO error could occur
    				if (files != null) {
    					for (int i = 0; i < files.length; i++) {
    						indexDocs(writer, new File(file, files[i]));
    					}
    				}
    			} else {//如果是文件
    				FileInputStream fis;
    				try {
    					fis = new FileInputStream(file);
    				} catch (FileNotFoundException fnfe) {
    					return;
    				}
    				try {
    					// 3.创建Document对象
    					Document doc = new Document();
    					// 4.为Document添加Field
    					// Add the path of the file as a field named "path". Use a
    					// field that is indexed (i.e. searchable), but don't
    					// tokenize
    					// the field into separate words and don't index term
    					// frequency
    					// or positional information:
    					//以文件的文件路径建立Field
    					Field pathField = new StringField("path", file.getPath(),Field.Store.YES);
    					doc.add(pathField);//添加到文档中
    					//以文件的名称建立索引域
    					doc.add( new StringField("filename", file.getName(),Field.Store.YES));//添加到文档中
    					// Add the last modified date of the file a field named
    					// "modified".
    					// Use a LongField that is indexed (i.e. efficiently
    					// filterable with
    					// NumericRangeFilter). This indexes to milli-second
    					// resolution, which
    					// is often too fine. You could instead create a number
    					// based on
    					// year/month/day/hour/minutes/seconds, down the resolution
    					// you require.
    					// For example the long value 2011021714 would mean
    					// February 17, 2011, 2-3 PM.
    					doc.add(new LongField("modified", file.lastModified(),Field.Store.YES));
    					// Add the contents of the file to a field named "contents".
    					// Specify a Reader,
    					// so that the text of the file is tokenized and indexed,
    					// but not stored.
    					// Note that FileReader expects the file to be in UTF-8
    					// encoding.
    					// If that's not the case searching for special characters
    					// will fail.
    					//以文件的内容建立索引域(Field)
    					doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));
    					if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
    						// New index, so we just add the document (no old
    						// document can be there):
    						System.out.println("adding " + file);
    						writer.addDocument(doc);//将文档写入到索引中(以创建的方式)
    					} else {
    						// Existing index (an old copy of this document may have
    						// been indexed) so
    						// we use updateDocument instead to replace the old one
    						// matching the exact
    						// path, if present:
    						System.out.println("updating " + file);		
    						writer.updateDocument(new Term("path", file.getPath()),doc);//以追加方式写入到索引中
    					}
    				} finally {
    					fis.close();
    				}
    			}
    		}
    	}
    	/**
    	 * 搜索
    	 * http://blog.csdn.net/nuptboyzhb
    	 */
    	public void searcher(String indexPath,String searchKeyword){
    		try {
    			IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    			IndexSearcher searcher = new IndexSearcher(reader);
    			Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    			String field = "contents";//搜索域是:文档的内容
    			QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
    		    Query query= parser.parse(searchKeyword);//搜索内容中含有searchKeyword字符串的文档
    		    TopDocs tds=searcher.search(query, 10);//搜索前十个
    		    ScoreDoc[] sds= tds.scoreDocs;
    		    for (ScoreDoc sd:sds) {//将内容中含有“南京”关键字的文档遍历一遍
    				Document document=searcher.doc(sd.doc);
    				System.out.println("score:"+sd.score+"--filename:"+document.get("filename")+
    						"--path:"+document.get("path")+"--time"+document.get("modified"));//打印检索结果中文档的路径
    			}
    		    reader.close();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}catch (ParseException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    	/**
    	 * 删除索引
    	 * @param indexPath 索引所在的路径
    	 * @param deleteKeyword 删除含有该内容的索引
    	 */
    	public void deleteIndexByQuery(String indexPath,String deleteKeyword){
    		try {
    			//1.新建一个IndexWrite
    			IndexWriter writer = new IndexWriter(FSDirectory.open(new File(indexPath)),new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)));
    			//2.生成一个Query
    			Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    			String field = "contents";//搜索域是:文档的内容
    			QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
    		    Query query= parser.parse(deleteKeyword);//生成搜索内容中含有deleteKeyword的文档
    			//3.按Query参数的方式删除索引,即删除了含有deleteKeyword的索引
    		    writer.deleteDocuments(query);
    		    writer.commit();//提交,正是删除
    		    writer.close();//关闭
    		    //
    		    //writer.deleteDocuments(new Term(field, ""));
    		}catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}catch (ParseException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    }
    


    4.编写Junit测试

    package com.njupt.zhb;
    
    import org.junit.Test;
    /*
     *@author: ZhengHaibo  
     *web:     http://blog.csdn.net/nuptboyzhb
     *mail:    zhb931706659@126.com
     *2013-08-25  Nanjing,njupt,China
     */
    public class TestJunit {
       @Test
       public void TestIndex(){
    	   HelloLucene hLucene=new HelloLucene();
    	   hLucene.index("index", "D:\lucene");
       }
       @Test
       public void TestSearcher(){
    	   HelloLucene hLucene=new HelloLucene();
    	   hLucene.searcher("index","南京");
       }
       @Test
       public void TestDeleteIndexByQuery(){
    	   HelloLucene hLucene=new HelloLucene();
    	   System.out.println("未删除前,查询关键字:北京  --结果:");
    	   hLucene.searcher("index","北京");
    	   hLucene.deleteIndexByQuery("index", "北京");
    	   System.out.println("删除后,查询关键字:北京  --结果:");
    	   hLucene.searcher("index","北京");
       }
    }
    



    5.实验结果

    5.1运行TestIndex方法

    >控制台打印的信息

    updating D:lucenelucene1.txt
    updating D:lucenelucene2.txt
    updating D:lucenelucene3.txt
    updating D:lucene北京.txt
    updating D:lucene南京.txt


    此时的index目录下的截图:


    5.2运行TestSearcher方法

    >搜索含有关键字“南京”的文档

    score:0.53033006--filename:lucene3.txt--path:D:lucenelucene3.txt--time1376828819375
    score:0.48666292--filename:lucene2.txt--path:D:lucenelucene2.txt--time1376828783791
    score:0.2155931--filename:北京.txt--path:D:lucene北京.txt--time1377784223795
    score:0.1530931--filename:南京.txt--path:D:lucene南京.txt--time1377784261486


    5.3运行TestDeleteIndexByQuery方法

    >

    未删除前,查询关键字:北京  --结果:
    score:0.4847152--filename:lucene2.txt--path:D:lucenelucene2.txt--time1376828783791
    score:0.39226472--filename:北京.txt--path:D:lucene北京.txt--time1377784223795
    score:0.10348864--filename:lucene3.txt--path:D:lucenelucene3.txt--time1376828819375
    score:0.029874597--filename:南京.txt--path:D:lucene南京.txt--time1377784261486
    删除后,查询关键字:北京  --结果:
    

    删除后,再次查询关键字时,无查询结果。

    此时,index目录下的文件结构为:


    多出了一个_0_1.del文件

    项目源代码:http://download.csdn.net/detail/nuptboyzhb/6041239

    未经允许,不得用于商业目的

  • 相关阅读:
    PAT甲级——A1091 Acute Stroke【30】
    PAT甲级——A1090 Highest Price in Supply Chain
    PAT甲级——A1089 Insert or Merge
    PAT甲级——A1088 Rational Arithmetic
    PAT甲级——A1087 All Roads Lead to Rome【30】
    【php中的curl】php中curl的详细解说
    【php中的curl】使用curl完成POST数据给飞信接口
    【php中的curl】php中curl的使用
    【socket】php实现socket
    【socket】用PHP的socket实现客户端到服务端的通信
  • 原文地址:https://www.cnblogs.com/pangblog/p/3293624.html
Copyright © 2020-2023  润新知