• lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索


    代码如下,代码没有优化,仅实现功能
    该代码复制到程序中不能直接使用,需要下载文章最后的例子,取得其中得dll后才可以

    using System;
    using System.Configuration;
    using System.Data;
    using System.Linq;
    using System.Web;
    using System.Web.Security;
    using System.Web.UI;
    using System.Web.UI.HtmlControls;
    using System.Web.UI.WebControls;
    using System.Web.UI.WebControls.WebParts;
    using System.Xml.Linq;
    using System.Text;
    using System.IO;

    using Lucene.Net.Documents;
    using Lucene.Net.Index;
    using Lucene.Net.Search;
    using Lucene.Net.QueryParsers;
    using Lucene.Net.Analysis.Standard;

    using Lucene.Net.Analysis.Cn;


    using org.pdfbox.pdmodel;
    using org.pdfbox.util;

    using System.Text.RegularExpressions;

    public partial class _Default : System.Web.UI.Page
    {
        
    public DateTime start = new DateTime();
        
    delegate void AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
        IndexSearcher searcher 
    = null;

        
    protected void Page_Load(object sender, EventArgs e)
        {
            
    if (!IsPostBack)
                TextBox3.Text 
    = Server.MapPath("doc");
        }


        
    #region 建立索引
        
    protected void Button2_Click(object sender, EventArgs e)
        {
            
    string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
            string INDEX_PATH = TextBox3.Text;  //INDEX_PATH 为搜索目录

            IndexWriter writer 
    = null;
            
    try
            {
                writer 
    = new IndexWriter(INDEX_STORE_PATH, new ChineseAnalyzer(), true);
                start 
    = DateTime.Now;

                IndexDirectory(writer, 
    new FileInfo(INDEX_PATH));
                writer.Optimize();
                writer.Close();

                TimeSpan s 
    = DateTime.Now - start;

                TextBox1.Text 
    = "提示:索引完成,共用时 " + s.TotalSeconds + " 秒\n";

            }
            
    catch (Exception ex)
            {
                TextBox4.Text 
    = ex.Message.ToString();
            }


        }

        
    public void IndexDirectory(IndexWriter writer, FileInfo file)
        {
            
    if (Directory.Exists(file.FullName))
            {
                String[] files 
    = Directory.GetFileSystemEntries(file.FullName);

                
    if (files != null)
                {
                    
    for (int i = 0; i < files.Length; i++)
                    {
                        IndexDirectory(writer, 
    new FileInfo(files[i]));  //这里是一个递归 
                    }
                }
            }
            
    else if (file.Extension.ToLower() == ".txt" || file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html" || file.Extension.ToLower() == ".pdf" || file.Extension.ToLower() == ".doc" || file.Extension.ToLower() == ".rtf" || file.Extension.ToLower() == ".ppt" || file.Extension.ToLower() == ".xls")
            {
                IndexFile(file, writer);
            }
        }

        
    private void IndexFile(FileInfo file, IndexWriter writer)
        {

            
    try
            {
                
    if (file.Extension.ToLower() == ".pdf")
                {
                    Document doc 
    = new Document();

                    PDDocument pddoc 
    = PDDocument.load(file.FullName);  
                    PDFTextStripper stripper 
    = new PDFTextStripper();

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);
                }
                
    else if (file.Extension.ToLower() == ".doc")
                {
                    Document doc 
    = new Document();
                    
    string str = "";
                    
    //
                    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                    
    object filePath = file.FullName;
                    
    object nullobj = System.Reflection.Missing.Value;
                    Microsoft.Office.Interop.Word.Document docdoc 
    = wordApp.Documents.Open(
                        
    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                    docdoc.ActiveWindow.Selection.WholeStory();

                    str 
    = docdoc.ActiveWindow.Selection.Text.ToString();
                    docdoc.Close(
    ref nullobj, ref nullobj, ref nullobj);
                    wordApp.Quit(
    ref nullobj, ref nullobj, ref nullobj);
                    
    //

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);

                }
                
    else if (file.Extension.ToLower() == ".rtf")    //word的方式可以解决rtf文件的读取
                {
                    Document doc 
    = new Document();
                    
    string str = "";
                    
    //
                    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                    
    object filePath = file.FullName;
                    
    object nullobj = System.Reflection.Missing.Value;
                    Microsoft.Office.Interop.Word.Document docdoc 
    = wordApp.Documents.Open(
                        
    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                    docdoc.ActiveWindow.Selection.WholeStory();

                    str 
    = docdoc.ActiveWindow.Selection.Text.ToString();
                    docdoc.Close(
    ref nullobj, ref nullobj, ref nullobj);
                    wordApp.Quit(
    ref nullobj, ref nullobj, ref nullobj);
                    
    //
                    
                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);
                }
                
    else if (file.Extension.ToLower() == ".ppt")
                {
                    Document doc 
    = new Document();
                    
    string str = "";
                    
    //
                    PowerPoint.ApplicationClass pptApp = new PowerPoint.ApplicationClass();
                    PowerPoint.Presentation pptPre 
    = pptApp.Presentations.Open(file.FullName,
                                Microsoft.Office.Core.MsoTriState.msoTrue,
                                Microsoft.Office.Core.MsoTriState.msoFalse,
                                Microsoft.Office.Core.MsoTriState.msoFalse);

                    
    foreach (PowerPoint.Slide slide in pptPre.Slides)
                    {
                        
    foreach (PowerPoint.Shape shape in slide.Shapes)
                        {
                            
    try
                            {
                                str 
    = str + shape.TextFrame.TextRange.Text;
                            }
                            
    catch { }
                        }
                    }
                    pptPre.Close();
                    pptApp.Quit();
                    
    //

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);
                }
                
    else if (file.Extension.ToLower() == ".xls")
                {
                    Document doc 
    = new Document();
                    
    string str = "";

                    
    //
                    Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
                    
    //xApp.Visible = true;

                    
    object nullobj = System.Reflection.Missing.Value;

                    Microsoft.Office.Interop.Excel.Workbook xBook 
    = xApp.Workbooks._Open(file.FullName,
                    nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);

                    Microsoft.Office.Interop.Excel.Worksheet xSheet;
                    
    int rcount, ccount;

                    
    for (int i = 0; i < xBook.Sheets.Count; i++)
                    {
                        xSheet 
    = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i + 1];

                        rcount 
    = xSheet.UsedRange.Rows.Count;
                        ccount 
    = xSheet.UsedRange.Columns.Count;

                        
    for (int m = 0; m < rcount; m++)
                        {
                            
    for (int n = 0; n < ccount; n++)
                            {
                                str 
    = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + 1, n + 1]).Value2;
                            }
                        }

                    }
                    xSheet 
    = null;
                    xBook.Close(nullobj, nullobj, nullobj);
                    xApp.Quit();
                    
    //

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);

                }
                
    else if (file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html")
                {

                    Document doc 
    = new Document();
                    
    string str = "";
                    str 
    = NoHTML(File.ReadAllText(file.FullName));

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));

                    writer.AddDocument(doc);
                }
                
    else    //默认是文本文件
                {
                    Document doc 
    = new Document();

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));

                    writer.AddDocument(doc);
                }
            }

            
    catch (FileNotFoundException fnfe)
            {
                TextBox4.Text 
    = TextBox4.Text + fnfe.Message + "\n";
                
    return;
            }
        }

        
    public static string NoHTML(string Htmlstring)//过滤调html的标签
        {
            
    //删除脚本 
            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>""", RegexOptions.IgnoreCase);
            
    //删除HTML 
            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"([\r\n])[\s]+""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"-->""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"<!--.*""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(quot|#34);""\"", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);""&", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(lt|#60);""<", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(gt|#62);"">", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(nbsp|#160);"" ", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(iexcl|#161);""\xa1", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(cent|#162);""\xa2", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(pound|#163);""\xa3", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(copy|#169);""\xa9", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&#(\d+);""", RegexOptions.IgnoreCase);
            Htmlstring.Replace(
    "<""");
            Htmlstring.Replace(
    ">""");
            Htmlstring.Replace(
    "\r\n""");
            Htmlstring 
    = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
            
    return Htmlstring;
        }
        
    #endregion

        
    #region 搜索
        
    protected void Button1_Click(object sender, EventArgs e)
        {
            
    string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
            string KEYWORD = TextBox2.Text;

            
    try
            {
                searcher 
    = new IndexSearcher(INDEX_STORE_PATH);

                QueryParser q 
    = new QueryParser("contents"new ChineseAnalyzer());

                Query query 
    = q.Parse(KEYWORD);


                Hits hits 
    = searcher.Search(query);

                printResult(hits);

                searcher.Close();
            }
            
    catch (Exception ex)
            {
                TextBox4.Text 
    = TextBox4.Text + ex.Message.ToString();
            }
        }

        
    void printResult(Hits h)
        {
            
    string str = "";
            
    if (h.Length() == 0)
            {
                str 
    = str + "对不起,没有搜索到你要的结果。\n";
            }
            
    else
            {
                
    for (int i = 0; i < h.Length(); i++)
                {
                    
    try
                    {
                        Document doc 
    = h.Doc(i);
                        str 
    = str + "这是第" + (i + 1+ "个搜索结果,文件路径为: " + doc.Get("filename"+ "\n";
                    }
                    
    catch (Exception ex)
                    {
                        TextBox4.Text 
    = TextBox4.Text + ex.Message;
                    }
                }
            }
            str 
    = str + "---------------------------\n";
            TextBox1.Text 
    = str;
        }

        
    #endregion

    }


    完整demo下载,点击下载

  • 相关阅读:
    表模块模式与事务脚本模式的代码编写
    解决方案下显示的网站名称被追加编号的问题解决方法
    应用层代码
    关于CodeReview(java)(转)
    关于事务的几个概念介绍(转)
    关于JVM的ClassLoader(转)
    svn相关
    .subversion
    linux用户与组的管理(命令加入、手动加入、加入组、用户之间的切换)
    回调函数
  • 原文地址:https://www.cnblogs.com/weekzero/p/1217521.html
Copyright © 2020-2023  润新知