• LuceneNet的使用


    先建立索引,再查询,速度很快。

    索引花费的时间比较多,但还可以接受,200M的文本需要1分多钟,40G的文本需要4个小时多。

    刚开始我用 2.9版本的,是选择将文本也保存在索引中,占据的空间比原先的文本2倍多。

    而且发现如果里面的邮箱地址不带@后面无法查询出来,也许是所用的分词的关系,也不知道怎样才可以自定义分隔符。

    后来改用了4.8版本,索引的空间只比原先的文本大一点点,而且不带@的关键字也可以查询出来。

    但还是有个问题,查询出来的内容中文是乱码,也无法用中文查询。

    无论是 NLuke 还是 Luke , 都没法成功打开索引文件。

    IndexWriter 构造函数使用了另一个重载,即第三个参数为 bool,如果为 true 表示不存在就创建、存在就覆盖,为 false 表示不存在就出错、存在就追加。这个不方便,因为我们需要的是:不存在就创建、存在就追加,怎样才能实现这个功能呢?省略掉,就实现这个功能了。
     
    void Main()
    {
     
     string idxpath = @"D:dataDB xtindex";
     string dir = @"D:dataDB xtsearch ianya";
     //!!怎样用@分词?
     string keyword = "zhaoshu0997";
     Utils.FullSearch.FileHelper filehelper= new Utils.FullSearch.FileHelper(idxpath);
     //filehelper.BuildIndex(dir);
     Utils.FullSearch.SearchResults results = filehelper.Search(keyword);
     Console.WriteLine($"{keyword.ToString()}");
     results.Dump();
    }

    namespace Utils.FullSearch { public class SearchResults{ public int TotalHits{get;set;} public List<Hit> SearchContents{get;set;} } public class Hit{ public float Score{get;set;} public string Content{get;set;} } public class FileHelper { private const LuceneVersion MATCH_LUCENE_VERSION= LuceneVersion.LUCENE_48; private const string Field_Name= "content"; private const int Results_Per_Page = 10; //private IndexWriter writer; private StandardAnalyzer analyzer; private QueryParser queryParser; //private SearcherManager searchManager; private string _indexPath; private StandardAnalyzer SetupAnalyzer() => new StandardAnalyzer(MATCH_LUCENE_VERSION); private QueryParser SetupQueryParser(StandardAnalyzer analyzer) => new QueryParser(MATCH_LUCENE_VERSION, Field_Name, analyzer); public FileHelper(string indexPath) { analyzer = SetupAnalyzer(); queryParser = SetupQueryParser(analyzer); _indexPath = indexPath; } public void BuildIndex(string dir) { var watch = Stopwatch.StartNew(); List<string> fpaths = FindFile(dir); IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer)); foreach(string fpath in fpaths){ string[] contents = File.ReadAllLines(fpath, Encoding.UTF8); foreach(string content in contents){ Document doc = new Document { new TextField(Field_Name, content, Field.Store.YES) }; writer.AddDocument(doc); } ($"index time for {fpath}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); } writer.Flush(true, true); writer.Commit(); writer.Dispose(); watch.Stop(); ($"index time for {dir}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); } public static List<string> FindFile(string sSourcePath) { List<String> list = new List<string>(); DirectoryInfo theFolder = new DirectoryInfo(sSourcePath); FileInfo[] thefileInfo = theFolder.GetFiles("*.*", SearchOption.TopDirectoryOnly); foreach (FileInfo NextFile in thefileInfo) //遍历文件 list.Add(NextFile.FullName); DirectoryInfo[] dirInfo = theFolder.GetDirectories(); foreach (DirectoryInfo NextFolder in dirInfo) { //list.Add(NextFolder.ToString()); FileInfo[] fileInfo = NextFolder.GetFiles("*.*", SearchOption.AllDirectories); foreach (FileInfo NextFile in fileInfo) //遍历文件 list.Add(NextFile.FullName); } return list; } public SearchResults Search(string queryString) { var watch = Stopwatch.StartNew(); Query query = queryParser.Parse(queryString); IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer)); SearcherManager searchManager = new SearcherManager(writer, true, null); searchManager.MaybeRefreshBlocking(); IndexSearcher searcher = searchManager.Acquire(); try { TopDocs topdDocs = searcher.Search(query, Results_Per_Page); SearchResults searchResults = new SearchResults() { TotalHits = topdDocs.TotalHits, SearchContents = new List<Hit>() }; foreach (var result in topdDocs.ScoreDocs) { Document document = searcher.Doc(result.Doc); Hit searchResult = new Hit { Score = result.Score, Content = document.GetField(Field_Name)?.GetStringValue() }; searchResults.SearchContents.Add(searchResult); } ($"search time for {queryString}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); return searchResults; } finally { searchManager.Release(searcher); searcher = null; } } } }

      

  • 相关阅读:
    【Java】+Lombok
    【Java】【陷阱】toString() 再也不用了
    【Excel】获取一列的有效行数
    【Excel】同一个表格 不同sheet表之间数据关联
    【Excel】+ 高亮显示当前单元格所在的行与列
    【Java】获取类中所有的方法
    捕获子线程中的异常
    Kafka documentation PDF
    Spring java8 LocalDatetime 格式化
    OneDrive同步文件夹不显示绿色对勾
  • 原文地址:https://www.cnblogs.com/sui84/p/12594735.html
Copyright © 2020-2023  润新知