先建立索引,再查询,速度很快。
索引花费的时间比较多,但还可以接受,200M的文本需要1分多钟,40G的文本需要4个小时多。
刚开始我用 2.9版本的,是选择将文本也保存在索引中,占据的空间比原先的文本2倍多。
而且发现如果里面的邮箱地址不带@后面无法查询出来,也许是所用的分词的关系,也不知道怎样才可以自定义分隔符。
后来改用了4.8版本,索引的空间只比原先的文本大一点点,而且不带@的关键字也可以查询出来。
但还是有个问题,查询出来的内容中文是乱码,也无法用中文查询。
无论是 NLuke 还是 Luke , 都没法成功打开索引文件。
IndexWriter 构造函数使用了另一个重载,即第三个参数为 bool,如果为 true 表示不存在就创建、存在就覆盖,为 false 表示不存在就出错、存在就追加。这个不方便,因为我们需要的是:不存在就创建、存在就追加,怎样才能实现这个功能呢?省略掉,就实现这个功能了。
void Main()
{
string idxpath = @"D:dataDB xtindex";
string dir = @"D:dataDB xtsearch ianya";
//!!怎样用@分词?
string keyword = "zhaoshu0997";
Utils.FullSearch.FileHelper filehelper= new Utils.FullSearch.FileHelper(idxpath);
//filehelper.BuildIndex(dir);
Utils.FullSearch.SearchResults results = filehelper.Search(keyword);
Console.WriteLine($"{keyword.ToString()}");
results.Dump();
}
namespace Utils.FullSearch { public class SearchResults{ public int TotalHits{get;set;} public List<Hit> SearchContents{get;set;} } public class Hit{ public float Score{get;set;} public string Content{get;set;} } public class FileHelper { private const LuceneVersion MATCH_LUCENE_VERSION= LuceneVersion.LUCENE_48; private const string Field_Name= "content"; private const int Results_Per_Page = 10; //private IndexWriter writer; private StandardAnalyzer analyzer; private QueryParser queryParser; //private SearcherManager searchManager; private string _indexPath; private StandardAnalyzer SetupAnalyzer() => new StandardAnalyzer(MATCH_LUCENE_VERSION); private QueryParser SetupQueryParser(StandardAnalyzer analyzer) => new QueryParser(MATCH_LUCENE_VERSION, Field_Name, analyzer); public FileHelper(string indexPath) { analyzer = SetupAnalyzer(); queryParser = SetupQueryParser(analyzer); _indexPath = indexPath; } public void BuildIndex(string dir) { var watch = Stopwatch.StartNew(); List<string> fpaths = FindFile(dir); IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer)); foreach(string fpath in fpaths){ string[] contents = File.ReadAllLines(fpath, Encoding.UTF8); foreach(string content in contents){ Document doc = new Document { new TextField(Field_Name, content, Field.Store.YES) }; writer.AddDocument(doc); } ($"index time for {fpath}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); } writer.Flush(true, true); writer.Commit(); writer.Dispose(); watch.Stop(); ($"index time for {dir}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); } public static List<string> FindFile(string sSourcePath) { List<String> list = new List<string>(); DirectoryInfo theFolder = new DirectoryInfo(sSourcePath); FileInfo[] thefileInfo = theFolder.GetFiles("*.*", SearchOption.TopDirectoryOnly); foreach (FileInfo NextFile in thefileInfo) //遍历文件 list.Add(NextFile.FullName); DirectoryInfo[] dirInfo = theFolder.GetDirectories(); foreach (DirectoryInfo NextFolder in dirInfo) { //list.Add(NextFolder.ToString()); FileInfo[] fileInfo = NextFolder.GetFiles("*.*", SearchOption.AllDirectories); foreach (FileInfo NextFile in fileInfo) //遍历文件 list.Add(NextFile.FullName); } return list; } public SearchResults Search(string queryString) { var watch = Stopwatch.StartNew(); Query query = queryParser.Parse(queryString); IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer)); SearcherManager searchManager = new SearcherManager(writer, true, null); searchManager.MaybeRefreshBlocking(); IndexSearcher searcher = searchManager.Acquire(); try { TopDocs topdDocs = searcher.Search(query, Results_Per_Page); SearchResults searchResults = new SearchResults() { TotalHits = topdDocs.TotalHits, SearchContents = new List<Hit>() }; foreach (var result in topdDocs.ScoreDocs) { Document document = searcher.Doc(result.Doc); Hit searchResult = new Hit { Score = result.Score, Content = document.GetField(Field_Name)?.GetStringValue() }; searchResults.SearchContents.Add(searchResult); } ($"search time for {queryString}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); return searchResults; } finally { searchManager.Release(searcher); searcher = null; } } } }