-
总述 IndexSearch全过程源码分析
- 1.总述 IndexSearch全过程源码分析--->生成weight树
- IndexSearch ---> search(createWeight(query), filter, n, sort)
- |--createWeight(query) |实际为生成Weight树
- |--return query.weight(this);
-
- 2.创建weight树总过程
- query.weight(this)
- |--Query query = searcher.rewrite(this); |重新解析Query,将Query生成为单个可供直接查询的Query
- |--Weight weight = query.createWeight(searcher);
- |--float sum = weight.sumOfSquaredWeights(); |计算sum分值
- |--float norm = getSimilarity(searcher).queryNorm(sum); |获取标准因子
- |--weight.normalize(norm); |标准化
- |--return weight; |返回weight权值树
-
- 3.重写Query对象,生成Query树
- IndexSearcher.rewrite(Query original) |重写Query对象,主要实现拆分
- |--for (Query rewrittenQuery = query.rewrite(reader); rewrittenQuery != query; rewrittenQuery = query.rewrite(reader)) |重写直至不能再拆分
- |--query = rewrittenQuery;
- |--eg1:BooleanQuery.rewrite(reader) 实现
- |--for (int i = 0 ; i < clauses.size(); i++)
- |--Query query = c.getQuery().rewrite(reader); |重写query对象,重复写的过程,最后都生成BooleanQuery对象
- |--clone.clauses.set(i, new BooleanClause(query, c.getOccur())); |合成新的BooleanQuery对象
- |--eg2:MultiTermQuery.rewrite(reader) 实现
- |--rewriteMethod.rewrite(reader, this);
- |--ConstantScoreFilterRewrite.rewrite(reader) |将所有Term当成一个Term处理
- |--Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
- |--result.setBoost(query.getBoost());
- |--ScoringBooleanQueryRewrite.rewrite(reader) |将Term分离出来,风险在于350Term的限制值
- |--ConstantScoreBooleanQueryRewrite
- |--result.add(tq, BooleanClause.Occur.SHOULD); |逐一分离Term,生成Boolean查询
-
- |--ConstantScoreAutoRewrite.rewrite(reader) |结合上述二者,自动选择,以term < 350 为界,进行选择
- |--FilteredTermEnum enumerator = query.getEnum(reader); |根据需要改变的query生成枚举器
- |--Term t = enumerator.term(); |此时含IO操作?
- |--Iterator it = pendingTerms.iterator();
- |--BooleanQuery bq = new BooleanQuery(true);
- |--while(it.hasNext()) |逐个循环,生成term
- |--TermQuery tq = new TermQuery((Term) it.next());
- |--bq.add(tq, BooleanClause.Occur.SHOULD);
- |--Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
- |--result.setBoost(query.getBoost()); |设置分值
- |--query.incTotalNumberOfTerms(pendingTerms.size()); |增加Term总数
- |--return query; |返回最终生成的Query树
-
- 4.不同的Query查询,重写后生成新的Query
- |--eg2.1:PrefixQuery.getEnum(reader)
- |--return new PrefixTermEnum(reader, prefix); |返回FilterTermEnum的子类
- |--setEnum(reader.terms(new Term(prefix.field(), prefix.text())));
- |--if (term != null && termCompare(term)) |比较前缀
- |--currentTerm = term;
- |--else next()
- |--if (actualEnum.next()) |取下一个term,判断是否为当前term的pre
- |-- Term term = actualEnum.term();
- |-- if (termCompare(term)) {
- |--currentTerm = term;
- |--eg2.2:FuzzyQuery.getEnum(reader)
- |--return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
- |--this.text = searchTerm.text().substring(realPrefixLength); |获取前缀及text文本内容
- |--this.prefix = searchTerm.text().substring(0, realPrefixLength);
- |--initializeMaxDistances(); |计算初始最大距离
- |--setEnum(reader.terms(new Term(searchTerm.field(), prefix))); |计算差距值
- |--termCompare(Term term)
- |--final String target = term.text().substring(prefix.length());
- |--this.similarity = similarity(target);
- |--return (similarity > minimumSimilarity);
-
-
- 5.weight.sumOfSquaredWeights() |--计算合值
- |--BooleanWeight |计算后出现二种情况,Boolean及单个weight树,以BooleanWeight为准进行分析
- |--for (int i = 0 ; i < weights.size(); i++) |逐一单个的Weight进行计算
- |--float s = w.sumOfSquaredWeights()
- |--sum += s;
- |--sum *= getBoost() * getBoost();
-
- |--TermWeight |以TermWeight为例
- |--queryWeight = idf * getBoost();
- |--return queryWeight * queryWeight; |开平方
-
-
- 6.float norm = getSimilarity(searcher).queryNorm(sum); |计算标准因子,默认为DefaultSimilarity
- |-- return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
-
-
- 7.weight.normalize(norm); |标准化norm因子,以BooleanWeight为例
- |--norm *= getBoost();
- |-- for (Iterator iter = weights.iterator(); iter.hasNext();)
- |--w.normalize(norm); |逐个标准化
-
- |--TermWeight.normalize(norm) |以TermWeight为例
- |--queryWeight *= queryNorm;
- |--value = queryWeight * idf
- |--实际值value = (idf * getBoost())*(idf * getBoost())*queryNorm*idf;
-
相关阅读:
Aizu 0033
Aizu 0118
【思维】贪心+细节——cf1361B
【思维】构造+凸包+向量叉积——LEETCODE 游乐园的迷宫
【思维】三元环计数+鸽笼定理/贪心——LEETCODE 游乐园的游览计划 好题
dp+线性筛——LEETCODE切分数组
【经典】带障碍的铺砖块——LEETCODE 覆盖
【思维】树形dp+构造——leetcode二叉树任务调度
【思维】状压dp—— 2020 联想杯 M
【思维】建图+排列组合+预处理+最短路—— 2020 联想杯 E
-
原文地址:https://www.cnblogs.com/l1pe1/p/2397442.html
Copyright © 2020-2023
润新知