2020import java .util .ArrayList ;
2121import java .util .List ;
2222import java .util .Objects ;
23+ import org .apache .lucene .index .AutomatonTermsEnum ;
24+ import org .apache .lucene .index .FilteredTermsEnum ;
2325import org .apache .lucene .index .LeafReaderContext ;
2426import org .apache .lucene .index .Term ;
2527import org .apache .lucene .index .TermState ;
2931import org .apache .lucene .util .Accountable ;
3032import org .apache .lucene .util .BytesRef ;
3133import org .apache .lucene .util .RamUsageEstimator ;
34+ import org .apache .lucene .util .automaton .CompiledAutomaton ;
3235
3336/**
3437 * Contains functionality common to both {@link MultiTermQueryConstantScoreBlendedWrapper} and
@@ -42,6 +45,12 @@ abstract class AbstractMultiTermQueryConstantScoreWrapper<Q extends MultiTermQue
4245 // mtq that matches 16 terms or less will be executed as a regular disjunction
4346 static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16 ;
4447
48+ // Budget for underlying TermsEnum operations when probing an unknown-count query
49+ // during scorerSupplier construction. Prefix-like patterns use few operations per
50+ // match (the automaton seeks efficiently), while leading wildcards exhaust this
51+ // quickly and fall back to deferred collection.
52+ static final int AUTOMATON_TERM_COLLECT_VISIT_BUDGET = 256 ;
53+
4554 protected final Q query ;
4655
4756 protected AbstractMultiTermQueryConstantScoreWrapper (Q query ) {
@@ -228,62 +237,66 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
228237 final long cost ;
229238 final IOLongFunction <WeightOrDocIdSetIterator > weightOrIteratorSupplier ;
230239
231- // Only collect terms while building the ScorerSupplier when the query exposes a known,
232- // bounded term count (e.g. TermInSetQuery, getTermsCount() >= 0). There, collecting is
233- // cheap and lets us return a null supplier up-front so a parent BooleanQuery can
234- // short-circuit.
235- //
236- // For queries with an unknown term count (e.g. automaton queries: wildcard / regexp /
237- // prefix / range), collecting eagerly can scan the whole term dictionary during
238- // ScorerSupplier construction -- a leading wildcard such as "*foo*" cannot seek and must
239- // visit every term. That is supposed to be the cheap "planning" phase, and doing it there
240- // defeats a parent conjunction's ability to short-circuit (a sibling clause matching no
241- // documents can no longer skip this clause before the scan runs). So for an unknown term
242- // count we estimate the cost and defer term collection to ScorerSupplier#get().
243- if (q .getTermsCount () >= 0 ) {
244- List <TermAndState > collectedTerms = new ArrayList <>();
245- boolean collectResult = collectTerms (fieldDocCount , termsEnum , collectedTerms );
246- if (collectResult ) {
247- // Return a null supplier if no query terms were in the segment:
248- if (collectedTerms .isEmpty ()) {
249- return null ;
250- }
240+ // Try to eagerly collect matching terms. For queries with a known term count
241+ // (e.g. TermInSetQuery), we always collect eagerly. For queries with an unknown term
242+ // count (e.g. automaton queries: wildcard / regexp / prefix / range), we attempt a
243+ // budgeted probe: if the automaton can find all matching terms within a small number of
244+ // underlying TermsEnum operations, we use those results. Otherwise (probe exhausts its
245+ // budget, or no probe is possible), we estimate the cost and defer term collection to
246+ // ScorerSupplier#get() -- eagerly scanning the whole term dictionary during the
247+ // "planning" phase would defeat a parent conjunction's ability to short-circuit.
248+ List <TermAndState > eagerTerms = new ArrayList <>();
249+ TermsEnum deferredTermsEnum = termsEnum ;
250+ boolean eagerSuccess ;
251251
252- // TODO: Instead of replicating the cost logic of a BooleanQuery we could consider
253- // rewriting to a BQ eagerly at this point and delegating to its cost method (instead of
254- // lazily rewriting on #get). Not sure what the performance hit would be of doing this
255- // though.
256- long sumTermCost = 0 ;
257- for (TermAndState collectedTerm : collectedTerms ) {
258- sumTermCost += collectedTerm .docFreq ;
259- }
260- cost = sumTermCost ;
252+ if (q .getTermsCount () >= 0 ) {
253+ eagerSuccess = collectTerms (fieldDocCount , termsEnum , eagerTerms );
254+ } else {
255+ // Unknown term count. Try a cheap budgeted probe: if the automaton can find
256+ // all matching terms within a small number of underlying TermsEnum operations,
257+ // use those results eagerly. Otherwise, fall back to deferred collection.
258+ FilteredTermsEnum probeEnum = null ;
259+ if (termsEnum instanceof FilteredTermsEnum fte ) {
260+ probeEnum = fte ;
261+ } else if (q instanceof AutomatonQuery aq
262+ && aq .getCompiled ().type == CompiledAutomaton .AUTOMATON_TYPE .NORMAL ) {
263+ probeEnum = new AutomatonTermsEnum (terms .iterator (), aq .getCompiled ());
264+ }
265+ if (probeEnum != null ) {
266+ probeEnum .setVisitsBudget (AUTOMATON_TERM_COLLECT_VISIT_BUDGET );
267+ boolean probeResult = collectTerms (fieldDocCount , probeEnum , eagerTerms );
268+ eagerSuccess = probeResult && !probeEnum .isVisitsBudgetExhausted ();
261269 } else {
262- cost = estimateCost ( terms , q . getTermsCount ()) ;
270+ eagerSuccess = false ;
263271 }
264- weightOrIteratorSupplier =
265- leadCost -> {
266- if (collectResult ) {
267- return rewriteAsBooleanQuery (context , collectedTerms );
268- } else {
269- // Too many terms to rewrite as a simple bq.
270- // Invoke rewriteInner logic to handle rewriting:
271- return rewriteInner (
272- context , fieldDocCount , terms , termsEnum , collectedTerms , leadCost );
273- }
274- };
272+ if (!eagerSuccess ) {
273+ deferredTermsEnum = (probeEnum == termsEnum ) ? q .getTermsEnum (terms ) : termsEnum ;
274+ eagerTerms = new ArrayList <>();
275+ }
276+ }
277+
278+ if (eagerSuccess ) {
279+ if (eagerTerms .isEmpty ()) {
280+ return null ;
281+ }
282+ long sumTermCost = 0 ;
283+ for (TermAndState collectedTerm : eagerTerms ) {
284+ sumTermCost += collectedTerm .docFreq ;
285+ }
286+ cost = sumTermCost ;
287+ final List <TermAndState > finalTerms = eagerTerms ;
288+ weightOrIteratorSupplier = _ -> rewriteAsBooleanQuery (context , finalTerms );
275289 } else {
276290 cost = estimateCost (terms , q .getTermsCount ());
291+ final TermsEnum finalDeferredEnum = deferredTermsEnum ;
292+ final List <TermAndState > partialTerms = eagerTerms ;
277293 weightOrIteratorSupplier =
278294 leadCost -> {
279- List <TermAndState > collectedTerms = new ArrayList <>();
280- if (collectTerms (fieldDocCount , termsEnum , collectedTerms )) {
281- return rewriteAsBooleanQuery (context , collectedTerms );
295+ if (collectTerms (fieldDocCount , finalDeferredEnum , partialTerms )) {
296+ return rewriteAsBooleanQuery (context , partialTerms );
282297 } else {
283- // Too many terms to rewrite as a simple bq.
284- // Invoke rewriteInner logic to handle rewriting:
285298 return rewriteInner (
286- context , fieldDocCount , terms , termsEnum , collectedTerms , leadCost );
299+ context , fieldDocCount , terms , finalDeferredEnum , partialTerms , leadCost );
287300 }
288301 };
289302 }
0 commit comments