Limit term dictionary traversal in multi-term queries

msfroh · claude · msfroh · commit a375f4ce45af · 2026-06-10T16:11:02.000-07:00
This change builds on the previous commit (#16222) by adding a "visit budget" on FilteredTermsEnum. We can use that to build a ScorerSupplier with more accurate cost if it can be done cheaply (e.g. prefix / regex queries with a prefix). If the terms enum is expensive (e.g. leading wildcard query), then the budget is exhausted and we give up. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -397,6 +397,8 @@ Optimizations
   clause matching no documents) before a non-seekable scan, such as a leading wildcard, runs.
   (Tianxiao Wei)
 
+* GITHUB#16240: Build on 16222 by letting automaton queries do "some" term dictionary traversal during planning, but with a bounded budget. (Michael Froh)
+
 * GITHUB#16176: Restore WANDScorer for TOP_SCORES + minShouldMatch > 1. (Tianxiao Wei)
 
 * GITHUB#16153: Use TernaryLongHeap in UpdateGraphsUtils for faster HNSW graph merging. (Prithvi S)
diff --git a/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java
@@ -34,6 +34,7 @@ public abstract class FilteredTermsEnum extends TermsEnum {
 
   private BytesRef initialSeekTerm;
   private boolean doSeek;
+  private int visitsBudget = Integer.MAX_VALUE;
 
   /** Which term the enum is currently positioned to. */
   protected BytesRef actualTerm;
@@ -90,6 +91,26 @@ protected FilteredTermsEnum(final TermsEnum tenum, final boolean startWithSeek)
     doSeek = startWithSeek;
   }
 
+  /**
+   * Sets a budget on the number of underlying {@link TermsEnum} operations ({@code seekCeil} and
+   * {@code next}) that {@link #next()} may perform. When the budget is exhausted, {@code next()}
+   * returns {@code null} as if the enum were exhausted. Use {@link #isVisitsBudgetExhausted()} to
+   * distinguish budget exhaustion from a true end of matching terms.
+   *
+   * <p>The default budget is {@link Integer#MAX_VALUE} (effectively unlimited).
+   */
+  public void setVisitsBudget(int budget) {
+    this.visitsBudget = budget;
+  }
+
+  /**
+   * Returns {@code true} if a previous {@link #next()} call returned {@code null} due to the visits
+   * budget being exhausted rather than the terms being truly exhausted.
+   */
+  public boolean isVisitsBudgetExhausted() {
+    return visitsBudget <= 0;
+  }
+
   /**
    * Use this method to set the initial {@link BytesRef} to seek before iterating. This is a
    * convenience method for subclasses that do not override {@link #nextSeekTerm}. If the initial
@@ -228,19 +249,20 @@ public BytesRef next() throws IOException {
       if (doSeek) {
         doSeek = false;
         final BytesRef t = nextSeekTerm(actualTerm);
-        // System.out.println("  seek to t=" + (t == null ? "null" : t.utf8ToString()) + " tenum=" +
-        // tenum);
         // Make sure we always seek forward:
         assert actualTerm == null || t == null || t.compareTo(actualTerm) > 0
             : "curTerm=" + actualTerm + " seekTerm=" + t;
-        if (t == null || tenum.seekCeil(t) == SeekStatus.END) {
-          // no more terms to seek to or enum exhausted
-          // System.out.println("  return null");
+        if (t == null) {
+          return null;
+        }
+        if (--visitsBudget < 0 || tenum.seekCeil(t) == SeekStatus.END) {
           return null;
         }
         actualTerm = tenum.term();
-        // System.out.println("  got term=" + actualTerm.utf8ToString());
       } else {
+        if (--visitsBudget < 0) {
+          return null;
+        }
         actualTerm = tenum.next();
         if (actualTerm == null) {
           // enum exhausted
diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java
@@ -20,6 +20,8 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
+import org.apache.lucene.index.AutomatonTermsEnum;
+import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermState;
@@ -29,6 +31,7 @@
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 
 /**
  * Contains functionality common to both {@link MultiTermQueryConstantScoreBlendedWrapper} and
@@ -42,6 +45,12 @@ abstract class AbstractMultiTermQueryConstantScoreWrapper<Q extends MultiTermQue
   // mtq that matches 16 terms or less will be executed as a regular disjunction
   static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
 
+  // Budget for underlying TermsEnum operations when probing an unknown-count query
+  // during scorerSupplier construction. Prefix-like patterns use few operations per
+  // match (the automaton seeks efficiently), while leading wildcards exhaust this
+  // quickly and fall back to deferred collection.
+  static final int AUTOMATON_TERM_COLLECT_VISIT_BUDGET = 256;
+
   protected final Q query;
 
   protected AbstractMultiTermQueryConstantScoreWrapper(Q query) {
@@ -228,62 +237,66 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
       final long cost;
       final IOLongFunction<WeightOrDocIdSetIterator> weightOrIteratorSupplier;
 
-      // Only collect terms while building the ScorerSupplier when the query exposes a known,
-      // bounded term count (e.g. TermInSetQuery, getTermsCount() >= 0). There, collecting is
-      // cheap and lets us return a null supplier up-front so a parent BooleanQuery can
-      // short-circuit.
-      //
-      // For queries with an unknown term count (e.g. automaton queries: wildcard / regexp /
-      // prefix / range), collecting eagerly can scan the whole term dictionary during
-      // ScorerSupplier construction -- a leading wildcard such as "*foo*" cannot seek and must
-      // visit every term. That is supposed to be the cheap "planning" phase, and doing it there
-      // defeats a parent conjunction's ability to short-circuit (a sibling clause matching no
-      // documents can no longer skip this clause before the scan runs). So for an unknown term
-      // count we estimate the cost and defer term collection to ScorerSupplier#get().
-      if (q.getTermsCount() >= 0) {
-        List<TermAndState> collectedTerms = new ArrayList<>();
-        boolean collectResult = collectTerms(fieldDocCount, termsEnum, collectedTerms);
-        if (collectResult) {
-          // Return a null supplier if no query terms were in the segment:
-          if (collectedTerms.isEmpty()) {
-            return null;
-          }
+      // Try to eagerly collect matching terms. For queries with a known term count
+      // (e.g. TermInSetQuery), we always collect eagerly. For queries with an unknown term
+      // count (e.g. automaton queries: wildcard / regexp / prefix / range), we attempt a
+      // budgeted probe: if the automaton can find all matching terms within a small number of
+      // underlying TermsEnum operations, we use those results. Otherwise (probe exhausts its
+      // budget, or no probe is possible), we estimate the cost and defer term collection to
+      // ScorerSupplier#get() -- eagerly scanning the whole term dictionary during the
+      // "planning" phase would defeat a parent conjunction's ability to short-circuit.
+      List<TermAndState> eagerTerms = new ArrayList<>();
+      TermsEnum deferredTermsEnum = termsEnum;
+      boolean eagerSuccess;
 
-          // TODO: Instead of replicating the cost logic of a BooleanQuery we could consider
-          // rewriting to a BQ eagerly at this point and delegating to its cost method (instead of
-          // lazily rewriting on #get). Not sure what the performance hit would be of doing this
-          // though.
-          long sumTermCost = 0;
-          for (TermAndState collectedTerm : collectedTerms) {
-            sumTermCost += collectedTerm.docFreq;
-          }
-          cost = sumTermCost;
+      if (q.getTermsCount() >= 0) {
+        eagerSuccess = collectTerms(fieldDocCount, termsEnum, eagerTerms);
+      } else {
+        // Unknown term count. Try a cheap budgeted probe: if the automaton can find
+        // all matching terms within a small number of underlying TermsEnum operations,
+        // use those results eagerly. Otherwise, fall back to deferred collection.
+        FilteredTermsEnum probeEnum = null;
+        if (termsEnum instanceof FilteredTermsEnum fte) {
+          probeEnum = fte;
+        } else if (q instanceof AutomatonQuery aq
+            && aq.getCompiled().type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+          probeEnum = new AutomatonTermsEnum(terms.iterator(), aq.getCompiled());
+        }
+        if (probeEnum != null) {
+          probeEnum.setVisitsBudget(AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
+          boolean probeResult = collectTerms(fieldDocCount, probeEnum, eagerTerms);
+          eagerSuccess = probeResult && !probeEnum.isVisitsBudgetExhausted();
         } else {
-          cost = estimateCost(terms, q.getTermsCount());
+          eagerSuccess = false;
         }
-        weightOrIteratorSupplier =
-            leadCost -> {
-              if (collectResult) {
-                return rewriteAsBooleanQuery(context, collectedTerms);
-              } else {
-                // Too many terms to rewrite as a simple bq.
-                // Invoke rewriteInner logic to handle rewriting:
-                return rewriteInner(
-                    context, fieldDocCount, terms, termsEnum, collectedTerms, leadCost);
-              }
-            };
+        if (!eagerSuccess) {
+          deferredTermsEnum = (probeEnum == termsEnum) ? q.getTermsEnum(terms) : termsEnum;
+          eagerTerms = new ArrayList<>();
+        }
+      }
+
+      if (eagerSuccess) {
+        if (eagerTerms.isEmpty()) {
+          return null;
+        }
+        long sumTermCost = 0;
+        for (TermAndState collectedTerm : eagerTerms) {
+          sumTermCost += collectedTerm.docFreq;
+        }
+        cost = sumTermCost;
+        final List<TermAndState> finalTerms = eagerTerms;
+        weightOrIteratorSupplier = _ -> rewriteAsBooleanQuery(context, finalTerms);
       } else {
         cost = estimateCost(terms, q.getTermsCount());
+        final TermsEnum finalDeferredEnum = deferredTermsEnum;
+        final List<TermAndState> partialTerms = eagerTerms;
         weightOrIteratorSupplier =
             leadCost -> {
-              List<TermAndState> collectedTerms = new ArrayList<>();
-              if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) {
-                return rewriteAsBooleanQuery(context, collectedTerms);
+              if (collectTerms(fieldDocCount, finalDeferredEnum, partialTerms)) {
+                return rewriteAsBooleanQuery(context, partialTerms);
               } else {
-                // Too many terms to rewrite as a simple bq.
-                // Invoke rewriteInner logic to handle rewriting:
                 return rewriteInner(
-                    context, fieldDocCount, terms, termsEnum, collectedTerms, leadCost);
+                    context, fieldDocCount, terms, finalDeferredEnum, partialTerms, leadCost);
               }
             };
       }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java
@@ -94,7 +94,7 @@ public void test() throws Exception {
     // perform.
     assertTrue(
         "too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount,
-        queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 7);
+        queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 10);
     r.close();
     dir.close();
   }
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java
@@ -445,10 +445,9 @@ public void testCostEstimate() throws IOException {
     Query rewritten = searcher.rewrite(query);
     Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
     ScorerSupplier supplier = weight.scorerSupplier(lrc);
-    // Automaton queries have an unknown term count, so term collection is deferred to get() and the
-    // cost is the worst-case estimate (sum of doc freqs across all terms) rather than the sum over
-    // the matching terms only.
-    assertEquals(3000, supplier.cost());
+    // "foo*" matches only 2 terms ("foo bar" and "foo wuzzle"), which can be found cheaply
+    // within the visit budget. So the cost is the accurate sum of their docFreqs (2 * 1000).
+    assertEquals(2000, supplier.cost());
 
     query = new WildcardQuery(new Term("body", "bar*"));
     rewritten = searcher.rewrite(query);
@@ -491,19 +490,52 @@ public void testScorerSupplierDoesNotScanTermsEagerly() throws IOException {
     termsEnumNextCalls.set(0);
     ScorerSupplier supplier = weight.scorerSupplier(lrc);
     assertNotNull(supplier);
-    assertEquals(
-        "scorerSupplier() must not scan the term dictionary for an automaton MultiTermQuery",
-        0,
-        termsEnumNextCalls.get());
-
-    // The scan is deferred to get(): building the scorer is where the terms are actually walked.
+    // The cheap probe runs during scorerSupplier() but is bounded by the visit budget.
+    // For a leading wildcard, the budget is exhausted quickly and term collection is deferred.
+    assertTrue(
+        "scorerSupplier() should use at most the visit budget for a leading wildcard",
+        termsEnumNextCalls.get()
+            <= AbstractMultiTermQueryConstantScoreWrapper.AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
+
+    // The full scan is deferred to get():
     assertNotNull(supplier.get(Long.MAX_VALUE));
     assertTrue("get() should scan the term dictionary", termsEnumNextCalls.get() > 0);
 
     reader.close();
     dir.close();
   }
 
+  // Verifies that a prefix-like wildcard with few (or zero) matches is resolved eagerly
+  // during scorerSupplier(), enabling null-supplier short-circuiting.
+  public void testScorerSupplierResolvesSparsePrefixEagerly() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    for (int i = 0; i < 1000; i++) {
+      Document doc = new Document();
+      doc.add(newStringField("body", "term" + i, Field.Store.NO));
+      writer.addDocument(doc);
+    }
+    writer.flush();
+    writer.forceMerge(1);
+    writer.close();
+
+    DirectoryReader reader = DirectoryReader.open(dir);
+    IndexSearcher searcher = new IndexSearcher(reader);
+    LeafReaderContext lrc = reader.leaves().get(0);
+
+    // No terms match "zzznomatch*", so the probe should find zero terms cheaply
+    // and scorerSupplier() should return null.
+    WildcardQuery query = new WildcardQuery(new Term("body", "zzznomatch*"));
+    Query rewritten = searcher.rewrite(query);
+    Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
+    assertNull(
+        "scorerSupplier() should return null when no terms match a prefix-like wildcard",
+        weight.scorerSupplier(lrc));
+
+    reader.close();
+    dir.close();
+  }
+
   private static TermsEnum nextCountingTermsEnum(TermsEnum in, AtomicInteger counter) {
     return new FilterLeafReader.FilterTermsEnum(in) {
       @Override

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ public void test() throws Exception {`
`94`	`94`	`// perform.`
`95`	`95`	`assertTrue(`
`96`	`96`	`"too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount,`
`97`		`- queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 7);`
	`97`	`+ queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 10);`
`98`	`98`	`r.close();`
`99`	`99`	`dir.close();`
`100`	`100`	`}`