Limit term dictionary traversal in multi-term queries

msfroh · claude · msfroh · commit da54feba2841 · 2026-06-10T14:55:20.000-07:00
This change builds on the previous commit (#16222) by adding a "visit budget" on FilteredTermsEnum. We can use that to build a ScorerSupplier with more accurate cost if it can be done cheaply (e.g. prefix / regex queries with a prefix). If the terms enum is expensive (e.g. leading wildcard query), then the budget is exhausted and we give up. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java
@@ -34,6 +34,7 @@ public abstract class FilteredTermsEnum extends TermsEnum {
 
   private BytesRef initialSeekTerm;
   private boolean doSeek;
+  private int visitsBudget = Integer.MAX_VALUE;
 
   /** Which term the enum is currently positioned to. */
   protected BytesRef actualTerm;
@@ -90,6 +91,24 @@ protected FilteredTermsEnum(final TermsEnum tenum, final boolean startWithSeek)
     doSeek = startWithSeek;
   }
 
+  /**
+   * Sets a budget on the number of underlying {@link TermsEnum} operations ({@code seekCeil} and
+   * {@code next}) that {@link #next()} may perform. When the budget is exhausted, {@code next()}
+   * returns {@code null} as if the enum were exhausted. Use {@link #isVisitsBudgetExhausted()} to
+   * distinguish budget exhaustion from a true end of matching terms.
+   *
+   * <p>The default budget is {@link Integer#MAX_VALUE} (effectively unlimited).
+   */
+  public void setVisitsBudget(int budget) {
+    this.visitsBudget = budget;
+  }
+
+  /** Returns {@code true} if a previous {@link #next()} call returned {@code null} due to the
+   * visits budget being exhausted rather than the terms being truly exhausted. */
+  public boolean isVisitsBudgetExhausted() {
+    return visitsBudget <= 0;
+  }
+
   /**
    * Use this method to set the initial {@link BytesRef} to seek before iterating. This is a
    * convenience method for subclasses that do not override {@link #nextSeekTerm}. If the initial
@@ -228,19 +247,20 @@ public BytesRef next() throws IOException {
       if (doSeek) {
         doSeek = false;
         final BytesRef t = nextSeekTerm(actualTerm);
-        // System.out.println("  seek to t=" + (t == null ? "null" : t.utf8ToString()) + " tenum=" +
-        // tenum);
         // Make sure we always seek forward:
         assert actualTerm == null || t == null || t.compareTo(actualTerm) > 0
             : "curTerm=" + actualTerm + " seekTerm=" + t;
-        if (t == null || tenum.seekCeil(t) == SeekStatus.END) {
-          // no more terms to seek to or enum exhausted
-          // System.out.println("  return null");
+        if (t == null) {
+          return null;
+        }
+        if (--visitsBudget < 0 || tenum.seekCeil(t) == SeekStatus.END) {
           return null;
         }
         actualTerm = tenum.term();
-        // System.out.println("  got term=" + actualTerm.utf8ToString());
       } else {
+        if (--visitsBudget < 0) {
+          return null;
+        }
         actualTerm = tenum.next();
         if (actualTerm == null) {
           // enum exhausted
diff --git a/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java
@@ -20,6 +20,8 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
+import org.apache.lucene.index.AutomatonTermsEnum;
+import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermState;
@@ -29,6 +31,7 @@
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 
 /**
  * Contains functionality common to both {@link MultiTermQueryConstantScoreBlendedWrapper} and
@@ -42,6 +45,12 @@ abstract class AbstractMultiTermQueryConstantScoreWrapper<Q extends MultiTermQue
   // mtq that matches 16 terms or less will be executed as a regular disjunction
   static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
 
+  // Budget for underlying TermsEnum operations when probing an unknown-count query
+  // during scorerSupplier construction. Prefix-like patterns use few operations per
+  // match (the automaton seeks efficiently), while leading wildcards exhaust this
+  // quickly and fall back to deferred collection.
+  static final int AUTOMATON_TERM_COLLECT_VISIT_BUDGET = 256;
+
   protected final Q query;
 
   protected AbstractMultiTermQueryConstantScoreWrapper(Q query) {
@@ -273,19 +282,52 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
               }
             };
       } else {
-        cost = estimateCost(terms, q.getTermsCount());
-        weightOrIteratorSupplier =
-            leadCost -> {
-              List<TermAndState> collectedTerms = new ArrayList<>();
-              if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) {
-                return rewriteAsBooleanQuery(context, collectedTerms);
-              } else {
-                // Too many terms to rewrite as a simple bq.
-                // Invoke rewriteInner logic to handle rewriting:
-                return rewriteInner(
-                    context, fieldDocCount, terms, termsEnum, collectedTerms, leadCost);
-              }
-            };
+        // Unknown term count. Try a cheap budgeted probe: if the automaton can find
+        // all matching terms within a small number of underlying TermsEnum operations,
+        // use those results eagerly. Otherwise, fall back to deferred collection.
+        boolean probeSucceeded = false;
+        List<TermAndState> probeTerms = null;
+        FilteredTermsEnum probeEnum = null;
+        if (termsEnum instanceof FilteredTermsEnum fte) {
+          probeEnum = fte;
+        } else if (q instanceof AutomatonQuery aq
+            && aq.getCompiled().type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+          probeEnum = new AutomatonTermsEnum(terms.iterator(), aq.getCompiled());
+        }
+        if (probeEnum != null) {
+          probeEnum.setVisitsBudget(AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
+          probeTerms = new ArrayList<>();
+          boolean probeResult = collectTerms(fieldDocCount, probeEnum, probeTerms);
+          if (probeResult && !probeEnum.isVisitsBudgetExhausted()) {
+            probeSucceeded = true;
+          }
+        }
+        if (probeSucceeded) {
+          if (probeTerms.isEmpty()) {
+            return null;
+          }
+          long sumTermCost = 0;
+          for (TermAndState collectedTerm : probeTerms) {
+            sumTermCost += collectedTerm.docFreq;
+          }
+          cost = sumTermCost;
+          final List<TermAndState> finalProbeTerms = probeTerms;
+          weightOrIteratorSupplier =
+              leadCost -> rewriteAsBooleanQuery(context, finalProbeTerms);
+        } else {
+          final TermsEnum deferredTermsEnum = q.getTermsEnum(terms);
+          cost = estimateCost(terms, q.getTermsCount());
+          weightOrIteratorSupplier =
+              leadCost -> {
+                List<TermAndState> collectedTerms = new ArrayList<>();
+                if (collectTerms(fieldDocCount, deferredTermsEnum, collectedTerms)) {
+                  return rewriteAsBooleanQuery(context, collectedTerms);
+                } else {
+                  return rewriteInner(
+                      context, fieldDocCount, terms, deferredTermsEnum, collectedTerms, leadCost);
+                }
+              };
+        }
       }
 
       return new ScorerSupplier() {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java
@@ -445,10 +445,9 @@ public void testCostEstimate() throws IOException {
     Query rewritten = searcher.rewrite(query);
     Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
     ScorerSupplier supplier = weight.scorerSupplier(lrc);
-    // Automaton queries have an unknown term count, so term collection is deferred to get() and the
-    // cost is the worst-case estimate (sum of doc freqs across all terms) rather than the sum over
-    // the matching terms only.
-    assertEquals(3000, supplier.cost());
+    // "foo*" matches only 2 terms ("foo bar" and "foo wuzzle"), which can be found cheaply
+    // within the visit budget. So the cost is the accurate sum of their docFreqs (2 * 1000).
+    assertEquals(2000, supplier.cost());
 
     query = new WildcardQuery(new Term("body", "bar*"));
     rewritten = searcher.rewrite(query);
@@ -491,19 +490,53 @@ public void testScorerSupplierDoesNotScanTermsEagerly() throws IOException {
     termsEnumNextCalls.set(0);
     ScorerSupplier supplier = weight.scorerSupplier(lrc);
     assertNotNull(supplier);
-    assertEquals(
-        "scorerSupplier() must not scan the term dictionary for an automaton MultiTermQuery",
-        0,
-        termsEnumNextCalls.get());
-
-    // The scan is deferred to get(): building the scorer is where the terms are actually walked.
+    // The cheap probe runs during scorerSupplier() but is bounded by the visit budget.
+    // For a leading wildcard, the budget is exhausted quickly and term collection is deferred.
+    assertTrue(
+        "scorerSupplier() should use at most the visit budget for a leading wildcard",
+        termsEnumNextCalls.get()
+            <= AbstractMultiTermQueryConstantScoreWrapper
+                .AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
+
+    // The full scan is deferred to get():
     assertNotNull(supplier.get(Long.MAX_VALUE));
     assertTrue("get() should scan the term dictionary", termsEnumNextCalls.get() > 0);
 
     reader.close();
     dir.close();
   }
 
+  // Verifies that a prefix-like wildcard with few (or zero) matches is resolved eagerly
+  // during scorerSupplier(), enabling null-supplier short-circuiting.
+  public void testScorerSupplierResolvesSparsePrefixEagerly() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    for (int i = 0; i < 1000; i++) {
+      Document doc = new Document();
+      doc.add(newStringField("body", "term" + i, Field.Store.NO));
+      writer.addDocument(doc);
+    }
+    writer.flush();
+    writer.forceMerge(1);
+    writer.close();
+
+    DirectoryReader reader = DirectoryReader.open(dir);
+    IndexSearcher searcher = new IndexSearcher(reader);
+    LeafReaderContext lrc = reader.leaves().get(0);
+
+    // No terms match "zzznomatch*", so the probe should find zero terms cheaply
+    // and scorerSupplier() should return null.
+    WildcardQuery query = new WildcardQuery(new Term("body", "zzznomatch*"));
+    Query rewritten = searcher.rewrite(query);
+    Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
+    assertNull(
+        "scorerSupplier() should return null when no terms match a prefix-like wildcard",
+        weight.scorerSupplier(lrc));
+
+    reader.close();
+    dir.close();
+  }
+
   private static TermsEnum nextCountingTermsEnum(TermsEnum in, AtomicInteger counter) {
     return new FilterLeafReader.FilterTermsEnum(in) {
       @Override