Skip to content

Commit 4b5f391

Browse files
msfrohclaude
andcommitted
Limit term dictionary traversal in multi-term queries
This change builds on the previous commit (#16222) by adding a "visit budget" on FilteredTermsEnum. We can use that to build a ScorerSupplier with more accurate cost if it can be done cheaply (e.g. prefix / regex queries with a prefix). If the terms enum is expensive (e.g. leading wildcard query), then the budget is exhausted and we give up. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e720207 commit 4b5f391

3 files changed

Lines changed: 124 additions & 29 deletions

File tree

lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ public abstract class FilteredTermsEnum extends TermsEnum {
3434

3535
private BytesRef initialSeekTerm;
3636
private boolean doSeek;
37+
private int visitsBudget = Integer.MAX_VALUE;
3738

3839
/** Which term the enum is currently positioned to. */
3940
protected BytesRef actualTerm;
@@ -90,6 +91,26 @@ protected FilteredTermsEnum(final TermsEnum tenum, final boolean startWithSeek)
9091
doSeek = startWithSeek;
9192
}
9293

94+
/**
95+
* Sets a budget on the number of underlying {@link TermsEnum} operations ({@code seekCeil} and
96+
* {@code next}) that {@link #next()} may perform. When the budget is exhausted, {@code next()}
97+
* returns {@code null} as if the enum were exhausted. Use {@link #isVisitsBudgetExhausted()} to
98+
* distinguish budget exhaustion from a true end of matching terms.
99+
*
100+
* <p>The default budget is {@link Integer#MAX_VALUE} (effectively unlimited).
101+
*/
102+
public void setVisitsBudget(int budget) {
103+
this.visitsBudget = budget;
104+
}
105+
106+
/**
107+
* Returns {@code true} if a previous {@link #next()} call returned {@code null} due to the visits
108+
* budget being exhausted rather than the terms being truly exhausted.
109+
*/
110+
public boolean isVisitsBudgetExhausted() {
111+
return visitsBudget <= 0;
112+
}
113+
93114
/**
94115
* Use this method to set the initial {@link BytesRef} to seek before iterating. This is a
95116
* convenience method for subclasses that do not override {@link #nextSeekTerm}. If the initial
@@ -228,19 +249,20 @@ public BytesRef next() throws IOException {
228249
if (doSeek) {
229250
doSeek = false;
230251
final BytesRef t = nextSeekTerm(actualTerm);
231-
// System.out.println(" seek to t=" + (t == null ? "null" : t.utf8ToString()) + " tenum=" +
232-
// tenum);
233252
// Make sure we always seek forward:
234253
assert actualTerm == null || t == null || t.compareTo(actualTerm) > 0
235254
: "curTerm=" + actualTerm + " seekTerm=" + t;
236-
if (t == null || tenum.seekCeil(t) == SeekStatus.END) {
237-
// no more terms to seek to or enum exhausted
238-
// System.out.println(" return null");
255+
if (t == null) {
256+
return null;
257+
}
258+
if (--visitsBudget < 0 || tenum.seekCeil(t) == SeekStatus.END) {
239259
return null;
240260
}
241261
actualTerm = tenum.term();
242-
// System.out.println(" got term=" + actualTerm.utf8ToString());
243262
} else {
263+
if (--visitsBudget < 0) {
264+
return null;
265+
}
244266
actualTerm = tenum.next();
245267
if (actualTerm == null) {
246268
// enum exhausted

lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import java.util.ArrayList;
2121
import java.util.List;
2222
import java.util.Objects;
23+
import org.apache.lucene.index.AutomatonTermsEnum;
24+
import org.apache.lucene.index.FilteredTermsEnum;
2325
import org.apache.lucene.index.LeafReaderContext;
2426
import org.apache.lucene.index.Term;
2527
import org.apache.lucene.index.TermState;
@@ -29,6 +31,7 @@
2931
import org.apache.lucene.util.Accountable;
3032
import org.apache.lucene.util.BytesRef;
3133
import org.apache.lucene.util.RamUsageEstimator;
34+
import org.apache.lucene.util.automaton.CompiledAutomaton;
3235

3336
/**
3437
* Contains functionality common to both {@link MultiTermQueryConstantScoreBlendedWrapper} and
@@ -42,6 +45,12 @@ abstract class AbstractMultiTermQueryConstantScoreWrapper<Q extends MultiTermQue
4245
// mtq that matches 16 terms or less will be executed as a regular disjunction
4346
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
4447

48+
// Budget for underlying TermsEnum operations when probing an unknown-count query
49+
// during scorerSupplier construction. Prefix-like patterns use few operations per
50+
// match (the automaton seeks efficiently), while leading wildcards exhaust this
51+
// quickly and fall back to deferred collection.
52+
static final int AUTOMATON_TERM_COLLECT_VISIT_BUDGET = 256;
53+
4554
protected final Q query;
4655

4756
protected AbstractMultiTermQueryConstantScoreWrapper(Q query) {
@@ -273,19 +282,51 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
273282
}
274283
};
275284
} else {
276-
cost = estimateCost(terms, q.getTermsCount());
277-
weightOrIteratorSupplier =
278-
leadCost -> {
279-
List<TermAndState> collectedTerms = new ArrayList<>();
280-
if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) {
281-
return rewriteAsBooleanQuery(context, collectedTerms);
282-
} else {
283-
// Too many terms to rewrite as a simple bq.
284-
// Invoke rewriteInner logic to handle rewriting:
285-
return rewriteInner(
286-
context, fieldDocCount, terms, termsEnum, collectedTerms, leadCost);
287-
}
288-
};
285+
// Unknown term count. Try a cheap budgeted probe: if the automaton can find
286+
// all matching terms within a small number of underlying TermsEnum operations,
287+
// use those results eagerly. Otherwise, fall back to deferred collection.
288+
boolean probeSucceeded = false;
289+
List<TermAndState> probeTerms = null;
290+
FilteredTermsEnum probeEnum = null;
291+
if (termsEnum instanceof FilteredTermsEnum fte) {
292+
probeEnum = fte;
293+
} else if (q instanceof AutomatonQuery aq
294+
&& aq.getCompiled().type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
295+
probeEnum = new AutomatonTermsEnum(terms.iterator(), aq.getCompiled());
296+
}
297+
if (probeEnum != null) {
298+
probeEnum.setVisitsBudget(AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
299+
probeTerms = new ArrayList<>();
300+
boolean probeResult = collectTerms(fieldDocCount, probeEnum, probeTerms);
301+
if (probeResult && !probeEnum.isVisitsBudgetExhausted()) {
302+
probeSucceeded = true;
303+
}
304+
}
305+
if (probeSucceeded) {
306+
if (probeTerms.isEmpty()) {
307+
return null;
308+
}
309+
long sumTermCost = 0;
310+
for (TermAndState collectedTerm : probeTerms) {
311+
sumTermCost += collectedTerm.docFreq;
312+
}
313+
cost = sumTermCost;
314+
final List<TermAndState> finalProbeTerms = probeTerms;
315+
weightOrIteratorSupplier = leadCost -> rewriteAsBooleanQuery(context, finalProbeTerms);
316+
} else {
317+
final TermsEnum deferredTermsEnum = q.getTermsEnum(terms);
318+
cost = estimateCost(terms, q.getTermsCount());
319+
weightOrIteratorSupplier =
320+
leadCost -> {
321+
List<TermAndState> collectedTerms = new ArrayList<>();
322+
if (collectTerms(fieldDocCount, deferredTermsEnum, collectedTerms)) {
323+
return rewriteAsBooleanQuery(context, collectedTerms);
324+
} else {
325+
return rewriteInner(
326+
context, fieldDocCount, terms, deferredTermsEnum, collectedTerms, leadCost);
327+
}
328+
};
329+
}
289330
}
290331

291332
return new ScorerSupplier() {

lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -445,10 +445,9 @@ public void testCostEstimate() throws IOException {
445445
Query rewritten = searcher.rewrite(query);
446446
Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
447447
ScorerSupplier supplier = weight.scorerSupplier(lrc);
448-
// Automaton queries have an unknown term count, so term collection is deferred to get() and the
449-
// cost is the worst-case estimate (sum of doc freqs across all terms) rather than the sum over
450-
// the matching terms only.
451-
assertEquals(3000, supplier.cost());
448+
// "foo*" matches only 2 terms ("foo bar" and "foo wuzzle"), which can be found cheaply
449+
// within the visit budget. So the cost is the accurate sum of their docFreqs (2 * 1000).
450+
assertEquals(2000, supplier.cost());
452451

453452
query = new WildcardQuery(new Term("body", "bar*"));
454453
rewritten = searcher.rewrite(query);
@@ -491,19 +490,52 @@ public void testScorerSupplierDoesNotScanTermsEagerly() throws IOException {
491490
termsEnumNextCalls.set(0);
492491
ScorerSupplier supplier = weight.scorerSupplier(lrc);
493492
assertNotNull(supplier);
494-
assertEquals(
495-
"scorerSupplier() must not scan the term dictionary for an automaton MultiTermQuery",
496-
0,
497-
termsEnumNextCalls.get());
498-
499-
// The scan is deferred to get(): building the scorer is where the terms are actually walked.
493+
// The cheap probe runs during scorerSupplier() but is bounded by the visit budget.
494+
// For a leading wildcard, the budget is exhausted quickly and term collection is deferred.
495+
assertTrue(
496+
"scorerSupplier() should use at most the visit budget for a leading wildcard",
497+
termsEnumNextCalls.get()
498+
<= AbstractMultiTermQueryConstantScoreWrapper.AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
499+
500+
// The full scan is deferred to get():
500501
assertNotNull(supplier.get(Long.MAX_VALUE));
501502
assertTrue("get() should scan the term dictionary", termsEnumNextCalls.get() > 0);
502503

503504
reader.close();
504505
dir.close();
505506
}
506507

508+
// Verifies that a prefix-like wildcard with few (or zero) matches is resolved eagerly
509+
// during scorerSupplier(), enabling null-supplier short-circuiting.
510+
public void testScorerSupplierResolvesSparsePrefixEagerly() throws IOException {
511+
Directory dir = newDirectory();
512+
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
513+
for (int i = 0; i < 1000; i++) {
514+
Document doc = new Document();
515+
doc.add(newStringField("body", "term" + i, Field.Store.NO));
516+
writer.addDocument(doc);
517+
}
518+
writer.flush();
519+
writer.forceMerge(1);
520+
writer.close();
521+
522+
DirectoryReader reader = DirectoryReader.open(dir);
523+
IndexSearcher searcher = new IndexSearcher(reader);
524+
LeafReaderContext lrc = reader.leaves().get(0);
525+
526+
// No terms match "zzznomatch*", so the probe should find zero terms cheaply
527+
// and scorerSupplier() should return null.
528+
WildcardQuery query = new WildcardQuery(new Term("body", "zzznomatch*"));
529+
Query rewritten = searcher.rewrite(query);
530+
Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
531+
assertNull(
532+
"scorerSupplier() should return null when no terms match a prefix-like wildcard",
533+
weight.scorerSupplier(lrc));
534+
535+
reader.close();
536+
dir.close();
537+
}
538+
507539
private static TermsEnum nextCountingTermsEnum(TermsEnum in, AtomicInteger counter) {
508540
return new FilterLeafReader.FilterTermsEnum(in) {
509541
@Override

0 commit comments

Comments
 (0)