Skip to content

Commit a375f4c

Browse files
msfrohclaude
andcommitted
Limit term dictionary traversal in multi-term queries
This change builds on the previous commit (#16222) by adding a "visit budget" on FilteredTermsEnum. We can use that to build a ScorerSupplier with more accurate cost if it can be done cheaply (e.g. prefix / regex queries with a prefix). If the terms enum is expensive (e.g. leading wildcard query), then the budget is exhausted and we give up. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e720207 commit a375f4c

5 files changed

Lines changed: 133 additions & 64 deletions

File tree

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,8 @@ Optimizations
397397
clause matching no documents) before a non-seekable scan, such as a leading wildcard, runs.
398398
(Tianxiao Wei)
399399

400+
* GITHUB#16240: Build on 16222 by letting automaton queries do "some" term dictionary traversal during planning, but with a bounded budget. (Michael Froh)
401+
400402
* GITHUB#16176: Restore WANDScorer for TOP_SCORES + minShouldMatch > 1. (Tianxiao Wei)
401403

402404
* GITHUB#16153: Use TernaryLongHeap in UpdateGraphsUtils for faster HNSW graph merging. (Prithvi S)

lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ public abstract class FilteredTermsEnum extends TermsEnum {
3434

3535
private BytesRef initialSeekTerm;
3636
private boolean doSeek;
37+
private int visitsBudget = Integer.MAX_VALUE;
3738

3839
/** Which term the enum is currently positioned to. */
3940
protected BytesRef actualTerm;
@@ -90,6 +91,26 @@ protected FilteredTermsEnum(final TermsEnum tenum, final boolean startWithSeek)
9091
doSeek = startWithSeek;
9192
}
9293

94+
/**
95+
* Sets a budget on the number of underlying {@link TermsEnum} operations ({@code seekCeil} and
96+
* {@code next}) that {@link #next()} may perform. When the budget is exhausted, {@code next()}
97+
* returns {@code null} as if the enum were exhausted. Use {@link #isVisitsBudgetExhausted()} to
98+
* distinguish budget exhaustion from a true end of matching terms.
99+
*
100+
* <p>The default budget is {@link Integer#MAX_VALUE} (effectively unlimited).
101+
*/
102+
public void setVisitsBudget(int budget) {
103+
this.visitsBudget = budget;
104+
}
105+
106+
/**
107+
* Returns {@code true} if a previous {@link #next()} call returned {@code null} due to the visits
108+
* budget being exhausted rather than the terms being truly exhausted.
109+
*/
110+
public boolean isVisitsBudgetExhausted() {
111+
return visitsBudget <= 0;
112+
}
113+
93114
/**
94115
* Use this method to set the initial {@link BytesRef} to seek before iterating. This is a
95116
* convenience method for subclasses that do not override {@link #nextSeekTerm}. If the initial
@@ -228,19 +249,20 @@ public BytesRef next() throws IOException {
228249
if (doSeek) {
229250
doSeek = false;
230251
final BytesRef t = nextSeekTerm(actualTerm);
231-
// System.out.println(" seek to t=" + (t == null ? "null" : t.utf8ToString()) + " tenum=" +
232-
// tenum);
233252
// Make sure we always seek forward:
234253
assert actualTerm == null || t == null || t.compareTo(actualTerm) > 0
235254
: "curTerm=" + actualTerm + " seekTerm=" + t;
236-
if (t == null || tenum.seekCeil(t) == SeekStatus.END) {
237-
// no more terms to seek to or enum exhausted
238-
// System.out.println(" return null");
255+
if (t == null) {
256+
return null;
257+
}
258+
if (--visitsBudget < 0 || tenum.seekCeil(t) == SeekStatus.END) {
239259
return null;
240260
}
241261
actualTerm = tenum.term();
242-
// System.out.println(" got term=" + actualTerm.utf8ToString());
243262
} else {
263+
if (--visitsBudget < 0) {
264+
return null;
265+
}
244266
actualTerm = tenum.next();
245267
if (actualTerm == null) {
246268
// enum exhausted

lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java

Lines changed: 60 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import java.util.ArrayList;
2121
import java.util.List;
2222
import java.util.Objects;
23+
import org.apache.lucene.index.AutomatonTermsEnum;
24+
import org.apache.lucene.index.FilteredTermsEnum;
2325
import org.apache.lucene.index.LeafReaderContext;
2426
import org.apache.lucene.index.Term;
2527
import org.apache.lucene.index.TermState;
@@ -29,6 +31,7 @@
2931
import org.apache.lucene.util.Accountable;
3032
import org.apache.lucene.util.BytesRef;
3133
import org.apache.lucene.util.RamUsageEstimator;
34+
import org.apache.lucene.util.automaton.CompiledAutomaton;
3235

3336
/**
3437
* Contains functionality common to both {@link MultiTermQueryConstantScoreBlendedWrapper} and
@@ -42,6 +45,12 @@ abstract class AbstractMultiTermQueryConstantScoreWrapper<Q extends MultiTermQue
4245
// mtq that matches 16 terms or less will be executed as a regular disjunction
4346
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
4447

48+
// Budget for underlying TermsEnum operations when probing an unknown-count query
49+
// during scorerSupplier construction. Prefix-like patterns use few operations per
50+
// match (the automaton seeks efficiently), while leading wildcards exhaust this
51+
// quickly and fall back to deferred collection.
52+
static final int AUTOMATON_TERM_COLLECT_VISIT_BUDGET = 256;
53+
4554
protected final Q query;
4655

4756
protected AbstractMultiTermQueryConstantScoreWrapper(Q query) {
@@ -228,62 +237,66 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
228237
final long cost;
229238
final IOLongFunction<WeightOrDocIdSetIterator> weightOrIteratorSupplier;
230239

231-
// Only collect terms while building the ScorerSupplier when the query exposes a known,
232-
// bounded term count (e.g. TermInSetQuery, getTermsCount() >= 0). There, collecting is
233-
// cheap and lets us return a null supplier up-front so a parent BooleanQuery can
234-
// short-circuit.
235-
//
236-
// For queries with an unknown term count (e.g. automaton queries: wildcard / regexp /
237-
// prefix / range), collecting eagerly can scan the whole term dictionary during
238-
// ScorerSupplier construction -- a leading wildcard such as "*foo*" cannot seek and must
239-
// visit every term. That is supposed to be the cheap "planning" phase, and doing it there
240-
// defeats a parent conjunction's ability to short-circuit (a sibling clause matching no
241-
// documents can no longer skip this clause before the scan runs). So for an unknown term
242-
// count we estimate the cost and defer term collection to ScorerSupplier#get().
243-
if (q.getTermsCount() >= 0) {
244-
List<TermAndState> collectedTerms = new ArrayList<>();
245-
boolean collectResult = collectTerms(fieldDocCount, termsEnum, collectedTerms);
246-
if (collectResult) {
247-
// Return a null supplier if no query terms were in the segment:
248-
if (collectedTerms.isEmpty()) {
249-
return null;
250-
}
240+
// Try to eagerly collect matching terms. For queries with a known term count
241+
// (e.g. TermInSetQuery), we always collect eagerly. For queries with an unknown term
242+
// count (e.g. automaton queries: wildcard / regexp / prefix / range), we attempt a
243+
// budgeted probe: if the automaton can find all matching terms within a small number of
244+
// underlying TermsEnum operations, we use those results. Otherwise (probe exhausts its
245+
// budget, or no probe is possible), we estimate the cost and defer term collection to
246+
// ScorerSupplier#get() -- eagerly scanning the whole term dictionary during the
247+
// "planning" phase would defeat a parent conjunction's ability to short-circuit.
248+
List<TermAndState> eagerTerms = new ArrayList<>();
249+
TermsEnum deferredTermsEnum = termsEnum;
250+
boolean eagerSuccess;
251251

252-
// TODO: Instead of replicating the cost logic of a BooleanQuery we could consider
253-
// rewriting to a BQ eagerly at this point and delegating to its cost method (instead of
254-
// lazily rewriting on #get). Not sure what the performance hit would be of doing this
255-
// though.
256-
long sumTermCost = 0;
257-
for (TermAndState collectedTerm : collectedTerms) {
258-
sumTermCost += collectedTerm.docFreq;
259-
}
260-
cost = sumTermCost;
252+
if (q.getTermsCount() >= 0) {
253+
eagerSuccess = collectTerms(fieldDocCount, termsEnum, eagerTerms);
254+
} else {
255+
// Unknown term count. Try a cheap budgeted probe: if the automaton can find
256+
// all matching terms within a small number of underlying TermsEnum operations,
257+
// use those results eagerly. Otherwise, fall back to deferred collection.
258+
FilteredTermsEnum probeEnum = null;
259+
if (termsEnum instanceof FilteredTermsEnum fte) {
260+
probeEnum = fte;
261+
} else if (q instanceof AutomatonQuery aq
262+
&& aq.getCompiled().type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
263+
probeEnum = new AutomatonTermsEnum(terms.iterator(), aq.getCompiled());
264+
}
265+
if (probeEnum != null) {
266+
probeEnum.setVisitsBudget(AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
267+
boolean probeResult = collectTerms(fieldDocCount, probeEnum, eagerTerms);
268+
eagerSuccess = probeResult && !probeEnum.isVisitsBudgetExhausted();
261269
} else {
262-
cost = estimateCost(terms, q.getTermsCount());
270+
eagerSuccess = false;
263271
}
264-
weightOrIteratorSupplier =
265-
leadCost -> {
266-
if (collectResult) {
267-
return rewriteAsBooleanQuery(context, collectedTerms);
268-
} else {
269-
// Too many terms to rewrite as a simple bq.
270-
// Invoke rewriteInner logic to handle rewriting:
271-
return rewriteInner(
272-
context, fieldDocCount, terms, termsEnum, collectedTerms, leadCost);
273-
}
274-
};
272+
if (!eagerSuccess) {
273+
deferredTermsEnum = (probeEnum == termsEnum) ? q.getTermsEnum(terms) : termsEnum;
274+
eagerTerms = new ArrayList<>();
275+
}
276+
}
277+
278+
if (eagerSuccess) {
279+
if (eagerTerms.isEmpty()) {
280+
return null;
281+
}
282+
long sumTermCost = 0;
283+
for (TermAndState collectedTerm : eagerTerms) {
284+
sumTermCost += collectedTerm.docFreq;
285+
}
286+
cost = sumTermCost;
287+
final List<TermAndState> finalTerms = eagerTerms;
288+
weightOrIteratorSupplier = _ -> rewriteAsBooleanQuery(context, finalTerms);
275289
} else {
276290
cost = estimateCost(terms, q.getTermsCount());
291+
final TermsEnum finalDeferredEnum = deferredTermsEnum;
292+
final List<TermAndState> partialTerms = eagerTerms;
277293
weightOrIteratorSupplier =
278294
leadCost -> {
279-
List<TermAndState> collectedTerms = new ArrayList<>();
280-
if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) {
281-
return rewriteAsBooleanQuery(context, collectedTerms);
295+
if (collectTerms(fieldDocCount, finalDeferredEnum, partialTerms)) {
296+
return rewriteAsBooleanQuery(context, partialTerms);
282297
} else {
283-
// Too many terms to rewrite as a simple bq.
284-
// Invoke rewriteInner logic to handle rewriting:
285298
return rewriteInner(
286-
context, fieldDocCount, terms, termsEnum, collectedTerms, leadCost);
299+
context, fieldDocCount, terms, finalDeferredEnum, partialTerms, leadCost);
287300
}
288301
};
289302
}

lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ public void test() throws Exception {
9494
// perform.
9595
assertTrue(
9696
"too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount,
97-
queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 7);
97+
queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 10);
9898
r.close();
9999
dir.close();
100100
}

lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -445,10 +445,9 @@ public void testCostEstimate() throws IOException {
445445
Query rewritten = searcher.rewrite(query);
446446
Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
447447
ScorerSupplier supplier = weight.scorerSupplier(lrc);
448-
// Automaton queries have an unknown term count, so term collection is deferred to get() and the
449-
// cost is the worst-case estimate (sum of doc freqs across all terms) rather than the sum over
450-
// the matching terms only.
451-
assertEquals(3000, supplier.cost());
448+
// "foo*" matches only 2 terms ("foo bar" and "foo wuzzle"), which can be found cheaply
449+
// within the visit budget. So the cost is the accurate sum of their docFreqs (2 * 1000).
450+
assertEquals(2000, supplier.cost());
452451

453452
query = new WildcardQuery(new Term("body", "bar*"));
454453
rewritten = searcher.rewrite(query);
@@ -491,19 +490,52 @@ public void testScorerSupplierDoesNotScanTermsEagerly() throws IOException {
491490
termsEnumNextCalls.set(0);
492491
ScorerSupplier supplier = weight.scorerSupplier(lrc);
493492
assertNotNull(supplier);
494-
assertEquals(
495-
"scorerSupplier() must not scan the term dictionary for an automaton MultiTermQuery",
496-
0,
497-
termsEnumNextCalls.get());
498-
499-
// The scan is deferred to get(): building the scorer is where the terms are actually walked.
493+
// The cheap probe runs during scorerSupplier() but is bounded by the visit budget.
494+
// For a leading wildcard, the budget is exhausted quickly and term collection is deferred.
495+
assertTrue(
496+
"scorerSupplier() should use at most the visit budget for a leading wildcard",
497+
termsEnumNextCalls.get()
498+
<= AbstractMultiTermQueryConstantScoreWrapper.AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
499+
500+
// The full scan is deferred to get():
500501
assertNotNull(supplier.get(Long.MAX_VALUE));
501502
assertTrue("get() should scan the term dictionary", termsEnumNextCalls.get() > 0);
502503

503504
reader.close();
504505
dir.close();
505506
}
506507

508+
// Verifies that a prefix-like wildcard with few (or zero) matches is resolved eagerly
509+
// during scorerSupplier(), enabling null-supplier short-circuiting.
510+
public void testScorerSupplierResolvesSparsePrefixEagerly() throws IOException {
511+
Directory dir = newDirectory();
512+
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
513+
for (int i = 0; i < 1000; i++) {
514+
Document doc = new Document();
515+
doc.add(newStringField("body", "term" + i, Field.Store.NO));
516+
writer.addDocument(doc);
517+
}
518+
writer.flush();
519+
writer.forceMerge(1);
520+
writer.close();
521+
522+
DirectoryReader reader = DirectoryReader.open(dir);
523+
IndexSearcher searcher = new IndexSearcher(reader);
524+
LeafReaderContext lrc = reader.leaves().get(0);
525+
526+
// No terms match "zzznomatch*", so the probe should find zero terms cheaply
527+
// and scorerSupplier() should return null.
528+
WildcardQuery query = new WildcardQuery(new Term("body", "zzznomatch*"));
529+
Query rewritten = searcher.rewrite(query);
530+
Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
531+
assertNull(
532+
"scorerSupplier() should return null when no terms match a prefix-like wildcard",
533+
weight.scorerSupplier(lrc));
534+
535+
reader.close();
536+
dir.close();
537+
}
538+
507539
private static TermsEnum nextCountingTermsEnum(TermsEnum in, AtomicInteger counter) {
508540
return new FilterLeafReader.FilterTermsEnum(in) {
509541
@Override

0 commit comments

Comments
 (0)