Skip to content

Commit da54feb

Browse files
msfrohclaude
andcommitted
Limit term dictionary traversal in multi-term queries
This change builds on the previous commit (#16222) by adding a "visit budget" on FilteredTermsEnum. We can use that to build a ScorerSupplier with more accurate cost if it can be done cheaply (e.g. prefix / regex queries with a prefix). If the terms enum is expensive (e.g. leading wildcard query), then the budget is exhausted and we give up. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e720207 commit da54feb

3 files changed

Lines changed: 124 additions & 29 deletions

File tree

lucene/core/src/java/org/apache/lucene/index/FilteredTermsEnum.java

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ public abstract class FilteredTermsEnum extends TermsEnum {
3434

3535
private BytesRef initialSeekTerm;
3636
private boolean doSeek;
37+
private int visitsBudget = Integer.MAX_VALUE;
3738

3839
/** Which term the enum is currently positioned to. */
3940
protected BytesRef actualTerm;
@@ -90,6 +91,24 @@ protected FilteredTermsEnum(final TermsEnum tenum, final boolean startWithSeek)
9091
doSeek = startWithSeek;
9192
}
9293

94+
/**
95+
* Sets a budget on the number of underlying {@link TermsEnum} operations ({@code seekCeil} and
96+
* {@code next}) that {@link #next()} may perform. When the budget is exhausted, {@code next()}
97+
* returns {@code null} as if the enum were exhausted. Use {@link #isVisitsBudgetExhausted()} to
98+
* distinguish budget exhaustion from a true end of matching terms.
99+
*
100+
* <p>The default budget is {@link Integer#MAX_VALUE} (effectively unlimited).
101+
*/
102+
public void setVisitsBudget(int budget) {
103+
this.visitsBudget = budget;
104+
}
105+
106+
/** Returns {@code true} if a previous {@link #next()} call returned {@code null} due to the
107+
* visits budget being exhausted rather than the terms being truly exhausted. */
108+
public boolean isVisitsBudgetExhausted() {
109+
return visitsBudget <= 0;
110+
}
111+
93112
/**
94113
* Use this method to set the initial {@link BytesRef} to seek before iterating. This is a
95114
* convenience method for subclasses that do not override {@link #nextSeekTerm}. If the initial
@@ -228,19 +247,20 @@ public BytesRef next() throws IOException {
228247
if (doSeek) {
229248
doSeek = false;
230249
final BytesRef t = nextSeekTerm(actualTerm);
231-
// System.out.println(" seek to t=" + (t == null ? "null" : t.utf8ToString()) + " tenum=" +
232-
// tenum);
233250
// Make sure we always seek forward:
234251
assert actualTerm == null || t == null || t.compareTo(actualTerm) > 0
235252
: "curTerm=" + actualTerm + " seekTerm=" + t;
236-
if (t == null || tenum.seekCeil(t) == SeekStatus.END) {
237-
// no more terms to seek to or enum exhausted
238-
// System.out.println(" return null");
253+
if (t == null) {
254+
return null;
255+
}
256+
if (--visitsBudget < 0 || tenum.seekCeil(t) == SeekStatus.END) {
239257
return null;
240258
}
241259
actualTerm = tenum.term();
242-
// System.out.println(" got term=" + actualTerm.utf8ToString());
243260
} else {
261+
if (--visitsBudget < 0) {
262+
return null;
263+
}
244264
actualTerm = tenum.next();
245265
if (actualTerm == null) {
246266
// enum exhausted

lucene/core/src/java/org/apache/lucene/search/AbstractMultiTermQueryConstantScoreWrapper.java

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import java.util.ArrayList;
2121
import java.util.List;
2222
import java.util.Objects;
23+
import org.apache.lucene.index.AutomatonTermsEnum;
24+
import org.apache.lucene.index.FilteredTermsEnum;
2325
import org.apache.lucene.index.LeafReaderContext;
2426
import org.apache.lucene.index.Term;
2527
import org.apache.lucene.index.TermState;
@@ -29,6 +31,7 @@
2931
import org.apache.lucene.util.Accountable;
3032
import org.apache.lucene.util.BytesRef;
3133
import org.apache.lucene.util.RamUsageEstimator;
34+
import org.apache.lucene.util.automaton.CompiledAutomaton;
3235

3336
/**
3437
* Contains functionality common to both {@link MultiTermQueryConstantScoreBlendedWrapper} and
@@ -42,6 +45,12 @@ abstract class AbstractMultiTermQueryConstantScoreWrapper<Q extends MultiTermQue
4245
// mtq that matches 16 terms or less will be executed as a regular disjunction
4346
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
4447

48+
// Budget for underlying TermsEnum operations when probing an unknown-count query
49+
// during scorerSupplier construction. Prefix-like patterns use few operations per
50+
// match (the automaton seeks efficiently), while leading wildcards exhaust this
51+
// quickly and fall back to deferred collection.
52+
static final int AUTOMATON_TERM_COLLECT_VISIT_BUDGET = 256;
53+
4554
protected final Q query;
4655

4756
protected AbstractMultiTermQueryConstantScoreWrapper(Q query) {
@@ -273,19 +282,52 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
273282
}
274283
};
275284
} else {
276-
cost = estimateCost(terms, q.getTermsCount());
277-
weightOrIteratorSupplier =
278-
leadCost -> {
279-
List<TermAndState> collectedTerms = new ArrayList<>();
280-
if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) {
281-
return rewriteAsBooleanQuery(context, collectedTerms);
282-
} else {
283-
// Too many terms to rewrite as a simple bq.
284-
// Invoke rewriteInner logic to handle rewriting:
285-
return rewriteInner(
286-
context, fieldDocCount, terms, termsEnum, collectedTerms, leadCost);
287-
}
288-
};
285+
// Unknown term count. Try a cheap budgeted probe: if the automaton can find
286+
// all matching terms within a small number of underlying TermsEnum operations,
287+
// use those results eagerly. Otherwise, fall back to deferred collection.
288+
boolean probeSucceeded = false;
289+
List<TermAndState> probeTerms = null;
290+
FilteredTermsEnum probeEnum = null;
291+
if (termsEnum instanceof FilteredTermsEnum fte) {
292+
probeEnum = fte;
293+
} else if (q instanceof AutomatonQuery aq
294+
&& aq.getCompiled().type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
295+
probeEnum = new AutomatonTermsEnum(terms.iterator(), aq.getCompiled());
296+
}
297+
if (probeEnum != null) {
298+
probeEnum.setVisitsBudget(AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
299+
probeTerms = new ArrayList<>();
300+
boolean probeResult = collectTerms(fieldDocCount, probeEnum, probeTerms);
301+
if (probeResult && !probeEnum.isVisitsBudgetExhausted()) {
302+
probeSucceeded = true;
303+
}
304+
}
305+
if (probeSucceeded) {
306+
if (probeTerms.isEmpty()) {
307+
return null;
308+
}
309+
long sumTermCost = 0;
310+
for (TermAndState collectedTerm : probeTerms) {
311+
sumTermCost += collectedTerm.docFreq;
312+
}
313+
cost = sumTermCost;
314+
final List<TermAndState> finalProbeTerms = probeTerms;
315+
weightOrIteratorSupplier =
316+
leadCost -> rewriteAsBooleanQuery(context, finalProbeTerms);
317+
} else {
318+
final TermsEnum deferredTermsEnum = q.getTermsEnum(terms);
319+
cost = estimateCost(terms, q.getTermsCount());
320+
weightOrIteratorSupplier =
321+
leadCost -> {
322+
List<TermAndState> collectedTerms = new ArrayList<>();
323+
if (collectTerms(fieldDocCount, deferredTermsEnum, collectedTerms)) {
324+
return rewriteAsBooleanQuery(context, collectedTerms);
325+
} else {
326+
return rewriteInner(
327+
context, fieldDocCount, terms, deferredTermsEnum, collectedTerms, leadCost);
328+
}
329+
};
330+
}
289331
}
290332

291333
return new ScorerSupplier() {

lucene/core/src/test/org/apache/lucene/search/TestWildcardQuery.java

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -445,10 +445,9 @@ public void testCostEstimate() throws IOException {
445445
Query rewritten = searcher.rewrite(query);
446446
Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
447447
ScorerSupplier supplier = weight.scorerSupplier(lrc);
448-
// Automaton queries have an unknown term count, so term collection is deferred to get() and the
449-
// cost is the worst-case estimate (sum of doc freqs across all terms) rather than the sum over
450-
// the matching terms only.
451-
assertEquals(3000, supplier.cost());
448+
// "foo*" matches only 2 terms ("foo bar" and "foo wuzzle"), which can be found cheaply
449+
// within the visit budget. So the cost is the accurate sum of their docFreqs (2 * 1000).
450+
assertEquals(2000, supplier.cost());
452451

453452
query = new WildcardQuery(new Term("body", "bar*"));
454453
rewritten = searcher.rewrite(query);
@@ -491,19 +490,53 @@ public void testScorerSupplierDoesNotScanTermsEagerly() throws IOException {
491490
termsEnumNextCalls.set(0);
492491
ScorerSupplier supplier = weight.scorerSupplier(lrc);
493492
assertNotNull(supplier);
494-
assertEquals(
495-
"scorerSupplier() must not scan the term dictionary for an automaton MultiTermQuery",
496-
0,
497-
termsEnumNextCalls.get());
498-
499-
// The scan is deferred to get(): building the scorer is where the terms are actually walked.
493+
// The cheap probe runs during scorerSupplier() but is bounded by the visit budget.
494+
// For a leading wildcard, the budget is exhausted quickly and term collection is deferred.
495+
assertTrue(
496+
"scorerSupplier() should use at most the visit budget for a leading wildcard",
497+
termsEnumNextCalls.get()
498+
<= AbstractMultiTermQueryConstantScoreWrapper
499+
.AUTOMATON_TERM_COLLECT_VISIT_BUDGET);
500+
501+
// The full scan is deferred to get():
500502
assertNotNull(supplier.get(Long.MAX_VALUE));
501503
assertTrue("get() should scan the term dictionary", termsEnumNextCalls.get() > 0);
502504

503505
reader.close();
504506
dir.close();
505507
}
506508

509+
// Verifies that a prefix-like wildcard with few (or zero) matches is resolved eagerly
510+
// during scorerSupplier(), enabling null-supplier short-circuiting.
511+
public void testScorerSupplierResolvesSparsePrefixEagerly() throws IOException {
512+
Directory dir = newDirectory();
513+
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
514+
for (int i = 0; i < 1000; i++) {
515+
Document doc = new Document();
516+
doc.add(newStringField("body", "term" + i, Field.Store.NO));
517+
writer.addDocument(doc);
518+
}
519+
writer.flush();
520+
writer.forceMerge(1);
521+
writer.close();
522+
523+
DirectoryReader reader = DirectoryReader.open(dir);
524+
IndexSearcher searcher = new IndexSearcher(reader);
525+
LeafReaderContext lrc = reader.leaves().get(0);
526+
527+
// No terms match "zzznomatch*", so the probe should find zero terms cheaply
528+
// and scorerSupplier() should return null.
529+
WildcardQuery query = new WildcardQuery(new Term("body", "zzznomatch*"));
530+
Query rewritten = searcher.rewrite(query);
531+
Weight weight = rewritten.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
532+
assertNull(
533+
"scorerSupplier() should return null when no terms match a prefix-like wildcard",
534+
weight.scorerSupplier(lrc));
535+
536+
reader.close();
537+
dir.close();
538+
}
539+
507540
private static TermsEnum nextCountingTermsEnum(TermsEnum in, AtomicInteger counter) {
508541
return new FilterLeafReader.FilterTermsEnum(in) {
509542
@Override

0 commit comments

Comments
 (0)