Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.

Commit 4bbd4dc

Browse files
authored
Fix long queries at /q (#1223)
This fixes code search against repos where the name is longer than 5 characters. The general strategy here is to bail out of case permutations if any tokens in the generated token stream are too long. This means that for a query like `repo:foobar quux`, the `quux` portion will match in a case-insensitive fashion, while the repo `foobar` must match case exactly.
1 parent c387128 commit 4bbd4dc

1 file changed

Lines changed: 15 additions & 6 deletions

File tree

server/bleep/src/query/compiler.rs

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ use crate::query::{
1919
planner,
2020
};
2121

22+
const MAX_CASE_PERMUTATION_LEN: usize = 5;
23+
2224
type DynQuery = Box<dyn tantivy::query::Query>;
2325

2426
enum Extraction<'a> {
@@ -106,12 +108,20 @@ impl Compiler {
106108
let mut token_stream = tokenizer.token_stream(&text);
107109
let tokens = std::iter::from_fn(move || {
108110
token_stream.next().map(|tok| CompactString::new(&tok.text))
109-
});
111+
})
112+
.collect::<Vec<_>>();
110113

111-
let terms = if query.is_case_sensitive() {
112-
tokens.map(|s| str_to_query(*field, &s)).collect::<Vec<_>>()
114+
// We skip case insensitive matching if a token
115+
let terms = if query.is_case_sensitive()
116+
|| tokens.iter().any(|t| t.len() > MAX_CASE_PERMUTATION_LEN)
117+
{
118+
tokens
119+
.into_iter()
120+
.map(|s| str_to_query(*field, &s))
121+
.collect::<Vec<_>>()
113122
} else {
114123
tokens
124+
.into_iter()
115125
.map(|s| {
116126
let terms = case_permutations(&s)
117127
.map(|s| str_to_query(*field, &s))
@@ -253,9 +263,8 @@ pub fn case_permutations(s: &str) -> impl Iterator<Item = CompactString> {
253263
.map(|c| c.to_ascii_lowercase())
254264
.collect::<SmallVec<[char; 3]>>();
255265

256-
// Make sure not to overflow. The end condition is a mask with the highest bit set, and we use
257-
// `u32` masks.
258-
debug_assert!(chars.len() <= 5);
266+
// A string that is too long leads to an exponential explosion here, growing with 2^n.
267+
debug_assert!(chars.len() <= MAX_CASE_PERMUTATION_LEN);
259268

260269
let num_chars = chars.len();
261270

0 commit comments

Comments
 (0)