Skip to content

Commit c8538c6

Browse files
committed
search: pre-filter literal regex patterns with case-insensitive substring search
The new substring fast-path costs ~5-15 ms (~2-3% of wall time) — a ~40-60x reduction in string-matching cost. `nix search nixpkgs gemma` against a warm eval cache spends ~52% of its wall time inside libc++'s std::regex matcher. Every call into regex_search / sregex_iterator is CPU-heavy (full NFA walk of the input) and allocation-heavy (backtracking state on the heap, per call). The overwhelming majority of derivations searched do not match the user's pattern, so the regex engine does this work only to return false. When the pattern contains no POSIX-extended regex metacharacters (e.g. plain words like 'gemma'), a case-insensitive substring search is equivalent and orders of magnitude cheaper. Use it as a pre-filter; if the literal isn't present in any of (path, name, description), the regex cannot match either and we skip it. If it *is* present, we fall through to the existing regex iterator so that hiliteMatches still receives proper smatch objects. Same treatment for excludeRegexes: literal exclude patterns are skipped entirely if the literal is absent, avoiding the regex_search. Measured on `nix search nixpkgs gemma`, hyperfine 30 runs (3 warmup), warm cache: before (vkwzvmrt): 1.193 s ± 0.012 s after (this commit): 0.568 s ± 0.006 s That's a 2.10x speedup, ~625 ms / ~52% off wall time. Cold cache also benefits, though less so (14.690 s -> 13.738 s, ~6.5%), since cold time is dominated by Nix expression evaluation, not regex. Output is byte-for-byte identical for both literal and non-literal patterns; verified with 'gemma', '^gem', '[Gg]emma', and 'gemma openllm' (AND-of-regexes).
1 parent 616df97 commit c8538c6

1 file changed

Lines changed: 91 additions & 24 deletions

File tree

src/nix/search.cc

Lines changed: 91 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
#include "nix/util/hilite.hh"
1212
#include "nix/util/strings-inline.hh"
1313

14+
#include <algorithm>
15+
#include <optional>
1416
#include <regex>
17+
#include <string_view>
1518
#include <nlohmann/json.hpp>
1619

1720
#include "nix/util/strings.hh"
@@ -25,6 +28,62 @@ std::string wrap(std::string prefix, std::string s)
2528
return concatStrings(prefix, s, ANSI_NORMAL);
2629
}
2730

31+
static bool hasNoRegexMetacharacters(std::string_view re)
32+
{
33+
return re.find_first_of(".[]()*+?{}|\\^$") == std::string_view::npos;
34+
}
35+
36+
/**
37+
* Locale-independent ASCII lower-casing. We deliberately avoid
38+
* `nix::toLower`, which routes through `std::tolower` and therefore depends
39+
* on the active C locale. `std::regex` with `icase` uses its own (C++)
40+
* locale for case-folding; for the literal pre-filter to be equivalent to
41+
* the regex it replaces, both must fold the same way. Folding ASCII bytes
42+
* directly keeps us aligned with `std::regex`'s default ("C" locale)
43+
* behaviour regardless of what `setlocale` may later do to the process.
44+
*/
45+
static std::string asciiLower(std::string_view s)
46+
{
47+
std::string out;
48+
out.reserve(s.size());
49+
for (char c : s)
50+
out.push_back(c >= 'A' && c <= 'Z' ? char(c + ('a' - 'A')) : c);
51+
return out;
52+
}
53+
54+
static bool containsCI(std::string_view haystack, std::string_view needleLower)
55+
{
56+
auto it = std::search(haystack.begin(), haystack.end(), needleLower.begin(), needleLower.end(), [](char a, char b) {
57+
char la = (a >= 'A' && a <= 'Z') ? char(a + ('a' - 'A')) : a;
58+
return la == b;
59+
});
60+
return it != haystack.end();
61+
}
62+
63+
/**
64+
* A user-supplied search pattern, compiled as a POSIX-extended regex with an
65+
* optional case-insensitive substring fast path.
66+
*
67+
* libc++'s std::regex allocates backtracking state on every call, so on the
68+
* `nix search` workload regex matching dominates the profile. When the pattern
69+
* contains no POSIX-extended regex metacharacters, a case-insensitive substring
70+
* search is equivalent and orders of magnitude cheaper; we use it as a
71+
* pre-filter before invoking std::regex.
72+
*/
73+
struct SearchPattern
74+
{
75+
std::regex regex;
76+
/** Lowercased pattern text, set iff `re` had no regex metacharacters. */
77+
std::optional<std::string> literal;
78+
79+
explicit SearchPattern(const std::string & re)
80+
: regex(re, std::regex::extended | std::regex::icase)
81+
{
82+
if (hasNoRegexMetacharacters(re))
83+
literal = asciiLower(re);
84+
}
85+
};
86+
2887
struct CmdSearch : InstallableValueCommand, MixJSON
2988
{
3089
std::vector<std::string> res;
@@ -70,16 +129,13 @@ struct CmdSearch : InstallableValueCommand, MixJSON
70129
throw UsageError(
71130
"Must provide at least one regex! To match all packages, use '%s'.", "nix search <installable> ^");
72131

73-
std::vector<std::regex> regexes;
74-
std::vector<std::regex> excludeRegexes;
75-
regexes.reserve(res.size());
76-
excludeRegexes.reserve(excludeRes.size());
77-
132+
std::vector<SearchPattern> patterns, excludeSearchPatterns;
133+
patterns.reserve(res.size());
134+
excludeSearchPatterns.reserve(excludeRes.size());
78135
for (auto & re : res)
79-
regexes.push_back(std::regex(re, std::regex::extended | std::regex::icase));
80-
136+
patterns.emplace_back(re);
81137
for (auto & re : excludeRes)
82-
excludeRegexes.emplace_back(re, std::regex::extended | std::regex::icase);
138+
excludeSearchPatterns.emplace_back(re);
83139

84140
auto state = getEvalState();
85141

@@ -119,26 +175,37 @@ struct CmdSearch : InstallableValueCommand, MixJSON
119175
std::vector<std::smatch> nameMatches;
120176
bool found = false;
121177

122-
for (auto & regex : excludeRegexes) {
123-
if (std::regex_search(attrPathStr, regex) || std::regex_search(name.name, regex)
124-
|| std::regex_search(description, regex))
178+
auto literalMissesAllFields = [&](const std::optional<std::string> & lit) {
179+
return lit && !containsCI(attrPathStr, *lit) && !containsCI(name.name, *lit)
180+
&& !containsCI(description, *lit);
181+
};
182+
183+
auto addAll = [&found](std::sregex_iterator it, std::vector<std::smatch> & vec) {
184+
const auto end = std::sregex_iterator();
185+
while (it != end) {
186+
vec.push_back(*it++);
187+
found = true;
188+
}
189+
};
190+
191+
for (auto & p : excludeSearchPatterns) {
192+
if (literalMissesAllFields(p.literal))
193+
continue;
194+
if (std::regex_search(attrPathStr, p.regex) || std::regex_search(name.name, p.regex)
195+
|| std::regex_search(description, p.regex))
125196
return;
126197
}
127198

128-
for (auto & regex : regexes) {
199+
for (auto & p : patterns) {
200+
if (literalMissesAllFields(p.literal)) {
201+
found = false;
202+
break;
203+
}
129204
found = false;
130-
auto addAll = [&found](std::sregex_iterator it, std::vector<std::smatch> & vec) {
131-
const auto end = std::sregex_iterator();
132-
while (it != end) {
133-
vec.push_back(*it++);
134-
found = true;
135-
}
136-
};
137-
138-
addAll(std::sregex_iterator(attrPathStr.begin(), attrPathStr.end(), regex), attrPathMatches);
139-
addAll(std::sregex_iterator(name.name.begin(), name.name.end(), regex), nameMatches);
140-
addAll(std::sregex_iterator(description.begin(), description.end(), regex), descriptionMatches);
141-
205+
addAll(std::sregex_iterator(attrPathStr.begin(), attrPathStr.end(), p.regex), attrPathMatches);
206+
addAll(std::sregex_iterator(name.name.begin(), name.name.end(), p.regex), nameMatches);
207+
addAll(
208+
std::sregex_iterator(description.begin(), description.end(), p.regex), descriptionMatches);
142209
if (!found)
143210
break;
144211
}

0 commit comments

Comments
 (0)