Skip to content

Commit 0fbaf4d

Browse files
Regex builtins: re_match / re_find / re_find_all / re_replace / re_split
Third piece of the Python-ergonomics catch-up. Wraps the `regex` crate (already a workspace dep) behind 5 OMC builtins covering the common Python `re` module surface: re_match(pattern, text) -> 0 or 1 Returns 1 if the pattern matches anywhere in text. Use ^/$ to anchor. Compiles on every call (one-shot use cheap; for tight loops, wrap+memoize at the OMC level). re_find(pattern, text) -> string First match as a string. Returns "" if no match (NOT null, for easier downstream string-handling). re_find_all(pattern, text) -> array of strings All non-overlapping matches in order. Empty array if no matches. re_replace(pattern, text, replacement) -> string All matches replaced. Supports Rust regex's $1, $2, ${name} backref syntax for capture-group substitution. re_split(pattern, text) -> array of strings Split at each match. Adjacent matches produce empty strings in the output (Python's re.split semantics). Tests (examples/tests/test_regex.omc — 10 tests, all pass): - Basic match / anchored match / character class - First find / find-all - Replace literal / replace with backref-swap - Split on whitespace / CSV-style split - Edge: empty pattern matches everything, non-empty on empty text Registered in: - is_known_builtin matches!() - HEAL_BUILTIN_NAMES (so heal pass doesn't flag them as typos) - compiler type-tag: re_match returns int (JIT compatibility) Regression: 8 exception + 10 f-string + 57 substrate + 70 builtins + 18 harmonic libs + 16 heal + 10 new regex = 189 OMC tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 54c3714 commit 0fbaf4d

3 files changed

Lines changed: 151 additions & 0 deletions

File tree

examples/tests/test_regex.omc

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Tests for regex builtins: re_match, re_find, re_find_all, re_replace, re_split.
2+
3+
fn assert_eq(actual, expected, msg) {
4+
if actual != expected {
5+
test_record_failure(msg + ": expected " + to_string(expected) + " got " + to_string(actual));
6+
}
7+
}
8+
9+
fn test_re_match_basic() {
10+
assert_eq(re_match("foo", "barfoobaz"), 1, "foo matches in barfoobaz");
11+
assert_eq(re_match("xyz", "abc"), 0, "xyz does not match abc");
12+
}
13+
14+
fn test_re_match_anchored() {
15+
assert_eq(re_match("^abc", "abcdef"), 1, "^abc matches abcdef");
16+
assert_eq(re_match("^abc", "xabcdef"), 0, "^abc does NOT match xabcdef");
17+
}
18+
19+
fn test_re_match_classes() {
20+
assert_eq(re_match("[0-9]+", "abc 42 xyz"), 1, "digit class matches");
21+
assert_eq(re_match("^[a-z]+$", "hello"), 1, "lowercase-only matches");
22+
assert_eq(re_match("^[a-z]+$", "Hello"), 0, "uppercase rejected");
23+
}
24+
25+
fn test_re_find() {
26+
assert_eq(re_find("[0-9]+", "abc 42 xyz"), "42", "first digit match");
27+
assert_eq(re_find("[0-9]+", "abc"), "", "no match -> empty string");
28+
}
29+
30+
fn test_re_find_all() {
31+
h matches = re_find_all("[0-9]+", "1 2 3 hello 42");
32+
assert_eq(arr_len(matches), 4, "4 numeric tokens");
33+
assert_eq(arr_get(matches, 0), "1", "first match");
34+
assert_eq(arr_get(matches, 3), "42", "last match");
35+
}
36+
37+
fn test_re_replace_basic() {
38+
assert_eq(re_replace("[0-9]+", "abc 42 xyz", "N"), "abc N xyz", "replace digits with N");
39+
assert_eq(re_replace("o", "foo", "0"), "f00", "replace all o with 0");
40+
}
41+
42+
fn test_re_replace_backref() {
43+
# Rust regex syntax uses ${n} or $n for capture groups in replacement.
44+
assert_eq(re_replace("(\\w+) (\\w+)", "hello world", "$2 $1"),
45+
"world hello", "swap with backrefs");
46+
}
47+
48+
fn test_re_split() {
49+
h parts = re_split("\\s+", "hello world foo");
50+
assert_eq(arr_len(parts), 3, "3 whitespace-split parts");
51+
assert_eq(arr_get(parts, 0), "hello", "first part");
52+
assert_eq(arr_get(parts, 1), "world", "second part");
53+
assert_eq(arr_get(parts, 2), "foo", "third part");
54+
}
55+
56+
fn test_re_split_csv() {
57+
h parts = re_split(",\\s*", "a, b,c, d");
58+
assert_eq(arr_len(parts), 4, "4 comma-split parts");
59+
assert_eq(arr_get(parts, 2), "c", "trim works");
60+
}
61+
62+
fn test_re_match_empty_string() {
63+
assert_eq(re_match("", "anything"), 1, "empty pattern matches everything");
64+
assert_eq(re_match("anything", ""), 0, "non-empty pattern on empty text");
65+
}

omnimcode-core/src/compiler.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ impl Compiler {
182182
// String → int parsers + counters + size queries
183183
| "str_to_int" | "str_count" | "str_is_empty"
184184
| "dict_size" | "dict_pop"
185+
// Regex predicate returns int
186+
| "re_match"
185187
// Array index/aggregate ints
186188
| "arr_argmax" | "arr_argmin"
187189
// Bit/digit/modular int returns

omnimcode-core/src/interpreter.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1914,6 +1914,7 @@ impl Interpreter {
19141914
| "str_repeat" | "str_reverse" | "str_uppercase" | "str_lowercase"
19151915
| "str_split_lines" | "str_count" | "str_is_empty"
19161916
| "str_to_int" | "str_to_float" | "str_capitalize"
1917+
| "re_match" | "re_find" | "re_find_all" | "re_replace" | "re_split"
19171918
// Arrays
19181919
| "arr_new" | "arr_from_range" | "arr_len" | "arr_get" | "arr_set"
19191920
| "arr_push" | "arr_first" | "arr_last" | "arr_slice" | "arr_concat"
@@ -3099,6 +3100,88 @@ impl Interpreter {
30993100
}
31003101
Ok(Value::String(s.replace(&old, &new_s)))
31013102
}
3103+
// ---- Regex (PCRE-style via the `regex` crate) -----------
3104+
// Compiles the pattern on every call; for inner loops that
3105+
// want a compiled regex reused, wrap the call in a fn and
3106+
// memoize at the OMC level. Cheap-enough for one-shot use.
3107+
"re_match" => {
3108+
// re_match(pattern, text) -> 1 if pattern matches anywhere
3109+
// in text, 0 otherwise. Anchor with ^/$ if you need
3110+
// full-string matching.
3111+
if args.len() < 2 {
3112+
return Err("re_match requires (pattern, text)".to_string());
3113+
}
3114+
let pat = self.eval_expr(&args[0])?.to_display_string();
3115+
let text = self.eval_expr(&args[1])?.to_display_string();
3116+
match regex::Regex::new(&pat) {
3117+
Ok(re) => Ok(Value::HInt(HInt::new(if re.is_match(&text) { 1 } else { 0 }))),
3118+
Err(e) => Err(format!("re_match: invalid pattern {:?}: {}", pat, e)),
3119+
}
3120+
}
3121+
"re_find" => {
3122+
// re_find(pattern, text) -> first match as string, or "" if no match.
3123+
if args.len() < 2 {
3124+
return Err("re_find requires (pattern, text)".to_string());
3125+
}
3126+
let pat = self.eval_expr(&args[0])?.to_display_string();
3127+
let text = self.eval_expr(&args[1])?.to_display_string();
3128+
match regex::Regex::new(&pat) {
3129+
Ok(re) => {
3130+
let m = re.find(&text).map(|m| m.as_str().to_string()).unwrap_or_default();
3131+
Ok(Value::String(m))
3132+
}
3133+
Err(e) => Err(format!("re_find: invalid pattern {:?}: {}", pat, e)),
3134+
}
3135+
}
3136+
"re_find_all" => {
3137+
// re_find_all(pattern, text) -> array of all matches (in order).
3138+
if args.len() < 2 {
3139+
return Err("re_find_all requires (pattern, text)".to_string());
3140+
}
3141+
let pat = self.eval_expr(&args[0])?.to_display_string();
3142+
let text = self.eval_expr(&args[1])?.to_display_string();
3143+
match regex::Regex::new(&pat) {
3144+
Ok(re) => {
3145+
let matches: Vec<Value> = re.find_iter(&text)
3146+
.map(|m| Value::String(m.as_str().to_string()))
3147+
.collect();
3148+
Ok(Value::Array(HArray::from_vec(matches)))
3149+
}
3150+
Err(e) => Err(format!("re_find_all: invalid pattern {:?}: {}", pat, e)),
3151+
}
3152+
}
3153+
"re_replace" => {
3154+
// re_replace(pattern, text, replacement) -> text with all
3155+
// pattern matches replaced. Supports $1, $2 backrefs in
3156+
// replacement string (Rust regex syntax).
3157+
if args.len() < 3 {
3158+
return Err("re_replace requires (pattern, text, replacement)".to_string());
3159+
}
3160+
let pat = self.eval_expr(&args[0])?.to_display_string();
3161+
let text = self.eval_expr(&args[1])?.to_display_string();
3162+
let repl = self.eval_expr(&args[2])?.to_display_string();
3163+
match regex::Regex::new(&pat) {
3164+
Ok(re) => Ok(Value::String(re.replace_all(&text, repl.as_str()).into_owned())),
3165+
Err(e) => Err(format!("re_replace: invalid pattern {:?}: {}", pat, e)),
3166+
}
3167+
}
3168+
"re_split" => {
3169+
// re_split(pattern, text) -> array of substrings split at pattern.
3170+
if args.len() < 2 {
3171+
return Err("re_split requires (pattern, text)".to_string());
3172+
}
3173+
let pat = self.eval_expr(&args[0])?.to_display_string();
3174+
let text = self.eval_expr(&args[1])?.to_display_string();
3175+
match regex::Regex::new(&pat) {
3176+
Ok(re) => {
3177+
let parts: Vec<Value> = re.split(&text)
3178+
.map(|s| Value::String(s.to_string()))
3179+
.collect();
3180+
Ok(Value::Array(HArray::from_vec(parts)))
3181+
}
3182+
Err(e) => Err(format!("re_split: invalid pattern {:?}: {}", pat, e)),
3183+
}
3184+
}
31023185
"str_index_of" => {
31033186
if args.len() < 2 {
31043187
return Err("str_index_of requires (haystack, needle)".to_string());
@@ -7771,6 +7854,7 @@ pub(crate) const HEAL_BUILTIN_NAMES: &[&str] = &[
77717854
"str_pad_left", "str_pad_right",
77727855
"str_split_lines", "str_count", "str_is_empty",
77737856
"str_to_int", "str_to_float", "str_capitalize",
7857+
"re_match", "re_find", "re_find_all", "re_replace", "re_split",
77747858
// Arrays
77757859
"arr_new", "arr_from_range", "arr_len", "arr_get", "arr_set",
77767860
"arr_push", "arr_first", "arr_last", "arr_slice", "arr_concat",

0 commit comments

Comments
 (0)