Skip to content

Commit bfaaf17

Browse files
committed
[RegExp] Fix character escaping and add character classes
1 parent 0a4693a commit bfaaf17

3 files changed

Lines changed: 258 additions & 49 deletions

File tree

lib/src/regexp/node.dart

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class DotNode extends Node {
4646
}
4747

4848
class LiteralNode extends Node {
49-
LiteralNode(String literal) : codePoint = literal.runes.single;
49+
LiteralNode(String literal) : codePoint = literal.codeUnits.single;
5050

5151
final int codePoint;
5252

@@ -69,6 +69,46 @@ class LiteralNode extends Node {
6969
int get hashCode => Object.hash(runtimeType, codePoint);
7070
}
7171

72+
class RangeNode extends Node {
73+
RangeNode(String start, String end)
74+
: startCodePoint = start.codeUnits.single,
75+
endCodePoint = end.codeUnits.single {
76+
if (startCodePoint > endCodePoint) {
77+
throw ArgumentError.value(
78+
'$start-$end',
79+
'start-end',
80+
'Start must be less than or equal to end',
81+
);
82+
}
83+
}
84+
85+
final int startCodePoint;
86+
final int endCodePoint;
87+
88+
@override
89+
Nfa toNfa() {
90+
final start = NfaState(isEnd: false);
91+
final end = NfaState(isEnd: true);
92+
for (var i = startCodePoint; i <= endCodePoint; i++) {
93+
start.transitions[i] = end;
94+
}
95+
return Nfa(start: start, end: end);
96+
}
97+
98+
@override
99+
String toString() =>
100+
'RangeNode(${String.fromCharCode(startCodePoint)}-${String.fromCharCode(endCodePoint)})';
101+
102+
@override
103+
bool operator ==(Object other) =>
104+
other is RangeNode &&
105+
other.startCodePoint == startCodePoint &&
106+
other.endCodePoint == endCodePoint;
107+
108+
@override
109+
int get hashCode => Object.hash(runtimeType, startCodePoint, endCodePoint);
110+
}
111+
72112
class ConcatenationNode extends Node {
73113
ConcatenationNode(this.left, this.right);
74114

@@ -234,7 +274,19 @@ class ComplementNode extends Node {
234274
final Node child;
235275

236276
@override
237-
Nfa toNfa() => throw UnsupportedError(toString());
277+
Nfa toNfa() {
278+
final childNfa = child.toNfa();
279+
final accepted = _collectAcceptedCodePoints(childNfa);
280+
281+
final start = NfaState(isEnd: false);
282+
final end = NfaState(isEnd: true);
283+
for (var i = 0; i <= 0xffff; i++) {
284+
if (!accepted.contains(i)) {
285+
start.transitions[i] = end;
286+
}
287+
}
288+
return Nfa(start: start, end: end);
289+
}
238290

239291
@override
240292
String toString() => 'ComplementNode($child)';
@@ -284,3 +336,40 @@ class EndAnchorNode extends Node {
284336
@override
285337
int get hashCode => runtimeType.hashCode;
286338
}
339+
340+
Set<int> _collectAcceptedCodePoints(Nfa childNfa) {
341+
final accepted = <int>{};
342+
final fromStart = <NfaState>{};
343+
void traverseStart(NfaState state) {
344+
if (!fromStart.add(state)) return;
345+
for (final next in state.epsilons) {
346+
traverseStart(next);
347+
}
348+
}
349+
350+
traverseStart(childNfa.start);
351+
final reached = <NfaState, bool>{};
352+
bool reachesEnd(NfaState state, Set<NfaState> visited) {
353+
if (state == childNfa.end || state.isEnd) return true;
354+
final cached = reached[state];
355+
if (cached != null) return cached;
356+
if (!visited.add(state)) return false;
357+
for (final next in state.epsilons) {
358+
if (reachesEnd(next, visited)) {
359+
reached[state] = true;
360+
return true;
361+
}
362+
}
363+
reached[state] = false;
364+
return false;
365+
}
366+
367+
for (final s in fromStart) {
368+
s.transitions.forEach((c, nextState) {
369+
if (reachesEnd(nextState, {})) {
370+
accepted.add(c);
371+
}
372+
});
373+
}
374+
return accepted;
375+
}

lib/src/regexp/parser.dart

Lines changed: 70 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,82 @@ import 'package:petitparser/definition.dart';
22
import 'package:petitparser/expression.dart';
33
import 'package:petitparser/parser.dart';
44

5+
import 'classes.dart';
56
import 'node.dart';
67

7-
final nodeParser = () {
8-
final builder = ExpressionBuilder<Node>();
8+
class RegexpParserDefinition extends GrammarDefinition<Node> {
9+
Parser<Node> escape() => any()
10+
.skip(before: char(r'\'))
11+
.map((char) => escapeClasses[char] ?? LiteralNode(char));
12+
Parser<Node> dot() => char('.').map((_) => DotNode());
13+
Parser<Node> startAnchor() => char('^').map((_) => StartAnchorNode());
14+
Parser<Node> endAnchor() => char(r'$').map((_) => EndAnchorNode());
15+
Parser<Node> other() => noneOf('()!|&').map((char) => LiteralNode(char));
916

10-
const meta = r'\.()!*+?|&^$';
11-
builder
12-
..primitive(noneOf(meta).map(LiteralNode.new))
13-
..primitive(anyOf(meta).skip(before: char(r'\')).map(LiteralNode.new))
14-
..primitive(char('.').map((_) => DotNode()))
15-
..primitive(char('^').map((_) => StartAnchorNode()))
16-
..primitive(char(r'$').map((_) => EndAnchorNode()));
17+
Parser<Node> charClass() => seq2(char('^').optional(), ref0(charClassItems))
18+
.map2((negate, items) => negate != null ? ComplementNode(items) : items)
19+
.skip(before: char('['), after: char(']'));
20+
Parser<Node> charClassItems() => ref0(
21+
charClassItem,
22+
).plus().map((items) => items.reduce(AlternationNode.new));
23+
Parser<Node> charClassItem() => [
24+
ref0(charClassRange),
25+
ref0(escape),
26+
noneOf(']').map((char) => LiteralNode(char)),
27+
].toChoiceParser();
28+
Parser<Node> charClassRange() => seq3(
29+
any(),
30+
char('-'),
31+
any(),
32+
).map3((start, _, end) => RangeNode(start, end));
1733

18-
builder.group().wrapper(char('('), char(')'), (_, value, _) => value);
19-
20-
final integer = digit().plusString().trim().map(int.parse);
21-
final range =
22-
seq3(integer.optional(), char(',').trim().optional(), integer.optional())
34+
Parser<int> integer() => digit().plusString().trim().map(int.parse);
35+
Parser<({int min, int? max})> range() =>
36+
seq3(
37+
ref0(integer).optional(),
38+
char(',').trim().optional(),
39+
ref0(integer).optional(),
40+
)
2341
.skip(before: char('{'), after: char('}'))
2442
.map3(
2543
(min, comma, max) =>
26-
(min ?? 0, max ?? (comma == null ? min ?? 0 : null)),
44+
(min: min ?? 0, max: max ?? (comma == null ? min ?? 0 : null)),
2745
);
2846

29-
builder.group()
30-
..prefix(char('!'), (_, exp) => ComplementNode(exp))
31-
..postfix(char('*'), (exp, _) => QuantificationNode(exp, 0))
32-
..postfix(char('+'), (exp, _) => QuantificationNode(exp, 1))
33-
..postfix(char('?'), (exp, _) => QuantificationNode(exp, 0, 1))
34-
..postfix(
35-
range,
36-
(exp, range) => QuantificationNode(exp, range.$1, range.$2),
37-
);
38-
39-
builder.group()
40-
..left(epsilon(), (left, _, right) => ConcatenationNode(left, right))
41-
..optional(EmptyNode());
42-
43-
builder.group()
44-
..left(char('|'), (left, _, right) => AlternationNode(left, right))
45-
..left(char('&'), (left, _, right) => IntersectionNode(left, right));
46-
47-
return resolve(builder.build()).end();
48-
}();
47+
@override
48+
Parser<Node> start() {
49+
final builder = ExpressionBuilder<Node>();
50+
51+
builder
52+
..primitive(ref0(dot))
53+
..primitive(ref0(startAnchor))
54+
..primitive(ref0(endAnchor))
55+
..primitive(ref0(charClass))
56+
..primitive(ref0(escape))
57+
..primitive(ref0(other));
58+
59+
builder.group().wrapper(char('('), char(')'), (_, value, _) => value);
60+
61+
builder.group()
62+
..prefix(char('!'), (_, exp) => ComplementNode(exp))
63+
..postfix(char('*'), (exp, _) => QuantificationNode(exp, 0))
64+
..postfix(char('+'), (exp, _) => QuantificationNode(exp, 1))
65+
..postfix(char('?'), (exp, _) => QuantificationNode(exp, 0, 1))
66+
..postfix(
67+
ref0(range),
68+
(exp, range) => QuantificationNode(exp, range.min, range.max),
69+
);
70+
71+
builder.group()
72+
..left(epsilon(), (left, _, right) => ConcatenationNode(left, right))
73+
..optional(EmptyNode());
74+
75+
builder.group()
76+
..left(char('|'), (left, _, right) => AlternationNode(left, right))
77+
..left(char('&'), (left, _, right) => IntersectionNode(left, right));
78+
79+
return resolve(builder.build()).end();
80+
}
81+
}
82+
83+
final nodeParser = RegexpParserDefinition().build();

0 commit comments

Comments
 (0)