Skip to content

Commit b909ff9

Browse files
authored
Merge pull request #1529 from WebFuzzing/regex-support-extension
Java more general solution for regex syntax characters escaping
2 parents 263a743 + 7fbe414 commit b909ff9

4 files changed

Lines changed: 36 additions & 27 deletions

File tree

core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,12 @@ patternCharacter
140140
// SourceCharacter but not one of ^ $ \ . * + ? ( ) [ ] { } |
141141
//: ~[^$\\.*+?()[\]{}|]
142142
: BaseChar
143+
| COMMA
143144
| MINUS
144145
| DecimalDigit
146+
// These are also allowed as literals when no matching pair exists
147+
| BRACE_close
148+
| BRACKET_close
145149
;
146150

147151

core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,6 @@ atom
9191
| atomEscape
9292
| characterClass
9393
| PAREN_open disjunction PAREN_close
94-
// These two rules are added to handle the . and + symbols in emails
95-
// A more general solution is needed for escaped control symbols in Java
96-
// regular expressions
97-
| ESCAPED_DOT
98-
| ESCAPED_PLUS
9994

10095
//TODO
10196
// | '(' '?' ':' disjunction ')'
@@ -176,9 +171,13 @@ patternCharacter
176171
// SourceCharacter but not one of ^ $ \ . * + ? ( ) [ ] { } |
177172
//: ~[^$\\.*+?()[\]{}|]
178173
: BaseChar
174+
| COMMA
179175
| MINUS
180176
| DecimalDigit
181177
| E | Q
178+
// These are also allowed as literals when no matching pair exists
179+
| BRACE_close
180+
| BRACKET_close
182181
;
183182

184183

@@ -222,7 +221,7 @@ classAtomNoDash
222221
| DecimalDigit
223222
| COMMA | CARET | DOLLAR | DOT | STAR | PLUS | QUESTION
224223
| PAREN_open | PAREN_close | BRACKET_open | BRACE_open | BRACE_close | OR | E | Q
225-
| ESCAPED_DOT | ESCAPED_PLUS;
224+
;
226225

227226
decimalDigits
228227
: DecimalDigit+
@@ -236,6 +235,7 @@ classEscape
236235
atomEscape
237236
: CharacterClassEscape
238237
| CharacterEscape
238+
| SyntaxEscapes
239239
// TODO
240240
// | '\\' DecimalEscape
241241
;
@@ -255,8 +255,10 @@ CharacterClassEscape
255255
;
256256

257257

258-
ESCAPED_PLUS : '\\+'; // Recognize \+
259-
ESCAPED_DOT : '\\.'; // Recognize \-
258+
SyntaxEscapes
259+
: SLASH [^$\\.*+?()[\]{}|/\-,:<>=!]
260+
;
261+
260262
CARET : '^';
261263
DOLLAR : '$';
262264
SLASH : '\\';

core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ private const val EOF_TOKEN = "<EOF>"
99
*/
1010
class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
1111

12+
/**
13+
* These are the Java regex syntax characters, all of these can be escaped to be treated as literals.
14+
*/
15+
private val allowedSyntaxEscapes = "^$\\.*+?()[]{}|/-,:<>=!"
1216

1317
override fun visitPattern(ctx: RegexJavaParser.PatternContext): VisitResult {
1418

@@ -215,18 +219,6 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
215219
return ctx.characterClass().accept(this)
216220
}
217221

218-
if (ctx.ESCAPED_PLUS()!=null) {
219-
val name = "blankBlock"
220-
val char = ctx.ESCAPED_PLUS().text[1].toString()
221-
return VisitResult(PatternCharacterBlockGene(name, char))
222-
}
223-
224-
if (ctx.ESCAPED_DOT()!=null) {
225-
val name = "blankBlock"
226-
val char = ctx.ESCAPED_DOT().text[1].toString()
227-
return VisitResult(PatternCharacterBlockGene(name, char))
228-
}
229-
230222
throw IllegalStateException("No valid atom resolver for: ${ctx.text}")
231223
}
232224

@@ -281,10 +273,9 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
281273
start
282274
}
283275
} else {
284-
// This case handles the \. and \+ cases
285-
// wheren . and + should be treated as
286-
// regular chars
287-
assert(startText == "\\+" || startText == "\\.")
276+
// This case handles the escaped syntax characters, like "\." and "\+", etc. cases
277+
// where '.' and '+', etc. should be treated as regular chars
278+
assert(startText[0] == '\\' && startText[1] in allowedSyntaxEscapes)
288279
start = startText[1]
289280
end = start
290281
}
@@ -325,8 +316,13 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
325316
val rec = (ctx.classAtom() ?: ctx.classAtomNoDash()).accept(this).data as List<CharacterRange>
326317
list.addAll(rec)
327318
} else {
328-
val char = (ctx.classAtom() ?: ctx.classAtomNoDash()).text[0]
329-
list.add(CharacterRange(char, char))
319+
val text = (ctx.classAtom() ?: ctx.classAtomNoDash()).text
320+
if(text.length==1) {
321+
list.add(CharacterRange(text[0], text[0]))
322+
}
323+
else {
324+
list.add(CharacterRange(text[1], text[1]))
325+
}
330326
}
331327
}
332328

@@ -415,6 +411,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
415411
String(Character.toChars(hexValue))
416412
)
417413
}
414+
in allowedSyntaxEscapes -> PatternCharacterBlockGene(txt, txt.substring(1))
418415
else -> CharacterClassEscapeRxGene(txt.substring(1))
419416
})
420417
}

core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,13 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){
359359
checkCanSample("""\D""", listOf("\u0000", "\uffff"), 1_000_000)
360360
}
361361

362+
@Test
363+
fun testSyntaxEscapes(){
364+
checkSameAsJava("""}],-:=<>!""")
365+
checkSameAsJava("""\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/\-\,\:\<\>\=\!""")
366+
checkSameAsJava("""[\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/\-\,\:\<\>\=\!]""")
367+
}
368+
362369
@Test
363370
open fun testPredefinedCharClassInsideCharClass(){
364371
checkSameAsJava("""[abc\d0]""")
@@ -372,7 +379,6 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){
372379
@Test
373380
open fun testJSExclusiveEscapes(){
374381
checkCanSample("""\a""", "a", 100)
375-
checkCanSample("""\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/""", "^$\\.*+?()[]{}|/", 100)
376382
checkCanSample("""[\c0]""", "\u0010", 100)
377383
checkCanSample("""[\cP][\c0]""", "\u0010\u0010", 100)
378384
checkCanSample("""[\c_]""", "\u001f", 100)

0 commit comments

Comments
 (0)