Merge pull request #1529 from WebFuzzing/regex-support-extension

arcuri82 · web-flow · commit b909ff9cb367 · 2026-05-02T20:26:49.000+02:00
Java more general solution for regex syntax characters escaping
diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4
@@ -140,8 +140,12 @@ patternCharacter
  // SourceCharacter but not one of ^ $ \ . * + ? ( ) [ ] { } |
  //: ~[^$\\.*+?()[\]{}|]
  : BaseChar
+ | COMMA
  | MINUS
  | DecimalDigit
+ // These are also allowed as literals when no matching pair exists
+ | BRACE_close
+ | BRACKET_close
  ;
 
 
diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4
@@ -91,11 +91,6 @@ atom
  | atomEscape
  | characterClass
  | PAREN_open disjunction PAREN_close
- // These two rules are added to handle the . and + symbols in emails
- // A more general solution is needed for escaped control symbols in Java
- // regular expressions
- | ESCAPED_DOT
- | ESCAPED_PLUS
 
  //TODO
 // | '(' '?' ':' disjunction ')'
@@ -176,9 +171,13 @@ patternCharacter
  // SourceCharacter but not one of ^ $ \ . * + ? ( ) [ ] { } |
  //: ~[^$\\.*+?()[\]{}|]
  : BaseChar
+ | COMMA
  | MINUS
  | DecimalDigit
  | E | Q
+ // These are also allowed as literals when no matching pair exists
+ | BRACE_close
+ | BRACKET_close
  ;
 
 
@@ -222,7 +221,7 @@ classAtomNoDash
  | DecimalDigit
  | COMMA | CARET | DOLLAR | DOT | STAR | PLUS | QUESTION
  | PAREN_open | PAREN_close | BRACKET_open | BRACE_open | BRACE_close | OR | E | Q
- | ESCAPED_DOT | ESCAPED_PLUS;
+ ;
 
 decimalDigits
  : DecimalDigit+
@@ -236,6 +235,7 @@ classEscape
 atomEscape
  : CharacterClassEscape
  | CharacterEscape
+ | SyntaxEscapes
 // TODO
 // | '\\' DecimalEscape
  ;
@@ -255,8 +255,10 @@ CharacterClassEscape
  ;
 
 
-ESCAPED_PLUS               : '\\+'; // Recognize \+
-ESCAPED_DOT                : '\\.'; // Recognize \-
+SyntaxEscapes
+ : SLASH [^$\\.*+?()[\]{}|/\-,:<>=!]
+ ;
+
 CARET                      : '^';
 DOLLAR                     : '$';
 SLASH                      : '\\';
diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt
@@ -9,6 +9,10 @@ private const val EOF_TOKEN = "<EOF>"
  */
 class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
 
+    /**
+     * These are the Java regex syntax characters, all of these can be escaped to be treated as literals.
+     */
+    private val allowedSyntaxEscapes = "^$\\.*+?()[]{}|/-,:<>=!"
 
     override fun visitPattern(ctx: RegexJavaParser.PatternContext): VisitResult {
 
@@ -215,18 +219,6 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
             return ctx.characterClass().accept(this)
         }
 
-        if (ctx.ESCAPED_PLUS()!=null) {
-            val name = "blankBlock"
-            val char = ctx.ESCAPED_PLUS().text[1].toString()
-            return VisitResult(PatternCharacterBlockGene(name, char))
-        }
-
-        if (ctx.ESCAPED_DOT()!=null) {
-            val name = "blankBlock"
-            val char = ctx.ESCAPED_DOT().text[1].toString()
-            return VisitResult(PatternCharacterBlockGene(name, char))
-        }
-
         throw IllegalStateException("No valid atom resolver for: ${ctx.text}")
     }
 
@@ -281,10 +273,9 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
                     start
                 }
             } else {
-                // This case handles the \. and \+ cases
-                // wheren . and + should be treated as
-                // regular chars
-                assert(startText == "\\+" || startText == "\\.")
+                // This case handles the escaped syntax characters, like "\." and "\+", etc. cases
+                // where '.' and '+', etc. should be treated as regular chars
+                assert(startText[0] == '\\' && startText[1] in allowedSyntaxEscapes)
                 start = startText[1]
                 end = start
             }
@@ -325,8 +316,13 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
                 val rec = (ctx.classAtom() ?: ctx.classAtomNoDash()).accept(this).data as List<CharacterRange>
                 list.addAll(rec)
             } else {
-                val char = (ctx.classAtom() ?: ctx.classAtomNoDash()).text[0]
-                list.add(CharacterRange(char, char))
+                val text = (ctx.classAtom() ?: ctx.classAtomNoDash()).text
+                if(text.length==1) {
+                    list.add(CharacterRange(text[0], text[0]))
+                }
+                else {
+                    list.add(CharacterRange(text[1], text[1]))
+                }
             }
         }
 
@@ -415,6 +411,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
                         String(Character.toChars(hexValue))
                 )
             }
+            in allowedSyntaxEscapes -> PatternCharacterBlockGene(txt, txt.substring(1))
             else -> CharacterClassEscapeRxGene(txt.substring(1))
         })
     }
diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt
@@ -359,6 +359,13 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){
         checkCanSample("""\D""", listOf("\u0000", "\uffff"), 1_000_000)
     }
 
+    @Test
+    fun testSyntaxEscapes(){
+        checkSameAsJava("""}],-:=<>!""")
+        checkSameAsJava("""\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/\-\,\:\<\>\=\!""")
+        checkSameAsJava("""[\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/\-\,\:\<\>\=\!]""")
+    }
+
     @Test
     open fun testPredefinedCharClassInsideCharClass(){
         checkSameAsJava("""[abc\d0]""")
@@ -372,7 +379,6 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){
     @Test
     open fun testJSExclusiveEscapes(){
         checkCanSample("""\a""", "a", 100)
-        checkCanSample("""\^\$\\\.\*\+\?\(\)\[\]\{\}\|\/""", "^$\\.*+?()[]{}|/", 100)
         checkCanSample("""[\c0]""", "\u0010", 100)
         checkCanSample("""[\cP][\c0]""", "\u0010\u0010", 100)
         checkCanSample("""[\c_]""", "\u001f", 100)