Skip to content

Commit 79cc4ee

Browse files
committed
more merge cleanup
1 parent 9dc49fd commit 79cc4ee

File tree

3 files changed

+67
-26
lines changed

3 files changed

+67
-26
lines changed

core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,11 +125,21 @@ CharacterEscape
125125
| SLASH HexEscapeSequence
126126
| SLASH UnicodeEscapeSequence
127127
| SLASH OctalEscapeSequence
128-
| SLASH 'p' BRACE_open PosixCharacterClassLabel BRACE_close // this is only implemented in Java at the moment as on JS this
129-
// is allowed only while certain flags are enabled
128+
| SLASH ('p' | 'P') BRACE_open PCharacterClassEscapeLabel BRACE_close // this is only implemented in Java at the moment
129+
// as on JS this is allowed only while certain flags are enabled
130+
130131
//| IdentityEscape
131132
;
132133

134+
fragment PCharacterClassEscapeLabel
135+
: PosixCharacterClassLabel
136+
| UnicodeCategoriesLabel
137+
;
138+
139+
fragment UnicodeCategoriesLabel
140+
: 'Pe'
141+
;
142+
133143
// basic US-ASCII only predefined POSIX character classes
134144
// https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#:~:text=character%3A%20%5B%5E%5Cw%5D-,POSIX,-character%20classes%20(US
135145
fragment PosixCharacterClassLabel
@@ -146,7 +156,6 @@ fragment PosixCharacterClassLabel
146156
| 'Cntrl'
147157
| 'XDigit'
148158
| 'Space'
149-
| 'Pe'
150159
;
151160

152161
fragment ControlEscape

core/src/main/kotlin/org/evomaster/core/search/gene/regex/CharacterClassEscapeRxGene.kt

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,45 @@ class CharacterClassEscapeRxGene(
6161
private val nonHorizontalSpaceMultiCharRange = MultiCharacterRange(true, horizontalSpaceSet)
6262
private val nonVerticalSpaceMultiCharRange = MultiCharacterRange(true, verticalSpaceSet)
6363

64-
// US-ASCII POSIX character classes (\p{X})
65-
private val posixMultiCharRanges = mapOf(
66-
"Lower" to listOf(CharacterRange('a', 'z')),
67-
"Upper" to listOf(CharacterRange('A', 'Z')),
68-
"ASCII" to listOf(CharacterRange(0, 0x7f)),
69-
"Alpha" to asciiLetterSet,
70-
"Digit" to digitSet,
71-
"Alnum" to digitSet + asciiLetterSet,
72-
"Punct" to punctuationSet,
73-
"Graph" to digitSet + asciiLetterSet + punctuationSet,
74-
"Print" to digitSet + asciiLetterSet + punctuationSet + stringToListOfCharacterRanges("\u0020"),
75-
"Blank" to stringToListOfCharacterRanges(" \t"),
76-
"Cntrl" to listOf(CharacterRange(0, 0x1f)) + stringToListOfCharacterRanges("\u007f"),
77-
"XDigit" to listOf(CharacterRange('0', '9'), CharacterRange('a', 'f'), CharacterRange('A', 'F')),
78-
"Space" to spaceSet,
79-
"Pe" to stringToListOfCharacterRanges(")]}")
80-
).mapValues { (_, value) -> MultiCharacterRange(false, value) }
64+
private val pEscapesMultiCharRanges: Map<String, MultiCharacterRange> = run {
65+
// US-ASCII POSIX character classes (\p{X})
66+
val posixAsciiSets = mapOf(
67+
"Lower" to listOf(CharacterRange('a', 'z')),
68+
"Upper" to listOf(CharacterRange('A', 'Z')),
69+
"ASCII" to listOf(CharacterRange(0, 0x7f)),
70+
"Alpha" to asciiLetterSet,
71+
"Digit" to digitSet,
72+
"Alnum" to digitSet + asciiLetterSet,
73+
"Punct" to punctuationSet,
74+
"Graph" to digitSet + asciiLetterSet + punctuationSet,
75+
"Print" to digitSet + asciiLetterSet + punctuationSet + stringToListOfCharacterRanges("\u0020"),
76+
"Blank" to stringToListOfCharacterRanges(" \t"),
77+
"Cntrl" to listOf(CharacterRange(0, 0x1f)) + stringToListOfCharacterRanges("\u007f"),
78+
"XDigit" to listOf(CharacterRange('0', '9'), CharacterRange('a', 'f'), CharacterRange('A', 'F')),
79+
"Space" to spaceSet,
80+
)
81+
82+
// Unicode category character classes (\p{X})
83+
val unicodeCategorySets = mapOf(
84+
"Pe" to stringToListOfCharacterRanges(")]}")
85+
// more Unicode categories will be added here
86+
)
87+
88+
// create both normal and negated version for all
89+
(posixAsciiSets + unicodeCategorySets).flatMap { (key, value) ->
90+
listOf(
91+
key to MultiCharacterRange(false, value),
92+
"^$key" to MultiCharacterRange(true, value)
93+
)
94+
}.toMap()
95+
}
8196
}
8297

8398
var value: String = ""
8499
var multiCharRange: MultiCharacterRange
85100

86101
init {
87-
if (type[0] !in "wWdDsSvVhHp") {
102+
if (type[0] !in "wWdDsSvVhHpP") {
88103
throw IllegalArgumentException("Invalid type: $type")
89104
}
90105

@@ -99,12 +114,16 @@ class CharacterClassEscapeRxGene(
99114
'V' -> nonVerticalSpaceMultiCharRange
100115
'h' -> horizontalSpaceMultiCharRange
101116
'H' -> nonHorizontalSpaceMultiCharRange
102-
'p' ->
103-
if (type.substring(2, type.length - 1) !in posixMultiCharRanges){
104-
throw IllegalArgumentException("$type invalid/unsupported POSIX character class")
117+
'p', 'P' -> {
118+
val pLabel = type.substring(2, type.length - 1)
119+
val negated = type[0].isUpperCase()
120+
val lookupKey = if (negated) "^$pLabel" else pLabel
121+
if (lookupKey !in pEscapesMultiCharRanges) {
122+
throw IllegalArgumentException("$type invalid/unsupported \\p escape character class")
105123
} else {
106-
posixMultiCharRanges[type.substring(2, type.length - 1)]!!
124+
pEscapesMultiCharRanges[lookupKey]!!
107125
}
126+
}
108127
else -> //this should never happen due to check in init
109128
throw IllegalStateException("Type '\\$type' not supported yet")
110129
}

core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,24 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() {
9797
|\p{Print}\p{Blank}\p{Cntrl}\p{XDigit}\p{Space}""".trimMargin())
9898
}
9999

100+
@Test
101+
fun testUnicodeCategories(){
102+
checkSameAsJava("""\p{Pe}""")
103+
checkSameAsJava("""Pe""")
104+
}
105+
100106
@Test
101107
fun testPredefinedCharClassInsideCharClass(){
102-
checkSameAsJava("""[\V\p{Lower}\P{Upper}\W\d]""")
108+
checkSameAsJava("""[\V\p{Lower}\p{Upper}\W\d]""")
103109
checkSameAsJava("""[a\p{Pe}]""")
104110
checkSameAsJava("""[\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]""")
105111
checkCanSample("""[a\p{Pe}b]""", ")", 100_000)
106112
}
113+
114+
@Test
115+
fun testPEscapesComplements(){
116+
checkSameAsJava("""\P{Lower}\P{Upper}\P{ASCII}\P{Alpha}\P{Digit}\P{Alnum}\P{Punct}\P{Graph}
117+
|\P{Print}\P{Blank}\P{Cntrl}\P{XDigit}\P{Space}""".trimMargin())
118+
checkSameAsJava("""\P{Pe}""")
119+
}
107120
}

0 commit comments

Comments
 (0)