Skip to content

Commit 5d722be

Browse files
committed
Java regex handle trailing digits in backreferences as literal characters.
1 parent a843106 commit 5d722be

2 files changed

Lines changed: 42 additions & 13 deletions

File tree

core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
140140
val remainingGenes = mutableListOf<Gene>()
141141
for (j in i + 1 until ctx.term().size) {
142142
val resTerm = ctx.term()[j].accept(this)
143-
resTerm.genes.firstOrNull()?.let { remainingGenes.add(it) }
143+
remainingGenes.addAll(resTerm.genes)
144144
}
145145

146146
currentFlags = previous
@@ -150,10 +150,9 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
150150
}
151151

152152
val resTerm = ctx.term()[i].accept(this)
153-
val gene = resTerm.genes.firstOrNull()
154153

155-
if(gene != null) {
156-
res.genes.add(gene)
154+
if(resTerm.genes.isNotEmpty()) {
155+
res.genes.addAll(resTerm.genes)
157156
} else {
158157

159158
val assertion = resTerm.data as String
@@ -187,18 +186,28 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
187186
}
188187

189188
val resAtom = ctx.atom().accept(this)
190-
val atom = resAtom.genes.firstOrNull()
191-
?: return res
189+
190+
if (resAtom.genes.isEmpty()) {
191+
return res
192+
}
192193

193194
if(ctx.quantifier() != null){
194195

195196
val limits = ctx.quantifier().accept(this).data as Pair<Int,Int>
196-
val q = QuantifierRxGene("q", atom, limits.first, limits.second)
197+
198+
// this is done so that visits that result in multiple genes (like a backref that interprets some
199+
// digits literally) work as expected, only applying quantifier to last gene
200+
201+
// add all genes to result, except for last gene
202+
res.genes.addAll(resAtom.genes.dropLast(1))
203+
204+
// the last gene gets wrapped with the quantifier gene, then that gets added to result
205+
val q = QuantifierRxGene("q", resAtom.genes.last(), limits.first, limits.second)
197206

198207
res.genes.add(q)
199208

200209
} else {
201-
res.genes.add(atom)
210+
res.genes.addAll(resAtom.genes)
202211
}
203212

204213
return res
@@ -521,14 +530,33 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
521530

522531
// unnamed backreference \N (N number)
523532
if (ctx.BackReference() != null) {
524-
val n = txt.drop(1).toInt() // strip leading \
525-
if (n < 1 || n > captureGroups.size) {
526-
throw IllegalStateException(
527-
"Backreference \\$n refers to group $n but only ${captureGroups.size} " +
533+
val allDigits = txt.drop(1)
534+
val maxDigits = captureGroups.size.toString().length
535+
536+
// In Java, multi-digit back references interprets trailing digits literally, see more:
537+
// https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#groupname:~:text=the%20parser%20will-,drop%20digits,-until%20the%20number
538+
val backRefDigitCount = when {
539+
maxDigits > allDigits.length -> allDigits.length
540+
allDigits.take(maxDigits).toInt() <= captureGroups.size -> maxDigits
541+
maxDigits > 1 -> maxDigits - 1
542+
else -> throw IllegalStateException(
543+
"Backreference ${txt.take(2)} refers to group ${allDigits[0]} but only ${captureGroups.size} " +
528544
"capture group(s) have been defined so far"
529545
)
530546
}
531-
return VisitResult(BackReferenceRxGene(n, captureGroups[n - 1]!!))
547+
548+
val n = allDigits.take(backRefDigitCount).toInt()
549+
550+
val result = VisitResult(BackReferenceRxGene(n, captureGroups[n - 1]!!))
551+
552+
val remainingChars = allDigits.drop(backRefDigitCount)
553+
554+
for (char in remainingChars) {
555+
// we add the remaining digits as pattern genes to result as these should be interpreted literally
556+
result.genes.add(PatternCharacterBlockGene(char.toString(), char.toString()))
557+
}
558+
559+
return result
532560
}
533561

534562
// named backreference \k<name>

core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() {
244244
checkSameAsJava("""<>[(?<notAName>abc)]""")
245245
checkCanSample("""[(?<notAName>abc)]""", "N", 100)
246246
checkSameAsJava("""((A)(B(C)))\1\2\3\4""")
247+
checkSameAsJava("""(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\10\11\12\120{3}""")
247248
}
248249

249250
@Test

0 commit comments

Comments
 (0)