@@ -140,7 +140,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
140140 val remainingGenes = mutableListOf<Gene >()
141141 for (j in i + 1 until ctx.term().size) {
142142 val resTerm = ctx.term()[j].accept(this )
143- resTerm.genes.firstOrNull()?. let { remainingGenes.add(it) }
143+ remainingGenes.addAll( resTerm.genes)
144144 }
145145
146146 currentFlags = previous
@@ -150,10 +150,9 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
150150 }
151151
152152 val resTerm = ctx.term()[i].accept(this )
153- val gene = resTerm.genes.firstOrNull()
154153
155- if (gene != null ) {
156- res.genes.add(gene )
154+ if (resTerm.genes.isNotEmpty() ) {
155+ res.genes.addAll(resTerm.genes )
157156 } else {
158157
159158 val assertion = resTerm.data as String
@@ -187,18 +186,28 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
187186 }
188187
189188 val resAtom = ctx.atom().accept(this )
190- val atom = resAtom.genes.firstOrNull()
191- ? : return res
189+
190+ if (resAtom.genes.isEmpty()) {
191+ return res
192+ }
192193
193194 if (ctx.quantifier() != null ){
194195
195196 val limits = ctx.quantifier().accept(this ).data as Pair <Int ,Int >
196- val q = QuantifierRxGene (" q" , atom, limits.first, limits.second)
197+
198+ // this is done so that visits that result in multiple genes (like a backref that interprets some
199+ // digits literally) work as expected, only applying quantifier to last gene
200+
201+ // add all genes to result, except for last gene
202+ res.genes.addAll(resAtom.genes.dropLast(1 ))
203+
204+ // the last gene gets wrapped with the quantifier gene, then that gets added to result
205+ val q = QuantifierRxGene (" q" , resAtom.genes.last(), limits.first, limits.second)
197206
198207 res.genes.add(q)
199208
200209 } else {
201- res.genes.add(atom )
210+ res.genes.addAll(resAtom.genes )
202211 }
203212
204213 return res
@@ -521,14 +530,33 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
521530
522531 // unnamed backreference \N (N number)
523532 if (ctx.BackReference () != null ) {
524- val n = txt.drop(1 ).toInt() // strip leading \
525- if (n < 1 || n > captureGroups.size) {
526- throw IllegalStateException (
527- " Backreference \\ $n refers to group $n but only ${captureGroups.size} " +
533+ val allDigits = txt.drop(1 )
534+ val maxDigits = captureGroups.size.toString().length
535+
536+ // In Java, multi-digit back references interprets trailing digits literally, see more:
537+ // https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#groupname:~:text=the%20parser%20will-,drop%20digits,-until%20the%20number
538+ val backRefDigitCount = when {
539+ maxDigits > allDigits.length -> allDigits.length
540+ allDigits.take(maxDigits).toInt() <= captureGroups.size -> maxDigits
541+ maxDigits > 1 -> maxDigits - 1
542+ else -> throw IllegalStateException (
543+ " Backreference ${txt.take(2 )} refers to group ${allDigits[0 ]} but only ${captureGroups.size} " +
528544 " capture group(s) have been defined so far"
529545 )
530546 }
531- return VisitResult (BackReferenceRxGene (n, captureGroups[n - 1 ]!! ))
547+
548+ val n = allDigits.take(backRefDigitCount).toInt()
549+
550+ val result = VisitResult (BackReferenceRxGene (n, captureGroups[n - 1 ]!! ))
551+
552+ val remainingChars = allDigits.drop(backRefDigitCount)
553+
554+ for (char in remainingChars) {
555+ // we add the remaining digits as pattern genes to result as these should be interpreted literally
556+ result.genes.add(PatternCharacterBlockGene (char.toString(), char.toString()))
557+ }
558+
559+ return result
532560 }
533561
534562 // named backreference \k<name>
0 commit comments