Skip to content

Commit 7e98a2e

Browse files
authored
Merge pull request #495 from jawkio/447-optimize-string-concatenation
Optimize chained string concatenation with counted CONCAT
2 parents 904592a + 0f75d58 commit 7e98a2e

5 files changed

Lines changed: 236 additions & 5 deletions

File tree

src/jmh/java/io/jawk/backend/AVMExpressionBenchmark.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ public class AVMExpressionBenchmark {
6161
private AwkExpression fieldConcatenation;
6262
private AwkExpression fieldRegexMatch;
6363
private AwkExpression multiStringConcatenation;
64+
private AwkExpression constantStringConcatenation;
65+
private AwkExpression stringConstantStringConstantConcatenation;
66+
private AwkExpression fourStringConcatenation;
6467
private AwkExpression mixedExpression;
6568

6669
/**
@@ -78,6 +81,9 @@ public void setup() throws IOException {
7881
this.fieldConcatenation = awk.compileExpression("$1 \" test\"");
7982
this.fieldRegexMatch = awk.compileExpression("$1 ~ /test/");
8083
this.multiStringConcatenation = awk.compileExpression("$1 \" test1\" \" test2\" \" test3\"");
84+
this.constantStringConcatenation = awk.compileExpression("\"constant\" \"constant\" \"constant\" \"constant\"");
85+
this.stringConstantStringConstantConcatenation = awk.compileExpression("$1 \"constant\" $2 \"constant\"");
86+
this.fourStringConcatenation = awk.compileExpression("$1 $2 $3 $4");
8187
this.mixedExpression = awk.compileExpression("($1 + $2) \":\" ($3 ~ /test/) \":\" $4");
8288
this.avm = new AVM(new AwkSettings(), Collections.emptyMap());
8389
this.avm.prepareForEval("42 3.14 test-value suffix");
@@ -159,6 +165,40 @@ public Object multiStringConcatenation() throws IOException {
159165
return this.avm.eval(this.multiStringConcatenation);
160166
}
161167

168+
/**
169+
* Measures the optimized constant-folded case for four constant string
170+
* operands.
171+
*
172+
* @return expression result
173+
* @throws IOException if input preparation or evaluation fails
174+
*/
175+
@Benchmark
176+
public Object constantStringConcatenation() throws IOException {
177+
return this.avm.eval(this.constantStringConcatenation);
178+
}
179+
180+
/**
181+
* Measures alternating field and constant string concatenation.
182+
*
183+
* @return expression result
184+
* @throws IOException if input preparation or evaluation fails
185+
*/
186+
@Benchmark
187+
public Object stringConstantStringConstantConcatenation() throws IOException {
188+
return this.avm.eval(this.stringConstantStringConstantConcatenation);
189+
}
190+
191+
/**
192+
* Measures concatenation of four field string operands.
193+
*
194+
* @return expression result
195+
* @throws IOException if input preparation or evaluation fails
196+
*/
197+
@Benchmark
198+
public Object fourStringConcatenation() throws IOException {
199+
return this.avm.eval(this.fourStringConcatenation);
200+
}
201+
162202
/**
163203
* Measures mixed numeric, string, field, and regular expression operations.
164204
*

src/main/java/io/jawk/backend/AVM.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,6 +1177,29 @@ private void executeTuples(PositionTracker position)
11771177
position.next();
11781178
break;
11791179
}
1180+
case MULTI_CONCAT: {
1181+
// arg[0] = number of stack items to concatenate
1182+
// stack[0] = last concatenation operand
1183+
CountTuple countTuple = (CountTuple) tuple;
1184+
int count = (int) countTuple.getCount();
1185+
// Store String references so appends run left-to-right. Converting
1186+
// operands to char[] would copy them once before StringBuilder
1187+
// copies them again, and front-inserting would shift existing
1188+
// content on each operand.
1189+
String[] values = new String[count];
1190+
int resultLength = 0;
1191+
for (int i = count - 1; i >= 0; i--) {
1192+
values[i] = jrt.toAwkString(pop());
1193+
resultLength += values[i].length();
1194+
}
1195+
StringBuilder resultString = new StringBuilder(resultLength);
1196+
for (String value : values) {
1197+
resultString.append(value);
1198+
}
1199+
push(resultString.toString());
1200+
position.next();
1201+
break;
1202+
}
11801203
case ASSIGN:
11811204
case ASSIGN_NOPUSH: {
11821205
// arg[0] = offset

src/main/java/io/jawk/intermediate/AwkTuples.java

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import java.util.HashMap;
3333
import java.util.HashSet;
3434
import java.util.IdentityHashMap;
35+
import java.util.List;
3536
import java.util.Map;
3637
import java.util.Set;
3738
import java.util.function.Supplier;
@@ -69,7 +70,7 @@ public class AwkTuples implements Serializable {
6970
* can be serialized and patched efficiently. A linked list would make every
7071
* lookup O(n) and complicate address reassignment.
7172
*/
72-
private java.util.List<Tuple> queue = new ArrayList<Tuple>(100) {
73+
private List<Tuple> queue = new ArrayList<Tuple>(100) {
7374
private static final long serialVersionUID = -6334362156408598578L;
7475

7576
@Override
@@ -1878,17 +1879,36 @@ private boolean peepholeOptimizePass() {
18781879
return false;
18791880
}
18801881

1881-
java.util.List<Tuple> original = new ArrayList<Tuple>(queue);
1882+
List<Tuple> original = new ArrayList<Tuple>(queue);
18821883
int[] indexMapping = new int[originalSize];
18831884
Arrays.fill(indexMapping, -1);
1884-
java.util.List<Tuple> optimizedQueue = new ArrayList<Tuple>(originalSize);
1885+
List<Tuple> optimizedQueue = new ArrayList<Tuple>(originalSize);
18851886
boolean[] isAddressTarget = addressTargets(original, originalSize);
18861887

18871888
boolean modified = false;
18881889
int oldIndex = 0;
18891890
int newIndex = 0;
18901891
while (oldIndex < originalSize) {
18911892
Tuple tuple = original.get(oldIndex);
1893+
// If an earlier rewrite already happened in this pass, wait for the
1894+
// next pass before collapsing concat runs. That gives literal folding
1895+
// priority so fully constant chains become one PUSH_STRING instead of a
1896+
// partially folded PUSH_STRING plus MULTI_CONCAT.
1897+
ConcatRun concatRun = !modified ? concatRun(original, isAddressTarget, oldIndex) : null;
1898+
if (concatRun != null) {
1899+
// Chained concatenations compile as a run of binary CONCAT tuples
1900+
// after all operands have been pushed. Collapse that postfix run into
1901+
// one counted MULTI_CONCAT, e.g. CONCAT, CONCAT, CONCAT ->
1902+
// MULTI_CONCAT 4.
1903+
Tuple replacement = createMultiConcat(concatRun.itemCount, tuple.getLineNumber());
1904+
optimizedQueue.add(replacement);
1905+
mapFoldedRange(indexMapping, oldIndex, concatRun.tupleCount, newIndex);
1906+
oldIndex += concatRun.tupleCount;
1907+
newIndex++;
1908+
modified = true;
1909+
continue;
1910+
}
1911+
18921912
if (tuple.getOpcode() == Opcode.ASSIGN && (oldIndex + 1) < originalSize) {
18931913
Tuple nextTuple = original.get(oldIndex + 1);
18941914
// Statement assignments compile as ASSIGN followed by POP because
@@ -1987,7 +2007,7 @@ private boolean peepholeOptimizePass() {
19872007
return true;
19882008
}
19892009

1990-
private boolean[] addressTargets(java.util.List<Tuple> tuples, int tupleCount) {
2010+
private boolean[] addressTargets(List<Tuple> tuples, int tupleCount) {
19912011
boolean[] targets = new boolean[tupleCount];
19922012
for (Tuple tuple : tuples) {
19932013
Address address = tuple.getAddress();
@@ -2007,6 +2027,29 @@ private void mapFoldedRange(int[] indexMapping, int startIndex, int length, int
20072027
}
20082028
}
20092029

2030+
private ConcatRun concatRun(List<Tuple> original, boolean[] isAddressTarget, int oldIndex) {
2031+
Tuple tuple = original.get(oldIndex);
2032+
if (tuple.getOpcode() != Opcode.CONCAT || isAddressTarget[oldIndex]) {
2033+
return null;
2034+
}
2035+
2036+
int itemCount = 2;
2037+
int tupleCount = 1;
2038+
int currentIndex = oldIndex + 1;
2039+
while (currentIndex < original.size()
2040+
&& original.get(currentIndex).getOpcode() == Opcode.CONCAT
2041+
&& !isAddressTarget[currentIndex]) {
2042+
itemCount++;
2043+
tupleCount++;
2044+
currentIndex++;
2045+
}
2046+
2047+
if (tupleCount < 2) {
2048+
return null;
2049+
}
2050+
return new ConcatRun(tupleCount, itemCount);
2051+
}
2052+
20102053
private Object literalValue(Tuple tuple) {
20112054
switch (tuple.getOpcode()) {
20122055
case PUSH_LONG:
@@ -2162,6 +2205,22 @@ private Tuple createGetInputFieldConst(long fieldIndex, int lineNumber) {
21622205
return tuple;
21632206
}
21642207

2208+
private Tuple createMultiConcat(int itemCount, int lineNumber) {
2209+
Tuple tuple = new Tuple.CountTuple(Opcode.MULTI_CONCAT, itemCount);
2210+
tuple.setLineNumber(lineNumber);
2211+
return tuple;
2212+
}
2213+
2214+
private static final class ConcatRun {
2215+
private final int tupleCount;
2216+
private final int itemCount;
2217+
2218+
private ConcatRun(int tupleCount, int itemCount) {
2219+
this.tupleCount = tupleCount;
2220+
this.itemCount = itemCount;
2221+
}
2222+
}
2223+
21652224
private void remapAddresses(int[] indexMapping) {
21662225
if (indexMapping.length == 0) {
21672226
return;

src/main/java/io/jawk/intermediate/Opcode.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,17 @@ public enum Opcode {
196196
* Stack after: x-concatenated-with-y ...
197197
*/
198198
CONCAT,
199+
/**
200+
* Pops and concatenates N values from the top-of-stack after AWK string
201+
* conversion; pushes the result onto the stack. The number of items is passed
202+
* in as a tuple argument.
203+
* <p>
204+
* Argument: # of items (N)
205+
* <p>
206+
* Stack before: x1 x2 x3 .. xN ...<br/>
207+
* Stack after: x1-concatenated-through-xN ...
208+
*/
209+
MULTI_CONCAT,
199210
/**
200211
* Assigns the top-of-stack to a variable and pushes the assigned value back
201212
* onto the stack.

src/test/java/io/jawk/AwkTupleOptimizationTest.java

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,84 @@ public void foldsLiteralStringConcatenation() throws Exception {
136136
AwkProgram tuples = new Awk().compile(script);
137137
List<Opcode> opcodes = collectOpcodes(tuples);
138138
assertFalse("Literal concatenation should eliminate CONCAT tuple", opcodes.contains(Opcode.CONCAT));
139+
assertFalse("Literal concatenation should eliminate MULTI_CONCAT tuple", opcodes.contains(Opcode.MULTI_CONCAT));
139140
assertTrue("Expected folded literal push of foobar", hasLiteralPush(tuples, "foobar"));
140141
}
141142

143+
@Test
144+
public void foldsChainedLiteralStringConcatenation() throws Exception {
145+
String script = "BEGIN { print \"foo\" \"bar\" \"baz\" \"qux\" }\n";
146+
AwkTestSupport
147+
.awkTest("folds chained literal string concatenation")
148+
.script(script)
149+
.expect("foobarbazqux\n")
150+
.runAndAssert();
151+
152+
AwkProgram tuples = new Awk().compile(script);
153+
List<Opcode> opcodes = collectOpcodes(tuples);
154+
assertFalse("Chained literal concatenation should eliminate CONCAT tuple", opcodes.contains(Opcode.CONCAT));
155+
assertFalse(
156+
"Chained literal concatenation should eliminate MULTI_CONCAT tuple",
157+
opcodes.contains(Opcode.MULTI_CONCAT));
158+
assertTrue("Expected folded literal push of foobarbazqux", hasLiteralPush(tuples, "foobarbazqux"));
159+
}
160+
161+
@Test
162+
public void optimizesChainedStringConcatenationAsSingleMultiConcat() throws Exception {
163+
String script = "BEGIN { s1 = \"alpha\"; s2 = \"beta\"; print s1 \"-\" s2 \":\" }\n";
164+
AwkTestSupport
165+
.awkTest("counted chained string concatenation")
166+
.script(script)
167+
.expect("alpha-beta:\n")
168+
.runAndAssert();
169+
170+
AwkProgram tuples = new Awk().compile(script);
171+
assertEquals(
172+
"Expected one counted MULTI_CONCAT for the mixed chain",
173+
1,
174+
countOpcodeWithCount(tuples, Opcode.MULTI_CONCAT, 4));
175+
assertEquals("Optimized mixed chain should not keep binary CONCAT tuples", 0, countOpcode(tuples, Opcode.CONCAT));
176+
}
177+
178+
@Test
179+
public void keepsParserConcatenationBinaryWhenOptimizationDisabled() throws Exception {
180+
String script = "BEGIN { s1 = \"alpha\"; s2 = \"beta\"; print s1 \"-\" s2 \":\" }\n";
181+
AwkProgram tuples = new Awk().compile(script, true);
182+
183+
assertEquals(
184+
"Unoptimized parser output should keep one binary CONCAT per expression pair",
185+
3,
186+
countOpcode(tuples, Opcode.CONCAT));
187+
assertEquals(
188+
"Unoptimized parser output should not emit counted chain MULTI_CONCAT",
189+
0,
190+
countOpcode(tuples, Opcode.MULTI_CONCAT));
191+
}
192+
193+
@Test
194+
public void keepsConcatRunWhenFirstConcatIsBranchTarget() {
195+
AwkTuples tuples = new AwkTuples();
196+
tuples.pushSourceLineNumber(1);
197+
Address concatTarget = tuples.createAddress("concat-target");
198+
199+
tuples.dereference(1, false, true);
200+
tuples.ifFalse(concatTarget);
201+
tuples.dereference(2, false, true);
202+
tuples.dereference(3, false, true);
203+
tuples.address(concatTarget);
204+
tuples.concat();
205+
tuples.dereference(4, false, true);
206+
tuples.concat();
207+
208+
tuples.optimize();
209+
210+
assertEquals("Targeted CONCAT run should remain binary", 2, countOpcode(tuples, Opcode.CONCAT));
211+
assertEquals(
212+
"Targeted CONCAT run should not be folded into MULTI_CONCAT",
213+
0,
214+
countOpcode(tuples, Opcode.MULTI_CONCAT));
215+
}
216+
142217
@Test
143218
public void foldsScalarAssignmentPopIntoNonPushingAssignment() throws Exception {
144219
String script = "BEGIN { a = -2; b = 2; c = 4; print a + b + c }\n";
@@ -204,6 +279,11 @@ public void doesNotFoldNumericConcatenation() throws Exception {
204279
AwkProgram tuples = new Awk().compile(script);
205280
List<Opcode> opcodes = collectOpcodes(tuples);
206281
assertTrue("Numeric literal concatenation should preserve CONCAT tuple", opcodes.contains(Opcode.CONCAT));
282+
assertEquals("Numeric literal concatenation should remain binary", 1, countOpcode(tuples, Opcode.CONCAT));
283+
assertEquals(
284+
"Binary numeric literal concatenation should not use MULTI_CONCAT",
285+
0,
286+
countOpcode(tuples, Opcode.MULTI_CONCAT));
207287
assertFalse("Optimizer should not fold numeric/string concatenation", hasLiteralPush(tuples, "1x"));
208288
}
209289

@@ -559,8 +639,12 @@ private static boolean hasAddressTargetWithPredecessor(AwkProgram tuples, Opcode
559639
}
560640

561641
private static int countOpcode(AwkProgram tuples, Opcode opcode) {
642+
return countOpcode(rawTuples(tuples), opcode);
643+
}
644+
645+
private static int countOpcode(AwkTuples tuples, Opcode opcode) {
562646
int count = 0;
563-
PositionTracker tracker = rawTuples(tuples).top();
647+
PositionTracker tracker = tuples.top();
564648
while (!tracker.isEOF()) {
565649
if (tracker.opcode() == opcode) {
566650
count++;
@@ -570,6 +654,20 @@ private static int countOpcode(AwkProgram tuples, Opcode opcode) {
570654
return count;
571655
}
572656

657+
private static int countOpcodeWithCount(AwkProgram tuples, Opcode opcode, long expectedCount) {
658+
int count = 0;
659+
PositionTracker tracker = rawTuples(tuples).top();
660+
while (!tracker.isEOF()) {
661+
if (tracker.opcode() == opcode
662+
&& tracker.current() instanceof Tuple.CountTuple
663+
&& ((Tuple.CountTuple) tracker.current()).getCount() == expectedCount) {
664+
count++;
665+
}
666+
tracker.next();
667+
}
668+
return count;
669+
}
670+
573671
private static String dumpTuples(AwkProgram tuples) throws Exception {
574672
ByteArrayOutputStream out = new ByteArrayOutputStream();
575673
try (PrintStream ps = new PrintStream(out, true, StandardCharsets.UTF_8.name())) {

0 commit comments

Comments
 (0)