From b24494ff8471782a98c67e429bef5ca44ea18dae Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Fri, 10 Apr 2026 04:42:47 +0000 Subject: [PATCH 1/7] Change TriggerState finished bitset coder to a SentinelBitSetCoder SentinelBitSetCoder is same as BitSetCoder except that it encodes empty bitset as a single element 0 byte array. This allows checking if the finished bitset is empty or missing. SentinelBitSetCoder and BitSetCoder are state compatible. Both coders can decode encoded bytes from the other coder successfully. --- CHANGES.md | 4 + .../serialization/SentinelBitSetCoder.java | 78 +++++++++ .../triggers/TriggerStateMachineRunner.java | 4 +- .../SentinelBitSetCoderTest.java | 150 ++++++++++++++++++ 4 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java create mode 100644 runners/core-java/src/test/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoderTest.java diff --git a/CHANGES.md b/CHANGES.md index 319520f94309..58cc8d16d088 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -69,6 +69,10 @@ ## New Features / Improvements * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* TriggerStateMachineRunner changes from BitSetCoder to SentinelBitSetCoder to + encode finished bitset [#38139](https://github.com/apache/beam/pull/38139). + SentinelBitSetCoder and BitSetCoder are state compatible. Both coders can + decode encoded bytes from the other coder. ## Breaking Changes diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java new file mode 100644 index 000000000000..e9f0582c5ca9 --- /dev/null +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.core.serialization; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.BitSet; +import org.apache.beam.sdk.coders.AtomicCoder; +import org.apache.beam.sdk.coders.ByteArrayCoder; +import org.apache.beam.sdk.coders.CoderException; + +/** + * Coder for {@link BitSet} that stores an empty bit set as a byte array with a single 0 element. + */ +public class SentinelBitSetCoder extends AtomicCoder { + private static final SentinelBitSetCoder INSTANCE = new SentinelBitSetCoder(); + private static final ByteArrayCoder BYTE_ARRAY_CODER = ByteArrayCoder.of(); + + private SentinelBitSetCoder() {} + + public static SentinelBitSetCoder of() { + return INSTANCE; + } + + @Override + public void encode(BitSet value, OutputStream outStream) throws CoderException, IOException { + encode(value, outStream, Context.NESTED); + } + + @Override + public void encode(BitSet value, OutputStream outStream, Context context) + throws CoderException, IOException { + if (value == null) { + throw new CoderException("cannot encode a null BitSet"); + } + byte[] bytes = value.isEmpty() ? new byte[] {0} : value.toByteArray(); + BYTE_ARRAY_CODER.encodeAndOwn(bytes, outStream, context); + } + + @Override + public BitSet decode(InputStream inStream) throws CoderException, IOException { + return decode(inStream, Context.NESTED); + } + + @Override + public BitSet decode(InputStream inStream, Context context) throws CoderException, IOException { + return BitSet.valueOf(BYTE_ARRAY_CODER.decode(inStream, context)); + } + + @Override + public void verifyDeterministic() throws NonDeterministicException { + verifyDeterministic( + this, + "SentinelBitSetCoder requires its ByteArrayCoder to be deterministic.", + BYTE_ARRAY_CODER); + } + + @Override + public boolean consistentWithEquals() { + return true; + } +} diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/triggers/TriggerStateMachineRunner.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/triggers/TriggerStateMachineRunner.java index cf29646ebaa3..e3791821b728 100644 --- a/runners/core-java/src/main/java/org/apache/beam/runners/core/triggers/TriggerStateMachineRunner.java +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/triggers/TriggerStateMachineRunner.java @@ -26,7 +26,7 @@ import org.apache.beam.runners.core.StateAccessor; import org.apache.beam.runners.core.StateTag; import org.apache.beam.runners.core.StateTags; -import org.apache.beam.sdk.coders.BitSetCoder; +import org.apache.beam.runners.core.serialization.SentinelBitSetCoder; import org.apache.beam.sdk.state.Timers; import org.apache.beam.sdk.state.ValueState; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; @@ -59,7 +59,7 @@ public class TriggerStateMachineRunner { @VisibleForTesting public static final StateTag> FINISHED_BITS_TAG = - StateTags.makeSystemTagInternal(StateTags.value("closed", BitSetCoder.of())); + StateTags.makeSystemTagInternal(StateTags.value("closed", SentinelBitSetCoder.of())); private final ExecutableTriggerStateMachine rootTrigger; private final TriggerStateMachineContextFactory contextFactory; diff --git a/runners/core-java/src/test/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoderTest.java b/runners/core-java/src/test/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoderTest.java new file mode 100644 index 000000000000..91d7d369f70b --- /dev/null +++ b/runners/core-java/src/test/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoderTest.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.core.serialization; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; +import org.apache.beam.sdk.coders.BitSetCoder; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.Coder.Context; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.beam.sdk.testing.CoderProperties; +import org.apache.beam.sdk.util.CoderUtils; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link SentinelBitSetCoder}. */ +@RunWith(JUnit4.class) +public class SentinelBitSetCoderTest { + + private static final Coder TEST_CODER = SentinelBitSetCoder.of(); + + private static final List TEST_VALUES = + Arrays.asList( + BitSet.valueOf(new byte[] {0xa, 0xb, 0xc}), + BitSet.valueOf(new byte[] {0xd, 0x3}), + BitSet.valueOf(new byte[] {0xd, 0xe}), + BitSet.valueOf(new byte[] {0}), + BitSet.valueOf(new byte[] {})); + + @Test + public void testDecodeEncodeEquals() throws Exception { + for (BitSet value : TEST_VALUES) { + CoderProperties.coderDecodeEncodeEqual(TEST_CODER, value); + } + } + + @Test + public void testRegisterByteSizeObserver() throws Exception { + CoderProperties.testByteCount( + SentinelBitSetCoder.of(), Coder.Context.OUTER, TEST_VALUES.toArray(new BitSet[] {})); + + CoderProperties.testByteCount( + SentinelBitSetCoder.of(), Coder.Context.NESTED, TEST_VALUES.toArray(new BitSet[] {})); + } + + @Test + public void testStructuralValueConsistentWithEquals() throws Exception { + for (BitSet value1 : TEST_VALUES) { + for (BitSet value2 : TEST_VALUES) { + CoderProperties.structuralValueConsistentWithEquals(TEST_CODER, value1, value2); + } + } + } + + /** + * Generated data to check that the wire format has not changed. "CgsM" is {0xa, 0xb, 0xc} "DQM" + * is {0xd, 0x3} "DQ4" is {0xd, 0xe} "AA==" is {0} (Sentinel for empty BitSet) + */ + private static final List TEST_ENCODINGS = + Arrays.asList("CgsM", "DQM", "DQ4", "AA", "AA"); + + @Test + public void testWireFormatEncode() throws Exception { + CoderProperties.coderEncodesBase64(TEST_CODER, TEST_VALUES, TEST_ENCODINGS); + } + + @Rule public ExpectedException thrown = ExpectedException.none(); + + @Test + public void encodeNullThrowsCoderException() throws Exception { + thrown.expect(CoderException.class); + thrown.expectMessage("cannot encode a null BitSet"); + + CoderUtils.encodeToBase64(TEST_CODER, null); + } + + @Test + public void testEncodedTypeDescriptor() throws Exception { + assertThat(TEST_CODER.getEncodedTypeDescriptor(), equalTo(TypeDescriptor.of(BitSet.class))); + } + + @Test + public void testEmptyBitSetEncoding() throws Exception { + { + byte[] encoded = CoderUtils.encodeToByteArray(TEST_CODER, new BitSet()); + // ByteArrayCoder in OUTER context encodes as is. + assertThat(encoded, equalTo(new byte[] {0})); + } + { + byte[] encoded = CoderUtils.encodeToByteArray(TEST_CODER, new BitSet(), Context.NESTED); + // Varint length = 1, data = 1 + assertThat(encoded, equalTo(new byte[] {1, 0})); + } + } + + @Test + public void testCompatibilityWithBitSetCoder() throws Exception { + BitSetCoder bitSetCoder = BitSetCoder.of(); + SentinelBitSetCoder sentinelCoder = SentinelBitSetCoder.of(); + + for (BitSet bitset : TEST_VALUES) { + for (Coder.Context context : Arrays.asList(Coder.Context.OUTER, Coder.Context.NESTED)) { + // Test SentinelBitSetCoder can decode bytes encoded by BitSetCoder + { + byte[] encodedByBitSet = CoderUtils.encodeToByteArray(bitSetCoder, bitset, context); + BitSet decodedBySentinel = + CoderUtils.decodeFromByteArray(sentinelCoder, encodedByBitSet, context); + assertThat( + "Decoding BitSetCoder encoded value with context " + context, + decodedBySentinel, + equalTo(bitset)); + } + + // Test BitSetCoder can decode bytes encoded by SentinelBitSetCoder + { + byte[] encodedBySentinel = CoderUtils.encodeToByteArray(sentinelCoder, bitset, context); + BitSet decodedByBitSet = + CoderUtils.decodeFromByteArray(bitSetCoder, encodedBySentinel, context); + assertThat( + "Decoding SentinelBitSetCoder encoded value with context " + context, + decodedByBitSet, + equalTo(bitset)); + } + } + } + } +} From a4ff2c6ef010313064620fa9e43e121ec26d2e17 Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Fri, 10 Apr 2026 05:06:24 +0000 Subject: [PATCH 2/7] fix style --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 58cc8d16d088..acdce4928d72 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -99,6 +99,7 @@ ## Highlights + ## I/Os * DebeziumIO (Java): added `OffsetRetainer` interface and `FileSystemOffsetRetainer` implementation to persist and restore CDC offsets across pipeline restarts, and exposed `withStartOffset` / `withOffsetRetainer` on `DebeziumIO.Read` and the cross-language `ReadBuilder` ([#28248](https://github.com/apache/beam/issues/28248)). @@ -2422,4 +2423,4 @@ Schema Options, it will be removed in version `2.23.0`. ([BEAM-9704](https://iss ## Highlights -- For versions 2.19.0 and older release notes are available on [Apache Beam Blog](https://beam.apache.org/blog/). +- For versions 2.19.0 and older release notes are available on [Apache Beam Blog](https://beam.apache.org/blog/). \ No newline at end of file From c55b4cbb8691e61f88f150bedd048d0e73a03ed1 Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Fri, 10 Apr 2026 05:29:46 +0000 Subject: [PATCH 3/7] fix style --- CHANGES.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index acdce4928d72..58cc8d16d088 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -99,7 +99,6 @@ ## Highlights - ## I/Os * DebeziumIO (Java): added `OffsetRetainer` interface and `FileSystemOffsetRetainer` implementation to persist and restore CDC offsets across pipeline restarts, and exposed `withStartOffset` / `withOffsetRetainer` on `DebeziumIO.Read` and the cross-language `ReadBuilder` ([#28248](https://github.com/apache/beam/issues/28248)). @@ -2423,4 +2422,4 @@ Schema Options, it will be removed in version `2.23.0`. ([BEAM-9704](https://iss ## Highlights -- For versions 2.19.0 and older release notes are available on [Apache Beam Blog](https://beam.apache.org/blog/). \ No newline at end of file +- For versions 2.19.0 and older release notes are available on [Apache Beam Blog](https://beam.apache.org/blog/). From cf5050ea348ffb7f8270205d3ea2744ee139c82a Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Fri, 10 Apr 2026 05:47:33 +0000 Subject: [PATCH 4/7] fix style --- CHANGES.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 58cc8d16d088..ca76cc9d6a8b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -70,9 +70,9 @@ * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * TriggerStateMachineRunner changes from BitSetCoder to SentinelBitSetCoder to - encode finished bitset [#38139](https://github.com/apache/beam/pull/38139). - SentinelBitSetCoder and BitSetCoder are state compatible. Both coders can - decode encoded bytes from the other coder. + encode finished bitset. SentinelBitSetCoder and BitSetCoder are state + compatible. Both coders can decode encoded bytes from the other coder + [#38139](https://github.com/apache/beam/pull/38139). ## Breaking Changes From 848617b5e199734c951bba451a8dcf6790e7d7ee Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Fri, 10 Apr 2026 07:26:03 +0000 Subject: [PATCH 5/7] fix style --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index ca76cc9d6a8b..44c4620229b7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -72,7 +72,7 @@ * TriggerStateMachineRunner changes from BitSetCoder to SentinelBitSetCoder to encode finished bitset. SentinelBitSetCoder and BitSetCoder are state compatible. Both coders can decode encoded bytes from the other coder - [#38139](https://github.com/apache/beam/pull/38139). + [#38139](https://github.com/apache/beam/issues/38139). ## Breaking Changes From 493fa8f61a174aea96da079a56a7a156de0acbd5 Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Fri, 10 Apr 2026 08:00:12 +0000 Subject: [PATCH 6/7] fix style --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 44c4620229b7..aa9a49a16e68 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -72,7 +72,7 @@ * TriggerStateMachineRunner changes from BitSetCoder to SentinelBitSetCoder to encode finished bitset. SentinelBitSetCoder and BitSetCoder are state compatible. Both coders can decode encoded bytes from the other coder - [#38139](https://github.com/apache/beam/issues/38139). + ([#38139](https://github.com/apache/beam/issues/38139)). ## Breaking Changes From 0d929f146fac101a38adc1582f55463109843817 Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Sat, 11 Apr 2026 09:01:36 +0000 Subject: [PATCH 7/7] address comment --- .../beam/runners/core/serialization/SentinelBitSetCoder.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java index e9f0582c5ca9..340816f6e0e1 100644 --- a/runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/serialization/SentinelBitSetCoder.java @@ -26,9 +26,12 @@ import org.apache.beam.sdk.coders.CoderException; /** - * Coder for {@link BitSet} that stores an empty bit set as a byte array with a single 0 element. + * Coder for {@link BitSet} that stores an empty bit set as a byte array with a single 0 element. In + * general BitSetCoder should be preferred as it encodes an empty bit set as an empty byte array. + * However, there are cases where non-empty values are useful to indicate presence. */ public class SentinelBitSetCoder extends AtomicCoder { + private static final SentinelBitSetCoder INSTANCE = new SentinelBitSetCoder(); private static final ByteArrayCoder BYTE_ARRAY_CODER = ByteArrayCoder.of();