From ae4f68ec99b2a3331fb55d92dee889ebe458e286 Mon Sep 17 00:00:00 2001 From: SubramanyaV Date: Thu, 2 Apr 2026 10:06:47 +0530 Subject: [PATCH 1/5] Added the missing main coder.java file for reviewing --- .../org/apache/beam/sdk/coders/Coder.java | 148 ++++++++++++++---- 1 file changed, 117 insertions(+), 31 deletions(-) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java index 0a3650ca133b..6cf0af72cc57 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java @@ -37,41 +37,99 @@ import org.checkerframework.checker.nullness.qual.Nullable; /** - * A {@link Coder Coder<T>} defines how to encode and decode values of type {@code T} into - * byte streams. + * A {@link Coder} defines how values of type {@code T} are encoded into bytes and decoded back into + * objects. * - *

{@link Coder} instances are serialized during job creation and deserialized before use. This - * will generally be performed by serializing the object via Java Serialization. + *

Coders are used by Beam to serialize data when it is transferred between transforms, + * persisted, or sent across process boundaries. * - *

{@link Coder} classes for compound types are often composed from coder classes for types - * contains therein. The composition of {@link Coder} instances into a coder for the compound class - * is the subject of the {@link CoderProvider} type, which enables automatic generic composition of - * {@link Coder} classes within the {@link CoderRegistry}. See {@link CoderProvider} and {@link - * CoderRegistry} for more information about how coders are inferred. + *

The {@link #encode(Object, OutputStream, Context)} and {@link #decode(InputStream, Context)} + * methods must be consistent: values encoded by {@code encode} must be correctly reconstructed by + * {@code decode}. * - *

All methods of a {@link Coder} are required to be thread safe. + *

The {@link Context} parameter specifies whether the value is encoded as a top-level element or + * as part of a larger structure. This affects whether additional information (such as length + * prefixes) is required to ensure that encoded values can be unambiguously decoded. * - * @param the type of values being encoded and decoded + *

For example: + * + *

+ * + *

Coder implementations must be: + * + *

+ * + * @param the type of values handled by this {@link Coder} */ public abstract class Coder implements Serializable { /** - * The context in which encoding or decoding is being done. + * Represents the context in which encoding or decoding is performed. + * + *

The {@link Context} determines whether the value being encoded or decoded is part of a + * larger structure or is the outermost value in the stream. + * + *

This distinction is important because some coders need to include additional information + * (such as length prefixes) when values are nested inside other structures, but can omit them + * when operating on the outermost level. + * + *

There are two standard contexts: + * + *

    + *
  • {@link #OUTER} – Indicates that the value occupies the remainder of the input or output + * stream. In this case, coders may omit length information because the boundaries are + * implicitly known. + *
  • {@link #NESTED} – Indicates that the value is encoded as part of a larger structure. + * Coders must ensure that the encoded value is self-delimiting, typically by including + * length prefixes or other boundary markers. + *
+ * + *

For example: + * + *

    + *
  • When encoding a top-level element in a file → use {@code OUTER} + *
  • When encoding elements inside a collection (e.g., list, KV, etc.) → use {@code NESTED} + *
+ * + *

Correct usage of {@link Context} ensures that encoded data can be safely and correctly + * decoded without ambiguity. * - * @deprecated To implement a coder, do not use any {@link Context}. Just implement only those - * abstract methods which do not accept a {@link Context} and leave the default - * implementations for methods accepting a {@link Context}. + *

Note: Most coder implementations do not need to manually manage {@link Context}. They + * should delegate to component coders with the appropriate context when encoding nested + * structures. */ @Deprecated public static class Context { /** - * The outer context: the value being encoded or decoded takes up the remainder of the - * record/stream contents. + * The outer context indicates that the value being encoded or decoded occupies the remainder of + * the input or output stream. + * + *

In this context, the boundaries of the value are implicitly known, so coders do not need + * to include additional length information or delimiters when encoding. + * + *

This is typically used for top-level values, such as elements written directly to a file + * or stream. */ public static final Context OUTER = new Context(true); /** - * The nested context: the value being encoded or decoded is (potentially) a part of a larger - * record/stream contents, and may have other parts encoded or decoded after it. + * The nested context indicates that the value being encoded or decoded is part of a larger + * structure and does not occupy the entire stream. + * + *

In this context, coders must ensure that the encoded value is self-delimiting, typically + * by including length prefixes or other boundary markers, so that subsequent data in the stream + * can be correctly decoded. + * + *

This is commonly used when encoding elements inside collections, key-value pairs, or other + * composite data structures. */ public static final Context NESTED = new Context(false); @@ -112,13 +170,28 @@ public String toString() { } /** - * Encodes the given value of type {@code T} onto the given output stream. Multiple elements can - * be encoded next to each other on the output stream, each coder should encode information to - * know how many bytes to read when decoding. A common approach is to prefix the encoding with the - * element's encoded length. + * Encodes the given value of type {@code T} onto the provided output stream. * - * @throws IOException if writing to the {@code OutputStream} fails for some reason - * @throws CoderException if the value could not be encoded for some reason + *

The encoding must be deterministic and consistent with {@link #decode}, such that values + * written by this method can be correctly reconstructed. + * + *

The {@link Context} determines how the value should be encoded: + * + *

    + *
  • In {@link Context#OUTER}, the value is written as a top-level element and may omit length + * prefixes or delimiters since it consumes the remainder of the stream. + *
  • In {@link Context#NESTED}, the value is part of a larger structure and must include + * sufficient boundary information (such as length prefixes) to allow correct decoding of + * subsequent data. + *
+ * + *

Implementations must ensure that the encoding is unambiguous and that multiple encoded + * values can be safely concatenated and decoded in sequence. + * + * @param value the value to encode + * @param outStream the output stream to write the encoded bytes to + * @throws IOException if writing to the stream fails + * @throws CoderException if the value cannot be encoded */ public abstract void encode(T value, OutputStream outStream) throws CoderException, IOException; @@ -136,13 +209,26 @@ public void encode(T value, OutputStream outStream, Context context) } /** - * Decodes a value of type {@code T} from the given input stream in the given context. Returns the - * decoded value. Multiple elements can be encoded next to each other on the input stream, each - * coder should encode information to know how many bytes to read when decoding. A common approach - * is to prefix the encoding with the element's encoded length. + * Decodes a value of type {@code T} from the given input stream. * - * @throws IOException if reading from the {@code InputStream} fails for some reason - * @throws CoderException if the value could not be decoded for some reason + *

The decoding must be consistent with {@link #encode}, such that values encoded by this coder + * can be correctly reconstructed. + * + *

When multiple values are encoded sequentially in a stream, implementations must read exactly + * the bytes corresponding to a single encoded value and no more. This ensures that subsequent + * values in the stream can be decoded correctly. + * + *

Depending on how the value was encoded, the implementation may rely on implicit boundaries + * (for outer context) or explicit boundary information such as length prefixes (for nested + * context). + * + *

Implementations must ensure that decoding is unambiguous and does not consume bytes beyond + * the encoded representation of the value. + * + * @param inStream the input stream to read the encoded value from + * @return the decoded value + * @throws IOException if reading from the stream fails + * @throws CoderException if the value cannot be decoded */ public abstract T decode(InputStream inStream) throws CoderException, IOException; From 9981c4629e1183c75891f7c68022d9e3bf4778dc Mon Sep 17 00:00:00 2001 From: SubramanyaV Date: Sun, 5 Apr 2026 17:49:03 +0530 Subject: [PATCH 2/5] remove beam-env from repo and ignore it --- .gitignore | Bin 3882 -> 3904 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/.gitignore b/.gitignore index e9fe331cb3166132f5dbf5d213f081273e41236a..c73e08c9b1174f47a2da3b014a325c070652824c 100644 GIT binary patch delta 30 kcmZ1_cR+4~7Qa{$Ln=ceLoS0Zkj`T$W6)>dW#D1}0C#u=_W%F@ delta 7 OcmX>gw@Pk<7C!(CI0BFW From bda20e90e975b904ccd9bf9c4a6cbd7b6db79d36 Mon Sep 17 00:00:00 2001 From: SubramanyaV Date: Sun, 5 Apr 2026 20:59:41 +0530 Subject: [PATCH 3/5] Add missing @deprecated Javadoc for context --- .../core/src/main/java/org/apache/beam/sdk/coders/Coder.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java index 6cf0af72cc57..93507eb1c2bd 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java @@ -105,6 +105,9 @@ public abstract class Coder implements Serializable { *

Note: Most coder implementations do not need to manually manage {@link Context}. They * should delegate to component coders with the appropriate context when encoding nested * structures. + * + * @deprecated This class is deprecated. Use the newer encoding context mechanisms provided by the + * beam SDK instead. */ @Deprecated public static class Context { From 25af774d8c3875a4c9e640e11d022a41236eb8b1 Mon Sep 17 00:00:00 2001 From: SubramanyaV Date: Sun, 5 Apr 2026 23:25:23 +0530 Subject: [PATCH 4/5] fix deprecated javadoc in Coder.java properly --- .../core/src/main/java/org/apache/beam/sdk/coders/Coder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java index 93507eb1c2bd..7a7d30433d4d 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java @@ -106,8 +106,8 @@ public abstract class Coder implements Serializable { * should delegate to component coders with the appropriate context when encoding nested * structures. * - * @deprecated This class is deprecated. Use the newer encoding context mechanisms provided by the - * beam SDK instead. + * @deprecated This class is deprecated and will be removed in future release. + * Use {@link Coder.Context} alternatives provided by the Beam SDk instead. */ @Deprecated public static class Context { From d814ab70040c43ebd773afa0cf75c91a228d974b Mon Sep 17 00:00:00 2001 From: SubramanyaV Date: Fri, 17 Apr 2026 23:02:04 +0530 Subject: [PATCH 5/5] improved javadoc documentation --- .../main/java/org/apache/beam/sdk/coders/Coder.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java index 7a7d30433d4d..ac5d1a0582b3 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/Coder.java @@ -69,6 +69,16 @@ * * * @param the type of values handled by this {@link Coder} + * + *

The behavior of encoding and decoding depends on the {@link Context}. + * + *

    + *
  • In {@link Context#OUTER}, the value consumes the remainder of the stream. + *
  • In {@link Context#NESTED}, the value is part of a larger structure and must be + * self-delimiting so that subsequent values can be correctly decoded. + *
+ * + *

See {@link CoderProperties} for utilities to test coder correctness and consistency. */ public abstract class Coder implements Serializable { /**