@@ -48,7 +48,7 @@ def start(self):
4848 ...
4949
5050 def advance(self):
51- # Move to the next record; ``False`` means no data right now (not EOF) .
51+ # Move to the next record; ``False`` means no data is available now .
5252 ...
5353
5454 def get_current(self):
@@ -164,7 +164,7 @@ def start(self) -> bool:
164164 raise NotImplementedError
165165
166166 def advance (self ) -> bool :
167- """Advances to the next record. ``False`` == no data *now*, not EOF ."""
167+ """Advances to the next record. ``False`` means no data is available now ."""
168168 raise NotImplementedError
169169
170170 def get_current (self ) -> Any :
@@ -176,7 +176,7 @@ def get_current_timestamp(self) -> Timestamp:
176176 raise NotImplementedError
177177
178178 def get_watermark (self ) -> Timestamp :
179- """A best-effort lower bound on timestamps of future records.
179+ """An approximate lower bound on timestamps of future records.
180180
181181 Treated as monotonic by the wrapper. Return ``MAX_TIMESTAMP`` to signal that
182182 this reader has permanently finished.
@@ -222,8 +222,8 @@ def create_reader(
222222 produces the very first record of the source (or returns ``False`` if
223223 none yet).
224224 * When ``checkpoint_mark`` is not ``None``, the returned reader's
225- ``start()`` produces the FIRST record strictly AFTER the position
226- encoded by ``checkpoint_mark``. The reader must NOT re-deliver records
225+ ``start()`` produces the first record strictly after the position
226+ encoded by ``checkpoint_mark``. The reader must not re-deliver records
227227 already covered by the prior bundle.
228228 """
229229 raise NotImplementedError
@@ -265,7 +265,7 @@ class _UnboundedSourceRestriction(object):
265265
266266 Field roles:
267267 * ``checkpoint_mark`` -- RESUME state. A reader rebuilt from this mark
268- MUST produce the FIRST record strictly AFTER it (i.e. no re-delivery) .
268+ must produce the first record strictly after it.
269269 * ``finalization_checkpoint_mark`` -- COMMIT hook. Only set on a done
270270 primary that was just cut this bundle. Registered with the runner's
271271 bundle finalizer to acknowledge upstream. Independent of
@@ -284,12 +284,11 @@ class _UnboundedSourceRestrictionCoder(Coder):
284284
285285 Stateless: at encode time the source's own
286286 :meth:`UnboundedSource.get_checkpoint_mark_coder` is looked up from the
287- restriction; at decode time the source is decoded FIRST and its coder
287+ restriction; at decode time the source is decoded first and its coder
288288 drives the checkpoint-mark decoding. This avoids passing source-specific
289289 coder state into the coder's constructor, which in turn lets
290290 :class:`_UnboundedSourceRestrictionProvider` and
291- :class:`_ReadFromUnboundedSourceDoFn` be module-level classes (avoiding
292- stdlib-pickle gotchas for closure-defined DoFns on some runners).
291+ :class:`_ReadFromUnboundedSourceDoFn` be module-level classes.
293292
294293 Wire shape: source_bytes / checkpoint_bytes / watermark / done /
295294 finalization_checkpoint_bytes -- the checkpoint and finalization bytes
@@ -432,8 +431,7 @@ def _try_claim_inner(self, out: list[Any]) -> bool:
432431 self ._checkpoint_taken = True
433432 out [0 ] = _NO_DATA
434433 return False
435- # No data right now (not EOF): refresh the watermark so process() can
436- # advance it before deferring, then let process() self-checkpoint.
434+ # No data is available now. Refresh the watermark before deferring.
437435 self ._restriction = dataclasses .replace (
438436 self ._restriction , watermark = watermark )
439437 out [0 ] = _NO_DATA
@@ -514,9 +512,8 @@ class _UnboundedSourceRestrictionProvider(core.RestrictionProvider):
514512 Stateless module-level singleton (see :data:`_PROVIDER`): all
515513 source-specific state (e.g. the source's checkpoint coder) is derived
516514 per-call from the restriction's ``source`` field, which lets
517- :class:`_ReadFromUnboundedSourceDoFn` live at module level too -- avoiding
518- stdlib-pickle gotchas for closure-defined DoFns. The provider currently
519- passes ``None`` for the ``options`` forwarded to
515+ :class:`_ReadFromUnboundedSourceDoFn` live at module level too. The provider
516+ currently passes ``None`` for the ``options`` forwarded to
520517 :meth:`UnboundedSource.split`.
521518 """
522519 def __init__ (self ):
@@ -594,9 +591,8 @@ def truncate(self, element, restriction):
594591class _ReadFromUnboundedSourceDoFn (core .DoFn ):
595592 """SDF wrapper driving an :class:`UnboundedReader` for one restriction.
596593
597- Module-level (not nested inside ``ReadFromUnboundedSource.expand``) so stdlib
598- ``pickle`` -- not just cloudpickle -- can serialise the DoFn. The restriction
599- provider is the module-level :data:`_PROVIDER` singleton.
594+ Module-level so stdlib pickle and cloudpickle can serialise the DoFn. The
595+ restriction provider is the module-level :data:`_PROVIDER` singleton.
600596 """
601597 @core .DoFn .unbounded_per_element ()
602598 def process (
@@ -621,15 +617,15 @@ def process(
621617 break
622618 record = holder [0 ]
623619 if record is _NO_DATA :
624- # No data right now: advance the watermark and self-checkpoint so
625- # the runner reschedules us after a short delay.
620+ # No data is available now: advance the watermark and self-checkpoint
621+ # so the runner reschedules us after a short delay.
626622 _set_watermark_if_greater (
627623 watermark_estimator , tracker .current_restriction ().watermark )
628624 tracker .defer_remainder (
629625 Duration (seconds = _DEFAULT_POLL_INTERVAL_SECONDS ))
630626 break
631- # Advance the estimator with the source watermark (third slot), not
632- # the record's event time.
627+ # The third tuple field is the source watermark. The record timestamp
628+ # remains the output event time.
633629 value , record_timestamp , source_watermark = record
634630 _set_watermark_if_greater (watermark_estimator , source_watermark )
635631 yield TimestampedValue (value , record_timestamp )
@@ -641,7 +637,7 @@ def process(
641637 if current is not initial and finalize_mark is not None :
642638 bundle_finalizer .register (finalize_mark .finalize_checkpoint )
643639 finally :
644- # Best-effort reader release for the downstream-yield-raised path .
640+ # Release the reader on downstream-yield errors .
645641 inner_tracker = tracker
646642 if hasattr (inner_tracker , '_threadsafe_restriction_tracker' ):
647643 inner_tracker = inner_tracker ._threadsafe_restriction_tracker
@@ -668,7 +664,7 @@ def _set_watermark_if_greater(
668664
669665
670666class ReadFromUnboundedSource (PTransform ):
671- """Reads an :class:`UnboundedSource` via a Splittable ``DoFn`` .
667+ """Reads an :class:`UnboundedSource`.
672668
673669 Most users should prefer :class:`apache_beam.io.Read`, which dispatches an
674670 ``UnboundedSource`` here automatically::
@@ -692,8 +688,8 @@ def expand(self, pbegin):
692688 | 'Create' >> core .Create ([source ])
693689 | 'ReadUnbounded' >> core .ParDo (_ReadFromUnboundedSourceDoFn ()))
694690 # Surface an element type only when the global registry already maps it to
695- # an equivalent coder; we don't mutate ``coders.registry`` (can't register a
696- # parameterized coder by class without leaking/losing state) .
691+ # an equivalent coder. Avoid mutating ``coders.registry`` for a
692+ # parameterized coder whose instance state would be lost .
697693 try :
698694 type_hint = output_coder .to_type_hint ()
699695 except NotImplementedError :
0 commit comments