feat(stdlib): add ChunkingStrategy ABC and built-in chunkers (#923)

planetf1 · web-flow · commit 4b27e2ecebf9 · 2026-04-27T12:24:52.000Z
* feat(stdlib): add ChunkingStrategy ABC and built-in chunkers Adds mellea/stdlib/chunking.py with ChunkingStrategy ABC and three built-in implementations: SentenceChunker, WordChunker, ParagraphChunker. split(accumulated_text) returns complete chunks, holding back trailing fragments for the next call. Closes #899 Part of #891 Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> * fix(test): remove unused pytest import from test_chunking Pyright flagged import as unaccessed; no pytest.* calls in the file. Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> * fix(stdlib): address PR #923 review feedback on ChunkingStrategy - Fix SentenceChunker whitespace leak: lstrip() after match.end() so double-space / tab separators don't bleed into the next chunk as leading whitespace - Add end-of-stream contract to ABC docstring (callers responsible for trailing fragment after stream terminates) - Fix incorrect comment "end-of-string" → "whitespace" - Compile _WHITESPACE / _PARA_BOUNDARY / _PARA_BOUNDARY_END at module level (consistent with _SENTENCE_BOUNDARY; avoids per-call recompile) - Expand SentenceChunker char class to include right curly double/single quotes (U+201D / U+2019) for common LLM output patterns - Document CRLF limitation on ParagraphChunker - Re-export ChunkingStrategy + chunkers from mellea.stdlib.__init__ - Add __all__ to chunking.py - Add tests: closing paren, double-space separator, tab separator, abbreviation edge case (known-bad split), WordChunker leading-whitespace Assisted-by: Claude Code * fix(stdlib): address PR #923 nits — comment accuracy and curly-quote test - Fix misleading comment on _SENTENCE_BOUNDARY: was "processed by re engine as \u escapes" but the file contained literal Unicode chars. Now uses chr(0x201d) + chr(0x2019) for Python 3.12 compatibility (U+2019 is treated as a string delimiter in single-quoted raw strings on 3.12). - Add test_sentence_chunker_curly_quotes to verify U+201D/U+2019 matching. Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> * fix(stdlib): apply review feedback on ChunkingStrategy (#923) - Simplify _SENTENCE_BOUNDARY regex to use \u escapes instead of chr() concatenation (cleaner, same semantics, Python 3.12-safe) - Document that SentenceChunker discards inter-sentence whitespace via lstrip() - Add test_chunking_strategy_is_abstract to document the extension-point contract Assisted-by: Claude Code --------- Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
diff --git a/mellea/stdlib/__init__.py b/mellea/stdlib/__init__.py
@@ -8,4 +8,11 @@
 ``@mify`` decorator for turning ordinary Python objects into components. Import from
 the sub-packages — ``mellea.stdlib.components``, ``mellea.stdlib.sampling``, and
 ``mellea.stdlib.session`` — for day-to-day use.
+
+Streaming chunking strategies (for use with streaming validation) are available at
+``mellea.stdlib.chunking`` and re-exported here for convenience.
 """
+
+from .chunking import ChunkingStrategy, ParagraphChunker, SentenceChunker, WordChunker
+
+__all__ = ["ChunkingStrategy", "ParagraphChunker", "SentenceChunker", "WordChunker"]
diff --git a/mellea/stdlib/chunking.py b/mellea/stdlib/chunking.py
@@ -0,0 +1,170 @@
+"""ChunkingStrategy ABC and built-in implementations for streaming validation."""
+
+import re
+from abc import ABC, abstractmethod
+
+__all__ = ["ChunkingStrategy", "ParagraphChunker", "SentenceChunker", "WordChunker"]
+
+
+class ChunkingStrategy(ABC):
+    """Abstract base class for text chunking strategies used in streaming validation.
+
+    A chunking strategy receives the full accumulated text so far and returns a
+    list of complete chunks ready for downstream validation. Any trailing fragment
+    that has not yet reached a chunk boundary is withheld — it is not included in
+    the returned list. Each call is stateless and idempotent given the same input.
+
+    End-of-stream contract: ``split()`` always withholds the trailing fragment.
+    When the stream terminates, callers are responsible for processing any remainder:
+    take the full accumulated text, identify everything after the last returned
+    chunk boundary, and handle it appropriately (e.g. pass to a final validator
+    or discard).
+    """
+
+    @abstractmethod
+    def split(self, accumulated_text: str) -> list[str]:
+        """Return complete chunks from accumulated_text, excluding any trailing fragment.
+
+        Args:
+            accumulated_text: The full text accumulated so far, including all
+                previously seen tokens and the latest delta.
+
+        Returns:
+            A list of complete chunks. If no chunk boundary has been reached yet,
+            returns an empty list. Never includes the trailing incomplete fragment.
+        """
+        ...
+
+
+# Sentence boundary: sentence-ending punctuation, optionally followed by a closing
+# quote or paren, then whitespace.
+# Character class covers: straight double/single quotes, right double/single curly
+# quotes (U+201D, U+2019), and closing paren.
+_SENTENCE_BOUNDARY = re.compile("[.!?][\"'\u201d\u2019)]?\\s")
+
+# Whitespace run separator used by WordChunker.
+_WHITESPACE = re.compile(r"\s+")
+
+# Paragraph boundary patterns used by ParagraphChunker.
+_PARA_BOUNDARY = re.compile(r"\n{2,}")
+_PARA_BOUNDARY_END = re.compile(r"\n{2,}$")
+
+
+class SentenceChunker(ChunkingStrategy):
+    """Splits accumulated text on sentence boundaries.
+
+    Sentence boundaries are detected by ``.``, ``!``, or ``?``, optionally
+    followed by a closing quote (straight or curly) or parenthesis, then
+    whitespace. The final sentence is only returned once it is followed by
+    whitespace or another sentence — a trailing fragment with no following
+    whitespace is withheld. Abbreviations are a known edge case: they will
+    be split on (simple regex, not NLP). Inter-sentence whitespace (including
+    double-space or tab) is discarded and does not appear as leading whitespace
+    in subsequent chunks.
+    """
+
+    def split(self, accumulated_text: str) -> list[str]:
+        """Return complete sentences from accumulated_text.
+
+        Args:
+            accumulated_text: The full text accumulated so far.
+
+        Returns:
+            Complete sentences detected so far. The trailing fragment (if any)
+            is withheld.
+        """
+        if not accumulated_text:
+            return []
+
+        chunks: list[str] = []
+        remaining = accumulated_text
+
+        while True:
+            match = _SENTENCE_BOUNDARY.search(remaining)
+            if match is None:
+                break
+            # Include up to and including the punctuation (and optional quote/paren),
+            # but not the trailing whitespace character.
+            end = match.start() + len(match.group().rstrip())
+            chunks.append(remaining[:end])
+            # Advance past the entire whitespace separator; lstrip() handles
+            # multi-character gaps (double-space, tab, etc.) so they don't
+            # leak into the next chunk as leading whitespace.
+            remaining = remaining[match.end() :].lstrip()
+
+        return chunks
+
+
+class WordChunker(ChunkingStrategy):
+    """Splits accumulated text on whitespace boundaries.
+
+    Each word is a chunk. Trailing text not yet followed by whitespace is
+    withheld.
+    """
+
+    def split(self, accumulated_text: str) -> list[str]:
+        """Return complete words from accumulated_text.
+
+        Args:
+            accumulated_text: The full text accumulated so far.
+
+        Returns:
+            All whitespace-delimited words except the trailing fragment (if any).
+            An empty list is returned when no whitespace boundary has been seen.
+        """
+        if not accumulated_text:
+            return []
+
+        # Split on runs of whitespace; the last token is a trailing fragment
+        # unless accumulated_text ends with whitespace.
+        parts = _WHITESPACE.split(accumulated_text)
+
+        # re.split on leading whitespace produces an empty first element; strip it.
+        if parts and parts[0] == "":
+            parts = parts[1:]
+        if parts and parts[-1] == "":
+            parts = parts[:-1]
+
+        if not parts:
+            return []
+
+        # If the text does not end with whitespace, the last part is a fragment.
+        if not accumulated_text[-1].isspace():
+            return parts[:-1]
+
+        return parts
+
+
+class ParagraphChunker(ChunkingStrategy):
+    r"""Splits accumulated text on double-newline paragraph boundaries.
+
+    Two or more consecutive newline characters are treated as a paragraph
+    separator. The trailing paragraph fragment (text not yet followed by ``\n\n``)
+    is withheld.
+
+    Note: only Unix-style ``\n\n`` separators are recognised. CRLF
+    (``\r\n\r\n``) paragraph separators are not supported.
+    """
+
+    def split(self, accumulated_text: str) -> list[str]:
+        """Return complete paragraphs from accumulated_text.
+
+        Args:
+            accumulated_text: The full text accumulated so far.
+
+        Returns:
+            Complete paragraphs (separated by two or more newlines). The
+            trailing incomplete paragraph is withheld. Returns an empty list
+            if no paragraph boundary has been reached.
+        """
+        if not accumulated_text:
+            return []
+
+        parts = _PARA_BOUNDARY.split(accumulated_text)
+
+        # If the text does not end with \n\n, the last part is a trailing fragment.
+        if not _PARA_BOUNDARY_END.search(accumulated_text):
+            parts = parts[:-1]
+
+        # _PARA_BOUNDARY.split on leading \n\n produces an empty first element.
+        return [p for p in parts if p]
diff --git a/test/stdlib/test_chunking.py b/test/stdlib/test_chunking.py