weaviate · dirkkul · Apr 22, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
@@ -26,9 +26,9 @@ env:
   WEAVIATE_132: 1.32.27
   WEAVIATE_133: 1.33.18
   WEAVIATE_134: 1.34.19
-  WEAVIATE_135: 1.35.17
-  WEAVIATE_136: 1.36.10
-  WEAVIATE_137: 1.37.1
+  WEAVIATE_135: 1.35.18
+  WEAVIATE_136: 1.36.12
+  WEAVIATE_137: 1.37.1-4e61e26.amd64
 
 jobs:
   lint-and-format:

diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py
diff --git a/weaviate/classes/__init__.py b/weaviate/classes/__init__.py
@@ -13,6 +13,7 @@
     rbac,
     replication,
     tenants,
+    tokenization,
 )  # noqa: F401
 from .config import ConsistencyLevel
 
@@ -29,6 +30,7 @@
     "init",
     "query",
     "tenants",
+    "tokenization",
     "rbac",
     "replication",
 ]
diff --git a/weaviate/classes/config.py b/weaviate/classes/config.py
@@ -11,8 +11,10 @@
     ReferenceProperty,
     ReplicationDeletionStrategy,
     Rerankers,
+    StopwordsCreate,
     StopwordsPreset,
     TextAnalyzerConfig,
+    TextAnalyzerConfigCreate,
     Tokenization,
     VectorDistances,
 )
@@ -39,8 +41,10 @@
     "PQEncoderType",
     "ReferenceProperty",
     "Rerankers",
+    "StopwordsCreate",
     "StopwordsPreset",
     "TextAnalyzerConfig",
+    "TextAnalyzerConfigCreate",
     "Tokenization",
     "Vectorizers",
     "VectorDistances",

diff --git a/weaviate/classes/tokenization.py b/weaviate/classes/tokenization.py
@@ -0,0 +1,17 @@
+from weaviate.collections.classes.config import (
+    StopwordsConfig,
+    StopwordsCreate,
+    StopwordsPreset,
+    TextAnalyzerConfigCreate,
+    Tokenization,
+)
+from weaviate.tokenization.models import TokenizeResult
+
+__all__ = [
+    "StopwordsConfig",
+    "StopwordsCreate",
+    "StopwordsPreset",
+    "TextAnalyzerConfigCreate",
+    "Tokenization",
+    "TokenizeResult",
+]
diff --git a/weaviate/collections/classes/config.py b/weaviate/collections/classes/config.py
@@ -1,5 +1,6 @@
 import datetime
 from dataclasses import dataclass
+from dataclasses import fields as _dataclass_fields
 from typing import (
     Any,
     ClassVar,
@@ -1647,6 +1648,26 @@ class _StopwordsConfig(_ConfigBase):
 
 
 StopwordsConfig = _StopwordsConfig
+StopwordsCreate = _StopwordsCreate
+
+# Invariant: the read-side dataclass (_StopwordsConfig) and the write-side
+# pydantic model (_StopwordsCreate) must carry the same set of field names so
+# that values round-tripped from ``collection.config.get()`` can flow back into
+# ``tokenization.text()`` without silent data loss. If a field is added to one
+# but not the other, importing this module fails loudly; the read→write
+# conversion in ``weaviate/tokenization/executor.py::_TokenizationExecutor.text``
+# depends on this parity.
+_read_fields = {f.name for f in _dataclass_fields(_StopwordsConfig)}
+_write_fields = set(_StopwordsCreate.model_fields.keys())
+if _read_fields != _write_fields:
+    raise RuntimeError(
+        "_StopwordsConfig / _StopwordsCreate field drift detected — "
+        f"read-only={_read_fields - _write_fields}, "
+        f"write-only={_write_fields - _read_fields}. "
+        "Update both classes together, or adapt the read→write conversion in "
+        "weaviate/tokenization/executor.py::_TokenizationExecutor.text."
+    )
+del _read_fields, _write_fields
 
 
 @dataclass
@@ -2224,6 +2245,9 @@ def _validate_ascii_fold_ignore(self) -> "_TextAnalyzerConfigCreate":
         return self
 
 
+TextAnalyzerConfigCreate = _TextAnalyzerConfigCreate
+
+
 class Property(_ConfigCreateModel):
     """This class defines the structure of a data property that a collection can have within Weaviate.
 

diff --git a/weaviate/tokenization/async_.pyi b/weaviate/tokenization/async_.pyi
@@ -1,21 +1,32 @@
-from typing import Dict, Optional
+from typing import Dict, List, Optional, Union, overload
 
 from weaviate.collections.classes.config import (
+    StopwordsConfig,
+    StopwordsCreate,
+    TextAnalyzerConfigCreate,
     Tokenization,
-    _StopwordsCreate,
-    _TextAnalyzerConfigCreate,
 )
 from weaviate.connect.v4 import ConnectionAsync
 from weaviate.tokenization.models import TokenizeResult
 
 from .executor import _TokenizationExecutor
 
 class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]):
+    @overload
     async def text(
         self,
         text: str,
         tokenization: Tokenization,
         *,
-        analyzer_config: Optional[_TextAnalyzerConfigCreate] = None,
-        stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None,
+        analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
+        stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ...,
+    ) -> TokenizeResult: ...
+    @overload
+    async def text(
+        self,
+        text: str,
+        tokenization: Tokenization,
+        *,
+        analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
+        stopword_presets: Optional[Dict[str, List[str]]] = ...,
     ) -> TokenizeResult: ...
diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py
@@ -1,13 +1,14 @@
 """Tokenize executor."""
 
-from typing import Any, Dict, Generic, Optional
+from typing import Any, Dict, Generic, List, Optional, Union, overload
 
 from httpx import Response
 
 from weaviate.collections.classes.config import (
+    StopwordsConfig,
+    StopwordsCreate,
+    TextAnalyzerConfigCreate,
     Tokenization,
-    _StopwordsCreate,
-    _TextAnalyzerConfigCreate,
 )
 from weaviate.connect import executor
 from weaviate.connect.v4 import ConnectionType, _ExpectedStatusCodes
@@ -27,32 +28,109 @@ def __check_version(self) -> None:
                 "1.37.0",
             )
 
+    # Overloads make ``stopwords`` and ``stopword_presets`` mutually exclusive
+    # at type-check time. Passing both is additionally rejected at runtime with
+    # ``ValueError`` in the implementation below. ``stopwords`` accepts either a
+    # ``StopwordsCreate`` (the write-side shape) or a ``StopwordsConfig`` (the
+    # read-side shape returned by ``collection.config.get()``), so values round-
+    # tripped through config reads can be passed back in directly.
+    @overload
     def text(
         self,
         text: str,
         tokenization: Tokenization,
         *,
-        analyzer_config: Optional[_TextAnalyzerConfigCreate] = None,
-        stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None,
+        analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
+        stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ...,
+    ) -> executor.Result[TokenizeResult]: ...
+
+    @overload
+    def text(
+        self,
+        text: str,
+        tokenization: Tokenization,
+        *,
+        analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
+        stopword_presets: Optional[Dict[str, List[str]]] = ...,
+    ) -> executor.Result[TokenizeResult]: ...
+
+    def text(
+        self,
+        text: str,
+        tokenization: Tokenization,
+        *,
+        analyzer_config: Optional[TextAnalyzerConfigCreate] = None,
+        stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = None,
+        stopword_presets: Optional[Dict[str, List[str]]] = None,
     ) -> executor.Result[TokenizeResult]:
         """Tokenize text using the generic /v1/tokenize endpoint.
 
+        For ``word`` tokenization the server defaults to the built-in ``en``
+        stopword preset when no stopword configuration is supplied. Pass
+        ``analyzer_config=TextAnalyzerConfigCreate(stopword_preset="none")``
+        or equivalent to opt out.
+
+        Call patterns for stopword handling (``stopwords`` and
+        ``stopword_presets`` are mutually exclusive — pass at most one):
+
+        1. **No stopword config** — rely on the server default (``en`` for
+           word tokenization, none otherwise)::
+
+               client.tokenization.text(text=..., tokenization=Tokenization.WORD)
+
+        2. **Apply a one-off stopwords block** via ``stopwords`` — the block
+           filters the query tokens directly, same shape as a collection's
+           ``invertedIndexConfig.stopwords``::
+
+               client.tokenization.text(
+                   text=...,
+                   tokenization=Tokenization.WORD,
+                   stopwords=StopwordsCreate(preset=StopwordsPreset.EN, additions=["foo"]),
+               )
+
+        3. **Register a named-preset catalog** via ``stopword_presets`` and
+           reference one by name from ``analyzer_config.stopword_preset``.
+           The catalog can also override built-in presets such as ``en``::
+
+               client.tokenization.text(
+                   text=...,
+                   tokenization=Tokenization.WORD,
+                   analyzer_config=TextAnalyzerConfigCreate(stopword_preset="custom"),
+                   stopword_presets={"custom": ["foo", "bar"]},
+               )
+
         Args:
             text: The text to tokenize.
-            tokenization: The tokenization method to use (e.g. Tokenization.WORD).
-            analyzer_config: Text analyzer settings (ASCII folding, stopword preset).
-            stopword_presets: Custom stopword preset definitions, keyed by name.
-                Each value is a ``_StopwordsCreate`` with optional preset, additions,
-                and removals fields.
+            tokenization: The tokenization method to use (e.g. ``Tokenization.WORD``).
+            analyzer_config: Text analyzer settings (ASCII folding, stopword
+                preset name), built via ``Configure.text_analyzer(...)``.
+                ``stopword_preset`` may reference a built-in preset
+                (``en`` / ``none``) or a name defined in ``stopword_presets``.
+            stopwords: One-off stopwords block applied directly to this request.
+                Mirrors the collection-level ``invertedIndexConfig.stopwords``
+                shape — hence the rich model with preset / additions / removals.
+                Mutually exclusive with ``stopword_presets``.
+            stopword_presets: Named-preset catalog (name → word list). Mirrors
+                the property-level preset catalog — a plain mapping, since a
+                property only references a preset by name (via
+                ``analyzer_config.stopword_preset``) rather than carrying the
+                full stopwords block. Entries can override built-ins like
+                ``en``. Mutually exclusive with ``stopwords``.
 
         Returns:
-            A TokenizeResult with indexed and query token lists.
+            A ``TokenizeResult`` with indexed and query token lists. The generic
+            endpoint does not echo request fields back in the response.
 
         Raises:
             WeaviateUnsupportedFeatureError: If the server version is below 1.37.0.
+            ValueError: If both ``stopwords`` and ``stopword_presets`` are passed,
+                or if any ``stopword_presets`` value is not a list/tuple of strings.
         """
         self.__check_version()
 
+        if stopwords is not None and stopword_presets is not None:
+            raise ValueError("stopwords and stopword_presets are mutually exclusive; pass only one")
+
         payload: Dict[str, Any] = {
             "text": text,
             "tokenization": tokenization.value,
@@ -63,10 +141,42 @@ def text(
             if ac_dict:
                 payload["analyzerConfig"] = ac_dict
 
+        if stopwords is not None:
+            if isinstance(stopwords, StopwordsConfig):
+                # Widen from the read-side shape returned by config.get() to the
+                # write-side shape the server expects. Field parity between the
+                # two classes is enforced at import time in
+                # ``weaviate/collections/classes/config.py``, so iterating
+                # ``StopwordsCreate.model_fields`` copies every field.
+                stopwords = StopwordsCreate(
+                    **{name: getattr(stopwords, name) for name in StopwordsCreate.model_fields}
+                )
+            sw_dict = stopwords._to_dict()
+            if sw_dict:
+                payload["stopwords"] = sw_dict
+
         if stopword_presets is not None:
-            payload["stopwordPresets"] = {
-                name: cfg._to_dict() for name, cfg in stopword_presets.items()
-            }
+            # Plain word-list shape matching a collection's
+            # invertedIndexConfig.stopwordPresets. Reject str (would
+            # silently split into characters) and pydantic models /
+            # other non-sequence shapes up-front so callers get a clear
+            # error instead of a malformed payload.
+            validated: Dict[str, List[str]] = {}
+            for name, words in stopword_presets.items():
+                if isinstance(words, (str, bytes)):
+                    raise ValueError(
+                        f"stopword_presets[{name!r}] must be a list of strings, "
+                        f"got {type(words).__name__}"
+                    )
+                if not isinstance(words, (list, tuple)):
+                    raise ValueError(
+                        f"stopword_presets[{name!r}] must be a list of strings, "
+                        f"got {type(words).__name__}"
+                    )
+                if not all(isinstance(w, str) for w in words):
+                    raise ValueError(f"stopword_presets[{name!r}] must contain only strings")
+                validated[name] = list(words)
+            payload["stopwordPresets"] = validated
 
         def resp(response: Response) -> TokenizeResult:
             return TokenizeResult.model_validate(response.json())

diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py
@@ -1,56 +1,17 @@
 """Return types for tokenization operations."""
 
-from typing import Any, Dict, List, Optional
+from typing import List
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator
-
-from weaviate.collections.classes.config import (
-    StopwordsConfig,
-    StopwordsPreset,
-    TextAnalyzerConfig,
-    Tokenization,
-)
+from pydantic import BaseModel
 
 
 class TokenizeResult(BaseModel):
     """Result of a tokenization operation.
 
     Attributes:
-        tokenization: The tokenization method that was applied.
         indexed: Tokens as they would be stored in the inverted index.
         query: Tokens as they would be used for querying (after stopword removal).
-        analyzer_config: The text analyzer configuration that was used, if any.
-        stopword_config: The stopword configuration that was used, if any.
     """
 
-    model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True)
-
-    tokenization: Tokenization
     indexed: List[str]
     query: List[str]
-    analyzer_config: Optional[TextAnalyzerConfig] = Field(default=None, alias="analyzerConfig")
-    stopword_config: Optional[StopwordsConfig] = Field(default=None, alias="stopwordConfig")
-
-    @field_validator("analyzer_config", mode="before")
-    @classmethod
-    def _parse_analyzer_config(cls, v: Optional[Dict[str, Any]]) -> Optional[TextAnalyzerConfig]:
-        if v is None:
-            return None
-        if "asciiFold" not in v and "stopwordPreset" not in v:
-            return None
-        return TextAnalyzerConfig(
-            ascii_fold=v.get("asciiFold", False),
-            ascii_fold_ignore=v.get("asciiFoldIgnore"),
-            stopword_preset=v.get("stopwordPreset"),
-        )
-
-    @field_validator("stopword_config", mode="before")
-    @classmethod
-    def _parse_stopword_config(cls, v: Optional[Dict[str, Any]]) -> Optional[StopwordsConfig]:
-        if v is None:
-            return None
-        return StopwordsConfig(
-            preset=StopwordsPreset(v["preset"]),
-            additions=v.get("additions"),
-            removals=v.get("removals"),
-        )