test: refactor output types and tests to config

amourao · amourao · commit 0f7fe47cac92 · 2026-04-21T14:24:33.000+01:00
diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py
@@ -403,6 +403,38 @@ def test_stopwords_and_stopword_presets_mutex(self, client: weaviate.WeaviateCli
                 stopword_presets={"custom": ["hello"]},
             )
 
+    @pytest.mark.parametrize(
+        "stopword_presets,match",
+        [
+            ({"custom": "hello"}, "must be a list of strings"),
+            (
+                {
+                    "custom": _StopwordsCreate(
+                        preset=StopwordsPreset.EN, additions=None, removals=None
+                    ),
+                },
+                "must be a list of strings",
+            ),
+            ({"custom": ["hello", 123]}, "must contain only strings"),
+        ],
+        ids=["str_value", "pydantic_model_value", "non_string_element"],
+    )
+    def test_stopword_presets_invalid_shape_raises(
+        self,
+        client: weaviate.WeaviateClient,
+        stopword_presets: dict,
+        match: str,
+    ) -> None:
+        """Client rejects malformed stopword_presets values locally before sending — str would silently split into characters; a pydantic model would serialize to field tuples."""
+        if client._connection._weaviate_version.is_lower_than(1, 37, 0):
+            pytest.skip("Tokenization requires Weaviate >= 1.37.0")
+        with pytest.raises(ValueError, match=match):
+            client.tokenization.text(
+                text="hello",
+                tokenization=Tokenization.WORD,
+                stopword_presets=stopword_presets,
+            )
+
 
 # ---------------------------------------------------------------------------
 # Version gate
diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py
@@ -40,14 +40,15 @@ def text(
 
         For ``word`` tokenization the server defaults to the built-in ``en``
         stopword preset when no stopword configuration is supplied. Pass
-        ``analyzer_config=TextAnalyzerConfig(stopword_preset="none")`` or
-        equivalent to opt out.
+        ``analyzer_config=Configure.text_analyzer(stopword_preset=StopwordsPreset.NONE)``
+        (or equivalent) to opt out.
 
         Args:
             text: The text to tokenize.
             tokenization: The tokenization method to use (e.g. Tokenization.WORD).
             analyzer_config: Text analyzer settings (ASCII folding, stopword
-                preset name). ``stopword_preset`` may reference a built-in preset
+                preset name), built via ``Configure.text_analyzer(...)``.
+                ``stopword_preset`` may reference a built-in preset
                 (``en`` / ``none``) or a name defined in ``stopword_presets``.
             stopwords: Fallback stopword config applied when
                 ``analyzer_config.stopword_preset`` is not set. Same shape as a
@@ -64,13 +65,13 @@ def text(
             422 if both are supplied.
 
         Returns:
-            A TokenizeResult with indexed and query token lists. The generic
-            endpoint does not echo request fields (tokenization, analyzer_config,
-            stopwords, stopword_presets) back in the response.
+            A TokenizeResult with indexed and query token lists. The response
+            does not echo request fields back.
 
         Raises:
             WeaviateUnsupportedFeatureError: If the server version is below 1.37.0.
-            ValueError: If both ``stopwords`` and ``stopword_presets`` are passed.
+            ValueError: If both ``stopwords`` and ``stopword_presets`` are passed,
+                or if any ``stopword_presets`` value is not a list/tuple of strings.
         """
         self.__check_version()
 
@@ -94,10 +95,28 @@ def text(
 
         if stopword_presets is not None:
             # Plain word-list shape matching a collection's
-            # invertedIndexConfig.stopwordPresets.
-            payload["stopwordPresets"] = {
-                name: list(words) for name, words in stopword_presets.items()
-            }
+            # invertedIndexConfig.stopwordPresets. Reject str (would
+            # silently split into characters) and pydantic models /
+            # other non-sequence shapes up-front so callers get a clear
+            # error instead of a malformed payload.
+            validated: Dict[str, List[str]] = {}
+            for name, words in stopword_presets.items():
+                if isinstance(words, (str, bytes)):
+                    raise ValueError(
+                        f"stopword_presets[{name!r}] must be a list of strings, "
+                        f"got {type(words).__name__}"
+                    )
+                if not isinstance(words, (list, tuple)):
+                    raise ValueError(
+                        f"stopword_presets[{name!r}] must be a list of strings, "
+                        f"got {type(words).__name__}"
+                    )
+                if not all(isinstance(w, str) for w in words):
+                    raise ValueError(
+                        f"stopword_presets[{name!r}] must contain only strings"
+                    )
+                validated[name] = list(words)
+            payload["stopwordPresets"] = validated
 
         def resp(response: Response) -> TokenizeResult:
             return TokenizeResult.model_validate(response.json())
diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py
@@ -13,14 +13,9 @@ class TokenizeResult(BaseModel):
     Attributes:
         indexed: Tokens as they would be stored in the inverted index.
         query: Tokens as they would be used for querying (after stopword removal).
-        tokenization: The tokenization method that was applied. Populated only by
-            the property-level endpoint, where the tokenization is resolved from
-            the property's schema. The generic ``/v1/tokenize`` endpoint does not
-            echo it back (the caller passed it).
     """
 
     model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True)
 
     indexed: List[str]
     query: List[str]
-    tokenization: Optional[Tokenization] = None