Skip to content

Commit 0f7fe47

Browse files
committed
test: refactor output types and tests to config
1 parent 959f554 commit 0f7fe47

3 files changed

Lines changed: 62 additions & 16 deletions

File tree

integration/test_tokenize.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,38 @@ def test_stopwords_and_stopword_presets_mutex(self, client: weaviate.WeaviateCli
403403
stopword_presets={"custom": ["hello"]},
404404
)
405405

406+
@pytest.mark.parametrize(
407+
"stopword_presets,match",
408+
[
409+
({"custom": "hello"}, "must be a list of strings"),
410+
(
411+
{
412+
"custom": _StopwordsCreate(
413+
preset=StopwordsPreset.EN, additions=None, removals=None
414+
),
415+
},
416+
"must be a list of strings",
417+
),
418+
({"custom": ["hello", 123]}, "must contain only strings"),
419+
],
420+
ids=["str_value", "pydantic_model_value", "non_string_element"],
421+
)
422+
def test_stopword_presets_invalid_shape_raises(
423+
self,
424+
client: weaviate.WeaviateClient,
425+
stopword_presets: dict,
426+
match: str,
427+
) -> None:
428+
"""Client rejects malformed stopword_presets values locally before sending — str would silently split into characters; a pydantic model would serialize to field tuples."""
429+
if client._connection._weaviate_version.is_lower_than(1, 37, 0):
430+
pytest.skip("Tokenization requires Weaviate >= 1.37.0")
431+
with pytest.raises(ValueError, match=match):
432+
client.tokenization.text(
433+
text="hello",
434+
tokenization=Tokenization.WORD,
435+
stopword_presets=stopword_presets,
436+
)
437+
406438

407439
# ---------------------------------------------------------------------------
408440
# Version gate

weaviate/tokenization/executor.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,15 @@ def text(
4040
4141
For ``word`` tokenization the server defaults to the built-in ``en``
4242
stopword preset when no stopword configuration is supplied. Pass
43-
``analyzer_config=TextAnalyzerConfig(stopword_preset="none")`` or
44-
equivalent to opt out.
43+
``analyzer_config=Configure.text_analyzer(stopword_preset=StopwordsPreset.NONE)``
44+
(or equivalent) to opt out.
4545
4646
Args:
4747
text: The text to tokenize.
4848
tokenization: The tokenization method to use (e.g. Tokenization.WORD).
4949
analyzer_config: Text analyzer settings (ASCII folding, stopword
50-
preset name). ``stopword_preset`` may reference a built-in preset
50+
preset name), built via ``Configure.text_analyzer(...)``.
51+
``stopword_preset`` may reference a built-in preset
5152
(``en`` / ``none``) or a name defined in ``stopword_presets``.
5253
stopwords: Fallback stopword config applied when
5354
``analyzer_config.stopword_preset`` is not set. Same shape as a
@@ -64,13 +65,13 @@ def text(
6465
422 if both are supplied.
6566
6667
Returns:
67-
A TokenizeResult with indexed and query token lists. The generic
68-
endpoint does not echo request fields (tokenization, analyzer_config,
69-
stopwords, stopword_presets) back in the response.
68+
A TokenizeResult with indexed and query token lists. The response
69+
does not echo request fields back.
7070
7171
Raises:
7272
WeaviateUnsupportedFeatureError: If the server version is below 1.37.0.
73-
ValueError: If both ``stopwords`` and ``stopword_presets`` are passed.
73+
ValueError: If both ``stopwords`` and ``stopword_presets`` are passed,
74+
or if any ``stopword_presets`` value is not a list/tuple of strings.
7475
"""
7576
self.__check_version()
7677

@@ -94,10 +95,28 @@ def text(
9495

9596
if stopword_presets is not None:
9697
# Plain word-list shape matching a collection's
97-
# invertedIndexConfig.stopwordPresets.
98-
payload["stopwordPresets"] = {
99-
name: list(words) for name, words in stopword_presets.items()
100-
}
98+
# invertedIndexConfig.stopwordPresets. Reject str (would
99+
# silently split into characters) and pydantic models /
100+
# other non-sequence shapes up-front so callers get a clear
101+
# error instead of a malformed payload.
102+
validated: Dict[str, List[str]] = {}
103+
for name, words in stopword_presets.items():
104+
if isinstance(words, (str, bytes)):
105+
raise ValueError(
106+
f"stopword_presets[{name!r}] must be a list of strings, "
107+
f"got {type(words).__name__}"
108+
)
109+
if not isinstance(words, (list, tuple)):
110+
raise ValueError(
111+
f"stopword_presets[{name!r}] must be a list of strings, "
112+
f"got {type(words).__name__}"
113+
)
114+
if not all(isinstance(w, str) for w in words):
115+
raise ValueError(
116+
f"stopword_presets[{name!r}] must contain only strings"
117+
)
118+
validated[name] = list(words)
119+
payload["stopwordPresets"] = validated
101120

102121
def resp(response: Response) -> TokenizeResult:
103122
return TokenizeResult.model_validate(response.json())

weaviate/tokenization/models.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,9 @@ class TokenizeResult(BaseModel):
1313
Attributes:
1414
indexed: Tokens as they would be stored in the inverted index.
1515
query: Tokens as they would be used for querying (after stopword removal).
16-
tokenization: The tokenization method that was applied. Populated only by
17-
the property-level endpoint, where the tokenization is resolved from
18-
the property's schema. The generic ``/v1/tokenize`` endpoint does not
19-
echo it back (the caller passed it).
2016
"""
2117

2218
model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True)
2319

2420
indexed: List[str]
2521
query: List[str]
26-
tokenization: Optional[Tokenization] = None

0 commit comments

Comments
 (0)