Skip to content

Commit 9fd83b8

Browse files
committed
fix: refactor tokenization tests to use parameterized cases for improved readability and maintainability
1 parent 60887f3 commit 9fd83b8

1 file changed

Lines changed: 176 additions & 151 deletions

File tree

integration/test_tokenize.py

Lines changed: 176 additions & 151 deletions
Original file line numberDiff line numberDiff line change
@@ -109,131 +109,146 @@ def test_tokenization_enum(
109109
# Generic endpoint does not echo tokenization back.
110110
assert result.tokenization is None
111111

112-
def test_default_en_applied_for_word(self, client: weaviate.WeaviateClient) -> None:
113-
"""Word tokenization defaults to the 'en' preset when no stopword config is supplied."""
114-
result = client.tokenization.text(
115-
text="The quick brown fox", tokenization=Tokenization.WORD
116-
)
117-
assert result.indexed == ["the", "quick", "brown", "fox"]
118-
# "the" removed by the server's default en preset.
119-
assert result.query == ["quick", "brown", "fox"]
120-
121-
def test_opt_out_of_default_en(self, client: weaviate.WeaviateClient) -> None:
122-
"""analyzerConfig.stopwordPreset='none' disables the default en."""
123-
cfg = _TextAnalyzerConfigCreate(stopword_preset=StopwordsPreset.NONE)
124-
result = client.tokenization.text(
125-
text="The quick brown fox",
126-
tokenization=Tokenization.WORD,
127-
analyzer_config=cfg,
128-
)
129-
assert result.query == ["the", "quick", "brown", "fox"]
130-
131-
def test_ascii_fold(self, client: weaviate.WeaviateClient) -> None:
132-
cfg = _TextAnalyzerConfigCreate(ascii_fold=True)
133-
result = client.tokenization.text(
134-
text="L'école est fermée",
135-
tokenization=Tokenization.WORD,
136-
analyzer_config=cfg,
137-
)
138-
assert result.indexed == ["l", "ecole", "est", "fermee"]
139-
140-
def test_ascii_fold_with_ignore(self, client: weaviate.WeaviateClient) -> None:
141-
cfg = _TextAnalyzerConfigCreate(ascii_fold=True, ascii_fold_ignore=["é"])
142-
result = client.tokenization.text(
143-
text="L'école est fermée",
144-
tokenization=Tokenization.WORD,
145-
analyzer_config=cfg,
146-
)
147-
assert result.indexed == ["l", "école", "est", "fermée"]
148-
149-
def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None:
150-
cfg = _TextAnalyzerConfigCreate(stopword_preset=StopwordsPreset.EN)
151-
result = client.tokenization.text(
152-
text="The quick brown fox",
153-
tokenization=Tokenization.WORD,
154-
analyzer_config=cfg,
155-
)
156-
assert "the" not in result.query
157-
assert "quick" in result.query
158-
159-
def test_stopword_preset_string(self, client: weaviate.WeaviateClient) -> None:
160-
cfg = _TextAnalyzerConfigCreate(stopword_preset="en")
161-
result = client.tokenization.text(
162-
text="The quick brown fox",
163-
tokenization=Tokenization.WORD,
164-
analyzer_config=cfg,
165-
)
166-
assert "the" not in result.query
167-
168-
def test_ascii_fold_combined_with_stopwords(self, client: weaviate.WeaviateClient) -> None:
169-
cfg = _TextAnalyzerConfigCreate(
170-
ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN
171-
)
172-
result = client.tokenization.text(
173-
text="The école est fermée",
174-
tokenization=Tokenization.WORD,
175-
analyzer_config=cfg,
176-
)
177-
assert result.indexed == ["the", "école", "est", "fermée"]
178-
assert "the" not in result.query
179-
assert "école" in result.query
180-
181-
def test_stopwords_fallback(self, client: weaviate.WeaviateClient) -> None:
182-
"""Top-level stopwords acts as the fallback detector when no analyzerConfig.stopwordPreset is set."""
183-
sw = _StopwordsCreate(preset=StopwordsPreset.EN, additions=["quick"], removals=None)
184-
result = client.tokenization.text(
185-
text="the quick brown fox",
186-
tokenization=Tokenization.WORD,
187-
stopwords=sw,
188-
)
189-
assert result.indexed == ["the", "quick", "brown", "fox"]
190-
# "the" (en) and "quick" (addition) filtered.
191-
assert result.query == ["brown", "fox"]
192-
193-
def test_stopwords_additions_default_preset_to_en(
194-
self, client: weaviate.WeaviateClient
112+
@pytest.mark.parametrize(
113+
"call_kwargs,expected_indexed,expected_query",
114+
[
115+
(
116+
{"text": "The quick brown fox"},
117+
["the", "quick", "brown", "fox"],
118+
["quick", "brown", "fox"],
119+
),
120+
(
121+
{
122+
"text": "The quick brown fox",
123+
"analyzer_config": _TextAnalyzerConfigCreate(
124+
stopword_preset=StopwordsPreset.NONE
125+
),
126+
},
127+
["the", "quick", "brown", "fox"],
128+
["the", "quick", "brown", "fox"],
129+
),
130+
(
131+
{
132+
"text": "L'école est fermée",
133+
"analyzer_config": _TextAnalyzerConfigCreate(ascii_fold=True),
134+
},
135+
["l", "ecole", "est", "fermee"],
136+
["l", "ecole", "fermee"],
137+
),
138+
(
139+
{
140+
"text": "L'école est fermée",
141+
"analyzer_config": _TextAnalyzerConfigCreate(
142+
ascii_fold=True, ascii_fold_ignore=["é"]
143+
),
144+
},
145+
["l", "école", "est", "fermée"],
146+
["l", "école", "fermée"],
147+
),
148+
(
149+
{
150+
"text": "The quick brown fox",
151+
"analyzer_config": _TextAnalyzerConfigCreate(
152+
stopword_preset=StopwordsPreset.EN
153+
),
154+
},
155+
["the", "quick", "brown", "fox"],
156+
["quick", "brown", "fox"],
157+
),
158+
(
159+
{
160+
"text": "The quick brown fox",
161+
"analyzer_config": _TextAnalyzerConfigCreate(stopword_preset="en"),
162+
},
163+
["the", "quick", "brown", "fox"],
164+
["quick", "brown", "fox"],
165+
),
166+
(
167+
{
168+
"text": "The école est fermée",
169+
"analyzer_config": _TextAnalyzerConfigCreate(
170+
ascii_fold=True,
171+
ascii_fold_ignore=["é"],
172+
stopword_preset=StopwordsPreset.EN,
173+
),
174+
},
175+
["the", "école", "est", "fermée"],
176+
["école", "est", "fermée"],
177+
),
178+
(
179+
{
180+
"text": "the quick brown fox",
181+
"stopwords": _StopwordsCreate(
182+
preset=StopwordsPreset.EN, additions=["quick"], removals=None
183+
),
184+
},
185+
["the", "quick", "brown", "fox"],
186+
["brown", "fox"],
187+
),
188+
(
189+
{
190+
"text": "the quick hello world",
191+
"stopwords": _StopwordsCreate(
192+
preset=None, additions=["hello"], removals=None
193+
),
194+
},
195+
["the", "quick", "hello", "world"],
196+
["quick", "world"],
197+
),
198+
(
199+
{
200+
"text": "the quick is fast",
201+
"stopwords": _StopwordsCreate(
202+
preset=None, additions=None, removals=["the"]
203+
),
204+
},
205+
["the", "quick", "is", "fast"],
206+
["the", "quick", "fast"],
207+
),
208+
(
209+
{
210+
"text": "hello world test",
211+
"analyzer_config": _TextAnalyzerConfigCreate(stopword_preset="custom"),
212+
"stopword_presets": {"custom": ["test"]},
213+
},
214+
["hello", "world", "test"],
215+
["hello", "world"],
216+
),
217+
(
218+
{
219+
"text": "the quick hello world",
220+
"stopword_presets": {"en": ["hello"]},
221+
},
222+
["the", "quick", "hello", "world"],
223+
["the", "quick", "world"],
224+
),
225+
],
226+
ids=[
227+
"default_en_applied_for_word",
228+
"opt_out_of_default_en",
229+
"ascii_fold",
230+
"ascii_fold_with_ignore",
231+
"stopword_preset_enum",
232+
"stopword_preset_string",
233+
"ascii_fold_combined_with_stopwords",
234+
"stopwords_fallback",
235+
"stopwords_additions_default_preset_to_en",
236+
"stopwords_removals_default_preset_to_en",
237+
"stopword_presets_named_reference",
238+
"stopword_presets_override_builtin_en",
239+
],
240+
)
241+
def test_text_tokenize(
242+
self,
243+
client: weaviate.WeaviateClient,
244+
call_kwargs: dict,
245+
expected_indexed: list,
246+
expected_query: list,
195247
) -> None:
196-
"""Caller omits preset, passes only additions. Server defaults preset to 'en' and builds detector from en + additions."""
197-
sw = _StopwordsCreate(preset=None, additions=["hello"], removals=None)
198-
result = client.tokenization.text(
199-
text="the quick hello world",
200-
tokenization=Tokenization.WORD,
201-
stopwords=sw,
202-
)
203-
assert result.query == ["quick", "world"]
204-
205-
def test_stopwords_removals_default_preset_to_en(self, client: weaviate.WeaviateClient) -> None:
206-
"""Caller omits preset, passes only removals. 'the' is removed from the en list so it passes through."""
207-
sw = _StopwordsCreate(preset=None, additions=None, removals=["the"])
208-
result = client.tokenization.text(
209-
text="the quick is fast",
210-
tokenization=Tokenization.WORD,
211-
stopwords=sw,
212-
)
213-
# "is" still in en, "the" removed.
214-
assert result.query == ["the", "quick", "fast"]
215-
216-
def test_stopword_presets_named_reference(self, client: weaviate.WeaviateClient) -> None:
217-
"""Define a named preset via stopword_presets, select it via analyzerConfig.stopwordPreset. Word lists use the collection shape."""
218-
result = client.tokenization.text(
219-
text="hello world test",
220-
tokenization=Tokenization.WORD,
221-
analyzer_config=_TextAnalyzerConfigCreate(stopword_preset="custom"),
222-
stopword_presets={"custom": ["test"]},
223-
)
224-
assert result.indexed == ["hello", "world", "test"]
225-
assert result.query == ["hello", "world"]
226-
227-
def test_stopword_presets_override_builtin_en(self, client: weaviate.WeaviateClient) -> None:
228-
"""A user-defined preset sharing a name with a built-in replaces the built-in entirely, including on the default-en path for word tokenization."""
229-
result = client.tokenization.text(
230-
text="the quick hello world",
231-
tokenization=Tokenization.WORD,
232-
stopword_presets={"en": ["hello"]},
233-
)
234-
assert result.indexed == ["the", "quick", "hello", "world"]
235-
# "the" no longer filtered (built-in en replaced), "hello" is.
236-
assert result.query == ["the", "quick", "world"]
248+
result = client.tokenization.text(tokenization=Tokenization.WORD, **call_kwargs)
249+
assert isinstance(result, TokenizeResult)
250+
assert result.indexed == expected_indexed
251+
assert result.query == expected_query
237252

238253

239254
# ---------------------------------------------------------------------------
@@ -287,33 +302,44 @@ def test_property_result_populates_tokenization(self, client: weaviate.WeaviateC
287302
class TestClientSideValidation:
288303
"""Verify that client-side validation rejects invalid input before hitting the server."""
289304

290-
def test_ascii_fold_ignore_without_fold_raises(self) -> None:
291-
with pytest.raises(ValueError, match="asciiFoldIgnore"):
292-
_TextAnalyzerConfigCreate(ascii_fold=False, ascii_fold_ignore=["é"])
293-
294-
def test_ascii_fold_ignore_without_fold_default_raises(self) -> None:
305+
@pytest.mark.parametrize(
306+
"kwargs",
307+
[
308+
{"ascii_fold": False, "ascii_fold_ignore": ["é"]},
309+
{"ascii_fold_ignore": ["é"]},
310+
],
311+
ids=["explicit_false", "default"],
312+
)
313+
def test_ascii_fold_ignore_without_fold_raises(self, kwargs: dict) -> None:
295314
with pytest.raises(ValueError, match="asciiFoldIgnore"):
296-
_TextAnalyzerConfigCreate(ascii_fold_ignore=["é"])
315+
_TextAnalyzerConfigCreate(**kwargs)
297316

298-
def test_valid_config_does_not_raise(self) -> None:
299-
cfg = _TextAnalyzerConfigCreate(ascii_fold=True, ascii_fold_ignore=["é", "ñ"])
300-
assert cfg.asciiFold is True
301-
assert cfg.asciiFoldIgnore == ["é", "ñ"]
302-
303-
def test_fold_without_ignore_is_valid(self) -> None:
304-
cfg = _TextAnalyzerConfigCreate(ascii_fold=True)
305-
assert cfg.asciiFold is True
306-
assert cfg.asciiFoldIgnore is None
307-
308-
def test_stopword_preset_only_is_valid(self) -> None:
309-
cfg = _TextAnalyzerConfigCreate(stopword_preset="en")
310-
assert cfg.stopwordPreset == "en"
311-
312-
def test_empty_config_is_valid(self) -> None:
313-
cfg = _TextAnalyzerConfigCreate()
314-
assert cfg.asciiFold is None
315-
assert cfg.asciiFoldIgnore is None
316-
assert cfg.stopwordPreset is None
317+
@pytest.mark.parametrize(
318+
"kwargs,expected",
319+
[
320+
(
321+
{"ascii_fold": True, "ascii_fold_ignore": ["é", "ñ"]},
322+
{"asciiFold": True, "asciiFoldIgnore": ["é", "ñ"]},
323+
),
324+
(
325+
{"ascii_fold": True},
326+
{"asciiFold": True, "asciiFoldIgnore": None},
327+
),
328+
(
329+
{"stopword_preset": "en"},
330+
{"stopwordPreset": "en"},
331+
),
332+
(
333+
{},
334+
{"asciiFold": None, "asciiFoldIgnore": None, "stopwordPreset": None},
335+
),
336+
],
337+
ids=["fold_with_ignore", "fold_without_ignore", "stopword_preset_only", "empty"],
338+
)
339+
def test_valid_config(self, kwargs: dict, expected: dict) -> None:
340+
cfg = _TextAnalyzerConfigCreate(**kwargs)
341+
for attr, value in expected.items():
342+
assert getattr(cfg, attr) == value
317343

318344
def test_stopwords_and_stopword_presets_mutex(self, client: weaviate.WeaviateClient) -> None:
319345
"""Client rejects the mutex violation locally with ValueError, before sending the request (which the server would also reject with 422)."""
@@ -411,7 +437,6 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien
411437
assert isinstance(result, TokenizeResult)
412438
assert result.tokenization == Tokenization.WORD
413439
assert result.indexed == ["the", "quick", "brown", "fox"]
414-
assert "the" not in result.query
415-
assert "quick" in result.query
440+
assert result.query == ["quick", "brown", "fox"]
416441
finally:
417442
await async_client.collections.delete("TestAsyncPropTokenize")

0 commit comments

Comments
 (0)