@@ -109,131 +109,146 @@ def test_tokenization_enum(
109109 # Generic endpoint does not echo tokenization back.
110110 assert result .tokenization is None
111111
112- def test_default_en_applied_for_word (self , client : weaviate .WeaviateClient ) -> None :
113- """Word tokenization defaults to the 'en' preset when no stopword config is supplied."""
114- result = client .tokenization .text (
115- text = "The quick brown fox" , tokenization = Tokenization .WORD
116- )
117- assert result .indexed == ["the" , "quick" , "brown" , "fox" ]
118- # "the" removed by the server's default en preset.
119- assert result .query == ["quick" , "brown" , "fox" ]
120-
121- def test_opt_out_of_default_en (self , client : weaviate .WeaviateClient ) -> None :
122- """analyzerConfig.stopwordPreset='none' disables the default en."""
123- cfg = _TextAnalyzerConfigCreate (stopword_preset = StopwordsPreset .NONE )
124- result = client .tokenization .text (
125- text = "The quick brown fox" ,
126- tokenization = Tokenization .WORD ,
127- analyzer_config = cfg ,
128- )
129- assert result .query == ["the" , "quick" , "brown" , "fox" ]
130-
131- def test_ascii_fold (self , client : weaviate .WeaviateClient ) -> None :
132- cfg = _TextAnalyzerConfigCreate (ascii_fold = True )
133- result = client .tokenization .text (
134- text = "L'école est fermée" ,
135- tokenization = Tokenization .WORD ,
136- analyzer_config = cfg ,
137- )
138- assert result .indexed == ["l" , "ecole" , "est" , "fermee" ]
139-
140- def test_ascii_fold_with_ignore (self , client : weaviate .WeaviateClient ) -> None :
141- cfg = _TextAnalyzerConfigCreate (ascii_fold = True , ascii_fold_ignore = ["é" ])
142- result = client .tokenization .text (
143- text = "L'école est fermée" ,
144- tokenization = Tokenization .WORD ,
145- analyzer_config = cfg ,
146- )
147- assert result .indexed == ["l" , "école" , "est" , "fermée" ]
148-
149- def test_stopword_preset_enum (self , client : weaviate .WeaviateClient ) -> None :
150- cfg = _TextAnalyzerConfigCreate (stopword_preset = StopwordsPreset .EN )
151- result = client .tokenization .text (
152- text = "The quick brown fox" ,
153- tokenization = Tokenization .WORD ,
154- analyzer_config = cfg ,
155- )
156- assert "the" not in result .query
157- assert "quick" in result .query
158-
159- def test_stopword_preset_string (self , client : weaviate .WeaviateClient ) -> None :
160- cfg = _TextAnalyzerConfigCreate (stopword_preset = "en" )
161- result = client .tokenization .text (
162- text = "The quick brown fox" ,
163- tokenization = Tokenization .WORD ,
164- analyzer_config = cfg ,
165- )
166- assert "the" not in result .query
167-
168- def test_ascii_fold_combined_with_stopwords (self , client : weaviate .WeaviateClient ) -> None :
169- cfg = _TextAnalyzerConfigCreate (
170- ascii_fold = True , ascii_fold_ignore = ["é" ], stopword_preset = StopwordsPreset .EN
171- )
172- result = client .tokenization .text (
173- text = "The école est fermée" ,
174- tokenization = Tokenization .WORD ,
175- analyzer_config = cfg ,
176- )
177- assert result .indexed == ["the" , "école" , "est" , "fermée" ]
178- assert "the" not in result .query
179- assert "école" in result .query
180-
181- def test_stopwords_fallback (self , client : weaviate .WeaviateClient ) -> None :
182- """Top-level stopwords acts as the fallback detector when no analyzerConfig.stopwordPreset is set."""
183- sw = _StopwordsCreate (preset = StopwordsPreset .EN , additions = ["quick" ], removals = None )
184- result = client .tokenization .text (
185- text = "the quick brown fox" ,
186- tokenization = Tokenization .WORD ,
187- stopwords = sw ,
188- )
189- assert result .indexed == ["the" , "quick" , "brown" , "fox" ]
190- # "the" (en) and "quick" (addition) filtered.
191- assert result .query == ["brown" , "fox" ]
192-
193- def test_stopwords_additions_default_preset_to_en (
194- self , client : weaviate .WeaviateClient
112+ @pytest .mark .parametrize (
113+ "call_kwargs,expected_indexed,expected_query" ,
114+ [
115+ (
116+ {"text" : "The quick brown fox" },
117+ ["the" , "quick" , "brown" , "fox" ],
118+ ["quick" , "brown" , "fox" ],
119+ ),
120+ (
121+ {
122+ "text" : "The quick brown fox" ,
123+ "analyzer_config" : _TextAnalyzerConfigCreate (
124+ stopword_preset = StopwordsPreset .NONE
125+ ),
126+ },
127+ ["the" , "quick" , "brown" , "fox" ],
128+ ["the" , "quick" , "brown" , "fox" ],
129+ ),
130+ (
131+ {
132+ "text" : "L'école est fermée" ,
133+ "analyzer_config" : _TextAnalyzerConfigCreate (ascii_fold = True ),
134+ },
135+ ["l" , "ecole" , "est" , "fermee" ],
136+ ["l" , "ecole" , "fermee" ],
137+ ),
138+ (
139+ {
140+ "text" : "L'école est fermée" ,
141+ "analyzer_config" : _TextAnalyzerConfigCreate (
142+ ascii_fold = True , ascii_fold_ignore = ["é" ]
143+ ),
144+ },
145+ ["l" , "école" , "est" , "fermée" ],
146+ ["l" , "école" , "fermée" ],
147+ ),
148+ (
149+ {
150+ "text" : "The quick brown fox" ,
151+ "analyzer_config" : _TextAnalyzerConfigCreate (
152+ stopword_preset = StopwordsPreset .EN
153+ ),
154+ },
155+ ["the" , "quick" , "brown" , "fox" ],
156+ ["quick" , "brown" , "fox" ],
157+ ),
158+ (
159+ {
160+ "text" : "The quick brown fox" ,
161+ "analyzer_config" : _TextAnalyzerConfigCreate (stopword_preset = "en" ),
162+ },
163+ ["the" , "quick" , "brown" , "fox" ],
164+ ["quick" , "brown" , "fox" ],
165+ ),
166+ (
167+ {
168+ "text" : "The école est fermée" ,
169+ "analyzer_config" : _TextAnalyzerConfigCreate (
170+ ascii_fold = True ,
171+ ascii_fold_ignore = ["é" ],
172+ stopword_preset = StopwordsPreset .EN ,
173+ ),
174+ },
175+ ["the" , "école" , "est" , "fermée" ],
176+ ["école" , "est" , "fermée" ],
177+ ),
178+ (
179+ {
180+ "text" : "the quick brown fox" ,
181+ "stopwords" : _StopwordsCreate (
182+ preset = StopwordsPreset .EN , additions = ["quick" ], removals = None
183+ ),
184+ },
185+ ["the" , "quick" , "brown" , "fox" ],
186+ ["brown" , "fox" ],
187+ ),
188+ (
189+ {
190+ "text" : "the quick hello world" ,
191+ "stopwords" : _StopwordsCreate (
192+ preset = None , additions = ["hello" ], removals = None
193+ ),
194+ },
195+ ["the" , "quick" , "hello" , "world" ],
196+ ["quick" , "world" ],
197+ ),
198+ (
199+ {
200+ "text" : "the quick is fast" ,
201+ "stopwords" : _StopwordsCreate (
202+ preset = None , additions = None , removals = ["the" ]
203+ ),
204+ },
205+ ["the" , "quick" , "is" , "fast" ],
206+ ["the" , "quick" , "fast" ],
207+ ),
208+ (
209+ {
210+ "text" : "hello world test" ,
211+ "analyzer_config" : _TextAnalyzerConfigCreate (stopword_preset = "custom" ),
212+ "stopword_presets" : {"custom" : ["test" ]},
213+ },
214+ ["hello" , "world" , "test" ],
215+ ["hello" , "world" ],
216+ ),
217+ (
218+ {
219+ "text" : "the quick hello world" ,
220+ "stopword_presets" : {"en" : ["hello" ]},
221+ },
222+ ["the" , "quick" , "hello" , "world" ],
223+ ["the" , "quick" , "world" ],
224+ ),
225+ ],
226+ ids = [
227+ "default_en_applied_for_word" ,
228+ "opt_out_of_default_en" ,
229+ "ascii_fold" ,
230+ "ascii_fold_with_ignore" ,
231+ "stopword_preset_enum" ,
232+ "stopword_preset_string" ,
233+ "ascii_fold_combined_with_stopwords" ,
234+ "stopwords_fallback" ,
235+ "stopwords_additions_default_preset_to_en" ,
236+ "stopwords_removals_default_preset_to_en" ,
237+ "stopword_presets_named_reference" ,
238+ "stopword_presets_override_builtin_en" ,
239+ ],
240+ )
241+ def test_text_tokenize (
242+ self ,
243+ client : weaviate .WeaviateClient ,
244+ call_kwargs : dict ,
245+ expected_indexed : list ,
246+ expected_query : list ,
195247 ) -> None :
196- """Caller omits preset, passes only additions. Server defaults preset to 'en' and builds detector from en + additions."""
197- sw = _StopwordsCreate (preset = None , additions = ["hello" ], removals = None )
198- result = client .tokenization .text (
199- text = "the quick hello world" ,
200- tokenization = Tokenization .WORD ,
201- stopwords = sw ,
202- )
203- assert result .query == ["quick" , "world" ]
204-
205- def test_stopwords_removals_default_preset_to_en (self , client : weaviate .WeaviateClient ) -> None :
206- """Caller omits preset, passes only removals. 'the' is removed from the en list so it passes through."""
207- sw = _StopwordsCreate (preset = None , additions = None , removals = ["the" ])
208- result = client .tokenization .text (
209- text = "the quick is fast" ,
210- tokenization = Tokenization .WORD ,
211- stopwords = sw ,
212- )
213- # "is" still in en, "the" removed.
214- assert result .query == ["the" , "quick" , "fast" ]
215-
216- def test_stopword_presets_named_reference (self , client : weaviate .WeaviateClient ) -> None :
217- """Define a named preset via stopword_presets, select it via analyzerConfig.stopwordPreset. Word lists use the collection shape."""
218- result = client .tokenization .text (
219- text = "hello world test" ,
220- tokenization = Tokenization .WORD ,
221- analyzer_config = _TextAnalyzerConfigCreate (stopword_preset = "custom" ),
222- stopword_presets = {"custom" : ["test" ]},
223- )
224- assert result .indexed == ["hello" , "world" , "test" ]
225- assert result .query == ["hello" , "world" ]
226-
227- def test_stopword_presets_override_builtin_en (self , client : weaviate .WeaviateClient ) -> None :
228- """A user-defined preset sharing a name with a built-in replaces the built-in entirely, including on the default-en path for word tokenization."""
229- result = client .tokenization .text (
230- text = "the quick hello world" ,
231- tokenization = Tokenization .WORD ,
232- stopword_presets = {"en" : ["hello" ]},
233- )
234- assert result .indexed == ["the" , "quick" , "hello" , "world" ]
235- # "the" no longer filtered (built-in en replaced), "hello" is.
236- assert result .query == ["the" , "quick" , "world" ]
248+ result = client .tokenization .text (tokenization = Tokenization .WORD , ** call_kwargs )
249+ assert isinstance (result , TokenizeResult )
250+ assert result .indexed == expected_indexed
251+ assert result .query == expected_query
237252
238253
239254# ---------------------------------------------------------------------------
@@ -287,33 +302,44 @@ def test_property_result_populates_tokenization(self, client: weaviate.WeaviateC
287302class TestClientSideValidation :
288303 """Verify that client-side validation rejects invalid input before hitting the server."""
289304
290- def test_ascii_fold_ignore_without_fold_raises (self ) -> None :
291- with pytest .raises (ValueError , match = "asciiFoldIgnore" ):
292- _TextAnalyzerConfigCreate (ascii_fold = False , ascii_fold_ignore = ["é" ])
293-
294- def test_ascii_fold_ignore_without_fold_default_raises (self ) -> None :
305+ @pytest .mark .parametrize (
306+ "kwargs" ,
307+ [
308+ {"ascii_fold" : False , "ascii_fold_ignore" : ["é" ]},
309+ {"ascii_fold_ignore" : ["é" ]},
310+ ],
311+ ids = ["explicit_false" , "default" ],
312+ )
313+ def test_ascii_fold_ignore_without_fold_raises (self , kwargs : dict ) -> None :
295314 with pytest .raises (ValueError , match = "asciiFoldIgnore" ):
296- _TextAnalyzerConfigCreate (ascii_fold_ignore = [ "é" ] )
315+ _TextAnalyzerConfigCreate (** kwargs )
297316
298- def test_valid_config_does_not_raise (self ) -> None :
299- cfg = _TextAnalyzerConfigCreate (ascii_fold = True , ascii_fold_ignore = ["é" , "ñ" ])
300- assert cfg .asciiFold is True
301- assert cfg .asciiFoldIgnore == ["é" , "ñ" ]
302-
303- def test_fold_without_ignore_is_valid (self ) -> None :
304- cfg = _TextAnalyzerConfigCreate (ascii_fold = True )
305- assert cfg .asciiFold is True
306- assert cfg .asciiFoldIgnore is None
307-
308- def test_stopword_preset_only_is_valid (self ) -> None :
309- cfg = _TextAnalyzerConfigCreate (stopword_preset = "en" )
310- assert cfg .stopwordPreset == "en"
311-
312- def test_empty_config_is_valid (self ) -> None :
313- cfg = _TextAnalyzerConfigCreate ()
314- assert cfg .asciiFold is None
315- assert cfg .asciiFoldIgnore is None
316- assert cfg .stopwordPreset is None
317+ @pytest .mark .parametrize (
318+ "kwargs,expected" ,
319+ [
320+ (
321+ {"ascii_fold" : True , "ascii_fold_ignore" : ["é" , "ñ" ]},
322+ {"asciiFold" : True , "asciiFoldIgnore" : ["é" , "ñ" ]},
323+ ),
324+ (
325+ {"ascii_fold" : True },
326+ {"asciiFold" : True , "asciiFoldIgnore" : None },
327+ ),
328+ (
329+ {"stopword_preset" : "en" },
330+ {"stopwordPreset" : "en" },
331+ ),
332+ (
333+ {},
334+ {"asciiFold" : None , "asciiFoldIgnore" : None , "stopwordPreset" : None },
335+ ),
336+ ],
337+ ids = ["fold_with_ignore" , "fold_without_ignore" , "stopword_preset_only" , "empty" ],
338+ )
339+ def test_valid_config (self , kwargs : dict , expected : dict ) -> None :
340+ cfg = _TextAnalyzerConfigCreate (** kwargs )
341+ for attr , value in expected .items ():
342+ assert getattr (cfg , attr ) == value
317343
318344 def test_stopwords_and_stopword_presets_mutex (self , client : weaviate .WeaviateClient ) -> None :
319345 """Client rejects the mutex violation locally with ValueError, before sending the request (which the server would also reject with 422)."""
@@ -411,7 +437,6 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien
411437 assert isinstance (result , TokenizeResult )
412438 assert result .tokenization == Tokenization .WORD
413439 assert result .indexed == ["the" , "quick" , "brown" , "fox" ]
414- assert "the" not in result .query
415- assert "quick" in result .query
440+ assert result .query == ["quick" , "brown" , "fox" ]
416441 finally :
417442 await async_client .collections .delete ("TestAsyncPropTokenize" )
0 commit comments