@@ -40,14 +40,15 @@ def text(
4040
4141 For ``word`` tokenization the server defaults to the built-in ``en``
4242 stopword preset when no stopword configuration is supplied. Pass
43- ``analyzer_config=TextAnalyzerConfig (stopword_preset="none" )`` or
44- equivalent to opt out.
43+ ``analyzer_config=Configure.text_analyzer (stopword_preset=StopwordsPreset.NONE )``
44+ (or equivalent) to opt out.
4545
4646 Args:
4747 text: The text to tokenize.
4848 tokenization: The tokenization method to use (e.g. Tokenization.WORD).
4949 analyzer_config: Text analyzer settings (ASCII folding, stopword
50- preset name). ``stopword_preset`` may reference a built-in preset
50+ preset name), built via ``Configure.text_analyzer(...)``.
51+ ``stopword_preset`` may reference a built-in preset
5152 (``en`` / ``none``) or a name defined in ``stopword_presets``.
5253 stopwords: Fallback stopword config applied when
5354 ``analyzer_config.stopword_preset`` is not set. Same shape as a
@@ -64,13 +65,13 @@ def text(
6465 422 if both are supplied.
6566
6667 Returns:
67- A TokenizeResult with indexed and query token lists. The generic
68- endpoint does not echo request fields (tokenization, analyzer_config,
69- stopwords, stopword_presets) back in the response.
68+ A TokenizeResult with indexed and query token lists. The response
69+ does not echo request fields back.
7070
7171 Raises:
7272 WeaviateUnsupportedFeatureError: If the server version is below 1.37.0.
73- ValueError: If both ``stopwords`` and ``stopword_presets`` are passed.
73+ ValueError: If both ``stopwords`` and ``stopword_presets`` are passed,
74+ or if any ``stopword_presets`` value is not a list/tuple of strings.
7475 """
7576 self .__check_version ()
7677
@@ -94,10 +95,28 @@ def text(
9495
9596 if stopword_presets is not None :
9697 # Plain word-list shape matching a collection's
97- # invertedIndexConfig.stopwordPresets.
98- payload ["stopwordPresets" ] = {
99- name : list (words ) for name , words in stopword_presets .items ()
100- }
98+ # invertedIndexConfig.stopwordPresets. Reject str (would
99+ # silently split into characters) and pydantic models /
100+ # other non-sequence shapes up-front so callers get a clear
101+ # error instead of a malformed payload.
102+ validated : Dict [str , List [str ]] = {}
103+ for name , words in stopword_presets .items ():
104+ if isinstance (words , (str , bytes )):
105+ raise ValueError (
106+ f"stopword_presets[{ name !r} ] must be a list of strings, "
107+ f"got { type (words ).__name__ } "
108+ )
109+ if not isinstance (words , (list , tuple )):
110+ raise ValueError (
111+ f"stopword_presets[{ name !r} ] must be a list of strings, "
112+ f"got { type (words ).__name__ } "
113+ )
114+ if not all (isinstance (w , str ) for w in words ):
115+ raise ValueError (
116+ f"stopword_presets[{ name !r} ] must contain only strings"
117+ )
118+ validated [name ] = list (words )
119+ payload ["stopwordPresets" ] = validated
101120
102121 def resp (response : Response ) -> TokenizeResult :
103122 return TokenizeResult .model_validate (response .json ())
0 commit comments