weaviate · amourao · Apr 14, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/integration/test_collection_config.py b/integration/test_collection_config.py
diff --git a/test/collection/test_config.py b/test/collection/test_config.py
@@ -12,6 +12,7 @@
     Property,
     Reconfigure,
     ReferenceProperty,
+    StopwordsPreset,
     TextAnalyzerConfig,
     Tokenization,
     Vectorizers,
@@ -3097,3 +3098,100 @@ def test_text_analyzer_rejects_wrong_types(self) -> None:
             TextAnalyzerConfig(ascii_fold="yes")  # type: ignore[arg-type]
         with pytest.raises(ValidationError):
             TextAnalyzerConfig(ascii_fold_ignore="é")  # type: ignore[arg-type]
+
+    def test_text_analyzer_stopword_preset_builtin_enum(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=TextAnalyzerConfig(stopword_preset=StopwordsPreset.EN),
+        )
+        assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "en"}
+
+    def test_text_analyzer_stopword_preset_user_defined_string(self) -> None:
+        prop = Property(
+            name="title_fr",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=TextAnalyzerConfig(stopword_preset="fr"),
+        )
+        assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "fr"}
+
+    def test_text_analyzer_combined_ascii_fold_and_stopword_preset(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=TextAnalyzerConfig(
+                ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset="fr"
+            ),
+        )
+        assert prop._to_dict()["textAnalyzer"] == {
+            "asciiFold": True,
+            "asciiFoldIgnore": ["é"],
+            "stopwordPreset": "fr",
+        }
+
+    def test_text_analyzer_stopword_preset_only_omits_other_keys(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=TextAnalyzerConfig(stopword_preset="fr"),
+        )
+        out = prop._to_dict()
+        assert "asciiFold" not in out["textAnalyzer"]
+        assert "asciiFoldIgnore" not in out["textAnalyzer"]
+
+
+class TestInvertedIndexStopwordPresets:
+    def test_configure_inverted_index_with_stopword_presets(self) -> None:
+        ic = Configure.inverted_index(
+            stopword_presets={
+                "fr": ["le", "la", "les"],
+                "es": ["el", "la", "los"],
+            },
+        )
+        out = ic._to_dict()
+        assert out["stopwordPresets"] == {
+            "fr": ["le", "la", "les"],
+            "es": ["el", "la", "los"],
+        }
+
+    def test_configure_inverted_index_without_stopword_presets_omits_key(self) -> None:
+        ic = Configure.inverted_index()
+        assert "stopwordPresets" not in ic._to_dict()
+
+    def test_reconfigure_inverted_index_merges_stopword_presets(self) -> None:
+        rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le", "la"]})
+        existing = {
+            "stopwords": {"preset": "en", "additions": None, "removals": None},
+            "bm25": {"b": 0.75, "k1": 1.2},
+            "cleanupIntervalSeconds": 60,
+        }
+        merged = rc.merge_with_existing(existing)
+        assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
+        # other fields untouched
+        assert merged["stopwords"]["preset"] == "en"
+        assert merged["bm25"]["b"] == 0.75
+
+    def test_reconfigure_inverted_index_replaces_existing_stopword_presets(self) -> None:
+        rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le"]})
+        existing = {
+            "stopwords": {"preset": "en", "additions": None, "removals": None},
+            "stopwordPresets": {"fr": ["le", "la", "les"], "es": ["el"]},
+        }
+        merged = rc.merge_with_existing(existing)
+        # The new value fully replaces the prior dict (this matches the server-side
+        # PUT semantics — see test_tokenize.py::test_remove_unused_preset_is_allowed).
+        assert merged["stopwordPresets"] == {"fr": ["le"]}
+
+    def test_reconfigure_inverted_index_without_stopword_presets_leaves_existing(self) -> None:
+        rc = Reconfigure.inverted_index(bm25_b=0.7, bm25_k1=1.1)
+        existing = {
+            "stopwords": {"preset": "en", "additions": None, "removals": None},
+            "bm25": {"b": 0.75, "k1": 1.2},
+            "stopwordPresets": {"fr": ["le", "la"]},
+        }
+        merged = rc.merge_with_existing(existing)
+        assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
diff --git a/test/collection/test_config_methods.py b/test/collection/test_config_methods.py
@@ -1,4 +1,5 @@
 from weaviate.collections.classes.config_methods import (
+    _collection_config_from_json,
     _collection_configs_simple_from_json,
     _nested_properties_from_config,
     _properties_from_config,
@@ -145,3 +146,102 @@ def test_nested_properties_from_config_parses_text_analyzer() -> None:
         "asciiFold": True,
         "asciiFoldIgnore": ["ñ"],
     }
+
+
+def test_properties_from_config_parses_stopword_preset_only() -> None:
+    """A property with only stopwordPreset (no asciiFold) must still produce a text_analyzer."""
+    schema = {
+        "vectorizer": "none",
+        "properties": [
+            _make_text_prop("title", textAnalyzer={"stopwordPreset": "fr"}),
+        ],
+    }
+    title = _properties_from_config(schema)[0]
+    assert title.text_analyzer is not None
+    assert title.text_analyzer.ascii_fold is False
+    assert title.text_analyzer.ascii_fold_ignore is None
+    assert title.text_analyzer.stopword_preset == "fr"
+
+
+def test_properties_from_config_parses_combined_text_analyzer() -> None:
+    schema = {
+        "vectorizer": "none",
+        "properties": [
+            _make_text_prop(
+                "title",
+                textAnalyzer={
+                    "asciiFold": True,
+                    "asciiFoldIgnore": ["é"],
+                    "stopwordPreset": "fr",
+                },
+            ),
+        ],
+    }
+    title = _properties_from_config(schema)[0]
+    assert title.text_analyzer is not None
+    assert title.text_analyzer.ascii_fold is True
+    assert title.text_analyzer.ascii_fold_ignore == ["é"]
+    assert title.text_analyzer.stopword_preset == "fr"
+
+
+def _full_schema(class_name: str, **inverted_overrides) -> dict:
+    inverted = {
+        "bm25": {"b": 0.75, "k1": 1.2},
+        "cleanupIntervalSeconds": 60,
+        "stopwords": {"preset": "en", "additions": None, "removals": None},
+    }
+    inverted.update(inverted_overrides)
+    return {
+        "class": class_name,
+        "vectorizer": "none",
+        "properties": [],
+        "invertedIndexConfig": inverted,
+        "replicationConfig": {"factor": 1, "deletionStrategy": "NoAutomatedResolution"},
+        "shardingConfig": {
+            "virtualPerPhysical": 128,
+            "desiredCount": 1,
+            "actualCount": 1,
+            "desiredVirtualCount": 128,
+            "actualVirtualCount": 128,
+            "key": "_id",
+            "strategy": "hash",
+            "function": "murmur3",
+        },
+        "vectorIndexType": "hnsw",
+        "vectorIndexConfig": {
+            "skip": False,
+            "cleanupIntervalSeconds": 300,
+            "maxConnections": 64,
+            "efConstruction": 128,
+            "ef": -1,
+            "dynamicEfMin": 100,
+            "dynamicEfMax": 500,
+            "dynamicEfFactor": 8,
+            "vectorCacheMaxObjects": 1000000000000,
+            "flatSearchCutoff": 40000,
+            "distance": "cosine",
+        },
+    }
+
+
+def test_collection_config_parses_stopword_presets() -> None:
+    """The inverted index config exposes stopwordPresets when present in the schema."""
+    schema = _full_schema(
+        "TestStopwordPresets",
+        stopwordPresets={
+            "fr": ["le", "la", "les"],
+            "es": ["el", "la", "los"],
+        },
+    )
+    full = _collection_config_from_json(schema)
+    assert full.inverted_index_config.stopword_presets == {
+        "fr": ["le", "la", "les"],
+        "es": ["el", "la", "los"],
+    }
+
+
+def test_collection_config_stopword_presets_absent() -> None:
+    """If the server response omits stopwordPresets, the parsed value is None."""
+    schema = _full_schema("TestNoStopwordPresets")
+    full = _collection_config_from_json(schema)
+    assert full.inverted_index_config.stopword_presets is None
diff --git a/weaviate/collections/classes/config.py b/weaviate/collections/classes/config.py
@@ -380,12 +380,14 @@ class _InvertedIndexConfigCreate(_ConfigCreateModel):
     indexPropertyLength: Optional[bool]
     indexNullState: Optional[bool]
     stopwords: _StopwordsCreate
+    stopwordPresets: Optional[Dict[str, List[str]]] = None
 
 
 class _InvertedIndexConfigUpdate(_ConfigUpdateModel):
     bm25: Optional[_BM25ConfigUpdate]
     cleanupIntervalSeconds: Optional[int]
     stopwords: Optional[_StopwordsUpdate]
+    stopwordPresets: Optional[Dict[str, List[str]]] = None
 
 
 class _MultiTenancyConfigCreate(_ConfigCreateModel):
@@ -1647,6 +1649,7 @@ class _InvertedIndexConfig(_ConfigBase):
     index_property_length: bool
     index_timestamps: bool
     stopwords: StopwordsConfig
+    stopword_presets: Optional[Dict[str, List[str]]] = None
 
 
 InvertedIndexConfig = _InvertedIndexConfig
@@ -1675,6 +1678,7 @@ class _PropertyVectorizerConfig:
 class _TextAnalyzerConfig(_ConfigBase):
     ascii_fold: bool
     ascii_fold_ignore: Optional[List[str]]
+    stopword_preset: Optional[str]
 
 
 @dataclass
@@ -2174,24 +2178,42 @@ class _ShardStatus:
 class TextAnalyzerConfig(_ConfigCreateModel):
     """Text analysis options for a property.
 
-    Configures ASCII folding behavior for `text` and `text[]` properties that use an
-    inverted index (searchable or filterable). When enabled, accent/diacritic marks are
-    folded to their base characters during indexing and search (e.g. 'école' matches
-    'ecole').
+    Configures per-property text analysis for `text` and `text[]` properties that use an
+    inverted index (searchable or filterable). Supports ASCII folding (accent/diacritic
+    handling) and selecting a stopword preset that overrides the collection-level
+    `invertedIndexConfig.stopwords` setting for this property only.
 
     Attributes:
         ascii_fold: If True, accent/diacritic marks are folded to their base characters
-            during indexing and search. If omitted, the field is not sent to the server
-            and the server default (False) applies.
+            during indexing and search (e.g. 'école' matches 'ecole'). If omitted, the
+            field is not sent to the server and the server default (False) applies.
         ascii_fold_ignore: Optional list of characters that should be excluded from
             ASCII folding (e.g. ['é'] keeps 'é' from being folded to 'e'). If omitted,
             the field is not sent to the server.
-
-    Both settings are immutable after the property is created.
+        stopword_preset: Stopword preset name. Overrides the collection-level
+            `invertedIndexConfig.stopwords` for this property. Only applies to
+            properties using `Tokenization.WORD`. Accepts a built-in preset
+            (`StopwordsPreset.EN` or `StopwordsPreset.NONE`) or the name of a
+            user-defined preset declared in
+            `Configure.inverted_index(stopword_presets=...)`.
+
+    All settings are immutable after the property is created.
     """
 
     asciiFold: Optional[bool] = Field(default=None, alias="ascii_fold")
     asciiFoldIgnore: Optional[List[str]] = Field(default=None, alias="ascii_fold_ignore")
+    stopwordPreset: Optional[Union[StopwordsPreset, str]] = Field(
+        default=None, alias="stopword_preset"
+    )
+
+    @field_validator("stopwordPreset", mode="before")
+    @classmethod
+    def _coerce_stopword_preset(cls, v: Any) -> Any:
+        # Pydantic preserves the StopwordsPreset enum instance through model_dump,
+        # but the wire format must be a plain string. Coerce at construction time.
+        if isinstance(v, StopwordsPreset):
+            return v.value
+        return v
 
 
 class Property(_ConfigCreateModel):
@@ -2615,11 +2637,17 @@ def inverted_index(
         stopwords_preset: Optional[StopwordsPreset] = None,
         stopwords_additions: Optional[List[str]] = None,
         stopwords_removals: Optional[List[str]] = None,
+        stopword_presets: Optional[Dict[str, List[str]]] = None,
     ) -> _InvertedIndexConfigCreate:
         """Create an `InvertedIndexConfigCreate` object to be used when defining the configuration of the keyword searching algorithm of Weaviate.
 
         Args:
-            See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details!
+            stopword_presets: User-defined named stopword lists keyed by preset name. Each value
+                is a flat list of stopword strings. A preset can be referenced from a property's
+                `text_analyzer.stopword_preset` to override the collection-level stopwords for
+                that property only. Requires Weaviate >= 1.37.0.
+
+            See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details on the other parameters.
         """  # noqa: D417 (missing argument descriptions in the docstring)
         if bm25_b is None and bm25_k1 is not None or bm25_k1 is None and bm25_b is not None:
             raise ValueError("bm25_b and bm25_k1 must be specified together")
@@ -2639,6 +2667,7 @@ def inverted_index(
                 additions=stopwords_additions,
                 removals=stopwords_removals,
             ),
+            stopwordPresets=stopword_presets,
         )
 
     @staticmethod
@@ -2913,13 +2942,19 @@ def inverted_index(
         stopwords_additions: Optional[List[str]] = None,
         stopwords_preset: Optional[StopwordsPreset] = None,
         stopwords_removals: Optional[List[str]] = None,
+        stopword_presets: Optional[Dict[str, List[str]]] = None,
     ) -> _InvertedIndexConfigUpdate:
         """Create an `InvertedIndexConfigUpdate` object.
 
         Use this method when defining the `inverted_index_config` argument in `collection.update()`.
 
         Args:
-            See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for a more detailed view!
+            stopword_presets: User-defined named stopword lists keyed by preset name. Each value
+                is a flat list of stopword strings. Passing this replaces the entire user-defined
+                stopword preset map for the collection. Removing a preset still referenced by a
+                property is rejected by the server. Requires Weaviate >= 1.37.0.
+
+            See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details on the other parameters.
         """  # noqa: D417 (missing argument descriptions in the docstring)
         return _InvertedIndexConfigUpdate(
             bm25=_BM25ConfigUpdate(b=bm25_b, k1=bm25_k1),
@@ -2929,6 +2964,7 @@ def inverted_index(
                 additions=stopwords_additions,
                 removals=stopwords_removals,
             ),
+            stopwordPresets=stopword_presets,
         )
 
     @staticmethod

diff --git a/weaviate/collections/classes/config_base.py b/weaviate/collections/classes/config_base.py
@@ -29,7 +29,7 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
                 continue
             if isinstance(val, Enum):
                 schema[cls_field] = str(val.value)
-            elif isinstance(val, (int, float, bool, str, list)):
+            elif isinstance(val, (int, float, bool, str, list, dict)):
                 schema[cls_field] = val
             elif isinstance(val, _QuantizerConfigUpdate):
                 quantizers = ["pq", "bq", "sq"]

diff --git a/weaviate/collections/classes/config_methods.py b/weaviate/collections/classes/config_methods.py
@@ -357,6 +357,7 @@ def _collection_config_from_json(schema: Dict[str, Any]) -> _CollectionConfig:
                 additions=schema["invertedIndexConfig"]["stopwords"]["additions"],
                 removals=schema["invertedIndexConfig"]["stopwords"]["removals"],
             ),
+            stopword_presets=schema["invertedIndexConfig"].get("stopwordPresets"),
         ),
         multi_tenancy_config=_MultiTenancyConfig(
             enabled=schema.get("multiTenancyConfig", {}).get("enabled", False),
@@ -467,11 +468,14 @@ def _text_analyzer_from_config(prop: Dict[str, Any]) -> Optional[_TextAnalyzerCo
     ta = prop.get("textAnalyzer")
     if ta is None:
         return None
-    if "asciiFold" not in ta:
+    # The server normalizes an empty TextAnalyzer to nil (see usecases/schema/validation.go),
+    # so the only meaningful signal is the presence of one of the configured fields.
+    if "asciiFold" not in ta and "stopwordPreset" not in ta:
         return None
     return _TextAnalyzerConfig(
         ascii_fold=ta.get("asciiFold", False),
         ascii_fold_ignore=ta.get("asciiFoldIgnore"),
+        stopword_preset=ta.get("stopwordPreset"),
     )