weaviate · dirkkul · Apr 14, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/integration/test_collection_config.py b/integration/test_collection_config.py
diff --git a/test/collection/test_config.py b/test/collection/test_config.py
@@ -4,18 +4,21 @@
 from pydantic import ValidationError
 
 from weaviate.collections.classes.config import (
-    _AsyncReplicationConfig,
-    _ReplicationConfig,
-    _ReplicationConfigUpdate,
     Configure,
     DataType,
     Property,
     Reconfigure,
     ReferenceProperty,
+    StopwordsPreset,
+    Tokenization,
     Vectorizers,
+    _AsyncReplicationConfig,
     _CollectionConfigCreate,
     _GenerativeProvider,
+    _ReplicationConfig,
+    _ReplicationConfigUpdate,
     _RerankerProvider,
+    _TextAnalyzerConfigCreate,
     _VectorizerConfigCreate,
     _ReplicationConfigCreate,
     ReplicationDeletionStrategy,
@@ -3021,3 +3024,156 @@ def test_nested_property_with_id_name_is_allowed() -> None:
         ],
     )
     assert prop.nestedProperties[0].name == "id"
+
+
+class Test_TextAnalyzerConfigCreate:
+    def test_property_without_text_analyzer_omits_key(self) -> None:
+        prop = Property(name="title", data_type=DataType.TEXT)
+        assert "textAnalyzer" not in prop._to_dict()
+
+    def test_property_with_ascii_fold_only(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            text_analyzer=Configure.TextAnalyzer(ascii_fold=True),
+        )
+        assert prop._to_dict()["textAnalyzer"] == {"asciiFold": True}
+
+    def test_property_with_ascii_fold_and_ignore(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=Configure.TextAnalyzer(ascii_fold=True, ascii_fold_ignore=["é", "ñ"]),
+        )
+        out = prop._to_dict()
+        assert out["textAnalyzer"] == {
+            "asciiFold": True,
+            "asciiFoldIgnore": ["é", "ñ"],
+        }
+        assert out["tokenization"] == "word"
+
+    def test_text_analyzer_rejects_ignore_without_ascii_fold(self) -> None:
+        with pytest.raises(ValidationError):
+            _TextAnalyzerConfigCreate(ascii_fold_ignore=["é"])
+
+    def test_nested_property_with_text_analyzer(self) -> None:
+        prop = Property(
+            name="meta",
+            data_type=DataType.OBJECT,
+            nested_properties=[
+                Property(
+                    name="title",
+                    data_type=DataType.TEXT,
+                    text_analyzer=Configure.TextAnalyzer(ascii_fold=True, ascii_fold_ignore=["ñ"]),
+                ),
+            ],
+        )
+        out = prop._to_dict()
+        assert out["nestedProperties"][0]["textAnalyzer"] == {
+            "asciiFold": True,
+            "asciiFoldIgnore": ["ñ"],
+        }
+
+    def test_text_analyzer_rejects_wrong_types(self) -> None:
+        with pytest.raises(ValidationError):
+            _TextAnalyzerConfigCreate(ascii_fold="yes")  # type: ignore[arg-type]
+        with pytest.raises(ValidationError):
+            _TextAnalyzerConfigCreate(ascii_fold_ignore="é")
+
+    def test_text_analyzer_stopword_preset_builtin_enum(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=Configure.TextAnalyzer(stopword_preset=StopwordsPreset.EN),
+        )
+        assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "en"}
+
+    def test_text_analyzer_stopword_preset_user_defined_string(self) -> None:
+        prop = Property(
+            name="title_fr",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=Configure.TextAnalyzer(stopword_preset="fr"),
+        )
+        assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "fr"}
+
+    def test_text_analyzer_combined_ascii_fold_and_stopword_preset(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=Configure.TextAnalyzer(
+                ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset="fr"
+            ),
+        )
+        assert prop._to_dict()["textAnalyzer"] == {
+            "asciiFold": True,
+            "asciiFoldIgnore": ["é"],
+            "stopwordPreset": "fr",
+        }
+
+    def test_text_analyzer_stopword_preset_only_omits_other_keys(self) -> None:
+        prop = Property(
+            name="title",
+            data_type=DataType.TEXT,
+            tokenization=Tokenization.WORD,
+            text_analyzer=Configure.TextAnalyzer(stopword_preset="fr"),
+        )
+        out = prop._to_dict()
+        assert "asciiFold" not in out["textAnalyzer"]
+        assert "asciiFoldIgnore" not in out["textAnalyzer"]
+
+
+class TestInvertedIndexStopwordPresets:
+    def test_configure_inverted_index_with_stopword_presets(self) -> None:
+        ic = Configure.inverted_index(
+            stopword_presets={
+                "fr": ["le", "la", "les"],
+                "es": ["el", "la", "los"],
+            },
+        )
+        out = ic._to_dict()
+        assert out["stopwordPresets"] == {
+            "fr": ["le", "la", "les"],
+            "es": ["el", "la", "los"],
+        }
+
+    def test_configure_inverted_index_without_stopword_presets_omits_key(self) -> None:
+        ic = Configure.inverted_index()
+        assert "stopwordPresets" not in ic._to_dict()
+
+    def test_reconfigure_inverted_index_merges_stopword_presets(self) -> None:
+        rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le", "la"]})
+        existing = {
+            "stopwords": {"preset": "en", "additions": None, "removals": None},
+            "bm25": {"b": 0.75, "k1": 1.2},
+            "cleanupIntervalSeconds": 60,
+        }
+        merged = rc.merge_with_existing(existing)
+        assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
+        # other fields untouched
+        assert merged["stopwords"]["preset"] == "en"
+        assert merged["bm25"]["b"] == 0.75
+
+    def test_reconfigure_inverted_index_replaces_existing_stopword_presets(self) -> None:
+        rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le"]})
+        existing = {
+            "stopwords": {"preset": "en", "additions": None, "removals": None},
+            "stopwordPresets": {"fr": ["le", "la", "les"], "es": ["el"]},
+        }
+        merged = rc.merge_with_existing(existing)
+        # The new value fully replaces the prior dict (this matches the server-side
+        # PUT semantics — see test_tokenize.py::test_remove_unused_preset_is_allowed).
+        assert merged["stopwordPresets"] == {"fr": ["le"]}
+
+    def test_reconfigure_inverted_index_without_stopword_presets_leaves_existing(self) -> None:
+        rc = Reconfigure.inverted_index(bm25_b=0.7, bm25_k1=1.1)
+        existing = {
+            "stopwords": {"preset": "en", "additions": None, "removals": None},
+            "bm25": {"b": 0.75, "k1": 1.2},
+            "stopwordPresets": {"fr": ["le", "la"]},
+        }
+        merged = rc.merge_with_existing(existing)
+        assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
diff --git a/test/collection/test_config_methods.py b/test/collection/test_config_methods.py
@@ -1,4 +1,9 @@
-from weaviate.collections.classes.config_methods import _collection_configs_simple_from_json
+from weaviate.collections.classes.config_methods import (
+    _collection_config_from_json,
+    _collection_configs_simple_from_json,
+    _nested_properties_from_config,
+    _properties_from_config,
+)
 
 
 def test_collection_config_simple_from_json_with_none_vectorizer_config() -> None:
@@ -68,3 +73,175 @@ def test_collection_config_simple_from_json_with_none_vectorizer_config() -> Non
     assert "default" in vec_config
     assert vec_config["default"].vectorizer.model == {}
     assert vec_config["default"].vectorizer.source_properties is None
+
+
+def _make_text_prop(name: str, **extra) -> dict:
+    base = {
+        "name": name,
+        "dataType": ["text"],
+        "indexFilterable": True,
+        "indexSearchable": True,
+        "indexRangeFilters": False,
+        "tokenization": "word",
+    }
+    base.update(extra)
+    return base
+
+
+def test_properties_from_config_parses_text_analyzer() -> None:
+    schema = {
+        "vectorizer": "none",
+        "properties": [
+            _make_text_prop(
+                "title",
+                textAnalyzer={"asciiFold": True, "asciiFoldIgnore": ["é"]},
+            ),
+            _make_text_prop("body"),
+        ],
+    }
+    props = _properties_from_config(schema)
+    title = next(p for p in props if p.name == "title")
+    body = next(p for p in props if p.name == "body")
+
+    assert title.text_analyzer is not None
+    assert title.text_analyzer.ascii_fold is True
+    assert title.text_analyzer.ascii_fold_ignore == ["é"]
+
+    assert body.text_analyzer is None
+
+    # The dataclass round-trips back to the wire format.
+    assert title.to_dict()["textAnalyzer"] == {
+        "asciiFold": True,
+        "asciiFoldIgnore": ["é"],
+    }
+    assert "textAnalyzer" not in body.to_dict()
+
+
+def test_properties_from_config_text_analyzer_omitted_when_no_ascii_fold() -> None:
+    """If the server response omits asciiFold, the client treats text_analyzer as unset."""
+    schema = {
+        "vectorizer": "none",
+        "properties": [
+            # Server response with textAnalyzer present but no asciiFold key
+            _make_text_prop("title", textAnalyzer={"asciiFoldIgnore": ["é"]}),
+        ],
+    }
+    title = _properties_from_config(schema)[0]
+    assert title.text_analyzer is None
+
+
+def test_nested_properties_from_config_parses_text_analyzer() -> None:
+    nested = _nested_properties_from_config(
+        [
+            _make_text_prop(
+                "title",
+                textAnalyzer={"asciiFold": True, "asciiFoldIgnore": ["ñ"]},
+            ),
+        ]
+    )
+    assert nested[0].text_analyzer is not None
+    assert nested[0].text_analyzer.ascii_fold is True
+    assert nested[0].text_analyzer.ascii_fold_ignore == ["ñ"]
+    assert nested[0].to_dict()["textAnalyzer"] == {
+        "asciiFold": True,
+        "asciiFoldIgnore": ["ñ"],
+    }
+
+
+def test_properties_from_config_parses_stopword_preset_only() -> None:
+    """A property with only stopwordPreset (no asciiFold) must still produce a text_analyzer."""
+    schema = {
+        "vectorizer": "none",
+        "properties": [
+            _make_text_prop("title", textAnalyzer={"stopwordPreset": "fr"}),
+        ],
+    }
+    title = _properties_from_config(schema)[0]
+    assert title.text_analyzer is not None
+    assert title.text_analyzer.ascii_fold is False
+    assert title.text_analyzer.ascii_fold_ignore is None
+    assert title.text_analyzer.stopword_preset == "fr"
+
+
+def test_properties_from_config_parses_combined_text_analyzer() -> None:
+    schema = {
+        "vectorizer": "none",
+        "properties": [
+            _make_text_prop(
+                "title",
+                textAnalyzer={
+                    "asciiFold": True,
+                    "asciiFoldIgnore": ["é"],
+                    "stopwordPreset": "fr",
+                },
+            ),
+        ],
+    }
+    title = _properties_from_config(schema)[0]
+    assert title.text_analyzer is not None
+    assert title.text_analyzer.ascii_fold is True
+    assert title.text_analyzer.ascii_fold_ignore == ["é"]
+    assert title.text_analyzer.stopword_preset == "fr"
+
+
+def _full_schema(class_name: str, **inverted_overrides) -> dict:
+    inverted = {
+        "bm25": {"b": 0.75, "k1": 1.2},
+        "cleanupIntervalSeconds": 60,
+        "stopwords": {"preset": "en", "additions": None, "removals": None},
+    }
+    inverted.update(inverted_overrides)
+    return {
+        "class": class_name,
+        "vectorizer": "none",
+        "properties": [],
+        "invertedIndexConfig": inverted,
+        "replicationConfig": {"factor": 1, "deletionStrategy": "NoAutomatedResolution"},
+        "shardingConfig": {
+            "virtualPerPhysical": 128,
+            "desiredCount": 1,
+            "actualCount": 1,
+            "desiredVirtualCount": 128,
+            "actualVirtualCount": 128,
+            "key": "_id",
+            "strategy": "hash",
+            "function": "murmur3",
+        },
+        "vectorIndexType": "hnsw",
+        "vectorIndexConfig": {
+            "skip": False,
+            "cleanupIntervalSeconds": 300,
+            "maxConnections": 64,
+            "efConstruction": 128,
+            "ef": -1,
+            "dynamicEfMin": 100,
+            "dynamicEfMax": 500,
+            "dynamicEfFactor": 8,
+            "vectorCacheMaxObjects": 1000000000000,
+            "flatSearchCutoff": 40000,
+            "distance": "cosine",
+        },
+    }
+
+
+def test_collection_config_parses_stopword_presets() -> None:
+    """The inverted index config exposes stopwordPresets when present in the schema."""
+    schema = _full_schema(
+        "TestStopwordPresets",
+        stopwordPresets={
+            "fr": ["le", "la", "les"],
+            "es": ["el", "la", "los"],
+        },
+    )
+    full = _collection_config_from_json(schema)
+    assert full.inverted_index_config.stopword_presets == {
+        "fr": ["le", "la", "les"],
+        "es": ["el", "la", "los"],
+    }
+
+
+def test_collection_config_stopword_presets_absent() -> None:
+    """If the server response omits stopwordPresets, the parsed value is None."""
+    schema = _full_schema("TestNoStopwordPresets")
+    full = _collection_config_from_json(schema)
+    assert full.inverted_index_config.stopword_presets is None
diff --git a/weaviate/classes/config.py b/weaviate/classes/config.py
@@ -12,6 +12,7 @@
     ReplicationDeletionStrategy,
     Rerankers,
     StopwordsPreset,
+    TextAnalyzerConfig,
     Tokenization,
     VectorDistances,
 )
@@ -39,6 +40,7 @@
     "ReferenceProperty",
     "Rerankers",
     "StopwordsPreset",
+    "TextAnalyzerConfig",
     "Tokenization",
     "Vectorizers",
     "VectorDistances",