Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
78ff5ec
feat: add TextAnalyzerConfig for ASCII folding in text properties
amourao Apr 9, 2026
6931a6f
refactor: ruff format
amourao Apr 9, 2026
bda3008
feat: add min version check
amourao Apr 9, 2026
77fc0ff
feat: update TextAnalyzerConfig docstring for ascii_fold attributes
amourao Apr 9, 2026
a8d6927
feat: add asciiFold check in _text_analyzer_from_config function
amourao Apr 9, 2026
e8919a3
test: fix ASCII folding tests
amourao Apr 9, 2026
3cc6306
feat: add support for stopword presets in inverted index configuratio…
amourao Apr 9, 2026
ef04dea
test: added live and config tests
amourao Apr 9, 2026
8f1b33b
refactor: improve docstrings for stopword presets and asciiFold tests
amourao Apr 9, 2026
03d6ff4
refactor: simplify _any_property_has_text_analyzer function using _pr…
amourao Apr 13, 2026
1342204
test: remove redundant insertion ascii fold tests from test_collectio…
amourao Apr 13, 2026
cb53d6a
test: add stopwords roundtrip test for collection configuration
amourao Apr 13, 2026
9de03f3
feat: add model validator to enforce asciiFoldIgnore constraints in T…
amourao Apr 13, 2026
7018927
feat: add factory class for text analyzer configurations with ASCII f…
amourao Apr 13, 2026
8e91984
refactor: update TextAnalyzerConfig usage to new Configure class methods
amourao Apr 13, 2026
30814fc
Merge branch 'feat/ascii-fold' into feat/stopword-presets
amourao Apr 13, 2026
db3009c
test: remove redundant line in stopword presets merge test
amourao Apr 13, 2026
50f7768
refactor: use factory pattern
amourao Apr 13, 2026
a0efe43
refactor: format text analyzer configuration for better readability
amourao Apr 14, 2026
fa92fc2
refactor: remove server side behavior tests
amourao Apr 14, 2026
27cd0a4
test: add stopword presets roundtrip tests for Weaviate collections
amourao Apr 14, 2026
83c2431
refactor: remove unnecessary stopword preset coercion from _TextAnaly…
amourao Apr 14, 2026
4e0a0f2
refactor: replace custom text analyzer method with a direct function …
amourao Apr 14, 2026
eaea155
Merge branch 'dev/1.37' into feat/ascii-fold
amourao Apr 14, 2026
38c7f44
chore: remove unused deprecated import from config.py
amourao Apr 14, 2026
ec43d53
Merge branch 'feat/stopword-presets' into feat/ascii-fold
amourao Apr 14, 2026
b3eb0ac
chore: update WEAVIATE_137 version to 1.37.0-rc.1-578c4eb in workflow
amourao Apr 14, 2026
ceef271
refactor: update text analyzer method to use new static method in Con…
amourao Apr 14, 2026
5e751bf
test: add stopwords roundtrip test with ASCII folding configuration
amourao Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
454 changes: 453 additions & 1 deletion integration/test_collection_config.py

Large diffs are not rendered by default.

162 changes: 159 additions & 3 deletions test/collection/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@
from pydantic import ValidationError

from weaviate.collections.classes.config import (
_AsyncReplicationConfig,
_ReplicationConfig,
_ReplicationConfigUpdate,
Configure,
DataType,
Property,
Reconfigure,
ReferenceProperty,
StopwordsPreset,
Tokenization,
Vectorizers,
_AsyncReplicationConfig,
_CollectionConfigCreate,
_GenerativeProvider,
_ReplicationConfig,
_ReplicationConfigUpdate,
_RerankerProvider,
_TextAnalyzerConfigCreate,
_VectorizerConfigCreate,
_ReplicationConfigCreate,
ReplicationDeletionStrategy,
Expand Down Expand Up @@ -3021,3 +3024,156 @@ def test_nested_property_with_id_name_is_allowed() -> None:
],
)
assert prop.nestedProperties[0].name == "id"


class Test_TextAnalyzerConfigCreate:
def test_property_without_text_analyzer_omits_key(self) -> None:
prop = Property(name="title", data_type=DataType.TEXT)
assert "textAnalyzer" not in prop._to_dict()

def test_property_with_ascii_fold_only(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
text_analyzer=Configure.TextAnalyzer(ascii_fold=True),
)
assert prop._to_dict()["textAnalyzer"] == {"asciiFold": True}

def test_property_with_ascii_fold_and_ignore(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=Configure.TextAnalyzer(ascii_fold=True, ascii_fold_ignore=["é", "ñ"]),
)
out = prop._to_dict()
assert out["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["é", "ñ"],
}
assert out["tokenization"] == "word"

def test_text_analyzer_rejects_ignore_without_ascii_fold(self) -> None:
with pytest.raises(ValidationError):
_TextAnalyzerConfigCreate(ascii_fold_ignore=["é"])

def test_nested_property_with_text_analyzer(self) -> None:
prop = Property(
name="meta",
data_type=DataType.OBJECT,
nested_properties=[
Property(
name="title",
data_type=DataType.TEXT,
text_analyzer=Configure.TextAnalyzer(ascii_fold=True, ascii_fold_ignore=["ñ"]),
),
],
)
out = prop._to_dict()
assert out["nestedProperties"][0]["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["ñ"],
}

def test_text_analyzer_rejects_wrong_types(self) -> None:
with pytest.raises(ValidationError):
_TextAnalyzerConfigCreate(ascii_fold="yes") # type: ignore[arg-type]
with pytest.raises(ValidationError):
_TextAnalyzerConfigCreate(ascii_fold_ignore="é")

def test_text_analyzer_stopword_preset_builtin_enum(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=Configure.TextAnalyzer(stopword_preset=StopwordsPreset.EN),
)
assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "en"}

def test_text_analyzer_stopword_preset_user_defined_string(self) -> None:
prop = Property(
name="title_fr",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=Configure.TextAnalyzer(stopword_preset="fr"),
)
assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "fr"}

def test_text_analyzer_combined_ascii_fold_and_stopword_preset(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=Configure.TextAnalyzer(
ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset="fr"
),
)
assert prop._to_dict()["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["é"],
"stopwordPreset": "fr",
}

def test_text_analyzer_stopword_preset_only_omits_other_keys(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=Configure.TextAnalyzer(stopword_preset="fr"),
)
out = prop._to_dict()
assert "asciiFold" not in out["textAnalyzer"]
assert "asciiFoldIgnore" not in out["textAnalyzer"]


class TestInvertedIndexStopwordPresets:
def test_configure_inverted_index_with_stopword_presets(self) -> None:
ic = Configure.inverted_index(
stopword_presets={
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
},
)
out = ic._to_dict()
assert out["stopwordPresets"] == {
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
}

def test_configure_inverted_index_without_stopword_presets_omits_key(self) -> None:
ic = Configure.inverted_index()
assert "stopwordPresets" not in ic._to_dict()

def test_reconfigure_inverted_index_merges_stopword_presets(self) -> None:
rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le", "la"]})
existing = {
"stopwords": {"preset": "en", "additions": None, "removals": None},
"bm25": {"b": 0.75, "k1": 1.2},
"cleanupIntervalSeconds": 60,
}
merged = rc.merge_with_existing(existing)
assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
# other fields untouched
assert merged["stopwords"]["preset"] == "en"
assert merged["bm25"]["b"] == 0.75

def test_reconfigure_inverted_index_replaces_existing_stopword_presets(self) -> None:
rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le"]})
existing = {
"stopwords": {"preset": "en", "additions": None, "removals": None},
"stopwordPresets": {"fr": ["le", "la", "les"], "es": ["el"]},
}
merged = rc.merge_with_existing(existing)
# The new value fully replaces the prior dict (this matches the server-side
# PUT semantics — see test_tokenize.py::test_remove_unused_preset_is_allowed).
assert merged["stopwordPresets"] == {"fr": ["le"]}

def test_reconfigure_inverted_index_without_stopword_presets_leaves_existing(self) -> None:
rc = Reconfigure.inverted_index(bm25_b=0.7, bm25_k1=1.1)
existing = {
"stopwords": {"preset": "en", "additions": None, "removals": None},
"bm25": {"b": 0.75, "k1": 1.2},
"stopwordPresets": {"fr": ["le", "la"]},
}
merged = rc.merge_with_existing(existing)
assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
179 changes: 178 additions & 1 deletion test/collection/test_config_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from weaviate.collections.classes.config_methods import _collection_configs_simple_from_json
from weaviate.collections.classes.config_methods import (
_collection_config_from_json,
_collection_configs_simple_from_json,
_nested_properties_from_config,
_properties_from_config,
)


def test_collection_config_simple_from_json_with_none_vectorizer_config() -> None:
Expand Down Expand Up @@ -68,3 +73,175 @@ def test_collection_config_simple_from_json_with_none_vectorizer_config() -> Non
assert "default" in vec_config
assert vec_config["default"].vectorizer.model == {}
assert vec_config["default"].vectorizer.source_properties is None


def _make_text_prop(name: str, **extra) -> dict:
base = {
"name": name,
"dataType": ["text"],
"indexFilterable": True,
"indexSearchable": True,
"indexRangeFilters": False,
"tokenization": "word",
}
base.update(extra)
return base


def test_properties_from_config_parses_text_analyzer() -> None:
schema = {
"vectorizer": "none",
"properties": [
_make_text_prop(
"title",
textAnalyzer={"asciiFold": True, "asciiFoldIgnore": ["é"]},
),
_make_text_prop("body"),
],
}
props = _properties_from_config(schema)
title = next(p for p in props if p.name == "title")
body = next(p for p in props if p.name == "body")

assert title.text_analyzer is not None
assert title.text_analyzer.ascii_fold is True
assert title.text_analyzer.ascii_fold_ignore == ["é"]

assert body.text_analyzer is None

# The dataclass round-trips back to the wire format.
assert title.to_dict()["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["é"],
}
assert "textAnalyzer" not in body.to_dict()


def test_properties_from_config_text_analyzer_omitted_when_no_ascii_fold() -> None:
"""If the server response omits asciiFold, the client treats text_analyzer as unset."""
schema = {
"vectorizer": "none",
"properties": [
# Server response with textAnalyzer present but no asciiFold key
_make_text_prop("title", textAnalyzer={"asciiFoldIgnore": ["é"]}),
],
}
title = _properties_from_config(schema)[0]
assert title.text_analyzer is None


def test_nested_properties_from_config_parses_text_analyzer() -> None:
nested = _nested_properties_from_config(
[
_make_text_prop(
"title",
textAnalyzer={"asciiFold": True, "asciiFoldIgnore": ["ñ"]},
),
]
)
assert nested[0].text_analyzer is not None
assert nested[0].text_analyzer.ascii_fold is True
assert nested[0].text_analyzer.ascii_fold_ignore == ["ñ"]
assert nested[0].to_dict()["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["ñ"],
}


def test_properties_from_config_parses_stopword_preset_only() -> None:
"""A property with only stopwordPreset (no asciiFold) must still produce a text_analyzer."""
schema = {
"vectorizer": "none",
"properties": [
_make_text_prop("title", textAnalyzer={"stopwordPreset": "fr"}),
],
}
title = _properties_from_config(schema)[0]
assert title.text_analyzer is not None
assert title.text_analyzer.ascii_fold is False
assert title.text_analyzer.ascii_fold_ignore is None
assert title.text_analyzer.stopword_preset == "fr"


def test_properties_from_config_parses_combined_text_analyzer() -> None:
schema = {
"vectorizer": "none",
"properties": [
_make_text_prop(
"title",
textAnalyzer={
"asciiFold": True,
"asciiFoldIgnore": ["é"],
"stopwordPreset": "fr",
},
),
],
}
title = _properties_from_config(schema)[0]
assert title.text_analyzer is not None
assert title.text_analyzer.ascii_fold is True
assert title.text_analyzer.ascii_fold_ignore == ["é"]
assert title.text_analyzer.stopword_preset == "fr"


def _full_schema(class_name: str, **inverted_overrides) -> dict:
inverted = {
"bm25": {"b": 0.75, "k1": 1.2},
"cleanupIntervalSeconds": 60,
"stopwords": {"preset": "en", "additions": None, "removals": None},
}
inverted.update(inverted_overrides)
return {
"class": class_name,
"vectorizer": "none",
"properties": [],
"invertedIndexConfig": inverted,
"replicationConfig": {"factor": 1, "deletionStrategy": "NoAutomatedResolution"},
"shardingConfig": {
"virtualPerPhysical": 128,
"desiredCount": 1,
"actualCount": 1,
"desiredVirtualCount": 128,
"actualVirtualCount": 128,
"key": "_id",
"strategy": "hash",
"function": "murmur3",
},
"vectorIndexType": "hnsw",
"vectorIndexConfig": {
"skip": False,
"cleanupIntervalSeconds": 300,
"maxConnections": 64,
"efConstruction": 128,
"ef": -1,
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 8,
"vectorCacheMaxObjects": 1000000000000,
"flatSearchCutoff": 40000,
"distance": "cosine",
},
}


def test_collection_config_parses_stopword_presets() -> None:
"""The inverted index config exposes stopwordPresets when present in the schema."""
schema = _full_schema(
"TestStopwordPresets",
stopwordPresets={
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
},
)
full = _collection_config_from_json(schema)
assert full.inverted_index_config.stopword_presets == {
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
}


def test_collection_config_stopword_presets_absent() -> None:
"""If the server response omits stopwordPresets, the parsed value is None."""
schema = _full_schema("TestNoStopwordPresets")
full = _collection_config_from_json(schema)
assert full.inverted_index_config.stopword_presets is None
2 changes: 2 additions & 0 deletions weaviate/classes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ReplicationDeletionStrategy,
Rerankers,
StopwordsPreset,
TextAnalyzerConfig,
Tokenization,
VectorDistances,
)
Expand Down Expand Up @@ -39,6 +40,7 @@
"ReferenceProperty",
"Rerankers",
"StopwordsPreset",
"TextAnalyzerConfig",
"Tokenization",
"Vectorizers",
"VectorDistances",
Expand Down
Loading
Loading