Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
569 changes: 569 additions & 0 deletions integration/test_collection_config.py

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions test/collection/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Property,
Reconfigure,
ReferenceProperty,
StopwordsPreset,
TextAnalyzerConfig,
Tokenization,
Vectorizers,
Expand Down Expand Up @@ -3097,3 +3098,100 @@ def test_text_analyzer_rejects_wrong_types(self) -> None:
TextAnalyzerConfig(ascii_fold="yes") # type: ignore[arg-type]
with pytest.raises(ValidationError):
TextAnalyzerConfig(ascii_fold_ignore="é") # type: ignore[arg-type]

def test_text_analyzer_stopword_preset_builtin_enum(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=TextAnalyzerConfig(stopword_preset=StopwordsPreset.EN),
)
assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "en"}

def test_text_analyzer_stopword_preset_user_defined_string(self) -> None:
prop = Property(
name="title_fr",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=TextAnalyzerConfig(stopword_preset="fr"),
)
assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "fr"}

def test_text_analyzer_combined_ascii_fold_and_stopword_preset(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=TextAnalyzerConfig(
ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset="fr"
),
)
assert prop._to_dict()["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["é"],
"stopwordPreset": "fr",
}

def test_text_analyzer_stopword_preset_only_omits_other_keys(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=TextAnalyzerConfig(stopword_preset="fr"),
)
out = prop._to_dict()
assert "asciiFold" not in out["textAnalyzer"]
assert "asciiFoldIgnore" not in out["textAnalyzer"]


class TestInvertedIndexStopwordPresets:
def test_configure_inverted_index_with_stopword_presets(self) -> None:
ic = Configure.inverted_index(
stopword_presets={
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
},
)
out = ic._to_dict()
assert out["stopwordPresets"] == {
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
}

def test_configure_inverted_index_without_stopword_presets_omits_key(self) -> None:
ic = Configure.inverted_index()
assert "stopwordPresets" not in ic._to_dict()

def test_reconfigure_inverted_index_merges_stopword_presets(self) -> None:
rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le", "la"]})
existing = {
"stopwords": {"preset": "en", "additions": None, "removals": None},
"bm25": {"b": 0.75, "k1": 1.2},
"cleanupIntervalSeconds": 60,
}
merged = rc.merge_with_existing(existing)
assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
# other fields untouched
assert merged["stopwords"]["preset"] == "en"
assert merged["bm25"]["b"] == 0.75

def test_reconfigure_inverted_index_replaces_existing_stopword_presets(self) -> None:
rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le"]})
existing = {
"stopwords": {"preset": "en", "additions": None, "removals": None},
"stopwordPresets": {"fr": ["le", "la", "les"], "es": ["el"]},
}
merged = rc.merge_with_existing(existing)
# The new value fully replaces the prior dict (this matches the server-side
# PUT semantics — see test_tokenize.py::test_remove_unused_preset_is_allowed).
Comment thread
amourao marked this conversation as resolved.
assert merged["stopwordPresets"] == {"fr": ["le"]}

def test_reconfigure_inverted_index_without_stopword_presets_leaves_existing(self) -> None:
rc = Reconfigure.inverted_index(bm25_b=0.7, bm25_k1=1.1)
existing = {
"stopwords": {"preset": "en", "additions": None, "removals": None},
"bm25": {"b": 0.75, "k1": 1.2},
"stopwordPresets": {"fr": ["le", "la"]},
}
merged = rc.merge_with_existing(existing)
assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
100 changes: 100 additions & 0 deletions test/collection/test_config_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from weaviate.collections.classes.config_methods import (
_collection_config_from_json,
_collection_configs_simple_from_json,
_nested_properties_from_config,
_properties_from_config,
Expand Down Expand Up @@ -145,3 +146,102 @@ def test_nested_properties_from_config_parses_text_analyzer() -> None:
"asciiFold": True,
"asciiFoldIgnore": ["ñ"],
}


def test_properties_from_config_parses_stopword_preset_only() -> None:
"""A property with only stopwordPreset (no asciiFold) must still produce a text_analyzer."""
schema = {
"vectorizer": "none",
"properties": [
_make_text_prop("title", textAnalyzer={"stopwordPreset": "fr"}),
],
}
title = _properties_from_config(schema)[0]
assert title.text_analyzer is not None
assert title.text_analyzer.ascii_fold is False
assert title.text_analyzer.ascii_fold_ignore is None
assert title.text_analyzer.stopword_preset == "fr"


def test_properties_from_config_parses_combined_text_analyzer() -> None:
schema = {
"vectorizer": "none",
"properties": [
_make_text_prop(
"title",
textAnalyzer={
"asciiFold": True,
"asciiFoldIgnore": ["é"],
"stopwordPreset": "fr",
},
),
],
}
title = _properties_from_config(schema)[0]
assert title.text_analyzer is not None
assert title.text_analyzer.ascii_fold is True
assert title.text_analyzer.ascii_fold_ignore == ["é"]
assert title.text_analyzer.stopword_preset == "fr"


def _full_schema(class_name: str, **inverted_overrides) -> dict:
inverted = {
"bm25": {"b": 0.75, "k1": 1.2},
"cleanupIntervalSeconds": 60,
"stopwords": {"preset": "en", "additions": None, "removals": None},
}
inverted.update(inverted_overrides)
return {
"class": class_name,
"vectorizer": "none",
"properties": [],
"invertedIndexConfig": inverted,
"replicationConfig": {"factor": 1, "deletionStrategy": "NoAutomatedResolution"},
"shardingConfig": {
"virtualPerPhysical": 128,
"desiredCount": 1,
"actualCount": 1,
"desiredVirtualCount": 128,
"actualVirtualCount": 128,
"key": "_id",
"strategy": "hash",
"function": "murmur3",
},
"vectorIndexType": "hnsw",
"vectorIndexConfig": {
"skip": False,
"cleanupIntervalSeconds": 300,
"maxConnections": 64,
"efConstruction": 128,
"ef": -1,
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 8,
"vectorCacheMaxObjects": 1000000000000,
"flatSearchCutoff": 40000,
"distance": "cosine",
},
}


def test_collection_config_parses_stopword_presets() -> None:
"""The inverted index config exposes stopwordPresets when present in the schema."""
schema = _full_schema(
"TestStopwordPresets",
stopwordPresets={
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
},
)
full = _collection_config_from_json(schema)
assert full.inverted_index_config.stopword_presets == {
"fr": ["le", "la", "les"],
"es": ["el", "la", "los"],
}


def test_collection_config_stopword_presets_absent() -> None:
"""If the server response omits stopwordPresets, the parsed value is None."""
schema = _full_schema("TestNoStopwordPresets")
full = _collection_config_from_json(schema)
assert full.inverted_index_config.stopword_presets is None
56 changes: 46 additions & 10 deletions weaviate/collections/classes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,12 +380,14 @@ class _InvertedIndexConfigCreate(_ConfigCreateModel):
indexPropertyLength: Optional[bool]
indexNullState: Optional[bool]
stopwords: _StopwordsCreate
stopwordPresets: Optional[Dict[str, List[str]]] = None


class _InvertedIndexConfigUpdate(_ConfigUpdateModel):
bm25: Optional[_BM25ConfigUpdate]
cleanupIntervalSeconds: Optional[int]
stopwords: Optional[_StopwordsUpdate]
stopwordPresets: Optional[Dict[str, List[str]]] = None


class _MultiTenancyConfigCreate(_ConfigCreateModel):
Expand Down Expand Up @@ -1647,6 +1649,7 @@ class _InvertedIndexConfig(_ConfigBase):
index_property_length: bool
index_timestamps: bool
stopwords: StopwordsConfig
stopword_presets: Optional[Dict[str, List[str]]] = None


InvertedIndexConfig = _InvertedIndexConfig
Expand Down Expand Up @@ -1675,6 +1678,7 @@ class _PropertyVectorizerConfig:
class _TextAnalyzerConfig(_ConfigBase):
ascii_fold: bool
ascii_fold_ignore: Optional[List[str]]
stopword_preset: Optional[str]


@dataclass
Expand Down Expand Up @@ -2174,24 +2178,42 @@ class _ShardStatus:
class TextAnalyzerConfig(_ConfigCreateModel):
"""Text analysis options for a property.

Configures ASCII folding behavior for `text` and `text[]` properties that use an
inverted index (searchable or filterable). When enabled, accent/diacritic marks are
folded to their base characters during indexing and search (e.g. 'école' matches
'ecole').
Configures per-property text analysis for `text` and `text[]` properties that use an
inverted index (searchable or filterable). Supports ASCII folding (accent/diacritic
handling) and selecting a stopword preset that overrides the collection-level
`invertedIndexConfig.stopwords` setting for this property only.

Attributes:
ascii_fold: If True, accent/diacritic marks are folded to their base characters
during indexing and search. If omitted, the field is not sent to the server
and the server default (False) applies.
during indexing and search (e.g. 'école' matches 'ecole'). If omitted, the
field is not sent to the server and the server default (False) applies.
ascii_fold_ignore: Optional list of characters that should be excluded from
ASCII folding (e.g. ['é'] keeps 'é' from being folded to 'e'). If omitted,
the field is not sent to the server.

Both settings are immutable after the property is created.
stopword_preset: Stopword preset name. Overrides the collection-level
`invertedIndexConfig.stopwords` for this property. Only applies to
properties using `Tokenization.WORD`. Accepts a built-in preset
(`StopwordsPreset.EN` or `StopwordsPreset.NONE`) or the name of a
user-defined preset declared in
`Configure.inverted_index(stopword_presets=...)`.

All settings are immutable after the property is created.
"""

asciiFold: Optional[bool] = Field(default=None, alias="ascii_fold")
asciiFoldIgnore: Optional[List[str]] = Field(default=None, alias="ascii_fold_ignore")
stopwordPreset: Optional[Union[StopwordsPreset, str]] = Field(
default=None, alias="stopword_preset"
)

@field_validator("stopwordPreset", mode="before")
@classmethod
def _coerce_stopword_preset(cls, v: Any) -> Any:
# Pydantic preserves the StopwordsPreset enum instance through model_dump,
# but the wire format must be a plain string. Coerce at construction time.
if isinstance(v, StopwordsPreset):
return v.value
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We usually use a _to_dict method, see nestedProperties for example.

Eg in class Property add

        if self.textAnalyzer is not None:
            ret_dict["textAnalyzer"] = self.textAnalyzer._to_dict()

and then add the _to_dict method to textAnalyzer that does whatever transfromation we need to do. I think it will work without any changes to textAnalyzer, as the baseClass already handles enums:

class _ConfigCreateModel(BaseModel):
    model_config = ConfigDict(strict=True)

    def _to_dict(self) -> Dict[str, Any]:
        ret = cast(dict, self.model_dump(exclude_none=True))
        for key, val in ret.items():
            if isinstance(val, Enum):
                ret[key] = val.value

        return ret

return v


class Property(_ConfigCreateModel):
Expand Down Expand Up @@ -2615,11 +2637,17 @@ def inverted_index(
stopwords_preset: Optional[StopwordsPreset] = None,
stopwords_additions: Optional[List[str]] = None,
stopwords_removals: Optional[List[str]] = None,
stopword_presets: Optional[Dict[str, List[str]]] = None,
) -> _InvertedIndexConfigCreate:
"""Create an `InvertedIndexConfigCreate` object to be used when defining the configuration of the keyword searching algorithm of Weaviate.

Args:
See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details!
stopword_presets: User-defined named stopword lists keyed by preset name. Each value
is a flat list of stopword strings. A preset can be referenced from a property's
`text_analyzer.stopword_preset` to override the collection-level stopwords for
that property only. Requires Weaviate >= 1.37.0.

See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details on the other parameters.
""" # noqa: D417 (missing argument descriptions in the docstring)
if bm25_b is None and bm25_k1 is not None or bm25_k1 is None and bm25_b is not None:
raise ValueError("bm25_b and bm25_k1 must be specified together")
Expand All @@ -2639,6 +2667,7 @@ def inverted_index(
additions=stopwords_additions,
removals=stopwords_removals,
),
stopwordPresets=stopword_presets,
)

@staticmethod
Expand Down Expand Up @@ -2913,13 +2942,19 @@ def inverted_index(
stopwords_additions: Optional[List[str]] = None,
stopwords_preset: Optional[StopwordsPreset] = None,
stopwords_removals: Optional[List[str]] = None,
stopword_presets: Optional[Dict[str, List[str]]] = None,
) -> _InvertedIndexConfigUpdate:
"""Create an `InvertedIndexConfigUpdate` object.

Use this method when defining the `inverted_index_config` argument in `collection.update()`.

Args:
See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for a more detailed view!
stopword_presets: User-defined named stopword lists keyed by preset name. Each value
is a flat list of stopword strings. Passing this replaces the entire user-defined
stopword preset map for the collection. Removing a preset still referenced by a
property is rejected by the server. Requires Weaviate >= 1.37.0.

See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details on the other parameters.
""" # noqa: D417 (missing argument descriptions in the docstring)
return _InvertedIndexConfigUpdate(
bm25=_BM25ConfigUpdate(b=bm25_b, k1=bm25_k1),
Expand All @@ -2929,6 +2964,7 @@ def inverted_index(
additions=stopwords_additions,
removals=stopwords_removals,
),
stopwordPresets=stopword_presets,
)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion weaviate/collections/classes/config_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def merge_with_existing(self, schema: Dict[str, Any]) -> Dict[str, Any]:
continue
if isinstance(val, Enum):
schema[cls_field] = str(val.value)
elif isinstance(val, (int, float, bool, str, list)):
elif isinstance(val, (int, float, bool, str, list, dict)):
schema[cls_field] = val
elif isinstance(val, _QuantizerConfigUpdate):
quantizers = ["pq", "bq", "sq"]
Expand Down
6 changes: 5 additions & 1 deletion weaviate/collections/classes/config_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ def _collection_config_from_json(schema: Dict[str, Any]) -> _CollectionConfig:
additions=schema["invertedIndexConfig"]["stopwords"]["additions"],
removals=schema["invertedIndexConfig"]["stopwords"]["removals"],
),
stopword_presets=schema["invertedIndexConfig"].get("stopwordPresets"),
),
multi_tenancy_config=_MultiTenancyConfig(
enabled=schema.get("multiTenancyConfig", {}).get("enabled", False),
Expand Down Expand Up @@ -467,11 +468,14 @@ def _text_analyzer_from_config(prop: Dict[str, Any]) -> Optional[_TextAnalyzerCo
ta = prop.get("textAnalyzer")
if ta is None:
return None
if "asciiFold" not in ta:
# The server normalizes an empty TextAnalyzer to nil (see usecases/schema/validation.go),
# so the only meaningful signal is the presence of one of the configured fields.
if "asciiFold" not in ta and "stopwordPreset" not in ta:
return None
return _TextAnalyzerConfig(
ascii_fold=ta.get("asciiFold", False),
ascii_fold_ignore=ta.get("asciiFoldIgnore"),
stopword_preset=ta.get("stopwordPreset"),
)


Expand Down
Loading
Loading