-
Notifications
You must be signed in to change notification settings - Fork 120
feat: add support to stopword presets #2008
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
3cc6306
ef04dea
8f1b33b
30814fc
db3009c
50f7768
a0efe43
fa92fc2
27cd0a4
83c2431
4e0a0f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -380,12 +380,14 @@ class _InvertedIndexConfigCreate(_ConfigCreateModel): | |
| indexPropertyLength: Optional[bool] | ||
| indexNullState: Optional[bool] | ||
| stopwords: _StopwordsCreate | ||
| stopwordPresets: Optional[Dict[str, List[str]]] = None | ||
|
|
||
|
|
||
| class _InvertedIndexConfigUpdate(_ConfigUpdateModel): | ||
| bm25: Optional[_BM25ConfigUpdate] | ||
| cleanupIntervalSeconds: Optional[int] | ||
| stopwords: Optional[_StopwordsUpdate] | ||
| stopwordPresets: Optional[Dict[str, List[str]]] = None | ||
|
|
||
|
|
||
| class _MultiTenancyConfigCreate(_ConfigCreateModel): | ||
|
|
@@ -1647,6 +1649,7 @@ class _InvertedIndexConfig(_ConfigBase): | |
| index_property_length: bool | ||
| index_timestamps: bool | ||
| stopwords: StopwordsConfig | ||
| stopword_presets: Optional[Dict[str, List[str]]] = None | ||
|
|
||
|
|
||
| InvertedIndexConfig = _InvertedIndexConfig | ||
|
|
@@ -1675,6 +1678,7 @@ class _PropertyVectorizerConfig: | |
| class _TextAnalyzerConfig(_ConfigBase): | ||
| ascii_fold: bool | ||
| ascii_fold_ignore: Optional[List[str]] | ||
| stopword_preset: Optional[str] | ||
|
|
||
|
|
||
| @dataclass | ||
|
|
@@ -2174,24 +2178,42 @@ class _ShardStatus: | |
| class TextAnalyzerConfig(_ConfigCreateModel): | ||
| """Text analysis options for a property. | ||
|
|
||
| Configures ASCII folding behavior for `text` and `text[]` properties that use an | ||
| inverted index (searchable or filterable). When enabled, accent/diacritic marks are | ||
| folded to their base characters during indexing and search (e.g. 'école' matches | ||
| 'ecole'). | ||
| Configures per-property text analysis for `text` and `text[]` properties that use an | ||
| inverted index (searchable or filterable). Supports ASCII folding (accent/diacritic | ||
| handling) and selecting a stopword preset that overrides the collection-level | ||
| `invertedIndexConfig.stopwords` setting for this property only. | ||
|
|
||
| Attributes: | ||
| ascii_fold: If True, accent/diacritic marks are folded to their base characters | ||
| during indexing and search. If omitted, the field is not sent to the server | ||
| and the server default (False) applies. | ||
| during indexing and search (e.g. 'école' matches 'ecole'). If omitted, the | ||
| field is not sent to the server and the server default (False) applies. | ||
| ascii_fold_ignore: Optional list of characters that should be excluded from | ||
| ASCII folding (e.g. ['é'] keeps 'é' from being folded to 'e'). If omitted, | ||
| the field is not sent to the server. | ||
|
|
||
| Both settings are immutable after the property is created. | ||
| stopword_preset: Stopword preset name. Overrides the collection-level | ||
| `invertedIndexConfig.stopwords` for this property. Only applies to | ||
| properties using `Tokenization.WORD`. Accepts a built-in preset | ||
| (`StopwordsPreset.EN` or `StopwordsPreset.NONE`) or the name of a | ||
| user-defined preset declared in | ||
| `Configure.inverted_index(stopword_presets=...)`. | ||
|
|
||
| All settings are immutable after the property is created. | ||
| """ | ||
|
|
||
| asciiFold: Optional[bool] = Field(default=None, alias="ascii_fold") | ||
| asciiFoldIgnore: Optional[List[str]] = Field(default=None, alias="ascii_fold_ignore") | ||
| stopwordPreset: Optional[Union[StopwordsPreset, str]] = Field( | ||
| default=None, alias="stopword_preset" | ||
| ) | ||
|
|
||
| @field_validator("stopwordPreset", mode="before") | ||
| @classmethod | ||
| def _coerce_stopword_preset(cls, v: Any) -> Any: | ||
| # Pydantic preserves the StopwordsPreset enum instance through model_dump, | ||
| # but the wire format must be a plain string. Coerce at construction time. | ||
| if isinstance(v, StopwordsPreset): | ||
| return v.value | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We usually use a _to_dict method, see nestedProperties for example. Eg in and then add the |
||
| return v | ||
|
|
||
|
|
||
| class Property(_ConfigCreateModel): | ||
|
|
@@ -2615,11 +2637,17 @@ def inverted_index( | |
| stopwords_preset: Optional[StopwordsPreset] = None, | ||
| stopwords_additions: Optional[List[str]] = None, | ||
| stopwords_removals: Optional[List[str]] = None, | ||
| stopword_presets: Optional[Dict[str, List[str]]] = None, | ||
| ) -> _InvertedIndexConfigCreate: | ||
| """Create an `InvertedIndexConfigCreate` object to be used when defining the configuration of the keyword searching algorithm of Weaviate. | ||
|
|
||
| Args: | ||
| See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details! | ||
| stopword_presets: User-defined named stopword lists keyed by preset name. Each value | ||
| is a flat list of stopword strings. A preset can be referenced from a property's | ||
| `text_analyzer.stopword_preset` to override the collection-level stopwords for | ||
| that property only. Requires Weaviate >= 1.37.0. | ||
|
|
||
| See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details on the other parameters. | ||
| """ # noqa: D417 (missing argument descriptions in the docstring) | ||
| if bm25_b is None and bm25_k1 is not None or bm25_k1 is None and bm25_b is not None: | ||
| raise ValueError("bm25_b and bm25_k1 must be specified together") | ||
|
|
@@ -2639,6 +2667,7 @@ def inverted_index( | |
| additions=stopwords_additions, | ||
| removals=stopwords_removals, | ||
| ), | ||
| stopwordPresets=stopword_presets, | ||
| ) | ||
|
|
||
| @staticmethod | ||
|
|
@@ -2913,13 +2942,19 @@ def inverted_index( | |
| stopwords_additions: Optional[List[str]] = None, | ||
| stopwords_preset: Optional[StopwordsPreset] = None, | ||
| stopwords_removals: Optional[List[str]] = None, | ||
| stopword_presets: Optional[Dict[str, List[str]]] = None, | ||
| ) -> _InvertedIndexConfigUpdate: | ||
| """Create an `InvertedIndexConfigUpdate` object. | ||
|
|
||
| Use this method when defining the `inverted_index_config` argument in `collection.update()`. | ||
|
|
||
| Args: | ||
| See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for a more detailed view! | ||
| stopword_presets: User-defined named stopword lists keyed by preset name. Each value | ||
| is a flat list of stopword strings. Passing this replaces the entire user-defined | ||
| stopword preset map for the collection. Removing a preset still referenced by a | ||
| property is rejected by the server. Requires Weaviate >= 1.37.0. | ||
|
|
||
| See [the docs](https://weaviate.io/developers/weaviate/configuration/indexes#configure-the-inverted-index) for details on the other parameters. | ||
| """ # noqa: D417 (missing argument descriptions in the docstring) | ||
| return _InvertedIndexConfigUpdate( | ||
| bm25=_BM25ConfigUpdate(b=bm25_b, k1=bm25_k1), | ||
|
|
@@ -2929,6 +2964,7 @@ def inverted_index( | |
| additions=stopwords_additions, | ||
| removals=stopwords_removals, | ||
| ), | ||
| stopwordPresets=stopword_presets, | ||
| ) | ||
|
|
||
| @staticmethod | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.