Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ env:
WEAVIATE_132: 1.32.27
WEAVIATE_133: 1.33.18
WEAVIATE_134: 1.34.19
WEAVIATE_135: 1.35.17
WEAVIATE_136: 1.36.10
WEAVIATE_137: 1.37.1
WEAVIATE_135: 1.35.18
WEAVIATE_136: 1.36.12
WEAVIATE_137: 1.37.1-4e61e26.amd64

jobs:
lint-and-format:
Expand Down
506 changes: 319 additions & 187 deletions integration/test_tokenize.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions weaviate/classes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
rbac,
replication,
tenants,
tokenization,
) # noqa: F401
from .config import ConsistencyLevel

Expand All @@ -29,6 +30,7 @@
"init",
"query",
"tenants",
"tokenization",
"rbac",
"replication",
]
4 changes: 4 additions & 0 deletions weaviate/classes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
ReferenceProperty,
ReplicationDeletionStrategy,
Rerankers,
StopwordsCreate,
StopwordsPreset,
TextAnalyzerConfig,
TextAnalyzerConfigCreate,
Tokenization,
VectorDistances,
)
Expand All @@ -39,8 +41,10 @@
"PQEncoderType",
"ReferenceProperty",
"Rerankers",
"StopwordsCreate",
"StopwordsPreset",
"TextAnalyzerConfig",
"TextAnalyzerConfigCreate",
"Tokenization",
"Vectorizers",
"VectorDistances",
Expand Down
17 changes: 17 additions & 0 deletions weaviate/classes/tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from weaviate.collections.classes.config import (
StopwordsConfig,
StopwordsCreate,
StopwordsPreset,
TextAnalyzerConfigCreate,
Tokenization,
)
from weaviate.tokenization.models import TokenizeResult

__all__ = [
"StopwordsConfig",
"StopwordsCreate",
"StopwordsPreset",
"TextAnalyzerConfigCreate",
"Tokenization",
"TokenizeResult",
]
24 changes: 24 additions & 0 deletions weaviate/collections/classes/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
from dataclasses import dataclass
from dataclasses import fields as _dataclass_fields
from typing import (
Any,
ClassVar,
Expand Down Expand Up @@ -1647,6 +1648,26 @@ class _StopwordsConfig(_ConfigBase):


StopwordsConfig = _StopwordsConfig
StopwordsCreate = _StopwordsCreate

# Invariant: the read-side dataclass (_StopwordsConfig) and the write-side
# pydantic model (_StopwordsCreate) must carry the same set of field names so
# that values round-tripped from ``collection.config.get()`` can flow back into
# ``tokenization.text()`` without silent data loss. If a field is added to one
# but not the other, importing this module fails loudly; the read→write
# conversion in ``weaviate/tokenization/executor.py::_TokenizationExecutor.text``
# depends on this parity.
_read_fields = {f.name for f in _dataclass_fields(_StopwordsConfig)}
_write_fields = set(_StopwordsCreate.model_fields.keys())
if _read_fields != _write_fields:
raise RuntimeError(
"_StopwordsConfig / _StopwordsCreate field drift detected — "
f"read-only={_read_fields - _write_fields}, "
f"write-only={_write_fields - _read_fields}. "
"Update both classes together, or adapt the read→write conversion in "
"weaviate/tokenization/executor.py::_TokenizationExecutor.text."
)
del _read_fields, _write_fields
Comment thread
dirkkul marked this conversation as resolved.


@dataclass
Expand Down Expand Up @@ -2224,6 +2245,9 @@ def _validate_ascii_fold_ignore(self) -> "_TextAnalyzerConfigCreate":
return self


TextAnalyzerConfigCreate = _TextAnalyzerConfigCreate


class Property(_ConfigCreateModel):
"""This class defines the structure of a data property that a collection can have within Weaviate.

Expand Down
21 changes: 16 additions & 5 deletions weaviate/tokenization/async_.pyi
Original file line number Diff line number Diff line change
@@ -1,21 +1,32 @@
from typing import Dict, Optional
from typing import Dict, List, Optional, Union, overload

from weaviate.collections.classes.config import (
StopwordsConfig,
StopwordsCreate,
TextAnalyzerConfigCreate,
Tokenization,
_StopwordsCreate,
_TextAnalyzerConfigCreate,
)
from weaviate.connect.v4 import ConnectionAsync
from weaviate.tokenization.models import TokenizeResult

from .executor import _TokenizationExecutor

class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]):
@overload
async def text(
self,
text: str,
tokenization: Tokenization,
*,
analyzer_config: Optional[_TextAnalyzerConfigCreate] = None,
stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None,
analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ...,
) -> TokenizeResult: ...
@overload
async def text(
self,
text: str,
tokenization: Tokenization,
*,
analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
stopword_presets: Optional[Dict[str, List[str]]] = ...,
) -> TokenizeResult: ...
138 changes: 124 additions & 14 deletions weaviate/tokenization/executor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
"""Tokenize executor."""

from typing import Any, Dict, Generic, Optional
from typing import Any, Dict, Generic, List, Optional, Union, overload

from httpx import Response

from weaviate.collections.classes.config import (
StopwordsConfig,
StopwordsCreate,
TextAnalyzerConfigCreate,
Tokenization,
_StopwordsCreate,
_TextAnalyzerConfigCreate,
)
from weaviate.connect import executor
from weaviate.connect.v4 import ConnectionType, _ExpectedStatusCodes
Expand All @@ -27,32 +28,109 @@ def __check_version(self) -> None:
"1.37.0",
)

# Overloads make ``stopwords`` and ``stopword_presets`` mutually exclusive
# at type-check time. Passing both is additionally rejected at runtime with
# ``ValueError`` in the implementation below. ``stopwords`` accepts either a
# ``StopwordsCreate`` (the write-side shape) or a ``StopwordsConfig`` (the
# read-side shape returned by ``collection.config.get()``), so values round-
# tripped through config reads can be passed back in directly.
@overload
def text(
self,
text: str,
tokenization: Tokenization,
*,
analyzer_config: Optional[_TextAnalyzerConfigCreate] = None,
stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None,
analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ...,
) -> executor.Result[TokenizeResult]: ...

@overload
def text(
self,
text: str,
tokenization: Tokenization,
*,
analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
stopword_presets: Optional[Dict[str, List[str]]] = ...,
) -> executor.Result[TokenizeResult]: ...

def text(
self,
text: str,
tokenization: Tokenization,
*,
analyzer_config: Optional[TextAnalyzerConfigCreate] = None,
stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = None,
stopword_presets: Optional[Dict[str, List[str]]] = None,
) -> executor.Result[TokenizeResult]:
"""Tokenize text using the generic /v1/tokenize endpoint.

For ``word`` tokenization the server defaults to the built-in ``en``
stopword preset when no stopword configuration is supplied. Pass
``analyzer_config=TextAnalyzerConfigCreate(stopword_preset="none")``
or equivalent to opt out.

Call patterns for stopword handling (``stopwords`` and
``stopword_presets`` are mutually exclusive — pass at most one):

1. **No stopword config** — rely on the server default (``en`` for
word tokenization, none otherwise)::

client.tokenization.text(text=..., tokenization=Tokenization.WORD)

2. **Apply a one-off stopwords block** via ``stopwords`` — the block
filters the query tokens directly, same shape as a collection's
``invertedIndexConfig.stopwords``::

client.tokenization.text(
text=...,
tokenization=Tokenization.WORD,
stopwords=StopwordsCreate(preset=StopwordsPreset.EN, additions=["foo"]),
)

3. **Register a named-preset catalog** via ``stopword_presets`` and
reference one by name from ``analyzer_config.stopword_preset``.
The catalog can also override built-in presets such as ``en``::

client.tokenization.text(
text=...,
tokenization=Tokenization.WORD,
analyzer_config=TextAnalyzerConfigCreate(stopword_preset="custom"),
stopword_presets={"custom": ["foo", "bar"]},
)

Args:
text: The text to tokenize.
tokenization: The tokenization method to use (e.g. Tokenization.WORD).
analyzer_config: Text analyzer settings (ASCII folding, stopword preset).
stopword_presets: Custom stopword preset definitions, keyed by name.
Each value is a ``_StopwordsCreate`` with optional preset, additions,
and removals fields.
tokenization: The tokenization method to use (e.g. ``Tokenization.WORD``).
analyzer_config: Text analyzer settings (ASCII folding, stopword
preset name), built via ``Configure.text_analyzer(...)``.
``stopword_preset`` may reference a built-in preset
(``en`` / ``none``) or a name defined in ``stopword_presets``.
stopwords: One-off stopwords block applied directly to this request.
Mirrors the collection-level ``invertedIndexConfig.stopwords``
shape — hence the rich model with preset / additions / removals.
Mutually exclusive with ``stopword_presets``.
stopword_presets: Named-preset catalog (name → word list). Mirrors
the property-level preset catalog — a plain mapping, since a
property only references a preset by name (via
``analyzer_config.stopword_preset``) rather than carrying the
full stopwords block. Entries can override built-ins like
``en``. Mutually exclusive with ``stopwords``.

Returns:
A TokenizeResult with indexed and query token lists.
A ``TokenizeResult`` with indexed and query token lists. The generic
endpoint does not echo request fields back in the response.

Raises:
WeaviateUnsupportedFeatureError: If the server version is below 1.37.0.
ValueError: If both ``stopwords`` and ``stopword_presets`` are passed,
or if any ``stopword_presets`` value is not a list/tuple of strings.
"""
self.__check_version()

if stopwords is not None and stopword_presets is not None:
raise ValueError("stopwords and stopword_presets are mutually exclusive; pass only one")

payload: Dict[str, Any] = {
"text": text,
"tokenization": tokenization.value,
Expand All @@ -63,10 +141,42 @@ def text(
if ac_dict:
payload["analyzerConfig"] = ac_dict

if stopwords is not None:
if isinstance(stopwords, StopwordsConfig):
# Widen from the read-side shape returned by config.get() to the
# write-side shape the server expects. Field parity between the
# two classes is enforced at import time in
# ``weaviate/collections/classes/config.py``, so iterating
# ``StopwordsCreate.model_fields`` copies every field.
stopwords = StopwordsCreate(
**{name: getattr(stopwords, name) for name in StopwordsCreate.model_fields}
)
sw_dict = stopwords._to_dict()
if sw_dict:
payload["stopwords"] = sw_dict

if stopword_presets is not None:
payload["stopwordPresets"] = {
name: cfg._to_dict() for name, cfg in stopword_presets.items()
}
# Plain word-list shape matching a collection's
# invertedIndexConfig.stopwordPresets. Reject str (would
# silently split into characters) and pydantic models /
# other non-sequence shapes up-front so callers get a clear
# error instead of a malformed payload.
validated: Dict[str, List[str]] = {}
for name, words in stopword_presets.items():
if isinstance(words, (str, bytes)):
raise ValueError(
f"stopword_presets[{name!r}] must be a list of strings, "
f"got {type(words).__name__}"
)
if not isinstance(words, (list, tuple)):
raise ValueError(
f"stopword_presets[{name!r}] must be a list of strings, "
f"got {type(words).__name__}"
)
if not all(isinstance(w, str) for w in words):
raise ValueError(f"stopword_presets[{name!r}] must contain only strings")
validated[name] = list(words)
payload["stopwordPresets"] = validated

def resp(response: Response) -> TokenizeResult:
return TokenizeResult.model_validate(response.json())
Expand Down
43 changes: 2 additions & 41 deletions weaviate/tokenization/models.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,17 @@
"""Return types for tokenization operations."""

from typing import Any, Dict, List, Optional
from typing import List

from pydantic import BaseModel, ConfigDict, Field, field_validator

from weaviate.collections.classes.config import (
StopwordsConfig,
StopwordsPreset,
TextAnalyzerConfig,
Tokenization,
)
from pydantic import BaseModel


class TokenizeResult(BaseModel):
"""Result of a tokenization operation.

Attributes:
tokenization: The tokenization method that was applied.
indexed: Tokens as they would be stored in the inverted index.
query: Tokens as they would be used for querying (after stopword removal).
Comment thread
amourao marked this conversation as resolved.
analyzer_config: The text analyzer configuration that was used, if any.
stopword_config: The stopword configuration that was used, if any.
"""

model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True)

tokenization: Tokenization
indexed: List[str]
query: List[str]
analyzer_config: Optional[TextAnalyzerConfig] = Field(default=None, alias="analyzerConfig")
stopword_config: Optional[StopwordsConfig] = Field(default=None, alias="stopwordConfig")

@field_validator("analyzer_config", mode="before")
@classmethod
def _parse_analyzer_config(cls, v: Optional[Dict[str, Any]]) -> Optional[TextAnalyzerConfig]:
if v is None:
return None
if "asciiFold" not in v and "stopwordPreset" not in v:
return None
return TextAnalyzerConfig(
ascii_fold=v.get("asciiFold", False),
ascii_fold_ignore=v.get("asciiFoldIgnore"),
stopword_preset=v.get("stopwordPreset"),
)

@field_validator("stopword_config", mode="before")
@classmethod
def _parse_stopword_config(cls, v: Optional[Dict[str, Any]]) -> Optional[StopwordsConfig]:
if v is None:
return None
return StopwordsConfig(
preset=StopwordsPreset(v["preset"]),
additions=v.get("additions"),
removals=v.get("removals"),
)
Loading
Loading