|
31 | 31 |
|
32 | 32 | from .exttypes.pyobjectpath import PyObjectPath |
33 | 33 | from .local_persistence import register_ccflow_import_path, sync_to_module |
| 34 | +from .utils.tokenize import DefaultTokenizer, Tokenizer, normalize_token |
34 | 35 |
|
35 | 36 | log = logging.getLogger(__name__) |
36 | 37 |
|
@@ -195,6 +196,41 @@ def type_(self) -> PyObjectPath: |
195 | 196 | # We want to track under what names a model has been registered |
196 | 197 | _registrations: List[Tuple["ModelRegistry", str]] = PrivateAttr(default_factory=list) |
197 | 198 |
|
| 199 | + # Tokenization support |
| 200 | + __ccflow_tokenizer__: ClassVar[Tokenizer] = DefaultTokenizer.with_bytecode() |
| 201 | + _model_token: Optional[str] = PrivateAttr(default=None) |
| 202 | + |
| 203 | + @property |
| 204 | + def model_token(self) -> str: |
| 205 | + """Return a deterministic content hash of this model. |
| 206 | +
|
| 207 | + The token is cached by default (controlled by ``cache_token`` in model_config). |
| 208 | + For frozen models, the token is computed once and never recomputed. |
| 209 | + For mutable models, the cache is cleared on field assignment (via ``validate_assignment``). |
| 210 | + Set ``cache_token=False`` in model_config to always compute fresh. |
| 211 | + """ |
| 212 | + cache = self.model_config.get("cache_token", True) |
| 213 | + if cache and self._model_token is not None: |
| 214 | + return self._model_token |
| 215 | + token = self.__ccflow_tokenizer__.tokenize(self) |
| 216 | + if cache: |
| 217 | + self.__pydantic_private__["_model_token"] = token |
| 218 | + return token |
| 219 | + |
| 220 | + @model_validator(mode="after") |
| 221 | + def _clear_token_cache(self): |
| 222 | + """Clear the cached token on construction and field assignment.""" |
| 223 | + if self.model_config.get("cache_token", True): |
| 224 | + self.__pydantic_private__["_model_token"] = None |
| 225 | + return self |
| 226 | + |
| 227 | + def model_copy(self, *, update=None, deep=False): |
| 228 | + """Override model_copy to clear the stale token cache on the copy.""" |
| 229 | + copy = super().model_copy(update=update, deep=deep) |
| 230 | + if update and copy.__pydantic_private__ is not None: |
| 231 | + copy.__pydantic_private__["_model_token"] = None |
| 232 | + return copy |
| 233 | + |
198 | 234 | model_config = ConfigDict( |
199 | 235 | # Note that validate_assignment only partially works: https://github.com/pydantic/pydantic/issues/7105 |
200 | 236 | validate_assignment=True, |
@@ -316,6 +352,18 @@ def __getstate__(self): |
316 | 352 | def __setstate__(self, state): |
317 | 353 | state["__pydantic_fields_set__"] = set(state["__pydantic_fields_set__"]) |
318 | 354 | super().__setstate__(state) |
| 355 | + # Clear stale token cache from pickle |
| 356 | + if self.__pydantic_private__ is not None and "_model_token" in self.__pydantic_private__: |
| 357 | + self.__pydantic_private__["_model_token"] = None |
| 358 | + |
| 359 | + |
| 360 | +# Register ccflow BaseModel-specific normalize_token handler |
| 361 | +# Delegates to the model's tokenizer so normalization is consistent |
| 362 | +# regardless of whether the model is accessed via model_token or |
| 363 | +# encountered as a value inside a container. |
| 364 | +@normalize_token.register(BaseModel) |
| 365 | +def _normalize_ccflow_basemodel(obj): |
| 366 | + return obj.__ccflow_tokenizer__.normalize(obj) |
319 | 367 |
|
320 | 368 |
|
321 | 369 | class _ModelRegistryData(PydanticBaseModel): |
|
0 commit comments