diff --git a/ccflow/evaluators/common.py b/ccflow/evaluators/common.py index 7dd22f3..2cab984 100644 --- a/ccflow/evaluators/common.py +++ b/ccflow/evaluators/common.py @@ -7,7 +7,6 @@ from types import MappingProxyType from typing import Any, Callable, Dict, List, Optional, Set, Union -import dask.base from pydantic import Field, PrivateAttr, field_validator from typing_extensions import override @@ -59,6 +58,17 @@ def combine_evaluators(first: Optional[EvaluatorBase], second: Optional[Evaluato return MultiEvaluator(evaluators=[first, second]) +def _flatten_cache_key_context(flow_obj: ModelEvaluationContext) -> tuple[ModelEvaluationContext, str, List[EvaluatorBase]]: + fn = flow_obj.fn + non_transparent: List[EvaluatorBase] = [] + while isinstance(flow_obj.context, ModelEvaluationContext): + fn = flow_obj.fn if flow_obj.fn != "__call__" else fn + if not isinstance(flow_obj, TransparentModelEvaluationContext): + non_transparent.append(flow_obj.model) + flow_obj = flow_obj.context + return flow_obj, fn if fn != "__call__" else flow_obj.fn, non_transparent + + class MultiEvaluator(EvaluatorBase): """An evaluator that combines multiple evaluators. @@ -224,24 +234,28 @@ def cache_key(flow_obj: Union[ModelEvaluationContext, ContextBase, CallableModel only on the underlying model, context, fn, options, and any non-transparent evaluators in the chain. + When the underlying model has callable methods, a behavior token (SHA-256 of + method bytecode) is included so that code changes invalidate the cache. + Args: flow_obj: The object to be tokenized to form the cache key. """ + from ..utils.tokenize import compute_cache_token + if isinstance(flow_obj, ModelEvaluationContext): - fn = flow_obj.fn - non_transparent = [] - while isinstance(flow_obj.context, ModelEvaluationContext): - fn = flow_obj.fn if flow_obj.fn != "__call__" else fn - if not isinstance(flow_obj, TransparentModelEvaluationContext): - non_transparent.append(flow_obj.model) - flow_obj = flow_obj.context - d = flow_obj.model_dump(mode="python") - d["fn"] = fn if fn != "__call__" else flow_obj.fn - if non_transparent: - d["_evaluators"] = [e.model_dump(mode="python") for e in non_transparent] - return dask.base.tokenize(d).encode("utf-8") + flow_obj, fn, non_transparent = _flatten_cache_key_context(flow_obj) + return compute_cache_token( + data_values=[ + {**flow_obj.model_dump(mode="python"), "fn": fn}, + *(evaluator.model_dump(mode="python") for evaluator in non_transparent), + ], + behavior_classes=[type(flow_obj.model), *(type(evaluator) for evaluator in non_transparent)], + ).encode("utf-8") elif isinstance(flow_obj, (ContextBase, CallableModel)): - return dask.base.tokenize(flow_obj.model_dump(mode="python")).encode("utf-8") + return compute_cache_token( + data_values=[flow_obj.model_dump(mode="python")], + behavior_classes=[type(flow_obj)], + ).encode("utf-8") else: raise TypeError(f"object of type {type(flow_obj)} cannot be serialized by this function!") diff --git a/ccflow/tests/utils/test_tokenize.py b/ccflow/tests/utils/test_tokenize.py new file mode 100644 index 0000000..0166894 --- /dev/null +++ b/ccflow/tests/utils/test_tokenize.py @@ -0,0 +1,630 @@ +"""Tests for tokenize helpers used by cache keys.""" + +import pytest + +from ccflow.callable import CallableModel, ContextBase, EvaluatorBase, ModelEvaluationContext +from ccflow.context import NullContext +from ccflow.evaluators.common import cache_key +from ccflow.result import GenericResult +from ccflow.utils.tokenize import compute_behavior_token, compute_cache_token, compute_data_token + +# --------------------------------------------------------------------------- +# Data token +# --------------------------------------------------------------------------- + + +class TestComputeDataToken: + def test_deterministic(self): + value = {"a": [1, 2], "b": ("x", 3)} + + assert compute_data_token(value) == compute_data_token(value) + + def test_different_values_different_tokens(self): + assert compute_data_token({"x": 1}) != compute_data_token({"x": 2}) + + +class TestComputeCacheToken: + def test_deterministic(self): + class Helper: + def f(self): + return 1 + + token1 = compute_cache_token(data_values=[{"x": 1}], behavior_classes=[Helper]) + token2 = compute_cache_token(data_values=[{"x": 1}], behavior_classes=[Helper]) + assert token1 == token2 + + def test_data_changes_token(self): + class Helper: + def f(self): + return 1 + + token1 = compute_cache_token(data_values=[{"x": 1}], behavior_classes=[Helper]) + token2 = compute_cache_token(data_values=[{"x": 2}], behavior_classes=[Helper]) + assert token1 != token2 + + def test_behavior_changes_token(self): + class HelperA: + def f(self): + return 1 + + class HelperB: + def f(self): + return 2 + + token1 = compute_cache_token(data_values=[{"x": 1}], behavior_classes=[HelperA]) + token2 = compute_cache_token(data_values=[{"x": 1}], behavior_classes=[HelperB]) + assert token1 != token2 + + +# --------------------------------------------------------------------------- +# Basic behavior +# --------------------------------------------------------------------------- + + +class TestComputeBehaviorToken: + def test_returns_sha256_hex(self): + class M: + def f(self): + return 1 + + token = compute_behavior_token(M) + assert isinstance(token, str) + assert len(token) == 64 + + def test_deterministic(self): + class M: + def f(self): + return 1 + + assert compute_behavior_token(M) == compute_behavior_token(M) + + def test_different_bytecode_different_token(self): + class A: + def f(self): + return 1 + + class B: + def f(self): + return 2 + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_same_logic_same_token(self): + class A: + def f(self): + return 42 + + class B: + def f(self): + return 42 + + assert compute_behavior_token(A) == compute_behavior_token(B) + + def test_none_for_no_methods(self): + class Empty: + x = 1 + + assert compute_behavior_token(Empty) is None + + def test_cached_on_class(self): + class M: + def f(self): + return 1 + + token = compute_behavior_token(M) + assert M.__ccflow_tokenizer_cache__ == token + # Second call returns cached value + assert compute_behavior_token(M) is token + + def test_docstring_ignored(self): + class A: + def f(self): + """This is a docstring.""" + return 1 + + class B: + def f(self): + """Different docstring.""" + return 1 + + assert compute_behavior_token(A) == compute_behavior_token(B) + + def test_comments_ignored(self): + """Comments don't exist in bytecode, so they're inherently ignored.""" + + class A: + def f(self): + # a comment + return 1 + + class B: + def f(self): + # different comment + return 1 + + assert compute_behavior_token(A) == compute_behavior_token(B) + + def test_constants_matter(self): + class A: + def f(self): + return "hello" + + class B: + def f(self): + return "world" + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_defaults_matter(self): + class A: + def f(self, x=1): + return x + + class B: + def f(self, x=2): + return x + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_kwdefaults_matter(self): + class A: + def f(self, *, x=1): + return x + + class B: + def f(self, *, x=2): + return x + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_closure_values_matter(self): + def make_model(value): + class M: + def f(self): + return value + + return M + + assert compute_behavior_token(make_model(1)) != compute_behavior_token(make_model(2)) + + +# --------------------------------------------------------------------------- +# Method collection +# --------------------------------------------------------------------------- + + +class TestMethodCollection: + def test_includes_regular_methods(self): + class M: + def method_a(self): + return 1 + + def method_b(self): + return 2 + + token = compute_behavior_token(M) + assert token is not None + + def test_includes_staticmethod(self): + class A: + @staticmethod + def helper(): + return 1 + + class B: + @staticmethod + def helper(): + return 2 + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_includes_classmethod(self): + class A: + @classmethod + def factory(cls): + return cls() + + class B: + @classmethod + def factory(cls): + return None + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_method_order_insensitive(self): + """Alphabetical sorting means definition order doesn't matter.""" + + class A: + def beta(self): + return 2 + + def alpha(self): + return 1 + + class B: + def alpha(self): + return 1 + + def beta(self): + return 2 + + assert compute_behavior_token(A) == compute_behavior_token(B) + + def test_skips_ccflow_internal_attrs(self): + """Attributes starting with __ccflow_ are skipped.""" + + class A: + __ccflow_tokenizer_deps__ = [] + + def f(self): + return 1 + + class B: + def f(self): + return 1 + + assert compute_behavior_token(A) == compute_behavior_token(B) + + +# --------------------------------------------------------------------------- +# Dependencies (__ccflow_tokenizer_deps__) +# --------------------------------------------------------------------------- + + +def _helper_add(x): + return x + 1 + + +def _helper_mul(x): + return x * 2 + + +class TestDeps: + def test_deps_included(self): + class NoDeps: + def f(self): + return 1 + + class WithDeps: + __ccflow_tokenizer_deps__ = [_helper_add] + + def f(self): + return 1 + + assert compute_behavior_token(NoDeps) != compute_behavior_token(WithDeps) + + def test_dep_order_insensitive(self): + class A: + __ccflow_tokenizer_deps__ = [_helper_add, _helper_mul] + + def f(self): + return 1 + + class B: + __ccflow_tokenizer_deps__ = [_helper_mul, _helper_add] + + def f(self): + return 1 + + assert compute_behavior_token(A) == compute_behavior_token(B) + + def test_dep_change_changes_token(self): + def v1(): + return 1 + + def v2(): + return 2 + + class A: + __ccflow_tokenizer_deps__ = [v1] + + def f(self): + return 1 + + class B: + __ccflow_tokenizer_deps__ = [v2] + + def f(self): + return 1 + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_class_dep_included(self): + class HelperA: + def f(self): + return 1 + + class HelperB: + def f(self): + return 2 + + class A: + __ccflow_tokenizer_deps__ = [HelperA] + + def f(self): + return 1 + + class B: + __ccflow_tokenizer_deps__ = [HelperB] + + def f(self): + return 1 + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_subclass_deps_extend_inherited_deps(self): + def base_a(): + return 1 + + def base_b(): + return 2 + + def sub_dep(): + return 3 + + class BaseA: + __ccflow_tokenizer_deps__ = [base_a] + + def f(self): + return 1 + + class BaseB: + __ccflow_tokenizer_deps__ = [base_b] + + def f(self): + return 1 + + class SubA(BaseA): + __ccflow_tokenizer_deps__ = [sub_dep] + + class SubB(BaseB): + __ccflow_tokenizer_deps__ = [sub_dep] + + assert compute_behavior_token(SubA) != compute_behavior_token(SubB) + + def test_recursive_class_deps_raise(self): + class A: + def f(self): + return 1 + + class B: + def g(self): + return 2 + + A.__ccflow_tokenizer_deps__ = [B] + B.__ccflow_tokenizer_deps__ = [A] + + with pytest.raises(TypeError, match="Recursive __ccflow_tokenizer_deps__ class dependency"): + compute_behavior_token(A) + + +# --------------------------------------------------------------------------- +# Integration with cache_key() +# --------------------------------------------------------------------------- + + +class TestCacheKeyIntegration: + def test_callable_model_includes_behavior(self): + """cache_key for a CallableModel includes the behavior hash.""" + from ccflow import Flow + + class MyModel(CallableModel): + x: int = 1 + + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=self.x + 1) + + class MyModelV2(CallableModel): + x: int = 1 + + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=self.x + 2) + + key1 = cache_key(MyModel(x=1)) + key2 = cache_key(MyModelV2(x=1)) + # Same config, different __call__ → different cache key + assert key1 != key2 + + def test_same_callable_same_key(self): + from ccflow import Flow + + class MyModel(CallableModel): + x: int = 1 + + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=self.x) + + key1 = cache_key(MyModel(x=1)) + key2 = cache_key(MyModel(x=1)) + assert key1 == key2 + + def test_different_config_different_key(self): + from ccflow import Flow + + class MyModel(CallableModel): + x: int = 1 + + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=self.x) + + key1 = cache_key(MyModel(x=1)) + key2 = cache_key(MyModel(x=2)) + assert key1 != key2 + + def test_context_base_no_behavior(self): + """ContextBase with no custom methods has no behavior token — still works.""" + + class MyContext(ContextBase): + value: int = 1 + + key = cache_key(MyContext(value=1)) + assert isinstance(key, bytes) + + def test_helper_default_arg_changes_key(self): + from ccflow import Flow + + class A(CallableModel): + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=self.helper()) + + def helper(self, x=1): + return x + + class B(CallableModel): + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=self.helper()) + + def helper(self, x=2): + return x + + assert cache_key(A()) != cache_key(B()) + + def test_class_dep_changes_key(self): + from ccflow import Flow + + class HelperA: + def f(self): + return 1 + + class HelperB: + def f(self): + return 2 + + class A(CallableModel): + __ccflow_tokenizer_deps__ = [HelperA] + + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=1) + + class B(CallableModel): + __ccflow_tokenizer_deps__ = [HelperB] + + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=1) + + assert cache_key(A()) != cache_key(B()) + + def test_opaque_evaluator_behavior_changes_key(self): + from ccflow import Flow + + class MyModel(CallableModel): + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=1) + + class OpaqueA(EvaluatorBase): + tag: str = "same" + + def __call__(self, context: ModelEvaluationContext): + return context() + + class OpaqueB(EvaluatorBase): + tag: str = "same" + + def __call__(self, context: ModelEvaluationContext): + result = context() + return result + + inner = ModelEvaluationContext(model=MyModel(), context=NullContext()) + key1 = cache_key(ModelEvaluationContext(model=OpaqueA(), context=inner)) + key2 = cache_key(ModelEvaluationContext(model=OpaqueB(), context=inner)) + assert key1 != key2 + + +# --------------------------------------------------------------------------- +# Decorator unwrapping (Flow.call, etc.) +# --------------------------------------------------------------------------- + + +class TestDecoratorUnwrapping: + def test_flow_call_different_impls_differ(self): + """@Flow.call wrappers are unwrapped — different implementations hash differently.""" + from ccflow import Flow + + class A(CallableModel): + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=1) + + class B(CallableModel): + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=2) + + assert compute_behavior_token(A) != compute_behavior_token(B) + + def test_flow_call_same_impl_same(self): + """Same @Flow.call implementation produces the same token.""" + from ccflow import Flow + + class A(CallableModel): + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=42) + + class B(CallableModel): + @Flow.call + def __call__(self, context: NullContext) -> GenericResult: + return GenericResult(value=42) + + assert compute_behavior_token(A) == compute_behavior_token(B) + + +# --------------------------------------------------------------------------- +# MRO / inherited methods +# --------------------------------------------------------------------------- + + +class TestInheritedMethods: + def test_inherited_call_included(self): + """Subclass that inherits __call__ from parent picks up parent's method.""" + + class Base: + def __call__(self): + return 1 + + class Sub(Base): + pass + + # Sub inherits __call__, so it should have a behavior token + assert compute_behavior_token(Sub) is not None + assert compute_behavior_token(Sub) == compute_behavior_token(Base) + + def test_override_changes_token(self): + """Subclass overriding a method gets a different token.""" + + class Base: + def __call__(self): + return 1 + + class Sub(Base): + def __call__(self): + return 2 + + assert compute_behavior_token(Sub) != compute_behavior_token(Base) + + def test_subclass_cache_independent(self): + """Parent and subclass don't share __ccflow_tokenizer_cache__.""" + + class Base: + def f(self): + return 1 + + class Sub(Base): + def g(self): + return 2 + + t_base = compute_behavior_token(Base) + t_sub = compute_behavior_token(Sub) + # Sub has additional method g, so tokens differ + assert t_base != t_sub + # But base's cached token is unaffected + assert compute_behavior_token(Base) == t_base diff --git a/ccflow/utils/__init__.py b/ccflow/utils/__init__.py index e1b3188..7e9fa23 100644 --- a/ccflow/utils/__init__.py +++ b/ccflow/utils/__init__.py @@ -1,4 +1,4 @@ from .chunker import * from .core import * from .logging import * -from .tokenize import normalize_token, tokenize +from .tokenize import compute_behavior_token, compute_cache_token, compute_data_token, normalize_token, tokenize diff --git a/ccflow/utils/tokenize.py b/ccflow/utils/tokenize.py index 20b7161..528c5ed 100644 --- a/ccflow/utils/tokenize.py +++ b/ccflow/utils/tokenize.py @@ -1,2 +1,295 @@ # ruff: noqa: F401 +"""Tokenization utilities for ccflow models. + +Re-exports ``normalize_token`` and ``tokenize`` from dask for data hashing. +Adds helpers for dask-based data hashing, ccflow-specific behavior hashing, +and combined cache-token hashing, useful for cache-key invalidation when +callable logic changes. +""" + +import hashlib +import inspect +from typing import Any, Callable, Iterable, List, Optional, Tuple + from dask.base import normalize_token, tokenize + +__all__ = ( + "compute_behavior_token", + "compute_cache_token", + "compute_data_token", + "normalize_token", + "tokenize", +) + + +def _sha256_hexdigest(*parts: bytes | str) -> str: + """Return a SHA-256 hex digest for one or more byte/string parts.""" + + hasher = hashlib.sha256() + for part in parts: + if isinstance(part, str): + part = part.encode("utf-8") + hasher.update(part) + return hasher.hexdigest() + + +def compute_data_token(value: Any) -> str: + """Compute a deterministic data token using dask's tokenization.""" + + return tokenize(value) + + +def compute_cache_token(*, data_values: Iterable[Any] = (), behavior_classes: Iterable[type] = ()) -> str: + """Compute a cache token by combining data and behavior tokens. + + Args: + data_values: Values whose serialized data should affect the cache key. + behavior_classes: Classes whose behavior tokens should affect the cache key. + """ + + tokens = [compute_data_token(value) for value in data_values] + for cls in behavior_classes: + behavior = compute_behavior_token(cls) + if behavior is not None: + tokens.append(behavior) + return compute_data_token(tuple(tokens)) + + +# Methods that are never behavior-relevant (pydantic/python internals) +_SKIPPED_METHODS = frozenset( + { + "__eq__", + "__hash__", + "__repr__", + "__str__", + "__init__", + "__init_subclass__", + "__class_getitem__", + "__get_pydantic_core_schema__", + "__get_pydantic_json_schema__", + "model_post_init", + } +) + + +# --------------------------------------------------------------------------- +# Behavior hashing — bytecode-based fingerprinting of class methods +# --------------------------------------------------------------------------- + + +def _unwrap_function(func: object) -> Optional[Callable]: + """Unwrap descriptors and decorator chains to get the underlying function. + + Handles ``classmethod``, ``staticmethod``, ``property`` (fget), and + ``functools.wraps``-style ``__wrapped__`` chains (e.g. ``@Flow.call``). + Returns ``None`` if the underlying object has no ``__code__``. + """ + if isinstance(func, classmethod): + func = func.__func__ + elif isinstance(func, staticmethod): + func = func.__func__ + elif isinstance(func, property): + func = func.fget + if func is None: + return None + + # Unwrap decorator chains (e.g. @Flow.call sets __wrapped__) + try: + func = inspect.unwrap(func) + except (TypeError, ValueError): + pass + + if not callable(func) or not hasattr(func, "__code__"): + return None + return func + + +def _function_state(func: Callable) -> Tuple[Any, Any, Tuple[Tuple[str, bool, Any], ...] | None]: + """Return defaults/kwdefaults/closure state that affects runtime behavior.""" + + kwdefaults = getattr(func, "__kwdefaults__", None) + if kwdefaults is not None: + kwdefaults = tuple(sorted(kwdefaults.items())) + + closure = getattr(func, "__closure__", None) + closure_state = None + if closure: + closure_state = [] + for name, cell in zip(func.__code__.co_freevars, closure): + try: + closure_state.append((name, True, cell.cell_contents)) + except ValueError: + closure_state.append((name, False, None)) + closure_state = tuple(closure_state) + + return getattr(func, "__defaults__", None), kwdefaults, closure_state + + +def _hash_function_bytecode(func: Callable) -> Optional[str]: + """Return a SHA-256 hex digest of a function's behavior-relevant state. + + The function is first unwrapped through any decorator chains, so that + e.g. ``@Flow.call`` wrappers do not mask the real implementation. + + In addition to ``co_code`` and ``co_consts``, this includes: + - positional defaults (``__defaults__``) + - keyword-only defaults (``__kwdefaults__``) + - closure cell contents + so that behavior changes that do not affect bytecode alone still change + the token. + + Returns ``None`` for objects without ``__code__`` (C builtins, etc.). + """ + unwrapped = _unwrap_function(func) + if unwrapped is None: + return None + code = unwrapped.__code__ + # Include constants (skip first if it's the docstring) + consts = code.co_consts + if consts and isinstance(consts[0], str): + consts = consts[1:] + return _sha256_hexdigest( + code.co_code, + repr(consts), + compute_data_token(_function_state(unwrapped)), + ) + + +def _dependency_info(dep: object, *, _visited: Tuple[type, ...]) -> Optional[Tuple[Tuple[str, str, str, str], str, str]]: + """Return deterministic identity, name, and token for a dependency entry.""" + + if isinstance(dep, type): + module = getattr(dep, "__module__", "") + qualname = getattr(dep, "__qualname__", getattr(dep, "__name__", repr(dep))) + behavior = compute_behavior_token(dep, _visited=_visited) + if behavior is None: + return None + return ("class", module, qualname, behavior), f"__dep_class__:{qualname}", behavior + + unwrapped = _unwrap_function(dep) + if unwrapped is None: + return None + module = getattr(unwrapped, "__module__", "") + qualname = getattr(unwrapped, "__qualname__", getattr(unwrapped, "__name__", repr(unwrapped))) + behavior = _hash_function_bytecode(unwrapped) + if behavior is None: + return None + return ("callable", module, qualname, behavior), f"__dep__:{qualname}", behavior + + +def _collect_methods(cls: type) -> List[Tuple[str, Callable]]: + """Collect callable methods from *cls* (walking MRO). + + Methods are collected with MRO override semantics: for each method name, + the first definition found in the MRO wins. This means a subclass's + ``__call__`` overrides the base class's even if the subclass doesn't + redefine every method. + + Own methods are sorted alphabetically. + + Internal framework attributes (``__ccflow_*``) and pydantic/python + boilerplate methods are skipped. + """ + # Walk MRO to collect methods with override semantics + seen_names = set() + methods = [] + for klass in cls.__mro__: + if klass is object: + break + for name, value in klass.__dict__.items(): + if name in seen_names: + continue + seen_names.add(name) + if name.startswith("__ccflow_"): + continue + if name in _SKIPPED_METHODS: + continue + if isinstance(value, (classmethod, staticmethod, property)) or callable(value): + methods.append((name, value)) + + methods.sort(key=lambda pair: pair[0]) + + return methods + + +def _collect_dependency_hashes(cls: type, *, _visited: Tuple[type, ...]) -> List[Tuple[str, str]]: + """Collect hashed ``__ccflow_tokenizer_deps__`` entries from the full MRO. + + Dependencies are merged across the MRO, deduplicated, and sorted + deterministically so that declaration order does not affect the hash. + + Dependency entries may be either: + - function-like objects hashable via ``_hash_function_bytecode()`` + - classes, in which case ``compute_behavior_token(dep_class)`` is included + """ + deps = [] + seen_dep_keys = set() + for klass in cls.__mro__: + extra_deps = klass.__dict__.get("__ccflow_tokenizer_deps__") + if extra_deps is None: + continue + for dep in extra_deps: + dep_info = _dependency_info(dep, _visited=_visited) + if dep_info is None: + continue + dep_key, dep_name, dep_token = dep_info + if dep_key in seen_dep_keys: + continue + seen_dep_keys.add(dep_key) + deps.append((dep_key, dep_name, dep_token)) + + deps.sort(key=lambda item: item[0]) + return [(dep_name, dep_token) for _, dep_name, dep_token in deps] + + +def compute_behavior_token(cls: type, *, _visited: Tuple[type, ...] = ()) -> Optional[str]: + """Compute a SHA-256 behavior token for *cls* based on its method bytecode. + + The token captures behavior-relevant state for every method in *cls*'s MRO + (with standard override semantics): bytecode, constants (minus docstrings), + defaults, keyword-only defaults, and closure cell contents. It also + includes any functions or classes listed in ``cls.__ccflow_tokenizer_deps__``. + + Decorator chains (e.g. ``@Flow.call``) are automatically unwrapped so + that the hash reflects the user's implementation, not the wrapper. + + ``__ccflow_tokenizer_deps__`` values are merged across the full MRO, so + subclasses can add dependencies without dropping inherited ones. Class + entries contribute their own ``compute_behavior_token()`` recursively. + + Results are cached on the class in ``cls.__ccflow_tokenizer_cache__``. + The cache is stored directly on the class (not inherited), so subclass + tokens are independent of parent tokens. + + Returns ``None`` if the class has no hashable methods. + + .. note:: + + Monkey-patching methods on an existing class after its behavior token + has been computed will **not** invalidate the cached token. Redefining + the class (e.g. in Jupyter) creates a new class object and works fine. + """ + if cls in _visited: + raise TypeError(f"Recursive __ccflow_tokenizer_deps__ class dependency detected for {cls.__module__}.{cls.__qualname__}") + + # Check cache on cls itself (not inherited) + cache = cls.__dict__.get("__ccflow_tokenizer_cache__") + if cache is not None: + return cache + + visited = _visited + (cls,) + methods = _collect_methods(cls) + method_hashes = [(name, h) for name, func in methods if (h := _hash_function_bytecode(func)) is not None] + method_hashes.extend(_collect_dependency_hashes(cls, _visited=visited)) + + if not method_hashes: + return None + + token = _sha256_hexdigest(repr(method_hashes)) + + try: + cls.__ccflow_tokenizer_cache__ = token + except (TypeError, AttributeError): + pass # e.g. C extension types that don't allow attribute setting + + return token diff --git a/docs/wiki/Workflows.md b/docs/wiki/Workflows.md index 526a957..6789d79 100644 --- a/docs/wiki/Workflows.md +++ b/docs/wiki/Workflows.md @@ -528,6 +528,46 @@ with FlowOptionsOverride(options={"cacheable":True, "evaluator": evaluator}): #> GenericResult[int](value=1) ``` +#### How cache keys are built + +`MemoryCacheEvaluator` uses `ccflow.evaluators.cache_key()`, which now delegates to `ccflow.utils.compute_cache_token()`. + +At a high level, the cache key combines: + +- a **data token** for the serialized model/context payload +- a **behavior token** for each callable/evaluator class that can affect the result + +For a direct `CallableModel` call, the cache key depends on: + +- `model.model_dump(mode="python")` +- `compute_behavior_token(type(model))` + +For a `ModelEvaluationContext`, the cache key depends on: + +- the underlying context payload plus the function name being evaluated (`__call__` vs `__deps__`) +- the behavior token of the underlying model class +- the data and behavior tokens of any **non-transparent** evaluators in the chain + +Transparent evaluators are skipped, so wrapping a model with logging or other pass-through evaluators does not change its cache identity. + +`compute_behavior_token()` hashes the class's Python-defined methods and also consults `__ccflow_tokenizer_deps__` for behavior that lives outside the class body. This is useful when the callable depends on module-level helpers or shared helper classes that should also invalidate the cache key when they change. + +```python +def helper(x): + return x + 1 + + +class SharedLogic: + def transform(self, x): + return x * 2 + + +class MyModel(CallableModel): + __ccflow_tokenizer_deps__ = [helper, SharedLogic] +``` + +Entries in `__ccflow_tokenizer_deps__` may be functions or classes. Class dependencies are tokenized recursively via `compute_behavior_token()`. Recursive class dependency graphs are rejected with a `TypeError`. + ### Graph Evaluation Dependencies between tasks/steps in a workflow is one of the defining characteristics of a workflow orchestration framework. Earlier we covered how dependencies defined (via composition) between configuration objects.