Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions python/lib/sift_py/yaml/_utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""Exercises the three YAML loader paths in :mod:`sift_py.yaml.utils`.

The three paths, in priority order inside :func:`try_fast_yaml_load`:

1. ``rapidyaml`` (``ryml``) -- C++ bindings, declared as a required dependency.
2. pyyaml ``CSafeLoader`` -- libyaml-backed, available on platforms whose
pyyaml wheel ships libyaml (i.e. almost all of them).
3. pyyaml ``safe_load`` -- pure-Python fallback used when neither libyaml nor
rapidyaml is installed.

Each path should produce an identical dict for the shipped example ingest
config, including correct resolution of anchors (``&foo`` / ``*foo``) and
YAML 1.1 merge keys (``<<: *foo``) used heavily by that config.
"""

from pathlib import Path

import pytest
import yaml

from sift_py.yaml import utils

EXAMPLE_CONFIG = Path(__file__).resolve().parents[3] / "examples" / "telemetry_config.example.yml"


def _force_pure_python_pyyaml(monkeypatch: pytest.MonkeyPatch) -> None:
"""Hide ``yaml.CSafeLoader`` so ``_pyyaml_load`` falls through to ``safe_load``."""
monkeypatch.delattr(yaml, "CSafeLoader", raising=False)


def _assert_sift_ingest_shape(config: dict) -> None:
"""Sanity-check that *config* is a well-formed Sift ingest YAML dict.

Focuses on fields that a miswired loader would get wrong: anchors resolving
to inline dicts, merge keys (``<<:``) being expanded rather than left as
literal ``"<<"`` keys, and numeric scalars surviving the round-trip as
ints rather than strings.
"""
assert isinstance(config, dict)
assert isinstance(config.get("asset_name"), str)
assert isinstance(config["channels"], dict)
assert isinstance(config["flows"], list)
assert config["flows"], "example config should declare at least one flow"

for flow in config["flows"]:
assert isinstance(flow["name"], str)
for ch in flow.get("channels", []):
if not isinstance(ch, dict):
continue
# Merge keys must be inlined, not left as a literal "<<" key.
assert "<<" not in ch, f"unresolved merge key in flow channel: {ch!r}"
# Any numeric field (bit_field_elements[].index, enum_types[].key)
# must come back as int, not str.
for enum in ch.get("enum_types", []) or []:
assert isinstance(enum.get("key"), int), f"enum key not int: {enum!r}"
for bit in ch.get("bit_field_elements", []) or []:
assert isinstance(bit.get("index"), int), f"bit index not int: {bit!r}"
assert isinstance(bit.get("bit_count"), int), f"bit_count not int: {bit!r}"


def test_rapidyaml_load_path():
"""Primary path: rapidyaml produces a schema-valid ingest dict."""
pytest.importorskip("ryml")
result = utils._rapidyaml_load(EXAMPLE_CONFIG)
_assert_sift_ingest_shape(result)


def test_pyyaml_csafeloader_path():
"""Fallback path 1: pyyaml's libyaml-backed ``CSafeLoader``."""
if not hasattr(yaml, "CSafeLoader"):
pytest.skip("libyaml/CSafeLoader not installed in this environment")
result = utils._pyyaml_load(EXAMPLE_CONFIG)
_assert_sift_ingest_shape(result)


def test_pyyaml_pure_python_path(monkeypatch: pytest.MonkeyPatch):
"""Fallback path 2: pure-Python ``safe_load`` with ``CSafeLoader`` hidden."""
_force_pure_python_pyyaml(monkeypatch)
assert not hasattr(yaml, "CSafeLoader"), "monkeypatch failed to hide CSafeLoader"
result = utils._pyyaml_load(EXAMPLE_CONFIG)
_assert_sift_ingest_shape(result)


def test_all_three_loaders_agree(monkeypatch: pytest.MonkeyPatch):
"""rapidyaml, CSafeLoader, and pure-Python ``safe_load`` return the same dict.

Dispatch order matters: call the two pyyaml paths *after* rapidyaml, and
perform the pure-Python load last so the ``CSafeLoader`` monkeypatch
cannot leak into the libyaml path.
"""
pytest.importorskip("ryml")
if not hasattr(yaml, "CSafeLoader"):
pytest.skip("libyaml/CSafeLoader not installed; cannot compare all three")

via_ryml = utils._rapidyaml_load(EXAMPLE_CONFIG)
via_csafe = utils._pyyaml_load(EXAMPLE_CONFIG)

_force_pure_python_pyyaml(monkeypatch)
via_safe = utils._pyyaml_load(EXAMPLE_CONFIG)

assert via_ryml == via_csafe
assert via_csafe == via_safe


def test_try_fast_yaml_load_dispatches_to_rapidyaml_when_available():
"""``try_fast_yaml_load`` returns the rapidyaml result when ``_HAS_RYML`` is true."""
pytest.importorskip("ryml")
assert utils._HAS_RYML, "rapidyaml declared as a required dep but not detected"
assert utils.try_fast_yaml_load(EXAMPLE_CONFIG) == utils._rapidyaml_load(EXAMPLE_CONFIG)
65 changes: 59 additions & 6 deletions python/lib/sift_py/yaml/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import json
from pathlib import Path
from typing import Any, Callable, Dict, Type, cast

import yaml

try:
import ryml
except ImportError:
ryml = None # type: ignore[assignment]

_HAS_RYML = ryml is not None


def _handle_subdir(path: Path, file_handler: Callable):
"""The file_handler callable must accept a Path object as its only argument."""
Expand All @@ -17,13 +25,58 @@ def _type_fqn(typ: Type) -> str:
return f"{typ.__module__}.{typ.__name__}"


def try_fast_yaml_load(path: Path) -> Dict[Any, Any]:
"""
Try to load the YAML file using the CSafeLoader, which is faster than the pyyaml safe loader but not built into the wheel for earlier versions of python..
If the CSafeLoader is not available, use the pyyaml safe loader.
def _rapidyaml_load(path: Path) -> Dict[Any, Any]:
"""Parse YAML via rapidyaml, round-tripping through JSON for a plain dict.

rapidyaml (``ryml``) is the C++ ryml binding; it is materially faster than
pyyaml on Sift telemetry configs (~3-4x on the example files, higher on
large files). ``tree.resolve()`` is called before emit, which inlines both
anchors/aliases (``&x`` / ``*x``) and merge keys (``<<: *x``); combined
with ``emit_json``'s scalar type inference, the returned dict matches
pyyaml's safe-load semantics on every config we ship today.

The one semantic diff to watch for is YAML 1.1-isms that rapidyaml (YAML
1.2) does not treat as typed: e.g. ``yes``/``no`` stay strings, and
sexagesimal numbers stay strings. Existing Sift configs do not use these.

Only call this when ``_HAS_RYML`` is true; the ``assert`` below narrows the
type for static analysis and is an invariant the dispatcher upholds.
"""
assert ryml is not None, "rapidyaml is not installed; call _pyyaml_load instead"
with open(path, "rb") as f:
tree = ryml.parse_in_arena(f.read())
resolve = getattr(tree, "resolve", None)
if callable(resolve):
resolve()
return cast(Dict[Any, Any], json.loads(ryml.emit_json(tree)))


def _pyyaml_load(path: Path) -> Dict[Any, Any]:
"""Fallback loader using pyyaml's C-backed ``CSafeLoader`` when available."""
with open(path, "r") as f:
if hasattr(yaml, "CSafeLoader"):
return cast(Dict[Any, Any], yaml.load(f.read(), Loader=yaml.CSafeLoader))
else:
return cast(Dict[Any, Any], yaml.safe_load(f.read()))
return cast(Dict[Any, Any], yaml.safe_load(f.read()))


def try_fast_yaml_load(path: Path) -> Dict[Any, Any]:
"""
Try to load the YAML file using the fastest available parser.

Order of preference:

1. ``rapidyaml`` (``ryml``) - C++ binding, ~100x faster than pyyaml on
large files. Requires ``pip install rapidyaml``. See
:func:`_rapidyaml_load` for caveats (notably: no YAML 1.1 merge-key
semantics).
2. ``pyyaml.CSafeLoader`` - libyaml-backed, bundled with most pyyaml
wheels but not every Python/platform combination.
3. ``pyyaml.safe_load`` - pure-Python fallback.

rapidyaml failures are not swallowed silently: if ``ryml`` is installed
but raises while parsing ``path``, the exception propagates so the
regression is visible rather than masked by the pyyaml fallback.
"""
if _HAS_RYML:
return _rapidyaml_load(path)
return _pyyaml_load(path)
5 changes: 5 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ keywords = ["sift", "sift-stack", "siftstack", "sift_py"]
dependencies = [
"grpcio~=1.13",
"PyYAML~=6.0",
"rapidyaml~=0.11",
"pandas>=2.0,<3.1",
"protobuf>=5.0",
"pydantic~=2.10",
Expand Down Expand Up @@ -319,6 +320,10 @@ ignore_errors = true
[[tool.mypy.overrides]]
module = "ruamel"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "ryml"
ignore_missing_imports = true
ignore_errors = true

[[tool.mypy.overrides]]
Expand Down
Loading