From a77df57e2e2d9071d810cbc84b95aab7c663702f Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 23 Apr 2026 17:10:59 -0700 Subject: [PATCH 1/4] python(feat): Add rapidyaml as the default yaml loader --- python/lib/sift_py/yaml/_utils_test.py | 115 +++++++++++++++++++++++++ python/lib/sift_py/yaml/utils.py | 61 +++++++++++-- python/pyproject.toml | 1 + 3 files changed, 171 insertions(+), 6 deletions(-) create mode 100644 python/lib/sift_py/yaml/_utils_test.py diff --git a/python/lib/sift_py/yaml/_utils_test.py b/python/lib/sift_py/yaml/_utils_test.py new file mode 100644 index 000000000..61b8c83cf --- /dev/null +++ b/python/lib/sift_py/yaml/_utils_test.py @@ -0,0 +1,115 @@ +"""Exercises the three YAML loader paths in :mod:`sift_py.yaml.utils`. + +The three paths, in priority order inside :func:`try_fast_yaml_load`: + +1. ``rapidyaml`` (``ryml``) -- C++ bindings, declared as a required dependency. +2. pyyaml ``CSafeLoader`` -- libyaml-backed, available on platforms whose + pyyaml wheel ships libyaml (i.e. almost all of them). +3. pyyaml ``safe_load`` -- pure-Python fallback used when neither libyaml nor + rapidyaml is installed. + +Each path should produce an identical dict for the shipped example ingest +config, including correct resolution of anchors (``&foo`` / ``*foo``) and +YAML 1.1 merge keys (``<<: *foo``) used heavily by that config. +""" + +from pathlib import Path + +import pytest +import yaml + +from sift_py.yaml import utils + +EXAMPLE_CONFIG = ( + Path(__file__).resolve().parents[3] / "examples" / "telemetry_config.example.yml" +) + + +def _force_pure_python_pyyaml(monkeypatch: pytest.MonkeyPatch) -> None: + """Hide ``yaml.CSafeLoader`` so ``_pyyaml_load`` falls through to ``safe_load``.""" + monkeypatch.delattr(yaml, "CSafeLoader", raising=False) + + +def _assert_sift_ingest_shape(config: dict) -> None: + """Sanity-check that *config* is a well-formed Sift ingest YAML dict. + + Focuses on fields that a miswired loader would get wrong: anchors resolving + to inline dicts, merge keys (``<<:``) being expanded rather than left as + literal ``"<<"`` keys, and numeric scalars surviving the round-trip as + ints rather than strings. + """ + assert isinstance(config, dict) + assert isinstance(config.get("asset_name"), str) + assert isinstance(config["channels"], dict) + assert isinstance(config["flows"], list) + assert config["flows"], "example config should declare at least one flow" + + for flow in config["flows"]: + assert isinstance(flow["name"], str) + for ch in flow.get("channels", []): + if not isinstance(ch, dict): + continue + # Merge keys must be inlined, not left as a literal "<<" key. + assert "<<" not in ch, f"unresolved merge key in flow channel: {ch!r}" + # Any numeric field (bit_field_elements[].index, enum_types[].key) + # must come back as int, not str. + for enum in ch.get("enum_types", []) or []: + assert isinstance(enum.get("key"), int), f"enum key not int: {enum!r}" + for bit in ch.get("bit_field_elements", []) or []: + assert isinstance(bit.get("index"), int), f"bit index not int: {bit!r}" + assert isinstance(bit.get("bit_count"), int), ( + f"bit_count not int: {bit!r}" + ) + + +def test_rapidyaml_load_path(): + """Primary path: rapidyaml produces a schema-valid ingest dict.""" + pytest.importorskip("ryml") + result = utils._rapidyaml_load(EXAMPLE_CONFIG) + _assert_sift_ingest_shape(result) + + +def test_pyyaml_csafeloader_path(): + """Fallback path 1: pyyaml's libyaml-backed ``CSafeLoader``.""" + if not hasattr(yaml, "CSafeLoader"): + pytest.skip("libyaml/CSafeLoader not installed in this environment") + result = utils._pyyaml_load(EXAMPLE_CONFIG) + _assert_sift_ingest_shape(result) + + +def test_pyyaml_pure_python_path(monkeypatch: pytest.MonkeyPatch): + """Fallback path 2: pure-Python ``safe_load`` with ``CSafeLoader`` hidden.""" + _force_pure_python_pyyaml(monkeypatch) + assert not hasattr(yaml, "CSafeLoader"), "monkeypatch failed to hide CSafeLoader" + result = utils._pyyaml_load(EXAMPLE_CONFIG) + _assert_sift_ingest_shape(result) + + +def test_all_three_loaders_agree(monkeypatch: pytest.MonkeyPatch): + """rapidyaml, CSafeLoader, and pure-Python ``safe_load`` return the same dict. + + Dispatch order matters: call the two pyyaml paths *after* rapidyaml, and + perform the pure-Python load last so the ``CSafeLoader`` monkeypatch + cannot leak into the libyaml path. + """ + pytest.importorskip("ryml") + if not hasattr(yaml, "CSafeLoader"): + pytest.skip("libyaml/CSafeLoader not installed; cannot compare all three") + + via_ryml = utils._rapidyaml_load(EXAMPLE_CONFIG) + via_csafe = utils._pyyaml_load(EXAMPLE_CONFIG) + + _force_pure_python_pyyaml(monkeypatch) + via_safe = utils._pyyaml_load(EXAMPLE_CONFIG) + + assert via_ryml == via_csafe + assert via_csafe == via_safe + + +def test_try_fast_yaml_load_dispatches_to_rapidyaml_when_available(): + """``try_fast_yaml_load`` returns the rapidyaml result when ``_HAS_RYML`` is true.""" + pytest.importorskip("ryml") + assert utils._HAS_RYML, "rapidyaml declared as a required dep but not detected" + assert utils.try_fast_yaml_load(EXAMPLE_CONFIG) == utils._rapidyaml_load( + EXAMPLE_CONFIG + ) diff --git a/python/lib/sift_py/yaml/utils.py b/python/lib/sift_py/yaml/utils.py index 4270046f2..625ce58c7 100644 --- a/python/lib/sift_py/yaml/utils.py +++ b/python/lib/sift_py/yaml/utils.py @@ -1,8 +1,16 @@ +import json from pathlib import Path from typing import Any, Callable, Dict, Type, cast import yaml +try: + import ryml # type: ignore[import-not-found] + + _HAS_RYML = True +except ImportError: + _HAS_RYML = False + def _handle_subdir(path: Path, file_handler: Callable): """The file_handler callable must accept a Path object as its only argument.""" @@ -17,13 +25,54 @@ def _type_fqn(typ: Type) -> str: return f"{typ.__module__}.{typ.__name__}" -def try_fast_yaml_load(path: Path) -> Dict[Any, Any]: - """ - Try to load the YAML file using the CSafeLoader, which is faster than the pyyaml safe loader but not built into the wheel for earlier versions of python.. - If the CSafeLoader is not available, use the pyyaml safe loader. +def _rapidyaml_load(path: Path) -> Dict[Any, Any]: + """Parse YAML via rapidyaml, round-tripping through JSON for a plain dict. + + rapidyaml (``ryml``) is the C++ ryml binding; it is materially faster than + pyyaml on Sift telemetry configs (~3-4x on the example files, higher on + large files). ``tree.resolve()`` is called before emit, which inlines both + anchors/aliases (``&x`` / ``*x``) and merge keys (``<<: *x``); combined + with ``emit_json``'s scalar type inference, the returned dict matches + pyyaml's safe-load semantics on every config we ship today. + + The one semantic diff to watch for is YAML 1.1-isms that rapidyaml (YAML + 1.2) does not treat as typed: e.g. ``yes``/``no`` stay strings, and + sexagesimal numbers stay strings. Existing Sift configs do not use these. """ + with open(path, "rb") as f: + tree = ryml.parse_in_arena(f.read()) + resolve = getattr(tree, "resolve", None) + if callable(resolve): + resolve() + return cast(Dict[Any, Any], json.loads(ryml.emit_json(tree))) + + +def _pyyaml_load(path: Path) -> Dict[Any, Any]: + """Fallback loader using pyyaml's C-backed ``CSafeLoader`` when available.""" with open(path, "r") as f: if hasattr(yaml, "CSafeLoader"): return cast(Dict[Any, Any], yaml.load(f.read(), Loader=yaml.CSafeLoader)) - else: - return cast(Dict[Any, Any], yaml.safe_load(f.read())) + return cast(Dict[Any, Any], yaml.safe_load(f.read())) + + +def try_fast_yaml_load(path: Path) -> Dict[Any, Any]: + """ + Try to load the YAML file using the fastest available parser. + + Order of preference: + + 1. ``rapidyaml`` (``ryml``) - C++ binding, ~100x faster than pyyaml on + large files. Requires ``pip install rapidyaml``. See + :func:`_rapidyaml_load` for caveats (notably: no YAML 1.1 merge-key + semantics). + 2. ``pyyaml.CSafeLoader`` - libyaml-backed, bundled with most pyyaml + wheels but not every Python/platform combination. + 3. ``pyyaml.safe_load`` - pure-Python fallback. + + rapidyaml failures are not swallowed silently: if ``ryml`` is installed + but raises while parsing ``path``, the exception propagates so the + regression is visible rather than masked by the pyyaml fallback. + """ + if _HAS_RYML: + return _rapidyaml_load(path) + return _pyyaml_load(path) diff --git a/python/pyproject.toml b/python/pyproject.toml index 048210a60..9084dabea 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -26,6 +26,7 @@ keywords = ["sift", "sift-stack", "siftstack", "sift_py"] dependencies = [ "grpcio~=1.13", "PyYAML~=6.0", + "rapidyaml~=0.11", "pandas>=2.0,<3.1", "protobuf>=5.0", "pydantic~=2.10", From f9b9b784e3f7d46efd41a709cf94989ac5c4e2b4 Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 23 Apr 2026 17:16:53 -0700 Subject: [PATCH 2/4] lint --- python/lib/sift_py/yaml/_utils_test.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/python/lib/sift_py/yaml/_utils_test.py b/python/lib/sift_py/yaml/_utils_test.py index 61b8c83cf..279fedc8b 100644 --- a/python/lib/sift_py/yaml/_utils_test.py +++ b/python/lib/sift_py/yaml/_utils_test.py @@ -20,9 +20,7 @@ from sift_py.yaml import utils -EXAMPLE_CONFIG = ( - Path(__file__).resolve().parents[3] / "examples" / "telemetry_config.example.yml" -) +EXAMPLE_CONFIG = Path(__file__).resolve().parents[3] / "examples" / "telemetry_config.example.yml" def _force_pure_python_pyyaml(monkeypatch: pytest.MonkeyPatch) -> None: @@ -57,9 +55,7 @@ def _assert_sift_ingest_shape(config: dict) -> None: assert isinstance(enum.get("key"), int), f"enum key not int: {enum!r}" for bit in ch.get("bit_field_elements", []) or []: assert isinstance(bit.get("index"), int), f"bit index not int: {bit!r}" - assert isinstance(bit.get("bit_count"), int), ( - f"bit_count not int: {bit!r}" - ) + assert isinstance(bit.get("bit_count"), int), f"bit_count not int: {bit!r}" def test_rapidyaml_load_path(): @@ -110,6 +106,4 @@ def test_try_fast_yaml_load_dispatches_to_rapidyaml_when_available(): """``try_fast_yaml_load`` returns the rapidyaml result when ``_HAS_RYML`` is true.""" pytest.importorskip("ryml") assert utils._HAS_RYML, "rapidyaml declared as a required dep but not detected" - assert utils.try_fast_yaml_load(EXAMPLE_CONFIG) == utils._rapidyaml_load( - EXAMPLE_CONFIG - ) + assert utils.try_fast_yaml_load(EXAMPLE_CONFIG) == utils._rapidyaml_load(EXAMPLE_CONFIG) From 26efef7b2c387e243bcee68edb5ab8d2c34e003e Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 23 Apr 2026 17:21:52 -0700 Subject: [PATCH 3/4] missing imports warning --- python/lib/sift_py/yaml/utils.py | 2 +- python/pyproject.toml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/lib/sift_py/yaml/utils.py b/python/lib/sift_py/yaml/utils.py index 625ce58c7..c31d5136e 100644 --- a/python/lib/sift_py/yaml/utils.py +++ b/python/lib/sift_py/yaml/utils.py @@ -5,7 +5,7 @@ import yaml try: - import ryml # type: ignore[import-not-found] + import ryml _HAS_RYML = True except ImportError: diff --git a/python/pyproject.toml b/python/pyproject.toml index 9084dabea..287a2a0d4 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -320,6 +320,10 @@ ignore_errors = true [[tool.mypy.overrides]] module = "ruamel" ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "ryml" +ignore_missing_imports = true ignore_errors = true [[tool.mypy.overrides]] From 3e3eacf5ce9589ce90092bdda2e7edbdb701216a Mon Sep 17 00:00:00 2001 From: Ian Later Date: Thu, 23 Apr 2026 17:36:50 -0700 Subject: [PATCH 4/4] pyright --- python/lib/sift_py/yaml/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/lib/sift_py/yaml/utils.py b/python/lib/sift_py/yaml/utils.py index c31d5136e..99fafc5ec 100644 --- a/python/lib/sift_py/yaml/utils.py +++ b/python/lib/sift_py/yaml/utils.py @@ -6,10 +6,10 @@ try: import ryml - - _HAS_RYML = True except ImportError: - _HAS_RYML = False + ryml = None # type: ignore[assignment] + +_HAS_RYML = ryml is not None def _handle_subdir(path: Path, file_handler: Callable): @@ -38,7 +38,11 @@ def _rapidyaml_load(path: Path) -> Dict[Any, Any]: The one semantic diff to watch for is YAML 1.1-isms that rapidyaml (YAML 1.2) does not treat as typed: e.g. ``yes``/``no`` stay strings, and sexagesimal numbers stay strings. Existing Sift configs do not use these. + + Only call this when ``_HAS_RYML`` is true; the ``assert`` below narrows the + type for static analysis and is an invariant the dispatcher upholds. """ + assert ryml is not None, "rapidyaml is not installed; call _pyyaml_load instead" with open(path, "rb") as f: tree = ryml.parse_in_arena(f.read()) resolve = getattr(tree, "resolve", None)