feat: Add sync_stages for ordered concurrent sync execution

jdrew82 · claude · jdrew82 · commit 8f331035d29f · 2026-03-19T09:34:35.000-05:00
Add sync_stages ClassVar on Adapter to control the order of model type
processing during concurrent sync. Stages execute sequentially while
elements within each stage run in parallel via ThreadPoolExecutor.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diffsync/__init__.py b/diffsync/__init__.py
@@ -487,6 +487,22 @@ class Adapter:  # pylint: disable=too-many-public-methods
     top_level: ClassVar[List[str]] = []
     """List of top-level modelnames to begin from when diffing or synchronizing."""
 
+    sync_stages: ClassVar[Optional[List[List[str]]]] = None
+    """Optional ordered groups of model types for staged concurrent sync.
+
+    Each inner list is a "stage" of model types that can safely execute in parallel.
+    Stages are processed sequentially — all elements in stage N complete before stage N+1 begins.
+    Only used when ``concurrent=True``; ignored for serial sync.
+
+    Example::
+
+        sync_stages = [
+            ["site", "vlan"],      # stage 1: independent types, run in parallel
+            ["device"],            # stage 2: depends on sites
+            ["interface"],         # stage 3: depends on devices
+        ]
+    """
+
     def __init__(
         self,
         name: Optional[str] = None,
@@ -528,6 +544,21 @@ def __init_subclass__(cls) -> None:
             if not isclass(value) or not issubclass(value, DiffSyncModel):
                 raise AttributeError(f'top_level references attribute "{name}" but it is not a DiffSyncModel subclass!')
 
+        if cls.sync_stages is not None:
+            top_level_set = set(cls.top_level)
+            seen: set = set()
+            for stage in cls.sync_stages:
+                for model_type in stage:
+                    if model_type not in top_level_set:
+                        raise AttributeError(
+                            f'sync_stages references "{model_type}" but it is not in top_level!'
+                        )
+                    if model_type in seen:
+                        raise AttributeError(
+                            f'sync_stages contains duplicate entry "{model_type}"!'
+                        )
+                    seen.add(model_type)
+
     def __new__(cls, **kwargs):  # type: ignore[no-untyped-def]
         """Document keyword arguments that were used to initialize Adapter."""
         meta_kwargs = {}
@@ -687,6 +718,7 @@ def sync_from(  # pylint: disable=too-many-arguments,R0917,too-many-locals
             batch_size=batch_size,
             concurrent=concurrent,
             max_workers=max_workers,
+            sync_stages=self.sync_stages,
         )
         result = syncer.perform_sync()
         if result:
diff --git a/diffsync/helpers.py b/diffsync/helpers.py
@@ -346,6 +346,7 @@ def __init__(  # pylint: disable=too-many-arguments,R0917
         batch_size: Optional[int] = None,
         concurrent: bool = False,
         max_workers: Optional[int] = None,
+        sync_stages: Optional[List[List[str]]] = None,
     ):
         """Create a DiffSyncSyncer instance, ready to call `perform_sync()` against."""
         self.diff = diff
@@ -363,6 +364,7 @@ def __init__(  # pylint: disable=too-many-arguments,R0917
         # Feature 3: Parallel sync of independent subtrees
         self.concurrent = concurrent
         self.max_workers = max_workers
+        self.sync_stages = sync_stages
 
         # Feature 4: Structured operations summary
         self.operations: Dict[str, Dict[str, List[Dict]]] = {}
@@ -397,11 +399,30 @@ def perform_sync(self) -> bool:
 
         # Feature 3: Parallel sync of independent subtrees
         if self.concurrent:
-            elements = list(self.diff.get_children())
-            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-                futures = {executor.submit(self.sync_diff_element, element): element for element in elements}
-                for future in as_completed(futures):
-                    changed |= future.result()
+            if self.sync_stages:
+                # Staged concurrent execution: process each stage sequentially,
+                # parallelizing elements within each stage.
+                for stage in self.sync_stages:
+                    stage_set = set(stage)
+                    stage_elements = [el for el in self.diff.get_children() if el.type in stage_set]
+                    if stage_elements:
+                        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                            futures = {executor.submit(self.sync_diff_element, el): el for el in stage_elements}
+                            for future in as_completed(futures):
+                                changed |= future.result()
+
+                # Handle any elements whose type is not covered by sync_stages (serial fallback)
+                staged_types = {t for stage in self.sync_stages for t in stage}
+                for element in self.diff.get_children():
+                    if element.type not in staged_types:
+                        changed |= self.sync_diff_element(element)
+            else:
+                # No stages defined — all elements in one pool (original behavior)
+                elements = list(self.diff.get_children())
+                with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                    futures = {executor.submit(self.sync_diff_element, element): element for element in elements}
+                    for future in as_completed(futures):
+                        changed |= future.result()
         else:
             for element in self.diff.get_children():
                 changed |= self.sync_diff_element(element)
diff --git a/tests/unit/test_diffsync_diff_and_sync_parameters.py b/tests/unit/test_diffsync_diff_and_sync_parameters.py
@@ -442,3 +442,189 @@ def test_filters_combined_with_sync_attrs():
             diffs = de.get_attrs_diffs()
             if "+" in diffs:
                 assert "tag" not in diffs["+"]
+
+
+# ---------------------------------------------------------------------------
+# sync_stages — ordered group execution for concurrent sync
+# ---------------------------------------------------------------------------
+
+
+# Models and adapters for sync_stages tests — uses multiple top-level types
+# to exercise staged parallelism.
+
+_creation_order: List = []
+
+
+class _Region(DiffSyncModel):
+    _modelname = "region"
+    _identifiers = ("name",)
+    _attributes = ("slug",)
+
+    name: str
+    slug: str = ""
+
+    @classmethod
+    def create(cls, adapter, ids, attrs):
+        _creation_order.append(("region", ids["name"]))
+        return super().create(adapter=adapter, ids=ids, attrs=attrs)
+
+
+class _Tenant(DiffSyncModel):
+    _modelname = "tenant"
+    _identifiers = ("name",)
+    _attributes = ("group",)
+
+    name: str
+    group: str = ""
+
+    @classmethod
+    def create(cls, adapter, ids, attrs):
+        _creation_order.append(("tenant", ids["name"]))
+        return super().create(adapter=adapter, ids=ids, attrs=attrs)
+
+
+class _Rack(DiffSyncModel):
+    _modelname = "rack"
+    _identifiers = ("name",)
+    _attributes = ("site_name",)
+
+    name: str
+    site_name: str = ""
+
+    @classmethod
+    def create(cls, adapter, ids, attrs):
+        _creation_order.append(("rack", ids["name"]))
+        return super().create(adapter=adapter, ids=ids, attrs=attrs)
+
+
+class _StagedAdapter(Adapter):
+    region = _Region
+    tenant = _Tenant
+    rack = _Rack
+    top_level = ["region", "tenant", "rack"]
+    sync_stages = [
+        ["region", "tenant"],  # stage 1: independent, can run in parallel
+        ["rack"],              # stage 2: depends on regions being created
+    ]
+
+
+class _UnstagedAdapter(Adapter):
+    """Same models, no sync_stages — for comparison."""
+    region = _Region
+    tenant = _Tenant
+    rack = _Rack
+    top_level = ["region", "tenant", "rack"]
+
+
+def _make_staged_pair(adapter_cls=_StagedAdapter):
+    """Build a source with regions/tenants/racks and an empty destination."""
+    src = adapter_cls()
+    dst = adapter_cls()
+
+    src.add(_Region(name="region1", slug="r1"))
+    src.add(_Region(name="region2", slug="r2"))
+    src.add(_Tenant(name="tenant1", group="g1"))
+    src.add(_Rack(name="rack1", site_name="region1"))
+    src.add(_Rack(name="rack2", site_name="region2"))
+
+    return src, dst
+
+
+def test_sync_stages_executes_in_order():
+    """All stage-1 types (region, tenant) must be created before any stage-2 type (rack)."""
+    _creation_order.clear()
+    src, dst = _make_staged_pair()
+    dst.sync_from(src, concurrent=True, max_workers=4)
+
+    # Find the index of the first rack creation
+    rack_indices = [i for i, (t, _) in enumerate(_creation_order) if t == "rack"]
+    region_indices = [i for i, (t, _) in enumerate(_creation_order) if t == "region"]
+    tenant_indices = [i for i, (t, _) in enumerate(_creation_order) if t == "tenant"]
+
+    assert len(rack_indices) == 2
+    assert len(region_indices) == 2
+    assert len(tenant_indices) == 1
+
+    # All stage-1 creations (regions + tenants) must come before any stage-2 creation (racks)
+    max_stage1_index = max(max(region_indices), max(tenant_indices))
+    min_stage2_index = min(rack_indices)
+    assert max_stage1_index < min_stage2_index, (
+        f"Stage 1 items must all complete before stage 2 begins. "
+        f"Order was: {_creation_order}"
+    )
+
+
+def test_sync_stages_parallelizes_within_stage():
+    """Two independent top-level types in the same stage should both be processed."""
+    _creation_order.clear()
+    src, dst = _make_staged_pair()
+    dst.sync_from(src, concurrent=True, max_workers=4)
+
+    types_created = {t for t, _ in _creation_order}
+    assert "region" in types_created
+    assert "tenant" in types_created
+    assert "rack" in types_created
+
+
+def test_sync_stages_none_preserves_current_behavior():
+    """sync_stages=None with concurrent=True should behave like the original unstaged concurrent sync."""
+    _creation_order.clear()
+    src, dst = _make_staged_pair(_UnstagedAdapter)
+    dst.sync_from(src, concurrent=True, max_workers=2)
+
+    assert dst.get_or_none("region", "region1") is not None
+    assert dst.get_or_none("tenant", "tenant1") is not None
+    assert dst.get_or_none("rack", "rack1") is not None
+
+
+def test_sync_stages_ignored_when_serial():
+    """sync_stages should have no effect on serial sync — top_level order is used."""
+    _creation_order.clear()
+    src, dst = _make_staged_pair()
+    dst.sync_from(src, concurrent=False)
+
+    assert dst.get_or_none("region", "region1") is not None
+    assert dst.get_or_none("rack", "rack1") is not None
+
+
+def test_sync_stages_validation_rejects_unknown_type():
+    """A type in sync_stages that is not in top_level should raise AttributeError."""
+    import pytest
+
+    with pytest.raises(AttributeError, match="sync_stages.*not in top_level"):
+        class _BadAdapter(Adapter):
+            region = _Region
+            top_level = ["region"]
+            sync_stages = [["region", "nonexistent"]]
+
+
+def test_sync_stages_validation_rejects_duplicates():
+    """A type appearing in multiple stages should raise AttributeError."""
+    import pytest
+
+    with pytest.raises(AttributeError, match="sync_stages.*duplicate"):
+        class _BadAdapter(Adapter):
+            region = _Region
+            tenant = _Tenant
+            top_level = ["region", "tenant"]
+            sync_stages = [["region", "tenant"], ["region"]]
+
+
+def test_sync_stages_unstaged_types_still_sync():
+    """A type in top_level but not in any stage should still be synced (serially, after all stages)."""
+
+    class _PartialStagesAdapter(Adapter):
+        region = _Region
+        tenant = _Tenant
+        rack = _Rack
+        top_level = ["region", "tenant", "rack"]
+        sync_stages = [["region"]]  # tenant and rack not staged
+
+    _creation_order.clear()
+    src, dst = _make_staged_pair(_PartialStagesAdapter)
+    dst.sync_from(src, concurrent=True, max_workers=2)
+
+    # All types should still be synced
+    assert dst.get_or_none("region", "region1") is not None
+    assert dst.get_or_none("tenant", "tenant1") is not None
+    assert dst.get_or_none("rack", "rack1") is not None