Skip to content

Commit 979126f

Browse files
committed
Add distributed fault recovery receipts
1 parent 3182acf commit 979126f

8 files changed

Lines changed: 555 additions & 4 deletions

File tree

.blitz/test_state_v1/indexes/task_states.ndjson

Lines changed: 6 additions & 0 deletions
Large diffs are not rendered by default.

examples/gn_ten_distributed_stack/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,21 @@ Phase 9 extends the proof through the fugu router/model substrate:
2222
- keep Execution Plane lower-lane execution out of the default proof until a
2323
scenario requires it.
2424

25+
Phase 11 records local fault and recovery posture:
26+
27+
- emit bounded fault receipts for node crash, distribution disconnect/heal,
28+
facade timeout, stale DTO, duplicate delivery, and trace exporter failure;
29+
- cite owner recovery evidence for Mezzanine, Citadel, StackLab's existing
30+
pressure/failover drill, and AITrace;
31+
- keep WAN, production discovery, release boot, and live-provider retry
32+
semantics as explicit non-claims.
33+
2534
## Commands
2635

2736
```bash
2837
mix stack_lab.gn_ten.distributed.prove --profile context_6_node --json
2938
mix stack_lab.gn_ten.distributed.prove --profile router_model_6_node --json
39+
mix stack_lab.gn_ten.distributed.prove --profile partition_recovery --json
3040
```
3141

3242
## QC

examples/gn_ten_distributed_stack/lib/mix/tasks/stack_lab.gn_ten.distributed.prove.ex

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ defmodule Mix.Tasks.StackLab.GnTen.Distributed.Prove do
2828
"router_model_6_node" ->
2929
run_router_model(opts)
3030

31+
"partition_recovery" ->
32+
run_partition_recovery(opts)
33+
3134
other ->
3235
Mix.raise("unsupported distributed proof profile: #{other}")
3336
end
@@ -76,4 +79,26 @@ defmodule Mix.Tasks.StackLab.GnTen.Distributed.Prove do
7679
Mix.raise("distributed proof failed: #{inspect(reason)}")
7780
end
7881
end
82+
83+
defp run_partition_recovery(opts) do
84+
json? = Keyword.get(opts, :json, false)
85+
86+
proof_opts =
87+
opts
88+
|> Keyword.take([:topology])
89+
|> Enum.map(fn {:topology, path} -> {:topology_path, path} end)
90+
91+
case GnTenDistributedStack.run_partition_recovery(proof_opts) do
92+
{:ok, receipt} when json? ->
93+
Mix.shell().info(GnTenDistributedStack.to_json!(receipt))
94+
95+
{:ok, receipt} ->
96+
Mix.shell().info("status=#{receipt.status}")
97+
Mix.shell().info("receipt_ref=#{receipt.receipt_ref}")
98+
Mix.shell().info("topology_ref=#{receipt.topology_ref}")
99+
100+
{:error, reason} ->
101+
Mix.raise("distributed proof failed: #{inspect(reason)}")
102+
end
103+
end
79104
end

examples/gn_ten_distributed_stack/lib/stack_lab/examples/gn_ten_distributed_stack.ex

Lines changed: 146 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,24 +108,64 @@ defmodule StackLab.Examples.GnTenDistributedStack.RouterModelReceipt do
108108
}
109109
end
110110

111+
defmodule StackLab.Examples.GnTenDistributedStack.FaultRecoveryReceipt do
112+
@moduledoc "Distributed gn-ten fault and recovery proof receipt."
113+
114+
@enforce_keys [
115+
:receipt_ref,
116+
:schema_version,
117+
:status,
118+
:profile,
119+
:topology_ref,
120+
:baseline_receipt_ref,
121+
:fault_receipts,
122+
:owner_recovery_evidence,
123+
:trace_refs,
124+
:node_lab_run,
125+
:does_not_prove
126+
]
127+
defstruct @enforce_keys
128+
129+
@type t :: %__MODULE__{
130+
receipt_ref: String.t(),
131+
schema_version: String.t(),
132+
status: :pass | :open_defect,
133+
profile: String.t(),
134+
topology_ref: String.t(),
135+
baseline_receipt_ref: String.t(),
136+
fault_receipts: [map()],
137+
owner_recovery_evidence: [map()],
138+
trace_refs: [String.t()],
139+
node_lab_run: map(),
140+
does_not_prove: [String.t()]
141+
}
142+
end
143+
111144
defmodule StackLab.Examples.GnTenDistributedStack do
112145
@moduledoc """
113146
Local distributed gn-ten proof scenarios.
114147
"""
115148

116-
alias StackLab.Examples.GnTenDistributedStack.{Receipt, RouterModelReceipt}
149+
alias StackLab.Examples.GnTenDistributedStack.{
150+
FaultRecoveryReceipt,
151+
Receipt,
152+
RouterModelReceipt
153+
}
117154

118155
@context_schema_version "stack_lab.gn_ten_distributed_stack.context_6_node.v1"
119156
@router_model_schema_version "stack_lab.gn_ten_distributed_stack.router_model_6_node.v1"
157+
@fault_recovery_schema_version "stack_lab.gn_ten_distributed_stack.partition_recovery.v1"
120158
@context_profile "context_6_node"
121159
@router_model_profile "router_model_6_node"
160+
@fault_recovery_profile "partition_recovery"
122161
@envelope_schema_version "stack_lab.distributed_envelope.v1"
123162
@context_roundtrip Module.concat([StackLab, Examples, ContextABIRoundtrip])
124163
@router_roundtrip Module.concat([StackLab, Examples, NSHKRRouterFabricRoundtrip])
125164
@aitrace_evidence Module.concat([AITrace, RemoteFacade, Evidence])
126165
@aitrace_fixture_transport Module.concat([AITrace, NSHKR, ExportTransport, Fixture])
127166
@replay_bundle Module.concat([AITrace, Trace, ReplayBundle])
128167
@envelope_scanner Module.concat([StackLab, GnTenNodeLab, EnvelopeScanner])
168+
@fault_drill Module.concat([StackLab, GnTenNodeLab, FaultDrill])
129169
@runner Module.concat([StackLab, GnTenNodeLab, Runner])
130170
@json Module.concat([Jason])
131171

@@ -252,11 +292,41 @@ defmodule StackLab.Examples.GnTenDistributedStack do
252292
end
253293
end
254294

255-
@spec to_map(Receipt.t() | RouterModelReceipt.t()) :: map()
295+
@spec run_partition_recovery(keyword()) ::
296+
{:ok, FaultRecoveryReceipt.t()} | {:error, term()}
297+
def run_partition_recovery(opts \\ []) when is_list(opts) do
298+
with {:ok, baseline} <- run_router_model_6_node(opts) do
299+
fault_receipts = fault_receipts(baseline)
300+
301+
{:ok,
302+
%FaultRecoveryReceipt{
303+
receipt_ref: fault_recovery_receipt_ref(baseline),
304+
schema_version: @fault_recovery_schema_version,
305+
status: fault_recovery_status(fault_receipts),
306+
profile: @fault_recovery_profile,
307+
topology_ref: baseline.topology_ref,
308+
baseline_receipt_ref: baseline.receipt_ref,
309+
fault_receipts: fault_receipts,
310+
owner_recovery_evidence: owner_recovery_evidence(),
311+
trace_refs: baseline.trace_refs,
312+
node_lab_run: baseline.node_lab_run,
313+
does_not_prove: [
314+
"WAN partition behavior",
315+
"production service discovery",
316+
"release artifact boot",
317+
"live provider retry semantics",
318+
"Execution Plane lower-lane partition behavior"
319+
]
320+
}}
321+
end
322+
end
323+
324+
@spec to_map(Receipt.t() | RouterModelReceipt.t() | FaultRecoveryReceipt.t()) :: map()
256325
def to_map(%Receipt{} = receipt), do: json_safe(receipt)
257326
def to_map(%RouterModelReceipt{} = receipt), do: json_safe(receipt)
327+
def to_map(%FaultRecoveryReceipt{} = receipt), do: json_safe(receipt)
258328

259-
@spec to_json!(Receipt.t() | RouterModelReceipt.t()) :: String.t()
329+
@spec to_json!(Receipt.t() | RouterModelReceipt.t() | FaultRecoveryReceipt.t()) :: String.t()
260330
def to_json!(%Receipt{} = receipt) do
261331
call(@json, :encode!, [to_map(receipt), [pretty: true]])
262332
end
@@ -265,6 +335,10 @@ defmodule StackLab.Examples.GnTenDistributedStack do
265335
call(@json, :encode!, [to_map(receipt), [pretty: true]])
266336
end
267337

338+
def to_json!(%FaultRecoveryReceipt{} = receipt) do
339+
call(@json, :encode!, [to_map(receipt), [pretty: true]])
340+
end
341+
268342
defp envelopes(baseline, node_lab_run, scenario \\ :context) do
269343
node_lab_run
270344
|> Map.fetch!("boot_receipts")
@@ -376,6 +450,12 @@ defmodule StackLab.Examples.GnTenDistributedStack do
376450
"gn-ten-distributed-router-model://#{suffix}"
377451
end
378452

453+
defp fault_recovery_receipt_ref(baseline) do
454+
baseline.receipt_ref
455+
|> String.replace_prefix("gn-ten-distributed-router-model://", "")
456+
|> then(&"gn-ten-distributed-partition-recovery://#{&1}")
457+
end
458+
379459
defp default_context_topology_path do
380460
Path.expand("../../../priv/topologies/context_6_node.exs", __DIR__)
381461
end
@@ -483,6 +563,69 @@ defmodule StackLab.Examples.GnTenDistributedStack do
483563
defp maybe_put(map, _key, nil), do: map
484564
defp maybe_put(map, key, value), do: Map.put(map, key, value)
485565

566+
defp fault_receipts(baseline) do
567+
run = baseline.node_lab_run
568+
569+
[
570+
call(@fault_drill, :crash_node!, [run, "jido_model_runtime_0"]),
571+
call(@fault_drill, :disconnect_nodes!, [
572+
run,
573+
"mezzanine_workflow_0",
574+
"jido_model_runtime_0"
575+
]),
576+
call(@fault_drill, :heal_nodes!, [run, "mezzanine_workflow_0", "jido_model_runtime_0"]),
577+
call(@fault_drill, :delay_facade!, [
578+
run,
579+
"jido_model_runtime_0",
580+
"JidoIntegration.RemoteFacade.ModelRuntime",
581+
5_001
582+
]),
583+
call(@fault_drill, :inject_stale_dto!, [
584+
run,
585+
"seam://mezzanine/jido/model-invocation",
586+
"fixture://stack_lab/partition_recovery/stale-model-invocation"
587+
]),
588+
call(@fault_drill, :duplicate_submit!, [run, baseline.model_invocation_ref]),
589+
call(@fault_drill, :kill_exporter!, [run, :aitrace_evidence])
590+
]
591+
end
592+
593+
defp fault_recovery_status(fault_receipts) do
594+
if Enum.all?(fault_receipts, &(Map.get(&1, "status") == "pass")),
595+
do: :pass,
596+
else: :open_defect
597+
end
598+
599+
defp owner_recovery_evidence do
600+
[
601+
%{
602+
"owner" => "citadel",
603+
"package" => "surfaces/citadel_domain_surface",
604+
"evidence_ref" => "citadel_domain_surface_fault_injection_and_operability_test",
605+
"safe_action" => "bounded duplicate, timeout, and dead-letter posture"
606+
},
607+
%{
608+
"owner" => "mezzanine",
609+
"package" => "core/workflow_runtime",
610+
"evidence_ref" => "Mezzanine.WorkflowRuntime.TemporalDispatchContract",
611+
"safe_action" => "workflow start outbox and retry visibility contracts"
612+
},
613+
%{
614+
"owner" => "stack_lab",
615+
"package" => "examples/pressure_failover_drill",
616+
"evidence_ref" => "PressureFailoverDrill duplicate_delivery and transport_interruption",
617+
"safe_action" =>
618+
"duplicate delivery converges and transport interruption remains pending until replay"
619+
},
620+
%{
621+
"owner" => "aitrace",
622+
"package" => "AITrace",
623+
"evidence_ref" => "AITrace.RemoteFacade.Evidence",
624+
"safe_action" => "export unavailable posture remains bounded"
625+
}
626+
]
627+
end
628+
486629
defp json_safe(%_struct{} = value), do: value |> Map.from_struct() |> json_safe()
487630

488631
defp json_safe(value) when is_map(value) do

examples/gn_ten_distributed_stack/test/stack_lab/examples/gn_ten_distributed_stack_test.exs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,23 @@ defmodule StackLab.Examples.GnTenDistributedStackTest do
6868
refute json =~ "raw_prompt"
6969
refute json =~ "provider_payload\":"
7070
end
71+
72+
test "runs the partition recovery fault receipt proof" do
73+
assert {:ok, receipt} = GnTenDistributedStack.run_partition_recovery()
74+
75+
assert receipt.status == :pass
76+
assert receipt.profile == "partition_recovery"
77+
assert length(receipt.fault_receipts) == 7
78+
assert Enum.all?(receipt.fault_receipts, &(&1["status"] == "pass"))
79+
assert Enum.any?(receipt.fault_receipts, &(&1["fault_kind"] == "node_crash"))
80+
assert Enum.any?(receipt.fault_receipts, &(&1["fault_kind"] == "stale_dto"))
81+
assert Enum.any?(receipt.fault_receipts, &(&1["fault_kind"] == "trace_exporter_failure"))
82+
assert Enum.any?(receipt.owner_recovery_evidence, &(&1["owner"] == "mezzanine"))
83+
assert Enum.any?(receipt.owner_recovery_evidence, &(&1["owner"] == "citadel"))
84+
85+
json = GnTenDistributedStack.to_json!(receipt)
86+
refute json =~ "cookie_value"
87+
refute json =~ "raw_prompt"
88+
refute json =~ "provider_payload\":"
89+
end
7190
end

support/gn_ten_node_lab/lib/stack_lab/gn_ten_node_lab.ex

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,15 @@ defmodule StackLab.GnTenNodeLab do
77
evidence semantics remain in their owner repos.
88
"""
99

10-
alias StackLab.GnTenNodeLab.{BootPlan, EnvelopeScanner, Peer, Preflight, Runner, Topology}
10+
alias StackLab.GnTenNodeLab.{
11+
BootPlan,
12+
EnvelopeScanner,
13+
FaultDrill,
14+
Peer,
15+
Preflight,
16+
Runner,
17+
Topology
18+
}
1119

1220
@spec preflight(keyword()) :: {:ok, map()} | {:error, map()}
1321
defdelegate preflight(opts \\ []), to: Preflight, as: :run
@@ -42,6 +50,27 @@ defmodule StackLab.GnTenNodeLab do
4250
@spec scan_envelopes([map()], keyword()) :: map()
4351
defdelegate scan_envelopes(envelopes, opts \\ []), to: EnvelopeScanner, as: :scan_many
4452

53+
@spec crash_node!(map(), String.t()) :: map()
54+
defdelegate crash_node!(run, node_ref), to: FaultDrill
55+
56+
@spec disconnect_nodes!(map(), String.t(), String.t()) :: map()
57+
defdelegate disconnect_nodes!(run, left_ref, right_ref), to: FaultDrill
58+
59+
@spec heal_nodes!(map(), String.t(), String.t()) :: map()
60+
defdelegate heal_nodes!(run, left_ref, right_ref), to: FaultDrill
61+
62+
@spec delay_facade!(map(), String.t(), String.t(), non_neg_integer()) :: map()
63+
defdelegate delay_facade!(run, node_ref, facade_ref, delay_ms), to: FaultDrill
64+
65+
@spec inject_stale_dto!(map(), String.t(), String.t()) :: map()
66+
defdelegate inject_stale_dto!(run, seam_ref, fixture_ref), to: FaultDrill
67+
68+
@spec duplicate_submit!(map(), String.t()) :: map()
69+
defdelegate duplicate_submit!(run, accepted_ref), to: FaultDrill
70+
71+
@spec kill_exporter!(map(), atom()) :: map()
72+
defdelegate kill_exporter!(run, exporter_profile), to: FaultDrill
73+
4574
@spec with_peer((Peer.t() -> term()), keyword()) :: {:ok, term()} | {:error, map()}
4675
defdelegate with_peer(fun, opts \\ []), to: Peer
4776
end

0 commit comments

Comments
 (0)