Kakeya-LLM-Inference-engine/proto/kakeya/v1/distributed.proto at main · FluffyAIcode/Kakeya-LLM-Inference-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
// Copyright 2026 Kakeya contributors.
//
// Kakeya multi-host plane — capability exchange + remote block
// proposal (ADR 0009, v0.5-M1 milestone).
//
// This file is the wire contract for the agent capability exchange
// platform between Kakeya nodes (Mac minis and heterogeneous CPU
// hosts) and for the first exchanged capability: serving dLM-proposer
// draft blocks to a remote AR verifier's speculative-decoding loop.
//
// Design properties (see ADR 0009 §4 and
// docs/design/agent-capability-exchange-platform.md):
//
//   * Gossip, not registry: ExchangeCapabilities is a symmetric
//     push-pull merge; there is no coordinator node.
//   * Last-writer-wins per node_id on announced_at_unix; TTL expiry.
//   * Token-ids-only proposal payloads: a wrong or stale remote draft
//     can only lower acceptance rate, never corrupt verifier output,
//     because the verifier-side greedy accept rule is unchanged.
//   * mlx.distributed is an OPTIONAL bulk-tensor data plane advertised
//     via NodeCapability.ring_address; this gRPC plane is the control
//     plane and works on fleets with no MLX at all.
//
// Stability: Unstable until v0.5 GA (same policy runtime.proto had
// before v0.3 GA). Breaking edits require an ADR-update commit.

syntax = "proto3";

package kakeya.v1;

// -----------------------------------------------------------------------------
// Services
// -----------------------------------------------------------------------------

// CapabilityService is served by every Kakeya node that participates
// in a fleet. Both RPCs are safe to call from any peer at any time;
// neither mutates anything beyond the callee's capability registry.
service CapabilityService {
  // ExchangeCapabilities is the gossip primitive. The caller sends
  // every capability card it currently holds (its own card included);
  // the callee merges those cards into its registry (last-writer-wins
  // on announced_at_unix, own card never overwritten) and replies
  // with its merged view. The caller merges the response. After one
  // round both sides hold the union of their prior views.
  rpc ExchangeCapabilities(ExchangeCapabilitiesRequest) returns (ExchangeCapabilitiesResponse);

  // GetNodeCapability returns only the callee's own card. Cheap
  // probe used for liveness checks and diagnostics; does not mutate
  // the callee's registry.
  rpc GetNodeCapability(GetNodeCapabilityRequest) returns (GetNodeCapabilityResponse);
}

// ProposerService serves speculative-decoding draft blocks. The
// contract is DLMProposer.propose_block (ADR 0001) lifted onto the
// wire: committed prefix in, exactly block_size draft token ids out.
service ProposerService {
  // ProposeBlock drafts exactly block_size tokens conditioned on the
  // committed prefix. Unknown model_id returns NOT_FOUND; malformed
  // arguments (empty prefix, non-positive block_size/num_steps)
  // return INVALID_ARGUMENT.
  rpc ProposeBlock(ProposeBlockRequest) returns (ProposeBlockResponse);
}

// -----------------------------------------------------------------------------
// Capability messages
// -----------------------------------------------------------------------------

// CapabilityRole classifies what a node can do with a given model
// for another node's agent workload.
enum CapabilityRole {
  CAPABILITY_ROLE_UNSPECIFIED = 0;

  // The node can run this model as an AR verifier (sink+window KV).
  CAPABILITY_ROLE_VERIFIER = 1;

  // The node can serve ProposeBlock with this model (dLM, DFlash,
  // or the model-free n-gram proposer advertised as model_id
  // "ngram").
  CAPABILITY_ROLE_PROPOSER = 2;

  // Reserved for future exchanged capabilities (ADR 0009 §4).
  CAPABILITY_ROLE_EMBEDDER = 3;
  CAPABILITY_ROLE_TOOL = 4;
}

// ModelCapability is one (model, role) a node offers.
message ModelCapability {
  // HuggingFace-style id ("Qwen/Qwen3-0.6B",
  // "mlx-community/Qwen3-1.7B-4bit") or the reserved id "ngram" for
  // the model-free prompt-lookup proposer.
  string model_id = 1;

  CapabilityRole role = 2;

  // Quantization label, informational: "bf16", "4bit-mlx", "none".
  string quantization = 3;

  // Measured throughput hint in tokens/second for this (model, role)
  // on this node. 0 = unmeasured. Placement uses this as the primary
  // score; nodes should publish a real measurement when they have
  // one.
  double tokens_per_second = 4;
}

// NodeCapability is one node's capability card. See
// docs/design/agent-capability-exchange-platform.md §1.
message NodeCapability {
  // Stable fleet-unique identity, operator-assigned ("mini-attic")
  // or generated. Two nodes sharing a node_id is an operator error;
  // last-writer-wins applies regardless.
  string node_id = 1;

  // host:port where this node's gRPC services (RuntimeService,
  // CapabilityService, ProposerService) are reachable by peers.
  string grpc_address = 2;

  // Informational platform label: "mac-m4-24gb", "linux-x86", ...
  string platform = 3;

  // Unified-memory capacity in bytes; placement tiebreak signal.
  uint64 unified_memory_bytes = 4;

  // Installed mlx version; empty string when MLX is unavailable on
  // this node (e.g. Linux x86 fleet members).
  string mlx_version = 5;

  // Models this node offers, one entry per (model, role).
  repeated ModelCapability models = 6;

  // Wall-clock freshness stamp (seconds since the Unix epoch) set by
  // the owning node each time it re-announces. Merge keeps the card
  // with the larger value per node_id. Fleet clocks are assumed
  // NTP-synced; ttl_seconds must be much larger than worst-case
  // clock skew (design doc §2).
  double announced_at_unix = 7;

  // Card lifetime. A card is expired (dropped on merge and snapshot)
  // once now > announced_at_unix + ttl_seconds.
  double ttl_seconds = 8;

  // Optional mlx.distributed ring endpoint for bulk-tensor flows
  // (ADR 0009 §4 item 4). Empty when the node is not part of a ring.
  string ring_address = 9;
}

message ExchangeCapabilitiesRequest {
  // Every card the caller currently holds, its own included.
  repeated NodeCapability known_nodes = 1;
}

message ExchangeCapabilitiesResponse {
  // The callee's post-merge view of the fleet, its own card included.
  repeated NodeCapability known_nodes = 1;
}

message GetNodeCapabilityRequest {}

message GetNodeCapabilityResponse {
  // The callee's own card only.
  NodeCapability node = 1;
}

// -----------------------------------------------------------------------------
// Remote proposal messages
// -----------------------------------------------------------------------------

message ProposeBlockRequest {
  // The committed prefix (prompt + accepted tokens), raw token ids in
  // the verifier tokenizer's vocabulary. Same opacity contract as
  // runtime.proto: the server never templates or interprets them.
  repeated uint32 committed_token_ids = 1;

  // Number of draft tokens to return. The response carries exactly
  // this many token_ids or the RPC fails; partial blocks are a
  // malformed-proposer error, mirroring SpeculativeDecoder's refusal
  // to continue on short blocks.
  uint32 block_size = 2;

  // Diffusion denoising steps for dLM proposers. Proposers that do
  // not iterate (DFlash single-pass, n-gram lookup) accept and
  // ignore it, matching the in-process propose_block contract.
  uint32 num_steps = 3;

  // Which advertised proposer model to use. Empty string selects the
  // serving node's default proposer.
  string model_id = 4;
}

message ProposeBlockResponse {
  // Exactly block_size draft token ids.
  repeated uint32 token_ids = 1;

  // Accounting mirrored from BlockProposal so the caller's
  // ProposerStats stay meaningful across the wire.
  uint32 diffusion_steps = 2;
  uint32 forward_passes = 3;
  uint64 peak_activation_bytes = 4;
}

// -----------------------------------------------------------------------------
// F3 bulk-tensor data plane: remote DFlash + f_θ proposer (ADR 0009 §4 item F3).
// -----------------------------------------------------------------------------
//
// The production Kakeya config splits the engine across hosts: the gemma-4
// verifier (sink+window restored KV) runs on host A (e.g. a Mac mini, MLX),
// while the EAGLE-style DFlash drafter + the f_θ K/V projection run on host B
// (e.g. a GPU). DFlashProposerService is the stateful contract for that split.
//
// Unlike ProposerService (token-ids-only), DFlash is EAGLE-style: it needs the
// verifier's aux-layer hidden states to build its context K/V, and f_θ projects
// the drafter's K/V into the verifier's K/V space for sink+window restoration.
// Those are real tensors, carried via the framework-neutral Tensor message
// (see inference_engine/distributed/tensor_codec).
//
// Correctness containment is UNCHANGED: every emitted token is decided by the
// verifier's local greedy verify; a wrong/stale remote draft can only lower the
// acceptance rate, never change the output. A session ties host B's incremental
// drafter context K/V to host A's verifier session for its lifetime.

// Tensor is a framework-neutral dense tensor: a numpy-style dtype string, an
// int64 shape, and the little-endian raw buffer (numpy.tobytes). bfloat16 has
// no numpy scalar, so it travels as the logical dtype "bfloat16" over a uint16
// bit buffer and is rebuilt by the torch/mlx bridge at the endpoint.
message Tensor {
  string dtype = 1; // float32|float16|bfloat16|int32|int64|uint32|bool
  repeated int64 shape = 2;
  bytes data = 3;
}

// LayerKV is one verifier layer's restored K and V banks.
message LayerKV {
  int32 layer = 1; // verifier layer index this K/V belongs to
  Tensor k = 2;
  Tensor v = 3;
}

// DFlashProposerService: stateful remote DFlash drafter + f_θ restoration.
// Per turn: Restore (prompt -> f_θ-projected verifier K/V) then SeedContext
// (verifier aux hidden -> drafter context K/V). Per decode block: DraftBlock
// (bonus + context_len -> draft tokens) then ExtendContext (committed aux ->
// grow drafter context). CloseSession frees host-B state.
service DFlashProposerService {
  // Restore drafts the f_θ-projected verifier K/V banks for the prompt: host B
  // embeds prompt_ids (verifier embedding), runs the DFlash drafter, and maps
  // its K/V through f_θ into verifier K/V space. With s5_exact_full_attn the
  // full-attention layers are omitted (the verifier's native cache owns them);
  // only sliding-layer banks are returned. Opens the session.
  rpc Restore(RestoreRequest) returns (RestoreResponse);

  // SeedContext builds host B's drafter context K/V from the verifier's aux
  // hidden states over the prompt (host A computed these during its prefill).
  rpc SeedContext(SeedContextRequest) returns (SeedContextResponse);

  // DraftBlock returns exactly block_size draft tokens for the upcoming block,
  // conditioned on the verifier's bonus token + the session's context K/V.
  rpc DraftBlock(DraftBlockRequest) returns (DraftBlockResponse);

  // ExtendContext appends the aux hidden of the just-committed tokens to host
  // B's drafter context K/V (O(block_size), not O(prefix)).
  rpc ExtendContext(ExtendContextRequest) returns (ExtendContextResponse);

  // CloseSession releases host-B per-session state. Idempotent.
  rpc CloseSession(DFlashProposerServiceCloseSessionRequest) returns (DFlashProposerServiceCloseSessionResponse);
}

message RestoreRequest {
  string session_id = 1;
  repeated uint32 prompt_ids = 2;
  uint32 sink = 3;
  uint32 window = 4;
  // When true, full-attention (exact) layers are omitted from the response;
  // the verifier's native cache holds them (the S5 free lunch on gemma-4).
  bool s5_exact_full_attn = 5;
  string model_id = 6;
}

message RestoreResponse {
  // f_θ-projected verifier K/V for the layers the verifier must restore.
  repeated LayerKV restored = 1;
  // Middle positions evicted from the sink+window (outside [sink, T-window)).
  repeated int32 evicted_positions = 2;
  uint32 prompt_len = 3;
}

message SeedContextRequest {
  string session_id = 1;
  // num_aux tensors, each [1, T, hidden]: verifier aux-layer hidden over prompt.
  repeated Tensor aux = 2;
  repeated int32 positions = 3;
}

message SeedContextResponse {
  uint32 context_len = 1;
}

message DraftBlockRequest {
  string session_id = 1;
  uint32 bonus_token_id = 2;
  uint32 context_len = 3;
  // Number of draft tokens to return (L-1 in the fused loop; the bonus is the
  // verifier's guaranteed-correct first token, handled caller-side).
  uint32 block_size = 4;
}

message DraftBlockResponse {
  repeated uint32 draft_token_ids = 1; // exactly block_size drafts
  uint32 forward_passes = 2;
  uint64 peak_activation_bytes = 3;
}

message ExtendContextRequest {
  string session_id = 1;
  // num_aux tensors, each [1, k, hidden] for the k newly committed positions.
  repeated Tensor aux = 2;
  repeated int32 positions = 3;
}

message ExtendContextResponse {
  uint32 context_len = 1;
}

message DFlashProposerServiceCloseSessionRequest {
  string session_id = 1;
}

message DFlashProposerServiceCloseSessionResponse {}