Skip to content

Commit 113caaf

Browse files
committed
docs: address feedback on multi-agent design proposal
1 parent 6fb5a31 commit 113caaf

1 file changed

Lines changed: 31 additions & 14 deletions

File tree

docs/design/multi-agent-runtime-proposal.md

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -207,16 +207,22 @@ In both policies, coordinator failure always causes full rollback and an error r
207207

208208
#### Dependency endpoint injection
209209

210-
Before a dependent role's sandbox is created, the verified pod IPs of its dependencies are injected as environment variables into the pod template. The naming convention is:
210+
Before a dependent role's sandbox is created, the verified pod IPs of its dependencies are injected as environment variables into the pod template. To ensure compatibility with standard shell naming conventions, any hyphens or non-alphanumeric characters in the role name are replaced by underscores.
211+
212+
The naming convention is:
211213

212214
```
213-
AGENTCUBE_DEP_{ROLE_NAME_UPPER}_ENDPOINT = {podIP}:{port}
215+
AGENTCUBE_DEP_{ROLE_NAME_SANITISED_UPPER}_ENDPOINT = {podIP}:{port}
214216
```
215217

216-
For a role with `dependencies: [planner]`, the pod receives:
218+
**Port Resolution Rule:**
219+
* If the dependency's `AgentRuntime` CRD defines a single port, that port is used.
220+
* If it defines multiple ports, the system first looks for a port named `http` or `default`. If no such port is found, it falls back to the first port in the ports list.
221+
222+
For a role with `dependencies: [my-planner]` (where the planner exposes `8080` as the first port), the dependent pod receives:
217223

218224
```
219-
AGENTCUBE_DEP_PLANNER_ENDPOINT = 10.0.0.4:8080
225+
AGENTCUBE_DEP_MY_PLANNER_ENDPOINT = 10.0.0.4:8080
220226
```
221227

222228
Injection happens in-memory inside `createSandboxGroup()` by mutating the pod template before it is passed to `buildSandboxByAgentRuntime()`. The referenced `AgentRuntime` CRD object in the informer cache is never written.
@@ -290,11 +296,12 @@ func (s *Server) createSandboxGroup(
290296
injectDependencyEndpoints(&sandbox.Spec.PodTemplate, role.Dependencies, created)
291297
}
292298
293-
resultChan := s.sandboxController.WatchSandboxOnce(ctx, sandbox.Namespace, sandbox.Name)
294-
defer s.sandboxController.UnWatchSandbox(sandbox.Namespace, sandbox.Name)
295-
296-
// createSandbox is called as-is (no changes to the function).
297-
resp, err := s.createSandbox(ctx, dynamicClient, sandbox, nil, sandboxEntry, resultChan)
299+
// Watch and create sandbox in a closure to prevent watcher resource accumulation from defer in a loop
300+
resp, err := func() (*types.CreateAgentResponse, error) {
301+
resultChan := s.sandboxController.WatchSandboxOnce(ctx, sandbox.Namespace, sandbox.Name)
302+
defer s.sandboxController.UnWatchSandbox(sandbox.Namespace, sandbox.Name)
303+
return s.createSandbox(ctx, dynamicClient, sandbox, nil, sandboxEntry, resultChan)
304+
}()
298305
if err != nil {
299306
if mar.Spec.StartupPolicy == StartupPolicyBestEffort && !role.IsCoordinator {
300307
klog.Warningf("group %s: role %s failed (BestEffort policy): %v", groupSessionID, role.Name, err)
@@ -312,7 +319,7 @@ func (s *Server) createSandboxGroup(
312319
})
313320
}
314321
315-
manifest := buildGroupManifest(groupSessionID, created)
322+
manifest := buildGroupManifest(groupSessionID, mar.Spec.Roles, created)
316323
if err := s.storeClient.SaveAgentGroup(ctx, groupSessionID, manifest); err != nil {
317324
return nil, fmt.Errorf("save group manifest: %w", err)
318325
}
@@ -408,6 +415,14 @@ func topoSort(roles []RoleSpec) ([]RoleSpec, error) {
408415
}
409416
410417
if len(sorted) != len(roles) {
418+
// Check for missing dependencies first to provide a better error message.
419+
for _, r := range roles {
420+
for _, dep := range r.Dependencies {
421+
if _, exists := roleMap[dep]; !exists {
422+
return nil, fmt.Errorf("role %s depends on non-existent role %s", r.Name, dep)
423+
}
424+
}
425+
}
411426
// Identify and name the roles involved in the cycle for the error message.
412427
var cycled []string
413428
for name, deg := range inDegree {
@@ -497,14 +512,16 @@ GetAgentGroup(ctx context.Context, groupSessionID string) (*types.AgentGroupMani
497512
// DeleteAgentGroup removes a group manifest by groupSessionID.
498513
DeleteAgentGroup(ctx context.Context, groupSessionID string) error
499514

500-
// UpdateAgentGroupRoleStatus atomically updates the status of a specific role
515+
// UpdateAgentGroupRoleStatus atomically updates the status and endpoint of a specific role
501516
// within a group manifest. Used by the reconciler during self-healing.
502-
UpdateAgentGroupRoleStatus(ctx context.Context, groupSessionID, roleName, status string) error
517+
UpdateAgentGroupRoleStatus(ctx context.Context, groupSessionID, roleName, status, endpoint string) error
503518
```
504519

505-
Both `store_redis.go` and `store_valkey.go` implement these methods using the key prefix `agentgroup:`. The serialization format is JSON, consistent with existing `sandbox:` entries.
520+
Both `store_redis.go` and `store_valkey.go` implement these methods using the key prefix `agentgroup:`.
506521

507-
> **Note:** `UpdateAgentGroupRoleStatus` performs a read-modify-write on the manifest JSON. Under high concurrency this could race, but group manifests are updated infrequently (only during self-healing) and only by the reconciler, so optimistic concurrency control is not required in the initial implementation.
522+
> **Store Implementation Note:** To prevent race conditions in a distributed environment during concurrent updates, the store backends (Redis/Valkey) implement group manifests using a **Redis Hash** (`HSET agentgroup:{groupSessionID}`) instead of a raw JSON string.
523+
>
524+
> The hash fields map directly to roles and their metadata (e.g., `HSET agentgroup:{groupSessionID} role:{roleName} <json>`), allowing atomic field-level updates without rewriting the full manifest JSON, avoiding read-modify-write races.
508525
509526
### CRD Types
510527

0 commit comments

Comments
 (0)