Skip to content

Commit e67c8b2

Browse files
committed
feat(picod): implement secure initialization improvements and persist bootstrap keys
1 parent a240b5d commit e67c8b2

37 files changed

Lines changed: 2020 additions & 746 deletions

cmd/picod/main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,13 @@ func main() {
4141
Workspace: *workspace,
4242
}
4343

44-
// Create server
45-
server := picod.NewServer(config)
46-
4744
// Setup signal handling with context cancellation
4845
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
4946
defer cancel()
5047

48+
// Create server
49+
server := picod.NewServer(ctx, config)
50+
5151
// Start PicoD server in goroutine
5252
errCh := make(chan error, 1)
5353
go func() {

cmd/workload-manager/main.go

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,21 +97,6 @@ func main() {
9797
os.Exit(1)
9898
}
9999

100-
sandboxReconciler := &workloadmanager.SandboxReconciler{
101-
Client: mgr.GetClient(),
102-
Scheme: mgr.GetScheme(),
103-
}
104-
105-
codeInterpreterReconciler := &workloadmanager.CodeInterpreterReconciler{
106-
Client: mgr.GetClient(),
107-
Scheme: mgr.GetScheme(),
108-
}
109-
110-
if err := setupControllers(mgr, sandboxReconciler, codeInterpreterReconciler); err != nil {
111-
fmt.Fprintf(os.Stderr, "unable to setup controllers: %v\n", err)
112-
os.Exit(1)
113-
}
114-
115100
// Create API server configuration
116101
config := &workloadmanager.Config{
117102
Port: *port,
@@ -123,12 +108,28 @@ func main() {
123108
MTLSConfig: tlsConfig,
124109
}
125110

111+
sandboxReconciler := &workloadmanager.SandboxReconciler{
112+
Client: mgr.GetClient(),
113+
Scheme: mgr.GetScheme(),
114+
}
115+
126116
// Create and initialize API server
127117
server, err := workloadmanager.NewServer(config, sandboxReconciler)
128118
if err != nil {
129119
klog.Fatalf("Failed to create API server: %v", err)
130120
}
131121

122+
codeInterpreterReconciler := &workloadmanager.CodeInterpreterReconciler{
123+
Client: mgr.GetClient(),
124+
Scheme: mgr.GetScheme(),
125+
BootstrapPublicKeyFunc: server.GetBootstrapPublicKeyPEM,
126+
}
127+
128+
if err := setupControllers(mgr, sandboxReconciler, codeInterpreterReconciler); err != nil {
129+
fmt.Fprintf(os.Stderr, "unable to setup controllers: %v\n", err)
130+
os.Exit(1)
131+
}
132+
132133
// Setup signal handling
133134
ctx, cancel := context.WithCancel(context.Background())
134135
defer cancel()

docs/agentcube/blog/release-v0.1.0/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ Key Capabilities:
101101

102102
### JWT Security Chain (Router → PicoD)
103103

104-
Sandbox pods are ephemeral and may be replaced at any time; embedding a shared secret in cluster config is fragile and hard to rotate. AgentCube establishes an RSA-based trust chain: the Router generates an RSA-2048 key pair at startup, stores the public key in a Kubernetes Secret (`picod-router-identity`), and the Workload Manager injects it as `PICOD_AUTH_PUBLIC_KEY` for `CodeInterpreter` sandboxes when authentication is enabled (the default is `picod`; `none` disables injection). The Router signs short-lived (5-minute) RS256 JWTs for every proxied request. PicoD verifies these tokens entirely in-process — no network round-trip, no shared database.
104+
Sandbox pods are ephemeral and may be replaced at any time; embedding a shared secret in cluster config is fragile and hard to rotate. AgentCube establishes a two-stage trust chain: the Router generates an RSA-2048 bootstrap key pair at startup, stores the public key in a Kubernetes Secret (`picod-router-identity`), and the Workload Manager injects it as `PICOD_BOOTSTRAP_PUBLIC_KEY` for `CodeInterpreter` sandboxes when authentication is enabled (the default is `picod`; `none` disables injection). A dynamic ECDSA (P-256) session key is then generated for each sandbox instance via an `/init` handshake. The Router signs short-lived (5-minute) ES256 JWTs for every proxied request. PicoD verifies these tokens entirely in-process — no network round-trip, no shared database.
105105

106106
Key Capabilities:
107107

docs/design/PicoD-Plain-Authentication-Design.md

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ However, emerging use cases require a more flexible architecture where the clien
1212

1313
The existing self-signed key-pair model is incompatible with this centralized management flow, as it bypasses the Router's ability to mediate access. To address this, we propose a new **Plain Authentication** mechanism for `picod`. This design enables the Router/Gateway to manage credentials and connection security, simplifying the client-side workflow while maintaining robust access control.
1414

15+
> [!WARNING]
16+
> **Migration Note (Two-Stage Secure Initialization)**
17+
>
18+
> The flow described in this document was updated by PR #352 to address cross-sandbox token replay vulnerabilities. The original `PICOD_AUTH_PUBLIC_KEY` environment variable has been renamed to `PICOD_BOOTSTRAP_PUBLIC_KEY` (formerly `PICOD_AUTH_PUBLIC_KEY`).
19+
>
20+
> While `PICOD_AUTH_PUBLIC_KEY` is still supported as a fallback for backwards compatibility, deployments should migrate to `PICOD_BOOTSTRAP_PUBLIC_KEY`. Under the new model, this key is only used to verify the bootstrap payload during the `/init` handshake, which establishes a unique session keypair for subsequent requests.
21+
1522
## Use Cases
1623

1724
### Gateway-Managed Sandbox Access
@@ -28,7 +35,7 @@ To ensure **High Availability (HA)** across multiple Router replicas and enfor
2835

2936
- All Router replicas share a single cryptographic identity to function as a unified Token Issuer.
3037
- **Private Key Storage**: Stored in a Kubernetes Secret (picod-router-identity). The Private Key is accessible only by the Router component.
31-
- **Public Key Distribution**: Published to a Kubernetes ConfigMap (picod-router-public-key). This is accessible by the WorkloadManager and PicoD instances.
38+
- **Public Key Distribution**: Published to a Kubernetes ConfigMap (agentcube-bootstrap-identity). This is accessible by the WorkloadManager and PicoD instances.
3239

3340
2. **Decoupled Provisioning (WorkloadManager)**:
3441

@@ -52,13 +59,13 @@ Upon startup, every Router replica executes an **Atomic Initialization Routine*
5259
- If Missing: The Router generates a new RSA/ECDSA key pair in memory and attempts to **CREATE** the Secret.
5360
- Concurrency Handling: If the creation fails with 409 Conflict (implying another replica initialized it simultaneously), the Router discards its generated key and fetches the existing Secret created by the peer.
5461

55-
2. **Public Key Publication**: Once the Private Key is successfully loaded, the Router reconciles the picod-router-public-key ConfigMap. It ensures the Public Key in the ConfigMap matches the Private Key in memory.
62+
2. **Public Key Publication**: Once the Private Key is successfully loaded, the Router reconciles the agentcube-bootstrap-identity ConfigMap. It ensures the Public Key in the ConfigMap matches the Private Key in memory.
5663

5764
#### 2. Provisioning Phase
5865

5966
- The **Router** sends a sandbox allocation request to the **WorkloadManager**. Crucially, this request **does not** contain key data.
60-
- The **WorkloadManager** constructs the Pod specification. It defines an environment variable `PICOD_AUTH_PUBLIC_KEY` that sources its value from the `picod-router-public-key` ConfigMap (using `valueFrom: configMapKeyRef`).
61-
- **PicoD** starts, reads the key from the **environment**, and initializes its JWT verifier.
67+
- The **WorkloadManager** constructs the Pod specification. It defines an environment variable `PICOD_BOOTSTRAP_PUBLIC_KEY` that sources its value from the `agentcube-bootstrap-identity` ConfigMap. It also injects `PICOD_SESSION_ID` for defense-in-depth token validation.
68+
- **PicoD** starts, reads the key from the **environment**, and waits for the `/init` handshake which establishes the per-session ECDSA keypair.
6269

6370
#### 3. Runtime Access Phase
6471

@@ -98,14 +105,14 @@ sequenceDiagram
98105
end
99106
100107
Router->>Router: Load Private Key into Memory
101-
Router->>K8s: APPLY ConfigMap "picod-router-public-key"
108+
Router->>K8s: APPLY ConfigMap "agentcube-bootstrap-identity"
102109
Note right of Router: Publishes Public Key for consumption
103110
104111
Note over Router, PicoD: 2. Provisioning
105112
Router->>WM: Request Sandbox (No Key Payload)
106-
WM->>K8s: Create Pod (valueFrom: picod-router-public-key)
107-
K8s-->>PicoD: Start Container (Env: PICOD_AUTH_PUBLIC_KEY)
108-
PicoD->>PicoD: Load Key from Env
113+
WM->>K8s: Create Pod (valueFrom: agentcube-bootstrap-identity)
114+
K8s-->>PicoD: Start Container (Env: PICOD_BOOTSTRAP_PUBLIC_KEY, PICOD_SESSION_ID)
115+
PicoD->>PicoD: Load Key from Env and await /init
109116
110117
Note over SDK, PicoD: 3. Runtime Access
111118
SDK->>Router: Request (No Auth)
@@ -142,14 +149,14 @@ data:
142149
143150
**B. Identity ConfigMap (Public)**
144151
145-
- **Name**: picod-router-public-key
152+
- **Name**: agentcube-bootstrap-identity
146153
- **Purpose**: Stores the public key mounted into PicoD instances.
147154
148155
```yaml
149156
apiVersion: v1
150157
kind: ConfigMap
151158
metadata:
152-
name: picod-router-public-key
159+
name: agentcube-bootstrap-identity
153160
namespace: agentcube-system
154161
data:
155162
# Plain text Public Key (PEM format)
@@ -170,18 +177,20 @@ spec:
170177
containers:
171178
- name: picod
172179
env:
173-
- name: PICOD_AUTH_PUBLIC_KEY
180+
- name: PICOD_BOOTSTRAP_PUBLIC_KEY
174181
valueFrom:
175182
configMapKeyRef:
176-
name: picod-router-public-key
183+
name: agentcube-bootstrap-identity
177184
key: public.pem
185+
- name: PICOD_SESSION_ID
186+
value: "<dynamic-uuid-per-sandbox>"
178187
```
179188
180189
### 3. PicoD Configuration
181190
The existing CLI flags for authentication are deprecated.
182191
183-
* Environment Variable: picod requires `PICOD_AUTH_PUBLIC_KEY` to be set.
184-
* Behavior: If the environment variable is present, `picod` initializes the Plain Auth provider. If missing, it fails to start (or falls back to legacy mode if we decide to keep it for a transition period).
192+
* Environment Variable: picod requires `PICOD_BOOTSTRAP_PUBLIC_KEY` to be set. `PICOD_SESSION_ID` is also strongly recommended to prevent cross-sandbox token replays.
193+
* Behavior: If the environment variable is present, `picod` initializes the Plain Auth provider. If missing, it fails to start.
185194

186195
### 4. JWT Token Spec
187196

@@ -198,7 +207,7 @@ The Router signs tokens using the standard JWT (RFC 7519) format.
198207
"iss": "agentcube-router", // Issuer: Fixed identifier for the Router
199208
"iat": 1716239000, // Issued At: Unix timestamp
200209
"exp": 1716242600, // Expiration: e.g., +1 hour
201-
"sub": "client-session-id", // Subject: Identifies the client/session
210+
"sub": "<dynamic-uuid-per-sandbox>", // Subject: Identifies the client/session
202211
"aud": "picod-service" // Audience: Intended recipient
203212
}
204213
```

docs/design/auth-proposal.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Author: Mahil Patel
1414
AgentCube currently has partial, ad-hoc authentication between its internal components but lacks a unified security model. The existing mechanisms are:
1515

1616
1. **Workload Manager Auth** (`pkg/workloadmanager/auth.go`): Optional Kubernetes TokenReview-based ServiceAccount token validation, gated behind `config.EnableAuth`, plus per-sandbox ownership checks using the extracted user identity (effectively relying on Kubernetes RBAC when using the user-scoped client).
17-
2. **Router → PicoD Auth** (`PicoD-Plain-Authentication-Design`): A custom RSA key-pair scheme where the Router signs JWTs and PicoD verifies them using a public key exposed via the `PICOD_AUTH_PUBLIC_KEY` environment variable. The key pair (`private.pem`, `public.pem`) is stored in the `picod-router-identity` Secret, and the WorkloadManager reads this Secret to inject the public key into PicoD pods. This works for the Router→PicoD channel but leaves other internal channels unauthenticated.
17+
2. **Router → PicoD Auth** (`PicoD-Plain-Authentication-Design`): A custom RSA/ECDSA key-pair scheme where the Router signs JWTs and PicoD verifies them using a public key exposed via the `PICOD_BOOTSTRAP_PUBLIC_KEY` environment variable (formerly `PICOD_AUTH_PUBLIC_KEY`). The key pair (`private.pem`, `public.pem`) is stored in the `agentcube-bootstrap-identity` Secret, and the WorkloadManager reads this Secret to inject the public key into PicoD pods. This works for the Router→PicoD channel but leaves other internal channels unauthenticated.
1818
3. **Router → WorkloadManager**: Optional, one-sided authentication. `pkg/router/session_manager.go` can attach a `Authorization: Bearer <serviceaccount token>` header, and WorkloadManager can validate it when `--enable-auth` is enabled. This is not mutual workload identity or a zero-trust model, and when auth is disabled any pod on the cluster network can call the WorkloadManager API.
1919
4. **External Clients → Router**: No authentication. The `handleInvoke` handler in `pkg/router/handlers.go` processes incoming requests without verifying the caller's identity.
2020

pkg/common/types/sandbox.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,16 @@ type SandboxInfo struct {
3838
// metav1.Duration marshals as a human-readable string (e.g. "15m0s") rather than
3939
// a raw nanosecond integer, making the persisted JSON unambiguous.
4040
IdleTimeout metav1.Duration `json:"idleTimeout,omitempty"`
41+
// SessionPrivateKey is intentionally excluded from JSON serialization so it
42+
// is not embedded in the SandboxInfo JSON payload in the KV store.
43+
// It is persisted separately via StoreSessionPrivateKey (keyed by session ID)
44+
// and populated transiently in the WM→Router HTTP response path.
45+
SessionPrivateKey string `json:"-"`
4146
// LastActivityAt is populated transiently from the store's last-activity sorted set
4247
// during ListInactiveSandboxes. It is intentionally excluded from JSON serialization.
4348
LastActivityAt time.Time `json:"-"`
4449
Status string `json:"status"`
50+
AuthMode string `json:"authMode,omitempty"`
4551
}
4652

4753
type SandboxEntryPoint struct {
@@ -62,6 +68,7 @@ type CreateSandboxResponse struct {
6268
SandboxID string `json:"sandboxId"`
6369
SandboxName string `json:"sandboxName"`
6470
EntryPoints []SandboxEntryPoint `json:"entryPoints"`
71+
AuthMode string `json:"authMode,omitempty"`
6572
}
6673

6774
func (car *CreateSandboxRequest) Validate() error {

0 commit comments

Comments
 (0)