Skip to content

Commit 16bc4cf

Browse files
hsinatfootprintaihsinhoyehclaude
authored
Feat support redhat (#88)
* feat: add RHEL/Rocky Linux 9 container OS support Add OSType protobuf enum (UBUNTU_2404, ROCKY_9, RHEL_9) and --os-type CLI flag to support Rocky Linux 9 (dev/test) and RHEL 9 (production) as container OS alternatives to Ubuntu 24.04. - New internal/ostype package: maps OSType to Incus images and OS family - New internal/ospkg package: abstracts package management (apt vs dnf, adduser vs useradd, sudo vs wheel group, ssh vs sshd service) - Dual stack definitions: RHEL variants for all stacks in stacks.yaml - Refactored container/manager.go to use OS-aware provisioning - OS type stored as container label for subsequent operations Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add backend heartbeat metrics and Grafana dashboard Push containarium_backend_healthy metric (1=up, 0=down) per backend to VictoriaMetrics every 30s. Add status-history panel to the Grafana containarium-overview dashboard showing backend heartbeat timeline. - New containarium.backend.healthy OTel gauge in metrics collector - FetchPeerHealth() on PeerMetricsFetcherAdapter reports peer health - Grafana dashboard: "Backend Heartbeat" row with status-history panel - Enriched /v1/backends endpoint with live peer system info - Dashboard auto-updates on daemon restart (updateGrafanaDashboard) - PostgreSQL: wait up to 2min at startup for availability; add Restart=on-failure systemd override inside core-postgres container Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: harden peer stability and SSH reliability Peer node stability: - ClamAV: wait for freshclam DB download before starting clamd; bump security container memory from 3GB to 4GB to prevent OOM kills - Conntrack: increase event channel buffers from 256 to 8192; rate-limit "channel full" warning to once per 30s to stop log flooding - Tunnel: increase yamux keepalive from 15s to 60s on both client and server so tunnel survives CPU-heavy workloads (builds, GPU training) - Peers: add --sentinel-url for auto-update; fix restart policies SSH reliability: - Fix ForwardCreateContainer to use camelCase field names (sshKeys, not ssh_keys) matching gRPC-gateway protojson. This was silently dropping SSH keys when creating containers on peers. - Reject unknown JSON fields in gRPC-gateway (DiscardUnknown: false) so field name mismatches fail loudly instead of silently - Validate SSH public keys at API boundary and pre-write to reject placeholder strings like "YOUR_KEY" - Fix jump server account unlock: use 'usermod -p *' instead of 'passwd -d' which left accounts locked on Ubuntu 24.04 - Raise sshpiper failtoban threshold from 3 to 20 (ssh-agent tries multiple keys per connection, each counted as a failure) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: hsinhoyeh <yhh92u@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e74663c commit 16bc4cf

40 files changed

Lines changed: 1237 additions & 286 deletions

api/swagger/containarium.swagger.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3850,6 +3850,10 @@
38503850
"backendId": {
38513851
"type": "string",
38523852
"title": "Backend ID this container runs on (e.g., \"gcp-spot\", \"tunnel-fts-5900x-gpu\")"
3853+
},
3854+
"osType": {
3855+
"$ref": "#/definitions/OSType",
3856+
"title": "Operating system type of the container"
38533857
}
38543858
},
38553859
"title": "Container represents a complete container instance"
@@ -4025,6 +4029,10 @@
40254029
"backendId": {
40264030
"type": "string",
40274031
"title": "Target backend ID for creation (empty = primary backend)"
4032+
},
4033+
"osType": {
4034+
"$ref": "#/definitions/OSType",
4035+
"title": "Operating system type (takes precedence over image when set)"
40284036
}
40294037
},
40304038
"title": "CreateContainerRequest is the request to create a new container"
@@ -5222,6 +5230,18 @@
52225230
},
52235231
"title": "NetworkTopology represents the complete network visualization"
52245232
},
5233+
"OSType": {
5234+
"type": "string",
5235+
"enum": [
5236+
"OS_TYPE_UNSPECIFIED",
5237+
"OS_TYPE_UBUNTU_2404",
5238+
"OS_TYPE_ROCKY_9",
5239+
"OS_TYPE_RHEL_9"
5240+
],
5241+
"default": "OS_TYPE_UNSPECIFIED",
5242+
"description": "- OS_TYPE_UNSPECIFIED: Unspecified OS type (defaults to Ubuntu 24.04)\n - OS_TYPE_UBUNTU_2404: Ubuntu 24.04 LTS\n - OS_TYPE_ROCKY_9: Rocky Linux 9 (RHEL 9 rebuild, for dev/test)\n - OS_TYPE_RHEL_9: Red Hat Enterprise Linux 9 (for production, requires subscription)",
5243+
"title": "OSType represents the operating system type for a container"
5244+
},
52255245
"PassthroughRoute": {
52265246
"type": "object",
52275247
"properties": {

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ require (
2222
go.opentelemetry.io/otel/metric v1.42.0
2323
go.opentelemetry.io/otel/sdk v1.42.0
2424
go.opentelemetry.io/otel/sdk/metric v1.42.0
25+
golang.org/x/crypto v0.49.0
2526
golang.org/x/term v0.41.0
2627
google.golang.org/api v0.272.0
2728
google.golang.org/genproto/googleapis/api v0.0.0-20260226221140-a57be14db171
@@ -86,7 +87,6 @@ require (
8687
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
8788
go.opentelemetry.io/otel/trace v1.42.0 // indirect
8889
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
89-
golang.org/x/crypto v0.49.0 // indirect
9090
golang.org/x/net v0.52.0 // indirect
9191
golang.org/x/oauth2 v0.36.0 // indirect
9292
golang.org/x/sync v0.20.0 // indirect

internal/client/grpc.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func (c *GRPCClient) ListContainers() ([]incus.ContainerInfo, error) {
126126
}
127127

128128
// CreateContainer creates a container via gRPC
129-
func (c *GRPCClient) CreateContainer(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string) (*incus.ContainerInfo, error) {
129+
func (c *GRPCClient) CreateContainer(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string, osType pb.OSType) (*incus.ContainerInfo, error) {
130130
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // Container creation can take time (includes ultra-aggressive retry logic for google_guest_agent)
131131
defer cancel()
132132

@@ -142,6 +142,7 @@ func (c *GRPCClient) CreateContainer(username, image, cpu, memory, disk string,
142142
EnablePodman: enablePodman,
143143
Stack: stack,
144144
Gpu: gpu,
145+
OsType: osType,
145146
}
146147

147148
resp, err := c.client.CreateContainer(ctx, req)

internal/client/http.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"time"
1313

1414
"github.com/footprintai/containarium/internal/incus"
15+
pb "github.com/footprintai/containarium/pkg/pb/containarium/v1"
1516
)
1617

1718
// HTTPClient wraps an HTTP connection to the containarium REST API
@@ -206,7 +207,7 @@ func (c *HTTPClient) ListContainers() ([]incus.ContainerInfo, error) {
206207
}
207208

208209
// CreateContainer creates a container via HTTP
209-
func (c *HTTPClient) CreateContainer(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string) (*incus.ContainerInfo, error) {
210+
func (c *HTTPClient) CreateContainer(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string, osType pb.OSType) (*incus.ContainerInfo, error) {
210211
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
211212
defer cancel()
212213

@@ -222,6 +223,7 @@ func (c *HTTPClient) CreateContainer(username, image, cpu, memory, disk string,
222223
"enablePodman": enablePodman,
223224
"stack": stack,
224225
"gpu": gpu,
226+
"osType": osType,
225227
}
226228

227229
resp, err := c.doRequest(ctx, http.MethodPost, "/v1/containers", reqBody)

internal/cmd/create.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"github.com/footprintai/containarium/internal/client"
1010
"github.com/footprintai/containarium/internal/container"
1111
"github.com/footprintai/containarium/internal/incus"
12+
"github.com/footprintai/containarium/internal/ostype"
13+
pb "github.com/footprintai/containarium/pkg/pb/containarium/v1"
1214
"github.com/spf13/cobra"
1315
)
1416

@@ -24,6 +26,7 @@ var (
2426
forceRecreate bool
2527
stackID string
2628
gpuDevice string
29+
osTypeStr string
2730
)
2831

2932
var createCmd = &cobra.Command{
@@ -76,6 +79,7 @@ func init() {
7679
createCmd.Flags().StringVar(&gpuDevice, "gpu", "", "GPU device ID for passthrough (e.g., '0' for first GPU, PCI address)")
7780
createCmd.Flags().StringSliceVar(&labels, "labels", []string{}, "Labels in key=value format (can be specified multiple times)")
7881
createCmd.Flags().BoolVar(&forceRecreate, "force", false, "Delete and recreate if container already exists")
82+
createCmd.Flags().StringVar(&osTypeStr, "os-type", "", "Container OS type: ubuntu, rocky9, rhel9 (overrides --image)")
7983
}
8084

8185
func runCreate(cmd *cobra.Command, args []string) error {
@@ -234,15 +238,18 @@ func runCreate(cmd *cobra.Command, args []string) error {
234238
// Create container - use remote or local mode
235239
var info *incus.ContainerInfo
236240

241+
// Parse OS type from flag
242+
osType := ostype.OSTypeFromString(osTypeStr)
243+
237244
if httpMode && serverAddr != "" {
238245
// Remote mode via HTTP
239-
info, err = createRemoteHTTP(username, containerImage, cpuLimit, memoryLimit, diskLimit, sshKeys, enablePodman, stackID, gpuDevice)
246+
info, err = createRemoteHTTP(username, containerImage, cpuLimit, memoryLimit, diskLimit, sshKeys, enablePodman, stackID, gpuDevice, osType)
240247
if err != nil {
241248
return fmt.Errorf("failed to create container via HTTP API: %w", err)
242249
}
243250
} else if serverAddr != "" {
244251
// Remote mode via gRPC
245-
info, err = createRemote(username, containerImage, cpuLimit, memoryLimit, diskLimit, sshKeys, enablePodman, stackID, gpuDevice)
252+
info, err = createRemote(username, containerImage, cpuLimit, memoryLimit, diskLimit, sshKeys, enablePodman, stackID, gpuDevice, osType)
246253
if err != nil {
247254
return fmt.Errorf("failed to create container via remote server: %w", err)
248255
}
@@ -251,7 +258,7 @@ func runCreate(cmd *cobra.Command, args []string) error {
251258
if verbose {
252259
fmt.Println("Creating container...")
253260
}
254-
info, err = createLocal(username, containerImage, cpuLimit, memoryLimit, diskLimit, staticIP, sshKeys, parsedLabels, enablePodman, stackID, gpuDevice)
261+
info, err = createLocal(username, containerImage, cpuLimit, memoryLimit, diskLimit, staticIP, sshKeys, parsedLabels, enablePodman, stackID, gpuDevice, osType)
255262
if err != nil {
256263
// Cleanup jump server account on failure
257264
_ = container.DeleteJumpServerAccount(username, false)
@@ -313,7 +320,7 @@ func runCreate(cmd *cobra.Command, args []string) error {
313320
}
314321

315322
// createLocal creates a container using local Incus daemon
316-
func createLocal(username, image, cpu, memory, disk, staticIP string, sshKeys []string, labelMap map[string]string, enablePodman bool, stack, gpu string) (*incus.ContainerInfo, error) {
323+
func createLocal(username, image, cpu, memory, disk, staticIP string, sshKeys []string, labelMap map[string]string, enablePodman bool, stack, gpu string, osType pb.OSType) (*incus.ContainerInfo, error) {
317324
mgr, err := container.New()
318325
if err != nil {
319326
return nil, fmt.Errorf("failed to connect to Incus: %w (is Incus running?)", err)
@@ -334,6 +341,7 @@ func createLocal(username, image, cpu, memory, disk, staticIP string, sshKeys []
334341
AutoStart: true,
335342
Verbose: verbose,
336343
Stack: stack,
344+
OSType: osType,
337345
}
338346

339347
return mgr.Create(opts)
@@ -356,23 +364,23 @@ func parseLabels(labelSlice []string) map[string]string {
356364
}
357365

358366
// createRemote creates a container using remote gRPC server
359-
func createRemote(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string) (*incus.ContainerInfo, error) {
367+
func createRemote(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string, osType pb.OSType) (*incus.ContainerInfo, error) {
360368
grpcClient, err := client.NewGRPCClient(serverAddr, certsDir, insecure)
361369
if err != nil {
362370
return nil, err
363371
}
364372
defer grpcClient.Close()
365373

366-
return grpcClient.CreateContainer(username, image, cpu, memory, disk, sshKeys, enablePodman, stack, gpu)
374+
return grpcClient.CreateContainer(username, image, cpu, memory, disk, sshKeys, enablePodman, stack, gpu, osType)
367375
}
368376

369377
// createRemoteHTTP creates a container using remote HTTP API
370-
func createRemoteHTTP(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string) (*incus.ContainerInfo, error) {
378+
func createRemoteHTTP(username, image, cpu, memory, disk string, sshKeys []string, enablePodman bool, stack, gpu string, osType pb.OSType) (*incus.ContainerInfo, error) {
371379
httpClient, err := client.NewHTTPClient(serverAddr, authToken)
372380
if err != nil {
373381
return nil, err
374382
}
375383
defer httpClient.Close()
376384

377-
return httpClient.CreateContainer(username, image, cpu, memory, disk, sshKeys, enablePodman, stack, gpu)
385+
return httpClient.CreateContainer(username, image, cpu, memory, disk, sshKeys, enablePodman, stack, gpu, osType)
378386
}

internal/cmd/daemon.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,36 @@ func runDaemon(cmd *cobra.Command, args []string) error {
236236
}
237237
}
238238

239+
// Wait for PostgreSQL to be reachable before proceeding — services that
240+
// depend on it (SecurityService, AlertService, etc.) are registered based
241+
// on PostgreSQL availability at startup. If PostgreSQL is temporarily down
242+
// (e.g., container restarting), we wait up to 2 minutes rather than
243+
// starting with those services permanently disabled.
244+
if postgresConnString != "" {
245+
log.Printf("Waiting for PostgreSQL to become reachable...")
246+
waitCtx, waitCancel := context.WithTimeout(context.Background(), 2*time.Minute)
247+
for {
248+
pingPool, pingErr := pgxpool.New(waitCtx, postgresConnString)
249+
if pingErr == nil {
250+
if err := pingPool.Ping(waitCtx); err == nil {
251+
pingPool.Close()
252+
log.Printf("PostgreSQL is reachable")
253+
break
254+
}
255+
pingPool.Close()
256+
}
257+
select {
258+
case <-waitCtx.Done():
259+
log.Printf("Warning: PostgreSQL not reachable after 2 minutes, proceeding anyway (some services will be disabled)")
260+
goto pgWaitDone
261+
case <-time.After(5 * time.Second):
262+
log.Printf(" still waiting for PostgreSQL...")
263+
}
264+
}
265+
pgWaitDone:
266+
waitCancel()
267+
}
268+
239269
// Load persisted daemon config from PostgreSQL (values saved by previous runs).
240270
// CLI flags that were explicitly set always override DB values.
241271
// Uses retry logic to handle post-restart races with PostgreSQL.

internal/container/cgroup_wrapper.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package container
33
import (
44
"fmt"
55
"log"
6+
7+
"github.com/footprintai/containarium/internal/ostype"
68
)
79

810
// cgroupWrapperScript returns a bash wrapper script that intercepts run/create
@@ -154,7 +156,14 @@ exec $REAL_RUNC "$@"
154156
// injected from the LXC container.
155157
func (m *Manager) installDockerOCIRuntime(containerName string) error {
156158
// Step 1: Ensure jq is installed (needed by the runtime script)
157-
if err := m.incus.Exec(containerName, []string{"apt-get", "install", "-y", "jq"}); err != nil {
159+
// Detect OS family from container labels
160+
jqInstallCmd := []string{"apt-get", "install", "-y", "jq"}
161+
if info, err := m.incus.GetContainer(containerName); err == nil {
162+
if osLabel, ok := info.Labels[ostype.OSTypeLabelKey]; ok && ostype.FamilyFromLabel(osLabel) == ostype.RHEL {
163+
jqInstallCmd = []string{"dnf", "install", "-y", "jq"}
164+
}
165+
}
166+
if err := m.incus.Exec(containerName, jqInstallCmd); err != nil {
158167
return fmt.Errorf("failed to install jq: %w", err)
159168
}
160169

internal/container/collaborator.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
"time"
77

88
"github.com/footprintai/containarium/internal/collaborator"
9+
"github.com/footprintai/containarium/internal/ospkg"
10+
"github.com/footprintai/containarium/internal/ostype"
911
)
1012

1113
// CollaboratorManager handles collaborator operations for containers
@@ -105,13 +107,21 @@ func (cm *CollaboratorManager) AddCollaborator(ownerUsername, collaboratorUserna
105107

106108
// createCollaboratorUser creates a collaborator user inside the container
107109
func (cm *CollaboratorManager) createCollaboratorUser(containerName, ownerUsername, accountName, sshPublicKey string, grantSudo, grantContainerRuntime bool) error {
108-
// Create user with bash shell (they need interactive access)
109-
if err := cm.manager.incus.Exec(containerName, []string{
110-
"adduser",
111-
"--disabled-password",
112-
"--gecos", fmt.Sprintf("Collaborator %s", accountName),
113-
accountName,
114-
}); err != nil {
110+
// Detect OS family from container labels or by probing
111+
info, err := cm.manager.incus.GetContainer(containerName)
112+
family := ostype.Debian
113+
if err == nil {
114+
if osLabel, ok := info.Labels[ostype.OSTypeLabelKey]; ok {
115+
family = ostype.FamilyFromLabel(osLabel)
116+
} else {
117+
family = ostype.DetectFamily(cm.manager.incus, containerName)
118+
}
119+
}
120+
pkgMgr := ospkg.ForFamily(family)
121+
122+
// Create user (OS-aware: adduser on Debian, useradd on RHEL)
123+
gecos := fmt.Sprintf("Collaborator %s", accountName)
124+
if err := cm.manager.incus.Exec(containerName, pkgMgr.CreateUserCmd(accountName, gecos)); err != nil {
115125
return fmt.Errorf("failed to create user: %w", err)
116126
}
117127

internal/container/jump_server.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,13 @@ func EnsureJumpServerAccount(username string) error {
129129
return fmt.Errorf("useradd failed: %w", err)
130130
}
131131

132-
// Unlock account (useradd creates locked accounts, sshd rejects them)
132+
// Unlock account (useradd creates locked accounts, sshd rejects them).
133+
// Set password to '*' which means "no valid password" but account is not
134+
// locked. This allows public key auth while preventing password login.
135+
// Note: passwd -d sets an empty password which some distros reject;
136+
// usermod -p '*' is the portable approach.
133137
// #nosec G204 -- username validated by isValidUsername above
134-
_ = exec.Command("passwd", "-d", username).Run()
138+
_ = exec.Command("usermod", "-p", "*", username).Run()
135139

136140
// Set home dir permissions (sshd requires 755 or stricter)
137141
_ = os.Chmod(fmt.Sprintf("/home/%s", username), 0755) // #nosec G302 -- sshd requires home dir to be world-readable

0 commit comments

Comments
 (0)