Skip to content

Commit ffcb421

Browse files
authored
fix(BRE2-940): out of Nebius capacity error mapping and ufw regression (#118)
* fix(BRE2-940): additional Nebius capacity error mapping * fix ufw issue * cleaner * review feedback
1 parent 63808b7 commit ffcb421

8 files changed

Lines changed: 220 additions & 48 deletions

File tree

v1/providers/nebius/errors.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
v1 "github.com/brevdev/cloud/v1"
88
"github.com/nebius/gosdk/operations"
9+
"github.com/nebius/gosdk/serviceerror"
910
"google.golang.org/grpc/codes"
1011
"google.golang.org/grpc/status"
1112
)
@@ -37,6 +38,17 @@ func handleErrToCloudErr(e error) error {
3738
if e == nil {
3839
return nil
3940
}
41+
var serviceErr *serviceerror.Error
42+
if errors.As(e, &serviceErr) {
43+
for _, detail := range serviceErr.Details {
44+
switch detail.(type) {
45+
case *serviceerror.NotEnoughResources:
46+
return v1.ErrInsufficientResources
47+
case *serviceerror.QuotaFailure:
48+
return v1.ErrOutOfQuota
49+
}
50+
}
51+
}
4052
// Check for Nebius operations.Error for ResourceExhausted (returned by operation.Wait on async failures)
4153
var opErr *operations.Error
4254
if errors.As(e, &opErr) {

v1/providers/nebius/errors_test.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
package v1
2+
3+
import (
4+
"errors"
5+
"testing"
6+
7+
cloudv1 "github.com/brevdev/cloud/v1"
8+
common "github.com/nebius/gosdk/proto/nebius/common/v1"
9+
"github.com/nebius/gosdk/serviceerror"
10+
"github.com/stretchr/testify/require"
11+
"google.golang.org/grpc/codes"
12+
"google.golang.org/grpc/status"
13+
)
14+
15+
func TestHandleErrToCloudErrMapsNotEnoughResourcesToInsufficientResources(t *testing.T) {
16+
t.Parallel()
17+
18+
err := &serviceerror.Error{
19+
Wrapped: status.Error(codes.ResourceExhausted, "operation failed"),
20+
Details: []serviceerror.Detail{
21+
serviceerror.NewDetail(&common.ServiceError{
22+
Service: "compute",
23+
Code: "NotEnoughResources",
24+
Details: &common.ServiceError_NotEnoughResources{
25+
NotEnoughResources: &common.NotEnoughResources{
26+
Violations: []*common.NotEnoughResources_Violation{
27+
{
28+
ResourceType: "virtualMachine",
29+
Requested: "1gpu-16vcpu-64gb",
30+
Message: "VM schedule timeout, most likely due to insufficient hardware resources",
31+
},
32+
},
33+
},
34+
},
35+
}),
36+
},
37+
}
38+
39+
require.True(t, errors.Is(handleErrToCloudErr(err), cloudv1.ErrInsufficientResources))
40+
}
41+
42+
func TestHandleErrToCloudErrMapsQuotaFailureToOutOfQuota(t *testing.T) {
43+
t.Parallel()
44+
45+
err := &serviceerror.Error{
46+
Wrapped: status.Error(codes.ResourceExhausted, "operation failed"),
47+
Details: []serviceerror.Detail{
48+
serviceerror.NewDetail(&common.ServiceError{
49+
Service: "compute",
50+
Code: "QuotaFailure",
51+
Details: &common.ServiceError_QuotaFailure{
52+
QuotaFailure: &common.QuotaFailure{
53+
Violations: []*common.QuotaFailure_Violation{
54+
{
55+
Quota: "compute.instance.gpu.h100",
56+
Limit: "0",
57+
Requested: "1",
58+
Message: "quota exceeded",
59+
},
60+
},
61+
},
62+
},
63+
}),
64+
},
65+
}
66+
67+
require.True(t, errors.Is(handleErrToCloudErr(err), cloudv1.ErrOutOfQuota))
68+
}

v1/providers/nebius/instance.go

Lines changed: 66 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package v1
22

33
import (
44
"context"
5+
_ "embed"
6+
"encoding/base64"
57
"fmt"
68
"strings"
79
"time"
@@ -20,6 +22,12 @@ const (
2022
nebiusCPUImageFamily = "ubuntu24.04-driverless"
2123
)
2224

25+
//go:embed scripts/brev-apply-docker-firewall.sh
26+
var dockerFirewallScript string
27+
28+
//go:embed scripts/10-brev-firewall.conf
29+
var dockerFirewallDropIn string
30+
2331
//nolint:gocyclo,funlen // Complex instance creation with resource management
2432
func (c *NebiusClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) {
2533
// Track created resources for automatic cleanup on failure
@@ -868,7 +876,6 @@ func matchesTagFilters(instanceTags map[string]string, tagFilters map[string][]s
868876
return true
869877
}
870878

871-
//nolint:dupl // StopInstance and StartInstance have similar structure but different operations
872879
func (c *NebiusClient) StopInstance(ctx context.Context, instanceID v1.CloudProviderInstanceID) error {
873880
c.logger.Debug(ctx, "initiating instance stop operation",
874881
v1.LogField("instanceID", instanceID))
@@ -906,7 +913,6 @@ func (c *NebiusClient) StopInstance(ctx context.Context, instanceID v1.CloudProv
906913
return nil
907914
}
908915

909-
//nolint:dupl // StartInstance and StopInstance have similar structure but different operations
910916
func (c *NebiusClient) StartInstance(ctx context.Context, instanceID v1.CloudProviderInstanceID) error {
911917
c.logger.Debug(ctx, "initiating instance start operation",
912918
v1.LogField("instanceID", instanceID))
@@ -916,17 +922,18 @@ func (c *NebiusClient) StartInstance(ctx context.Context, instanceID v1.CloudPro
916922
Id: string(instanceID),
917923
})
918924
if err != nil {
919-
return fmt.Errorf("failed to initiate instance start: %w", err)
925+
return fmt.Errorf("failed to initiate instance start: %w", handleErrToCloudErr(err))
920926
}
921927

922928
// Wait for the start operation to complete
923929
finalOp, err := operation.Wait(ctx)
924930
if err != nil {
925-
return fmt.Errorf("failed to wait for instance start: %w", err)
931+
return fmt.Errorf("failed to wait for instance start: %w", handleErrToCloudErr(err))
926932
}
927933

928934
if !finalOp.Successful() {
929-
return fmt.Errorf("instance start failed: %v", finalOp.Status())
935+
statusErr := fmt.Errorf("instance start failed: %v", finalOp.Status())
936+
return handleErrToCloudErr(statusErr)
930937
}
931938

932939
c.logger.Debug(ctx, "start operation completed, waiting for instance to reach RUNNING state",
@@ -1577,13 +1584,12 @@ func (c *NebiusClient) cleanupOrphanedBootDisks(ctx context.Context, testID stri
15771584
}
15781585

15791586
// generateCloudInitUserData generates a cloud-init user-data script for SSH key injection and firewall configuration
1580-
// This is inspired by Shadeform's LaunchConfiguration approach but uses cloud-init instead of base64 scripts
1587+
// This is inspired by Shadeform's LaunchConfiguration approach but uses cloud-init directly.
15811588
func generateCloudInitUserData(publicKey string, firewallRules v1.FirewallRules) string {
15821589
// Start with cloud-init header
15831590
script := `#cloud-config
15841591
packages:
15851592
- ufw
1586-
- iptables-persistent
15871593
`
15881594

15891595
// Add SSH key configuration if provided
@@ -1593,35 +1599,19 @@ packages:
15931599
`, publicKey)
15941600
}
15951601

1602+
script += generateDockerFirewallWriteFiles()
1603+
15961604
var commands []string
15971605

1598-
// Fix a systemd race condition: ufw.service and netfilter-persistent.service
1599-
// both start in parallel (both are Before=network-pre.target with no mutual
1600-
// ordering). Both call iptables-restore concurrently, and with the iptables-nft
1601-
// backend the competing nftables transactions cause UFW to fail with
1602-
// "iptables-restore: line 4 failed". This drop-in forces UFW to wait for
1603-
// netfilter-persistent to finish first.
1604-
commands = append(commands,
1605-
"sudo mkdir -p /etc/systemd/system/ufw.service.d",
1606-
`printf '[Unit]\nAfter=netfilter-persistent.service\n' | sudo tee /etc/systemd/system/ufw.service.d/after-netfilter.conf > /dev/null`,
1607-
"sudo systemctl daemon-reload",
1608-
)
1606+
commands = append(commands, "sudo systemctl daemon-reload")
16091607

16101608
// Generate UFW firewall commands (similar to Shadeform's approach)
16111609
// UFW (Uncomplicated Firewall) is available on Ubuntu/Debian instances
16121610
commands = append(commands, generateUFWCommands(firewallRules)...)
16131611

1614-
// Generate IPTables firewall commands to ensure docker ports are not made immediately
1615-
// accessible from the internet by default.
1616-
commands = append(commands, generateIPTablesCommands()...)
1617-
1618-
// Save the complete iptables state (UFW chains + DOCKER-USER rules) so it
1619-
// survives instance stop/start cycles. Cloud-init runcmd only executes on
1620-
// first boot; on subsequent boots netfilter-persistent restores this snapshot,
1621-
// then UFW starts after it (due to the drop-in above) and re-applies its rules.
1622-
// This provides defense-in-depth: even if UFW fails for any reason, the
1623-
// netfilter-persistent snapshot ensures port 22 and DOCKER-USER rules persist.
1624-
commands = append(commands, "sudo netfilter-persistent save")
1612+
// Apply immediately for images where Docker is already running. The
1613+
// docker.service ExecStartPost hook handles images where Docker starts later.
1614+
commands = append(commands, "sudo /usr/local/sbin/brev-apply-docker-firewall.sh || true")
16251615

16261616
if len(commands) > 0 {
16271617
// Use runcmd to execute firewall setup commands
@@ -1663,25 +1653,53 @@ func generateUFWCommands(firewallRules v1.FirewallRules) []string {
16631653
return commands
16641654
}
16651655

1666-
// generateIPTablesCommands generates IPTables firewall commands to ensure docker ports are not made immediately
1667-
// accessible from the internet by default.
1668-
func generateIPTablesCommands() []string {
1669-
commands := []string{
1670-
"iptables -F DOCKER-USER",
1671-
"iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT",
1672-
"iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT",
1673-
"iptables -A DOCKER-USER -i br+ ! -o br+ -j ACCEPT",
1674-
"iptables -A DOCKER-USER -i cni+ ! -o cni+ -j ACCEPT", // TODO: add these back in when we have a way to test it
1675-
"iptables -A DOCKER-USER -i cali+ ! -o cali+ -j ACCEPT",
1676-
"iptables -A DOCKER-USER -i docker0 -o docker0 -j ACCEPT",
1677-
"iptables -A DOCKER-USER -i br+ -o br+ -j ACCEPT",
1678-
"iptables -A DOCKER-USER -i cni+ -o cni+ -j ACCEPT",
1679-
"iptables -A DOCKER-USER -i cali+ -o cali+ -j ACCEPT",
1680-
"iptables -A DOCKER-USER -i lo -j ACCEPT",
1681-
"iptables -A DOCKER-USER -j DROP",
1682-
"iptables -A DOCKER-USER -j RETURN", // Expected by Docker
1683-
}
1684-
return commands
1656+
const (
1657+
// Keep these generated paths stable: cloud-init, systemd, and validation
1658+
// tests all depend on this Docker firewall handoff.
1659+
dockerFirewallScriptPath = "/usr/local/sbin/brev-apply-docker-firewall.sh"
1660+
1661+
// This is a docker.service drop-in because the firewall rules must be
1662+
// re-applied immediately after Docker initializes or resets DOCKER-USER. If
1663+
// we need a separately inspectable status surface later, this can move to a
1664+
// named oneshot unit such as brev-docker-firewall.service; for now the
1665+
// execution is visible through docker.service journal/status output.
1666+
dockerServiceDropInDir = "/etc/systemd/system/docker.service.d"
1667+
dockerFirewallDropInPath = dockerServiceDropInDir + "/10-brev-firewall.conf"
1668+
)
1669+
1670+
func generateDockerFirewallWriteFiles() string {
1671+
// This function emits the only write_files block in this cloud-config. If
1672+
// another generated file is added later, merge it into this block instead of
1673+
// adding a second top-level write_files key.
1674+
//
1675+
// Docker published ports are not governed by UFW's INPUT policy. Docker adds
1676+
// NAT/FORWARD rules that can make `docker run -p host:container` reachable
1677+
// from the public internet even when UFW says incoming traffic is denied.
1678+
//
1679+
// DOCKER-USER is Docker's documented filter hook for this traffic. The script
1680+
// ensures the chain exists before configuring it. If Docker already created
1681+
// the chain, the create command fails harmlessly and the script continues.
1682+
//
1683+
// The generated script exits successfully even if an iptables command fails
1684+
// because failing Docker startup would be worse operationally. Validation
1685+
// tests assert that the rule set is actually present and blocks published
1686+
// ports.
1687+
//
1688+
// UFW persists its own rules in /etc/ufw; Docker firewall rules are applied
1689+
// through cloud-init and the docker.service post-start hook.
1690+
return fmt.Sprintf(`
1691+
write_files:
1692+
- path: %s
1693+
owner: root:root
1694+
permissions: '0755'
1695+
encoding: b64
1696+
content: %s
1697+
- path: %s
1698+
owner: root:root
1699+
permissions: '0644'
1700+
encoding: b64
1701+
content: %s
1702+
`, dockerFirewallScriptPath, base64.StdEncoding.EncodeToString([]byte(dockerFirewallScript)), dockerFirewallDropInPath, base64.StdEncoding.EncodeToString([]byte(dockerFirewallDropIn)))
16851703
}
16861704

16871705
// convertIngressRuleToUFW converts an ingress firewall rule to UFW command(s)

v1/providers/nebius/instance_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,36 @@ func TestNebiusClient_MergeInstanceForUpdate(t *testing.T) {
8484
assert.Equal(t, newInstance.Status, merged.Status)
8585
}
8686

87+
func TestGenerateCloudInitUserDataInstallsDockerFirewallHook(t *testing.T) {
88+
script := generateCloudInitUserData("ssh-rsa test", v1.FirewallRules{})
89+
90+
assert.NotContains(t, script, "iptables-persistent")
91+
assert.NotContains(t, script, "netfilter-persistent")
92+
assert.Contains(t, script, "write_files:")
93+
assert.Contains(t, script, "encoding: b64")
94+
assert.Contains(t, script, "/usr/local/sbin/brev-apply-docker-firewall.sh")
95+
assert.Contains(t, script, "/etc/systemd/system/docker.service.d")
96+
assert.Contains(t, script, "sudo /usr/local/sbin/brev-apply-docker-firewall.sh || true")
97+
assert.NotContains(t, script, "content: |")
98+
assert.NotContains(t, script, " #!/bin/sh")
99+
assert.NotContains(t, script, "ExecStartPost=/usr/local/sbin/brev-apply-docker-firewall.sh")
100+
assert.NotContains(t, script, "printf '%s\\n'")
101+
assert.NotContains(t, script, "| sudo tee")
102+
}
103+
104+
func TestDockerFirewallScriptCreatesDockerUserChainBeforeFlush(t *testing.T) {
105+
createChainIndex := strings.Index(dockerFirewallScript, "iptables -N DOCKER-USER")
106+
flushChainIndex := strings.Index(dockerFirewallScript, "iptables -F DOCKER-USER")
107+
108+
assert.Greater(t, createChainIndex, -1)
109+
assert.Greater(t, flushChainIndex, createChainIndex)
110+
assert.Contains(t, dockerFirewallScript, "iptables -A DOCKER-USER -j DROP")
111+
}
112+
113+
func TestDockerFirewallDropInIgnoresExecStartPostFailure(t *testing.T) {
114+
assert.Contains(t, dockerFirewallDropIn, "ExecStartPost=-/usr/local/sbin/brev-apply-docker-firewall.sh")
115+
}
116+
87117
// BenchmarkCreateInstance benchmarks the CreateInstance method
88118
func BenchmarkCreateInstance(b *testing.B) {
89119
b.Skip("CreateInstance requires real SDK initialization - use integration tests instead")
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[Service]
2+
ExecStartPost=-/usr/local/sbin/brev-apply-docker-firewall.sh
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/sh
2+
3+
iptables -N DOCKER-USER 2>/dev/null || true
4+
iptables -F DOCKER-USER || true
5+
iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT
6+
iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT
7+
iptables -A DOCKER-USER -i br+ ! -o br+ -j ACCEPT
8+
iptables -A DOCKER-USER -i cni+ ! -o cni+ -j ACCEPT
9+
iptables -A DOCKER-USER -i cali+ ! -o cali+ -j ACCEPT
10+
iptables -A DOCKER-USER -i docker0 -o docker0 -j ACCEPT
11+
iptables -A DOCKER-USER -i br+ -o br+ -j ACCEPT
12+
iptables -A DOCKER-USER -i cni+ -o cni+ -j ACCEPT
13+
iptables -A DOCKER-USER -i cali+ -o cali+ -j ACCEPT
14+
iptables -A DOCKER-USER -i lo -j ACCEPT
15+
iptables -A DOCKER-USER -j DROP
16+
iptables -A DOCKER-USER -j RETURN
17+
18+
exit 0

v1/providers/shadeform/firewall.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ const (
1515
ufwDefaultAllowPort2222 = "ufw allow 2222/tcp"
1616
ufwForceEnable = "ufw --force enable"
1717

18+
// Ensure DOCKER-USER exists before clearing it. Docker normally creates this
19+
// chain, but firewall setup can run before Docker has initialized iptables.
20+
ipTablesCreateDockerUserChain = "iptables -N DOCKER-USER || true"
21+
1822
// Clear DOCKER-USER policy.
1923
ipTablesResetDockerUserChain = "iptables -F DOCKER-USER"
2024

@@ -83,6 +87,7 @@ func (c *ShadeformClient) getUFWCommands(firewallRules v1.FirewallRules) []strin
8387

8488
func (c *ShadeformClient) getIPTablesCommands() []string {
8589
commands := []string{
90+
ipTablesCreateDockerUserChain,
8691
ipTablesResetDockerUserChain,
8792
ipTablesAllowDockerUserOutbound,
8893
ipTablesAllowDockerUserOutboundInit0,
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package v1
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestShadeformIPTablesCommandsCreateDockerUserChainBeforeFlush(t *testing.T) {
11+
client := &ShadeformClient{}
12+
commands := strings.Join(client.getIPTablesCommands(), "\n")
13+
14+
createChainIndex := strings.Index(commands, "iptables -N DOCKER-USER")
15+
flushChainIndex := strings.Index(commands, "iptables -F DOCKER-USER")
16+
17+
assert.Greater(t, createChainIndex, -1)
18+
assert.Greater(t, flushChainIndex, createChainIndex)
19+
}

0 commit comments

Comments
 (0)