Skip to content

Commit 542eece

Browse files
authored
fix: prevent Docker network resource exhaustion with improved cleanup (#41)
1 parent a560326 commit 542eece

4 files changed

Lines changed: 174 additions & 57 deletions

File tree

.claude/commands/pr-compress.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Deeply review existing change.
2+
3+
Try to find opportunities for DRY. i.e. where you can push the net balance of code additions as low as possible while simultaneously improving readability and behavior.
4+
5+
Do not venture into changes beyond the scope of the PR.

src/jail/linux/docker.rs

Lines changed: 134 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,51 @@ struct DockerNetwork {
1212
network_name: String,
1313
}
1414

15+
impl DockerNetwork {
16+
const NETWORK_PREFIX: &'static str = "httpjail_";
17+
18+
/// Generate network name from jail ID
19+
fn network_name_from_jail_id(jail_id: &str) -> String {
20+
format!("{}{}", Self::NETWORK_PREFIX, jail_id)
21+
}
22+
23+
/// Extract jail ID from network name
24+
fn jail_id_from_network_name(network_name: &str) -> Option<&str> {
25+
network_name.strip_prefix(Self::NETWORK_PREFIX)
26+
}
27+
28+
/// Check if a Docker command failed due to resource not existing
29+
fn is_not_found_error(stderr: &str) -> bool {
30+
stderr.contains("not found")
31+
|| stderr.contains("No such")
32+
|| stderr.contains("does not exist")
33+
}
34+
35+
/// Check if a Docker command failed due to resource already existing
36+
fn is_already_exists_error(stderr: &str) -> bool {
37+
stderr.contains("already exists")
38+
}
39+
}
40+
1541
/// Docker routing nftables resource that gets cleaned up on drop
1642
struct DockerRoutingTable {
1743
#[allow(dead_code)]
1844
jail_id: String,
1945
table_name: String,
2046
}
2147

48+
impl DockerRoutingTable {
49+
/// Generate table name from jail ID
50+
fn table_name_from_jail_id(jail_id: &str) -> String {
51+
format!("httpjail_docker_{}", jail_id)
52+
}
53+
}
54+
2255
impl SystemResource for DockerRoutingTable {
2356
fn create(jail_id: &str) -> Result<Self> {
24-
let table_name = format!("httpjail_docker_{}", jail_id);
2557
Ok(Self {
2658
jail_id: jail_id.to_string(),
27-
table_name,
59+
table_name: Self::table_name_from_jail_id(jail_id),
2860
})
2961
}
3062

@@ -38,10 +70,10 @@ impl SystemResource for DockerRoutingTable {
3870

3971
if !output.status.success() {
4072
let stderr = String::from_utf8_lossy(&output.stderr);
41-
if !stderr.contains("No such file or directory") && !stderr.contains("does not exist") {
42-
warn!("Failed to delete Docker routing table: {}", stderr);
43-
} else {
73+
if DockerNetwork::is_not_found_error(&stderr) {
4474
debug!("Docker routing table {} already removed", self.table_name);
75+
} else {
76+
warn!("Failed to delete Docker routing table: {}", stderr);
4577
}
4678
} else {
4779
info!("Removed Docker routing table {}", self.table_name);
@@ -51,25 +83,16 @@ impl SystemResource for DockerRoutingTable {
5183
}
5284

5385
fn for_existing(jail_id: &str) -> Self {
54-
let table_name = format!("httpjail_docker_{}", jail_id);
5586
Self {
5687
jail_id: jail_id.to_string(),
57-
table_name,
88+
table_name: Self::table_name_from_jail_id(jail_id),
5889
}
5990
}
6091
}
6192

62-
impl DockerNetwork {
63-
#[allow(dead_code)]
64-
fn new(jail_id: &str) -> Result<Self> {
65-
let network_name = format!("httpjail_{}", jail_id);
66-
Ok(Self { network_name })
67-
}
68-
}
69-
7093
impl SystemResource for DockerNetwork {
7194
fn create(jail_id: &str) -> Result<Self> {
72-
let network_name = format!("httpjail_{}", jail_id);
95+
let network_name = Self::network_name_from_jail_id(jail_id);
7396

7497
// Create Docker network with no default gateway (isolated)
7598
// Using a /24 subnet in the 172.20.x.x range
@@ -92,7 +115,7 @@ impl SystemResource for DockerNetwork {
92115

93116
if !output.status.success() {
94117
let stderr = String::from_utf8_lossy(&output.stderr);
95-
if stderr.contains("already exists") {
118+
if Self::is_already_exists_error(&stderr) {
96119
info!("Docker network {} already exists", network_name);
97120
} else {
98121
anyhow::bail!("Failed to create Docker network: {}", stderr);
@@ -117,7 +140,7 @@ impl SystemResource for DockerNetwork {
117140

118141
if !output.status.success() {
119142
let stderr = String::from_utf8_lossy(&output.stderr);
120-
if stderr.contains("not found") {
143+
if Self::is_not_found_error(&stderr) {
121144
debug!("Docker network {} already removed", self.network_name);
122145
} else {
123146
warn!("Failed to remove Docker network: {}", stderr);
@@ -130,8 +153,9 @@ impl SystemResource for DockerNetwork {
130153
}
131154

132155
fn for_existing(jail_id: &str) -> Self {
133-
let network_name = format!("httpjail_{}", jail_id);
134-
Self { network_name }
156+
Self {
157+
network_name: Self::network_name_from_jail_id(jail_id),
158+
}
135159
}
136160
}
137161

@@ -202,47 +226,76 @@ impl DockerLinux {
202226
})
203227
}
204228

229+
/// Clean up all orphaned Docker networks that don't have corresponding canary files
230+
fn cleanup_all_orphaned_docker_networks() -> Result<()> {
231+
debug!("Scanning for orphaned Docker networks");
232+
233+
// List all Docker networks
234+
let output = Command::new("docker")
235+
.args(["network", "ls", "--format", "{{.Name}}"])
236+
.output()
237+
.context("Failed to list Docker networks")?;
238+
239+
if !output.status.success() {
240+
warn!("Failed to list Docker networks for cleanup");
241+
return Ok(());
242+
}
243+
244+
let networks = String::from_utf8_lossy(&output.stdout);
245+
let canary_dir = crate::jail::get_canary_dir();
246+
247+
for network_name in networks.lines() {
248+
// Extract jail_id from network name (skip non-httpjail networks)
249+
let Some(jail_id) = DockerNetwork::jail_id_from_network_name(network_name) else {
250+
continue;
251+
};
252+
253+
// Check if canary file exists for this jail
254+
let canary_path = canary_dir.join(jail_id);
255+
if !canary_path.exists() {
256+
info!(
257+
"Found orphaned Docker network {} without canary, removing",
258+
network_name
259+
);
260+
261+
// Remove the orphaned network
262+
let rm_output = Command::new("docker")
263+
.args(["network", "rm", network_name])
264+
.output()
265+
.context("Failed to remove orphaned Docker network")?;
266+
267+
if !rm_output.status.success() {
268+
let stderr = String::from_utf8_lossy(&rm_output.stderr);
269+
if !DockerNetwork::is_not_found_error(&stderr) {
270+
warn!(
271+
"Failed to remove orphaned Docker network {}: {}",
272+
network_name, stderr
273+
);
274+
}
275+
}
276+
}
277+
}
278+
279+
Ok(())
280+
}
281+
282+
/// Docker flags that take a value as the next argument
283+
const FLAGS_WITH_VALUES: &'static [&'static str] =
284+
&["-e", "-v", "-p", "--name", "--entrypoint", "-w", "--user"];
285+
205286
/// Build the docker command with isolated network
206287
fn build_docker_command(
207288
&self,
208289
docker_args: &[String],
209290
extra_env: &[(String, String)],
210291
) -> Result<Command> {
211-
let network_name = format!("httpjail_{}", self.config.jail_id);
292+
let network_name = DockerNetwork::network_name_from_jail_id(&self.config.jail_id);
212293
// Parse docker arguments to filter out conflicting options and find the image
213294
let modified_args = Self::filter_network_args(docker_args);
214295

215296
// Find where the image name is in the args
216-
let mut image_idx = None;
217-
let mut skip_next = false;
218-
219-
for (i, arg) in modified_args.iter().enumerate() {
220-
if skip_next {
221-
skip_next = false;
222-
continue;
223-
}
224-
225-
// Skip known flags that take values
226-
if arg == "-e"
227-
|| arg == "-v"
228-
|| arg == "-p"
229-
|| arg == "--name"
230-
|| arg == "--entrypoint"
231-
|| arg == "-w"
232-
|| arg == "--user"
233-
{
234-
skip_next = true;
235-
continue;
236-
}
237-
238-
// If it doesn't start with -, it's likely the image
239-
if !arg.starts_with('-') {
240-
image_idx = Some(i);
241-
break;
242-
}
243-
}
244-
245-
let image_idx = image_idx.context("Could not find Docker image in arguments")?;
297+
let image_idx = Self::find_image_index(&modified_args)
298+
.context("Could not find Docker image in arguments")?;
246299

247300
// Split args into: docker options, image, and command
248301
let docker_opts = &modified_args[..image_idx];
@@ -302,6 +355,31 @@ impl DockerLinux {
302355
Ok(cmd)
303356
}
304357

358+
/// Find the index of the Docker image in the arguments
359+
fn find_image_index(args: &[String]) -> Option<usize> {
360+
let mut skip_next = false;
361+
362+
for (i, arg) in args.iter().enumerate() {
363+
if skip_next {
364+
skip_next = false;
365+
continue;
366+
}
367+
368+
// Skip known flags that take values
369+
if Self::FLAGS_WITH_VALUES.contains(&arg.as_str()) {
370+
skip_next = true;
371+
continue;
372+
}
373+
374+
// If it doesn't start with -, it's likely the image
375+
if !arg.starts_with('-') {
376+
return Some(i);
377+
}
378+
}
379+
380+
None
381+
}
382+
305383
/// Filter out any existing --network arguments from docker args
306384
fn filter_network_args(docker_args: &[String]) -> Vec<String> {
307385
let mut modified_args = Vec::new();
@@ -347,7 +425,7 @@ impl DockerLinux {
347425
// Add nftables rules to:
348426
// 1. Allow traffic from Docker network to jail's proxy ports
349427
// 2. DNAT HTTP/HTTPS traffic to the proxy
350-
let table_name = format!("httpjail_docker_{}", self.config.jail_id);
428+
let table_name = DockerRoutingTable::table_name_from_jail_id(&self.config.jail_id);
351429

352430
// Create nftables rules
353431
let nft_rules = format!(
@@ -412,6 +490,10 @@ impl DockerLinux {
412490

413491
impl Jail for DockerLinux {
414492
fn setup(&mut self, proxy_port: u16) -> Result<()> {
493+
// Clean up any orphaned Docker networks first
494+
// This handles cases where Docker networks exist without corresponding canary files
495+
Self::cleanup_all_orphaned_docker_networks()?;
496+
415497
// First setup the inner Linux jail
416498
self.inner_jail.setup(proxy_port)?;
417499

src/jail/managed.rs

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -225,12 +225,23 @@ impl<J: Jail> Jail for ManagedJail<J> {
225225
// Cleanup orphans first
226226
if self.enable_heartbeat {
227227
self.cleanup_orphans()?;
228+
229+
// Create canary BEFORE setting up jail to prevent race condition
230+
// where another jail's cleanup might delete our network
231+
self.create_canary()?;
228232
}
229233

230-
// Setup the inner jail
231-
self.jail.setup(proxy_port)?;
234+
// Setup the inner jail (which may create Docker networks)
235+
let setup_result = self.jail.setup(proxy_port);
236+
237+
// If setup failed, clean up the canary we created
238+
if setup_result.is_err() && self.enable_heartbeat {
239+
let _ = self.delete_canary();
240+
return setup_result;
241+
}
232242

233-
// Start heartbeat after successful setup
243+
// Start heartbeat thread after successful setup
244+
// Note: This will try to create canary again but create_canary is idempotent
234245
self.start_heartbeat()?;
235246

236247
Ok(())
@@ -272,8 +283,20 @@ impl<J: Jail> Drop for ManagedJail<J> {
272283
fn drop(&mut self) {
273284
// Best effort cleanup
274285
let _ = self.stop_heartbeat();
275-
if self.enable_heartbeat {
286+
287+
// Explicitly cleanup jail resources BEFORE deleting canary
288+
// This ensures resources are freed before we signal that the jail is gone
289+
let cleanup_result = self.jail.cleanup();
290+
291+
// Only delete canary if cleanup succeeded
292+
// If cleanup failed, leave the canary so orphan cleanup can retry later
293+
if self.enable_heartbeat && cleanup_result.is_ok() {
276294
let _ = self.delete_canary();
295+
} else if cleanup_result.is_err() {
296+
error!(
297+
"Failed to cleanup jail '{}', leaving canary for orphan cleanup",
298+
self.jail.jail_id()
299+
);
277300
}
278301
}
279302
}

src/jail/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,14 @@ pub trait Jail: Send + Sync {
5151

5252
/// Get the canary directory for tracking jail lifetimes
5353
pub fn get_canary_dir() -> std::path::PathBuf {
54-
std::path::PathBuf::from("/tmp/httpjail")
54+
// Use user data directory instead of /tmp to avoid issues with tmp cleaners
55+
// This ensures canaries persist across reboots and are only removed when we want them to be
56+
if let Some(data_dir) = dirs::data_dir() {
57+
data_dir.join("httpjail").join("canaries")
58+
} else {
59+
// Fallback to /tmp if we can't get user data dir (should rarely happen)
60+
std::path::PathBuf::from("/tmp/httpjail")
61+
}
5562
}
5663

5764
/// Get the directory for httpjail temporary files (like resolv.conf)

0 commit comments

Comments
 (0)