Skip to content

Commit 8e21545

Browse files
jarlahclaude
andauthored
Fix reliability: container cleanup, wait strategies, and Ryuk socket (#243)
* Fix reliability: container cleanup, wait strategies, Ryuk socket, and ToxiproxyContainer - Retry Ryuk socket on all transient errors, not just econnrefused - Wrap Ryuk filter registration in retry loop for closed errors - Track containers in GenServer state, clean up on terminate and wait strategy failure - Make image cleanup opt-in via cleanup.images property - Switch ToxiproxyContainer from PortWaitStrategy to HttpWaitStrategy - Add retry logic with backoff to ToxiproxyContainer httpc API calls - Fix HttpWaitStrategy response validation and retry logic - Reduce LogWaitStrategy retry delay from 500ms to 200ms - Set global 300s test timeout, remove per-module overrides - Add tests for HttpWaitStrategy, terminate cleanup, and wait strategy failure Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Add property-controlled pull policy for offline support Allow configuring the default image pull behavior via `pull.policy` property (or `TESTCONTAINERS_PULL_POLICY` env var). Set to "missing" or "never" to skip pulling images that already exist locally, enabling offline usage. Per-container pull_policy set via `with_pull_policy/2` still takes precedence over the global setting. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Fix pull policy: use merged properties for env var support, apply to Ryuk - Fix state.properties being rebound to read_property_file() (user file only), losing env vars. Now uses read_property_sources() which merges env vars, user file, and project file with proper precedence. - Apply pull policy to Ryuk container so it also respects the setting. - Tag pull_policy_test.exs with :needs_registry for offline exclusion. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 644f5bd commit 8e21545

25 files changed

+347
-85
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,6 @@ testcontainers_elixir-*.tar
3838
.mix/
3939
.hex/
4040
.nix-mix/
41+
42+
# test runs
43+
test_run_*.log

lib/container.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ defmodule Testcontainers.Container do
2929
hostname: nil,
3030
reuse: false,
3131
force_reuse: false,
32-
pull_policy: Testcontainers.PullPolicy.always_pull()
32+
pull_policy: nil
3333
]
3434

3535
@doc """

lib/container/toxiproxy_container.ex

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ defmodule Testcontainers.ToxiproxyContainer do
1010

1111
alias Testcontainers.Container
1212
alias Testcontainers.ContainerBuilder
13-
alias Testcontainers.PortWaitStrategy
13+
alias Testcontainers.HttpWaitStrategy
1414
alias Testcontainers.ToxiproxyContainer
1515

1616
@default_image "ghcr.io/shopify/toxiproxy"
@@ -24,6 +24,8 @@ defmodule Testcontainers.ToxiproxyContainer do
2424
@proxy_port_count 31
2525

2626
@default_wait_timeout 60_000
27+
@max_retries 3
28+
@retry_delay_ms 500
2729

2830
@enforce_keys [:image, :wait_timeout]
2931
defstruct [:image, :wait_timeout, check_image: @default_image, reuse: false]
@@ -144,7 +146,7 @@ defmodule Testcontainers.ToxiproxyContainer do
144146

145147
headers = [{~c"content-type", ~c"application/json"}]
146148

147-
case :httpc.request(:post, {url, headers, ~c"application/json", body}, [], []) do
149+
case httpc_request_with_retry(:post, {url, headers, ~c"application/json", body}) do
148150
{:ok, {{_, code, _}, _, _}} when code in [200, 201] ->
149151
# Return the mapped port on the host
150152
{:ok, Container.mapped_port(container, listen_port)}
@@ -198,7 +200,7 @@ defmodule Testcontainers.ToxiproxyContainer do
198200

199201
url = ~c"http://#{host}:#{api_port}/proxies/#{name}"
200202

201-
case :httpc.request(:delete, {url, []}, [], []) do
203+
case httpc_request_with_retry(:delete, {url, []}) do
202204
{:ok, {{_, 204, _}, _, _}} -> :ok
203205
{:ok, {{_, 404, _}, _, _}} -> {:error, :not_found}
204206
{:ok, {{_, code, _}, _, body}} -> {:error, {:http_error, code, body}}
@@ -217,7 +219,7 @@ defmodule Testcontainers.ToxiproxyContainer do
217219

218220
url = ~c"http://#{host}:#{api_port}/reset"
219221

220-
case :httpc.request(:post, {url, [], ~c"application/json", "{}"}, [], []) do
222+
case httpc_request_with_retry(:post, {url, [], ~c"application/json", "{}"}) do
221223
{:ok, {{_, 204, _}, _, _}} -> :ok
222224
{:ok, {{_, code, _}, _, body}} -> {:error, {:http_error, code, body}}
223225
{:error, reason} -> {:error, reason}
@@ -237,7 +239,7 @@ defmodule Testcontainers.ToxiproxyContainer do
237239

238240
url = ~c"http://#{host}:#{api_port}/proxies"
239241

240-
case :httpc.request(:get, {url, []}, [], []) do
242+
case httpc_request_with_retry(:get, {url, []}) do
241243
{:ok, {{_, 200, _}, _, body}} ->
242244
{:ok, Jason.decode!(to_string(body))}
243245

@@ -254,6 +256,33 @@ defmodule Testcontainers.ToxiproxyContainer do
254256
"""
255257
def proxy_port_count, do: @proxy_port_count
256258

259+
defp httpc_request_with_retry(method, request, retries_left \\ @max_retries) do
260+
http_opts = [timeout: 5_000, connect_timeout: 5_000]
261+
262+
result = :httpc.request(method, request, http_opts, [])
263+
264+
case result do
265+
{:error, reason} when retries_left > 0 ->
266+
retryable =
267+
case reason do
268+
:socket_closed_remotely -> true
269+
{:failed_connect, _} -> true
270+
:econnrefused -> true
271+
_ -> false
272+
end
273+
274+
if retryable do
275+
Process.sleep(@retry_delay_ms)
276+
httpc_request_with_retry(method, request, retries_left - 1)
277+
else
278+
result
279+
end
280+
281+
_ ->
282+
result
283+
end
284+
end
285+
257286
# ContainerBuilder implementation
258287
defimpl ContainerBuilder do
259288
import Container
@@ -272,10 +301,10 @@ defmodule Testcontainers.ToxiproxyContainer do
272301
new(config.image)
273302
|> with_exposed_ports(all_ports)
274303
|> with_waiting_strategy(
275-
PortWaitStrategy.new(
276-
"127.0.0.1",
304+
HttpWaitStrategy.new(
305+
"/version",
277306
ToxiproxyContainer.control_port(),
278-
config.wait_timeout
307+
timeout: config.wait_timeout
279308
)
280309
)
281310
|> with_reuse(config.reuse)

lib/docker/api.ex

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ defmodule Testcontainers.Docker.Api do
6464
end
6565
end
6666

67+
def delete_image(image, conn) when is_binary(image) do
68+
case Api.Image.image_delete(conn, image, force: true) do
69+
{:ok, _} -> :ok
70+
{:error, _} = error -> error
71+
end
72+
end
73+
6774
def create_container(%Container{} = container, conn) do
6875
case Api.Container.container_create(conn, container_create_request(container)) do
6976
{:error, %Tesla.Env{status: other}} ->

lib/testcontainers.ex

Lines changed: 119 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ defmodule Testcontainers do
1515
alias Testcontainers.Connection
1616
alias Testcontainers.Container
1717
alias Testcontainers.ContainerBuilder
18+
alias Testcontainers.PullPolicy
1819
alias Testcontainers.Util.PropertiesParser
1920

2021
import Testcontainers.Constants
@@ -58,8 +59,7 @@ defmodule Testcontainers do
5859
|> Base.encode16()
5960

6061
with {:ok, docker_hostname} <- get_docker_hostname(docker_host_url, conn),
61-
{:ok} <- start_reaper(conn, session_id, properties, docker_host, docker_hostname),
62-
{:ok, properties} <- PropertiesParser.read_property_file() do
62+
{:ok} <- start_reaper(conn, session_id, properties, docker_host, docker_hostname) do
6363
Logger.info("Testcontainers initialized")
6464

6565
{:ok,
@@ -68,7 +68,9 @@ defmodule Testcontainers do
6868
docker_hostname: docker_hostname,
6969
session_id: session_id,
7070
properties: properties,
71-
networks: MapSet.new()
71+
networks: MapSet.new(),
72+
containers: MapSet.new(),
73+
images: MapSet.new()
7274
}}
7375
else
7476
error ->
@@ -170,15 +172,70 @@ defmodule Testcontainers do
170172
wait_for_call({:remove_network, network_name}, name)
171173
end
172174

175+
@impl true
176+
def handle_cast({:track_container, container_id, image}, state) do
177+
{:noreply,
178+
%{
179+
state
180+
| containers: MapSet.put(state.containers, container_id),
181+
images: MapSet.put(state.images, image)
182+
}}
183+
end
184+
185+
def handle_cast({:track_image, image}, state) do
186+
{:noreply, %{state | images: MapSet.put(state.images, image)}}
187+
end
188+
173189
@impl true
174190
def handle_info(_msg, state) do
175191
{:noreply, state}
176192
end
177193

194+
@impl true
195+
def terminate(_reason, state) do
196+
for container_id <- state.containers do
197+
Logger.info("Terminating container #{container_id}")
198+
Api.stop_container(container_id, state.conn)
199+
end
200+
201+
for network <- state.networks do
202+
Logger.info("Removing network #{network}")
203+
Api.remove_network(network, state.conn)
204+
end
205+
206+
if Map.get(state.properties, "cleanup.images", "false") == "true" do
207+
for image <- state.images do
208+
Logger.info("Removing image #{image}")
209+
Api.delete_image(image, state.conn)
210+
end
211+
end
212+
213+
:ok
214+
end
215+
178216
@impl true
179217
def handle_call({:start_container, config_builder}, from, state) do
218+
self_pid = self()
219+
180220
Task.async(fn ->
181-
GenServer.reply(from, start_and_wait(config_builder, state))
221+
result = start_and_wait(config_builder, state)
222+
223+
case result do
224+
{:ok, container} ->
225+
GenServer.cast(self_pid, {:track_container, container.container_id, container.image})
226+
227+
_ ->
228+
# Track the image even on failure so it gets cleaned up on terminate
229+
case config_builder do
230+
%Container{image: image} when is_binary(image) ->
231+
GenServer.cast(self_pid, {:track_image, image})
232+
233+
_ ->
234+
:ok
235+
end
236+
end
237+
238+
GenServer.reply(from, result)
182239
end)
183240

184241
{:noreply, state}
@@ -187,7 +244,7 @@ defmodule Testcontainers do
187244
@impl true
188245
def handle_call({:stop_container, container_id}, from, state) do
189246
Task.async(fn -> GenServer.reply(from, Api.stop_container(container_id, state.conn)) end)
190-
{:noreply, state}
247+
{:noreply, %{state | containers: MapSet.delete(state.containers, container_id)}}
191248
end
192249

193250
@impl true
@@ -275,20 +332,43 @@ defmodule Testcontainers do
275332
|> Container.with_auto_remove(true)
276333
|> Container.with_privileged(ryuk_privileged)
277334

278-
with {:ok, _} <- Api.pull_image(ryuk_config.image, conn),
335+
ryuk_config = resolve_pull_policy(ryuk_config, properties)
336+
337+
with :ok <- maybe_pull_image(ryuk_config, conn),
279338
{:ok, ryuk_container_id} <- Api.create_container(ryuk_config, conn),
280339
:ok <- Api.start_container(ryuk_container_id, conn),
281340
{:ok, container} <- Api.get_container(ryuk_container_id, conn),
282-
{:ok, socket} <- create_ryuk_socket(container, docker_hostname),
283-
:ok <- register_ryuk_filter(session_id, socket) do
341+
:ok <- connect_and_register_ryuk(container, docker_hostname, session_id) do
284342
{:ok}
285343
end
286344
end
287345

346+
defp connect_and_register_ryuk(container, docker_hostname, session_id, attempt \\ 1)
347+
348+
defp connect_and_register_ryuk(container, docker_hostname, session_id, attempt)
349+
when attempt <= 5 do
350+
with {:ok, socket} <- create_ryuk_socket(container, docker_hostname),
351+
:ok <- register_ryuk_filter(session_id, socket) do
352+
:ok
353+
else
354+
error ->
355+
Logger.info(
356+
"Failed to connect and register ryuk filter: #{inspect(error)}. Retrying... Attempt #{attempt}/5"
357+
)
358+
359+
:timer.sleep(1000)
360+
connect_and_register_ryuk(container, docker_hostname, session_id, attempt + 1)
361+
end
362+
end
363+
364+
defp connect_and_register_ryuk(_container, _docker_hostname, _session_id, _attempt) do
365+
{:error, :ryuk_connection_failed}
366+
end
367+
288368
defp create_ryuk_socket(container, docker_hostname, reattempt_count \\ 0)
289369

290370
defp create_ryuk_socket(%Container{} = container, docker_hostname, reattempt_count)
291-
when reattempt_count < 3 do
371+
when reattempt_count < 5 do
292372
host_port = Container.mapped_port(container, 8080)
293373

294374
case :gen_tcp.connect(~c"#{docker_hostname}", host_port, [
@@ -300,13 +380,10 @@ defmodule Testcontainers do
300380
{:ok, connected} ->
301381
{:ok, connected}
302382

303-
{:error, :econnrefused} ->
304-
Logger.info("Connection refused. Retrying... Attempt #{reattempt_count + 1}/3")
305-
:timer.sleep(5000)
383+
{:error, reason} ->
384+
Logger.info("Connection failed with #{inspect(reason)}. Retrying... Attempt #{reattempt_count + 1}/5")
385+
:timer.sleep(1000)
306386
create_ryuk_socket(container, docker_hostname, reattempt_count + 1)
307-
308-
{:error, error} ->
309-
{:error, error}
310387
end
311388
end
312389

@@ -366,13 +443,38 @@ defmodule Testcontainers do
366443
end
367444

368445
defp create_and_start_container(config, config_builder, state) do
446+
config = resolve_pull_policy(config, state.properties)
447+
369448
with :ok <- maybe_pull_image(config, state.conn),
370-
{:ok, id} <- Api.create_container(config, state.conn),
371-
:ok <- Api.start_container(id, state.conn),
449+
{:ok, id} <- Api.create_container(config, state.conn) do
450+
start_and_wait_container(id, config, config_builder, state)
451+
end
452+
end
453+
454+
defp resolve_pull_policy(%{pull_policy: nil} = config, properties) do
455+
pull_policy =
456+
case Map.get(properties, "pull.policy", "always") do
457+
"missing" -> PullPolicy.never_pull()
458+
"never" -> PullPolicy.never_pull()
459+
_ -> PullPolicy.always_pull()
460+
end
461+
462+
%{config | pull_policy: pull_policy}
463+
end
464+
465+
defp resolve_pull_policy(config, _properties), do: config
466+
467+
defp start_and_wait_container(id, config, config_builder, state) do
468+
with :ok <- Api.start_container(id, state.conn),
372469
{:ok, container} <- Api.get_container(id, state.conn),
373470
:ok <- ContainerBuilder.after_start(config_builder, container, state.conn),
374471
:ok <- wait_for_container(container, config.wait_strategies || [], state.conn) do
375472
{:ok, container}
473+
else
474+
error ->
475+
Logger.info("Cleaning up container #{id} after failed start")
476+
Api.stop_container(id, state.conn)
477+
error
376478
end
377479
end
378480

lib/wait_strategy/http_wait_strategy.ex

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ defmodule Testcontainers.HttpWaitStrategy do
33
Considers the container as ready when a http request is successful.
44
"""
55

6-
@timeout 5000
7-
@max_retries 3
6+
@timeout 60_000
7+
@max_retries 10
88

99
@typedoc """
1010
The HttpWaitStrategy struct
@@ -83,7 +83,7 @@ defmodule Testcontainers.HttpWaitStrategy do
8383
headers: wait_strategy.headers
8484
)
8585

86-
with response <- validate_response(raw_response),
86+
with {:ok, response} <- validate_response(raw_response),
8787
:ok <- verify_status_code(wait_strategy, response),
8888
:ok <- verify_match(wait_strategy, response) do
8989
:ok
@@ -95,7 +95,7 @@ defmodule Testcontainers.HttpWaitStrategy do
9595

9696
# Response evaluation
9797

98-
defp validate_response({:ok, response}), do: response
98+
defp validate_response({:ok, response}), do: {:ok, response}
9999
defp validate_response({:error, reason}), do: {:error, reason}
100100

101101
defp verify_status_code(wait_strategy, %{status: status_code})
@@ -130,11 +130,12 @@ defmodule Testcontainers.HttpWaitStrategy do
130130
{Tesla.Middleware.BaseUrl, base_url: base_url},
131131
{Tesla.Middleware.Timeout, timeout: request_timeout},
132132
{Tesla.Middleware.Retry,
133-
delay: 1,
133+
delay: 500,
134134
max_retries: wait_strategy.max_retries,
135-
max_delay: 10,
135+
max_delay: 5_000,
136136
should_retry: fn
137-
_, _env, _context -> true
137+
{:ok, _response}, _env, _context -> false
138+
{:error, _reason}, _env, _context -> true
138139
end}
139140
])
140141
end

0 commit comments

Comments
 (0)