Skip to content

Commit 01efc95

Browse files
authored
Fix recording shutdown for node docker (#3106)
1 parent 4f94c02 commit 01efc95

File tree

10 files changed

+189
-60
lines changed

10 files changed

+189
-60
lines changed

Makefile

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,7 @@ test_parallel: hub chrome firefox edge chromium video
980980
cd ./tests || true ; \
981981
echo TAG=$(TAG_VERSION) > .env ; \
982982
echo VIDEO_TAG=$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) >> .env ; \
983-
echo TEST_DELAY_AFTER_TEST=$(or $(TEST_DELAY_AFTER_TEST), 2) >> .env ; \
983+
echo TEST_DELAY_AFTER_TEST=$(or $(TEST_DELAY_AFTER_TEST), 0) >> .env ; \
984984
echo TEST_DRAIN_AFTER_SESSION_COUNT=$(or $(TEST_DRAIN_AFTER_SESSION_COUNT), 2) >> .env ; \
985985
echo TEST_PARALLEL_HARDENING=$(or $(TEST_PARALLEL_HARDENING), "true") >> .env ; \
986986
echo TEST_PARALLEL_COUNT=$(or $(TEST_PARALLEL_COUNT), 5) >> .env ; \
@@ -1007,10 +1007,10 @@ test_parallel: hub chrome firefox edge chromium video
10071007
make test_video_integrity
10081008

10091009
test_video_standalone: standalone_chrome standalone_chromium standalone_firefox standalone_edge
1010-
DOCKER_COMPOSE_FILE=docker-compose-v3-test-standalone.yml TEST_DELAY_AFTER_TEST=2 HUB_CHECKS_INTERVAL=45 make test_video
1010+
DOCKER_COMPOSE_FILE=docker-compose-v3-test-standalone.yml TEST_DELAY_AFTER_TEST=0 HUB_CHECKS_INTERVAL=45 make test_video
10111011

10121012
test_video_dynamic_name:
1013-
VIDEO_FILE_NAME=auto TEST_DELAY_AFTER_TEST=2 HUB_CHECKS_INTERVAL=45 TEST_ADD_CAPS_RECORD_VIDEO=false \
1013+
VIDEO_FILE_NAME=auto TEST_DELAY_AFTER_TEST=0 HUB_CHECKS_INTERVAL=45 TEST_ADD_CAPS_RECORD_VIDEO=false \
10141014
make test_video
10151015

10161016
# This should run on its own CI job. There is no need to combine it with the other tests.
@@ -1039,7 +1039,7 @@ test_video: video hub chrome firefox edge chromium
10391039
echo UID=$$(id -u) >> .env ; \
10401040
echo BINDING_VERSION=$(BINDING_VERSION) >> .env ; \
10411041
echo BASE_VERSION=$(BASE_VERSION) >> .env ; \
1042-
echo TEST_DELAY_AFTER_TEST=$(or $(TEST_DELAY_AFTER_TEST), 2) >> .env ; \
1042+
echo TEST_DELAY_AFTER_TEST=$(or $(TEST_DELAY_AFTER_TEST), 0) >> .env ; \
10431043
echo HUB_CHECKS_INTERVAL=$(or $(HUB_CHECKS_INTERVAL), 45) >> .env ; \
10441044
echo SELENIUM_ENABLE_MANAGED_DOWNLOADS=$(or $(SELENIUM_ENABLE_MANAGED_DOWNLOADS), "true") >> .env ; \
10451045
echo TEST_FIREFOX_INSTALL_LANG_PACKAGE=$${TEST_FIREFOX_INSTALL_LANG_PACKAGE} >> .env ; \
@@ -1161,7 +1161,7 @@ test_node_docker: hub standalone_docker standalone_chrome standalone_firefox sta
11611161
echo LOG_LEVEL=$(or $(LOG_LEVEL), "INFO") >> .env ; \
11621162
echo REQUEST_TIMEOUT=$(or $(REQUEST_TIMEOUT), 300) >> .env ; \
11631163
echo SELENIUM_ENABLE_MANAGED_DOWNLOADS=$(or $(SELENIUM_ENABLE_MANAGED_DOWNLOADS), "false") >> .env ; \
1164-
echo TEST_DELAY_AFTER_TEST=$(or $(TEST_DELAY_AFTER_TEST), 2) >> .env ; \
1164+
echo TEST_DELAY_AFTER_TEST=$(or $(TEST_DELAY_AFTER_TEST), 0) >> .env ; \
11651165
echo RECORD_STANDALONE=$(or $(RECORD_STANDALONE), "true") >> .env ; \
11661166
echo GRID_URL=$(or $(GRID_URL), "") >> .env ; \
11671167
echo HUB_CHECKS_INTERVAL=$(or $(HUB_CHECKS_INTERVAL), 20) >> .env ; \
@@ -1170,6 +1170,7 @@ test_node_docker: hub standalone_docker standalone_chrome standalone_firefox sta
11701170
echo UID=$$(id -u) >> .env ; \
11711171
echo BINDING_VERSION=$(BINDING_VERSION) >> .env ; \
11721172
echo BASE_VERSION=$(BASE_VERSION) >> .env ; \
1173+
echo VIDEO_EVENT_DRIVEN=$(or $(VIDEO_EVENT_DRIVEN), "true") >> .env ; \
11731174
if [ "$$(uname)" != "Darwin" ]; then \
11741175
echo HOST_IP=$$(hostname -I | awk '{print $$1}') >> .env ; \
11751176
else \
@@ -1238,7 +1239,7 @@ test_video_integrity:
12381239
fi; \
12391240
for file in $$list_files; do \
12401241
echo "Checking video file: $$file"; \
1241-
docker run -u $$(id -u) -v $$(pwd):$$(pwd) -w $$(pwd) --entrypoint="" $(NAME)/video:$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) ffmpeg -v error -i "$$file" -f null - ; \
1242+
docker run --rm -u $$(id -u) -v $$(pwd):$$(pwd) -w $$(pwd) --entrypoint="" $(NAME)/video:$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) ffmpeg -v error -i "$$file" -f null - ; \
12421243
if [ $$? -ne 0 ]; then \
12431244
echo "Video file $$file is corrupted"; \
12441245
number_corrupted_files=$$((number_corrupted_files+1)); \
@@ -1275,7 +1276,7 @@ chart_test_autoscaling_deployment:
12751276
PLATFORMS=$(PLATFORMS) TEST_EXISTING_KEDA=true RELEASE_NAME=selenium CHART_ENABLE_TRACING=true TEST_PATCHED_KEDA=$(TEST_PATCHED_KEDA) AUTOSCALING_COOLDOWN_PERIOD=30 \
12761277
TRACING_EXPORTER_ENDPOINT=$(TRACING_EXPORTER_ENDPOINT) TEST_CUSTOM_SPECIFIC_NAME=true \
12771278
SECURE_CONNECTION_SERVER=true SECURE_USE_EXTERNAL_CERT=true SERVICE_TYPE_NODEPORT=true SELENIUM_GRID_PROTOCOL=https SELENIUM_GRID_HOST=$$(hostname -I | cut -d' ' -f1) SELENIUM_GRID_PORT=31444 \
1278-
SELENIUM_GRID_AUTOSCALING_MIN_REPLICA=1 SET_MAX_REPLICAS=3 TEST_DELAY_AFTER_TEST=2 TEST_NODE_DRAIN_AFTER_SESSION_COUNT=3 SELENIUM_GRID_MONITORING=false \
1279+
SELENIUM_GRID_AUTOSCALING_MIN_REPLICA=1 SET_MAX_REPLICAS=3 TEST_DELAY_AFTER_TEST=0 TEST_NODE_DRAIN_AFTER_SESSION_COUNT=3 SELENIUM_GRID_MONITORING=false \
12791280
VERSION=$(TAG_VERSION) VIDEO_TAG=$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) KEDA_BASED_NAME=$(KEDA_BASED_NAME) KEDA_BASED_TAG=$(KEDA_BASED_TAG) NAMESPACE=$(NAMESPACE) BINDING_VERSION=$(BINDING_VERSION) BASE_VERSION=$(BASE_VERSION) \
12801281
TEMPLATE_OUTPUT_FILENAME="k8s_prefixSelenium_enableTracing_secureServer_externalCerts_nodePort_autoScaling_scaledObject_existingKEDA_subPath.yaml" \
12811282
./tests/charts/make/chart_test.sh DeploymentAutoscaling

StandaloneDocker/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,6 @@ ENV SE_SESSION_REQUEST_TIMEOUT="300" \
2121
# Boolean value, maps "--relax-checks"
2222
SE_RELAX_CHECKS="true" \
2323
SE_OTEL_SERVICE_NAME="selenium-standalone-docker" \
24-
SE_NODE_ENABLE_MANAGED_DOWNLOADS="true"
24+
SE_NODE_ENABLE_MANAGED_DOWNLOADS="true" \
25+
SE_BIND_BUS="true" \
26+
SE_EVENT_BUS_IMPLEMENTATION=""

StandaloneDocker/start-selenium-grid-docker.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,19 @@ if [ ! -z "${SE_EVENT_BUS_HEARTBEAT_PERIOD}" ]; then
108108
append_se_opts "--eventbus-heartbeat-period" "${SE_EVENT_BUS_HEARTBEAT_PERIOD}"
109109
fi
110110

111+
if [ ! -z "${SE_EVENT_BUS_IMPLEMENTATION}" ]; then
112+
append_se_opts "--events-implementation" "${SE_EVENT_BUS_IMPLEMENTATION}"
113+
fi
114+
115+
if [ "${SE_BIND_BUS}" = "true" ]; then
116+
append_se_opts "--bind-bus" "${SE_BIND_BUS}"
117+
append_se_opts "--publish-events" "tcp://*:${SE_EVENT_BUS_PUBLISH_PORT}"
118+
append_se_opts "--subscribe-events" "tcp://*:${SE_EVENT_BUS_SUBSCRIBE_PORT}"
119+
if [ -z "${SE_EVENT_BUS_IMPLEMENTATION}" ]; then
120+
append_se_opts "--events-implementation" "org.openqa.selenium.events.zeromq.ZeroMqEventBus"
121+
fi
122+
fi
123+
111124
if [ "${SE_ENABLE_TLS}" = "true" ]; then
112125
# Configure truststore for the server
113126
if [ ! -z "$SE_JAVA_SSL_TRUST_STORE" ]; then

Video/recorder.conf

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ autostart=%(ENV_SE_RECORD_VIDEO)s
66
startsecs=0
77
autorestart=%(ENV_SE_RECORD_VIDEO)s
88
stopsignal=TERM
9-
stopwaitsecs=60
9+
stopwaitsecs=30
1010

1111
;Logs (all activity redirected to stdout so it can be seen through "docker logs"
1212
redirect_stderr=true
@@ -20,7 +20,8 @@ killasgroup=true
2020
autostart=%(ENV_SE_RECORD_VIDEO)s
2121
startsecs=0
2222
autorestart=%(ENV_SE_RECORD_VIDEO)s
23-
stopsignal=KILL
23+
stopsignal=TERM
24+
stopwaitsecs=5
2425

2526
;Logs (all activity redirected to stdout so it can be seen through "docker logs"
2627
redirect_stderr=true

Video/video.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,15 @@ function graceful_exit() {
212212
wait_util_uploader_shutdown
213213
}
214214

215+
_graceful_exit_done=false
215216
function graceful_exit_force() {
217+
if [[ "$_graceful_exit_done" = "true" ]]; then
218+
return
219+
fi
220+
_graceful_exit_done=true
216221
graceful_exit
217-
kill -SIGTERM "$(cat ${SE_SUPERVISORD_PID_FILE})" 2>/dev/null
222+
# Supervisord signaling is delegated to the Python controller (video_recorder.py)
223+
# which handles it uniformly for both shell and event-driven modes.
218224
echo "$(date -u +"${ts_format}") [${process_name}] - Ready to shutdown the recorder"
219225
exit 0
220226
}
@@ -234,7 +240,7 @@ if [[ "${VIDEO_UPLOAD_ENABLED}" != "true" ]] && [[ "${VIDEO_FILE_NAME}" != "auto
234240
-probesize 32M -analyzeduration 0 -y -f x11grab -video_size ${VIDEO_SIZE} -r ${FRAME_RATE} \
235241
-i ${DISPLAY} ${SE_AUDIO_SOURCE} -codec:v ${CODEC} ${PRESET:-"-preset veryfast"} \
236242
-tune zerolatency -crf ${SE_VIDEO_CRF:-28} -maxrate ${SE_VIDEO_MAXRATE:-1000k} -bufsize ${SE_VIDEO_BUFSIZE:-2000k} \
237-
-pix_fmt yuv420p -movflags +faststart "$video_file" &
243+
-pix_fmt yuv420p -movflags frag_keyframe+empty_moov+default_base_moof "$video_file" &
238244
FFMPEG_PID=$!
239245
if ps -p $FFMPEG_PID >/dev/null; then
240246
wait $FFMPEG_PID
@@ -270,7 +276,7 @@ else
270276
-probesize 32M -analyzeduration 0 -y -f x11grab -video_size ${VIDEO_SIZE} -r ${FRAME_RATE} \
271277
-i ${DISPLAY} ${SE_AUDIO_SOURCE} -codec:v ${CODEC} ${PRESET:-"-preset veryfast"} \
272278
-tune zerolatency -crf ${SE_VIDEO_CRF:-28} -maxrate ${SE_VIDEO_MAXRATE:-1000k} -bufsize ${SE_VIDEO_BUFSIZE:-2000k} \
273-
-pix_fmt yuv420p -movflags +faststart "$video_file" &
279+
-pix_fmt yuv420p -movflags frag_keyframe+empty_moov+default_base_moof "$video_file" &
274280
FFMPEG_PID=$!
275281
if ps -p $FFMPEG_PID >/dev/null; then
276282
recording_started="true"

Video/video_ready.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ def do_GET(self):
2828

2929
def graceful_shutdown(signum, frame):
3030
print("Trapped SIGTERM/SIGINT/x so shutting down video-ready...")
31-
httpd.shutdown()
31+
# httpd.shutdown() must be called from a different thread than serve_forever()
32+
# or it deadlocks. video-ready has no state to drain, so close the socket
33+
# and exit directly — supervisord will see the clean exit immediately.
34+
httpd.server_close()
3235
sys.exit(0)
3336

3437

Video/video_recorder.py

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
77
When event-driven mode is enabled, this launches a single unified service
88
that handles both recording and uploading with shared state management.
9+
10+
After the video service exits for any reason (normal drain, session end, or
11+
supervisord-initiated shutdown), this controller signals supervisord so the
12+
container shuts down. Centralising this here means both shell and event-driven
13+
modes have identical container-lifecycle behaviour without video.sh needing to
14+
know about supervisord.
915
"""
1016

1117
import os
@@ -14,12 +20,47 @@
1420
import sys
1521

1622

23+
def _signal_supervisord() -> None:
24+
"""Signal supervisord to initiate a container-wide shutdown.
25+
26+
Safe to call even when supervisord is already shutting down — it will
27+
simply ignore a repeated SIGTERM if it is already in SHUTDOWN state.
28+
"""
29+
pid_file = os.environ.get("SE_SUPERVISORD_PID_FILE", "")
30+
if not pid_file:
31+
return
32+
try:
33+
with open(pid_file) as f:
34+
pid = int(f.read().strip())
35+
os.kill(pid, signal.SIGTERM)
36+
print("[video.recorder] - Signaled supervisord to shut down")
37+
except (OSError, ValueError, FileNotFoundError):
38+
pass
39+
40+
1741
def main():
1842
event_driven = os.environ.get("SE_VIDEO_EVENT_DRIVEN", "false").lower() == "true"
1943

2044
if event_driven:
2145
print("Starting unified event-driven video service...")
2246
print("This service handles both recording and uploading with shared state.")
47+
48+
# Capture whether shutdown was externally initiated (SIGTERM/SIGINT)
49+
# before asyncio.run() replaces the signal handlers via add_signal_handler.
50+
_external_shutdown = [False]
51+
52+
def _mark_external_shutdown(signum, frame):
53+
_external_shutdown[0] = True
54+
# This handler is only reachable before asyncio.run() installs its
55+
# own handlers via loop.add_signal_handler(). Setting the flag and
56+
# returning would swallow the signal — nothing would act on it and
57+
# the process would hang inside asyncio.run() indefinitely.
58+
# Exit immediately so supervisord sees a clean stop.
59+
sys.exit(0)
60+
61+
signal.signal(signal.SIGTERM, _mark_external_shutdown)
62+
signal.signal(signal.SIGINT, _mark_external_shutdown)
63+
2364
try:
2465
import asyncio
2566

@@ -31,24 +72,46 @@ def main():
3172
print("Ensure pyzmq is installed: pip install pyzmq")
3273
print("Falling back to shell-based recording...")
3374
_run_shell_recorder()
75+
return
76+
77+
# Only trigger container shutdown for self-initiated exits (drain).
78+
if not _external_shutdown[0]:
79+
_signal_supervisord()
3480
else:
3581
print("Starting shell-based video recording...")
3682
_run_shell_recorder()
3783

3884

3985
def _run_shell_recorder():
4086
proc = subprocess.Popen(["/opt/bin/video.sh"])
87+
_external_shutdown = False # True when supervisord (or user) told us to stop
4188

4289
def forward_signal(signum, frame):
43-
try:
44-
proc.send_signal(signum)
45-
except ProcessLookupError:
46-
pass # Process already exited before signal was forwarded
90+
nonlocal _external_shutdown
91+
# Forward the signal to video.sh at most once. supervisord uses
92+
# killasgroup=true so video.sh already received the signal directly;
93+
# re-forwarding on every re-entrant call amplifies the SIGTERM
94+
# ping-pong and can keep the process alive for 60 s.
95+
if not _external_shutdown:
96+
_external_shutdown = True
97+
try:
98+
proc.send_signal(signum)
99+
except ProcessLookupError:
100+
pass # Process already exited before signal was forwarded
47101
proc.wait()
48102

49103
signal.signal(signal.SIGTERM, forward_signal)
50104
signal.signal(signal.SIGINT, forward_signal)
51105
rc = proc.wait()
106+
107+
# Signal supervisord only for self-initiated exits (drain, node gone).
108+
# If the shutdown came FROM supervisord (_external_shutdown=True) it is
109+
# already in SHUTDOWN state — signalling it again is a no-op at best and
110+
# confusing at worst. If the recorder crashed (rc != 0) we must not bring
111+
# down the Selenium process alongside it.
112+
if not _external_shutdown and rc == 0:
113+
_signal_supervisord()
114+
52115
if rc != 0:
53116
sys.exit(rc)
54117

0 commit comments

Comments
 (0)