-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdevmapper-pool-init
More file actions
executable file
·165 lines (144 loc) · 6.54 KB
/
devmapper-pool-init
File metadata and controls
executable file
·165 lines (144 loc) · 6.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/bin/busybox sh
# devmapper-pool-init — Talos extension service (ADR-038)
#
# Creates dm-thin-pool from r-dm-data raw partition,
# writes devmapper CRI config, restarts CRI containerd.
# Runs once at boot (restart: never).
set -e
POOL_NAME="containerd-pool"
DM_PART="/dev/disk/by-partlabel/r-dm-data"
# Use /run (tmpfs) for metadata — bind mounts from /var are snapshots taken when
# the extension container starts, and at that early boot point /var is still
# the read-only squashfs base (EPHEMERAL mount hasn't propagated to the extension
# container's mount namespace yet). /run is tmpfs, always writable, always available.
# Metadata is lost on reboot, but that's fine — a fresh boot has no running
# containers, so we rebuild the pool from scratch. The data partition (r-dm-data)
# is persistent but the thin-pool metadata is regenerated each boot.
META_DIR="/run/devmapper"
META_FILE="${META_DIR}/meta"
# Talos CRI reads config from /etc/cri/conf.d/ which is an overlay:
# lower = read-only rootfs, upper = /var/system/overlays/etc-cri-diff/
# Writing to the overlay upper dir makes files appear in /etc/cri/conf.d/
# when CRI reads it. This is under /var (mounted into this container).
CRI_CONF_DIR="/var/system/overlays/etc-cri-diff/conf.d"
CRI_CONF="${CRI_CONF_DIR}/20-devmapper.part"
HOST_PROC="/host-proc"
# Persistent reboot marker — must survive reboot to prevent loop.
# /var/log/ is writable very early in boot (Talos writes service logs there
# from Phase 1) and on EPHEMERAL partition (persists across reboots).
REBOOT_MARKER="/var/log/devmapper-rebooted"
log() { echo "devmapper-pool: $*"; }
# ── 1. Check if pool already exists ──
if /sbin/dmsetup info "$POOL_NAME" >/dev/null 2>&1; then
log "pool exists"
# Ensure CRI config exists
if [ -f "$CRI_CONF" ]; then
log "config exists — nothing to do"
exit 0
fi
# Pool exists but config missing — write it and restart CRI
log "pool exists but config missing — writing config"
fi
# ── 2. Create thin pool (if not exists) ──
if ! /sbin/dmsetup info "$POOL_NAME" >/dev/null 2>&1; then
# Wait for udev to create the by-partlabel symlink.
# The partition may exist but udev hasn't scanned it yet (~1-2s after creation).
WAIT=0
while [ ! -b "$DM_PART" ] && [ "$WAIT" -lt 30 ]; do
/bin/busybox sleep 1
WAIT=$((WAIT + 1))
done
if [ ! -b "$DM_PART" ]; then
log "ERROR: $DM_PART not found after ${WAIT}s"
exit 1
fi
log "partition ready after ${WAIT}s"
DATA_DEV=$(/bin/busybox readlink -f "$DM_PART")
log "data: $DATA_DEV"
# Metadata: 4GB sparse file on EPHEMERAL (dm-thin requires separate meta device)
# Use META_DIR variable (set to /var/devmapper, NOT /var/lib/devmapper which is a late overlay)
/bin/busybox mkdir -p "$META_DIR" || { log "ERROR: mkdir $META_DIR failed"; exit 1; }
log "meta dir ready: $META_DIR"
if [ ! -f "$META_FILE" ]; then
/bin/busybox dd if=/dev/zero of="$META_FILE" bs=1 count=0 seek=4294967296 2>/dev/null
log "meta file created"
fi
META_LOOP=$(/bin/busybox losetup -f "$META_FILE" && /bin/busybox losetup -a | /bin/busybox grep "$META_FILE" | /bin/busybox cut -d: -f1)
SECTORS=$(/bin/busybox blockdev --getsz "$DATA_DEV")
log "creating pool: data=$DATA_DEV sectors=$SECTORS meta=$META_LOOP"
/sbin/dmsetup create "$POOL_NAME" \
--table "0 ${SECTORS} thin-pool ${META_LOOP} ${DATA_DEV} 512 32768 1 skip_block_zeroing"
log "pool created"
fi
# ── 3. Write CRI devmapper config ──
/bin/busybox mkdir -p "$CRI_CONF_DIR" 2>/dev/null || true
/bin/busybox cat > "$CRI_CONF" << 'TOML'
[plugins."io.containerd.snapshotter.v1.devmapper"]
pool_name = "containerd-pool"
root_path = "/var/lib/containerd/devmapper"
base_image_size = "10GB"
discard_blocks = true
fs_type = "ext4"
async_remove = true
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata]
runtime_type = "io.containerd.kata.v2"
privileged_without_host_devices = true
pod_annotations = ["io.katacontainers.*"]
snapshotter = "devmapper"
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata.options]
ConfigPath = "/usr/local/share/kata-containers/configuration.toml"
TOML
log "CRI config written"
# ── 4. Restart CRI to load devmapper snapshotter ──
# Find CRI containerd PID via host /proc
CRI_PID=""
for pid in $(/bin/busybox ls "$HOST_PROC" 2>/dev/null | /bin/busybox grep '^[0-9]'); do
cmdline=$(/bin/busybox cat "$HOST_PROC/$pid/cmdline" 2>/dev/null | /bin/busybox tr '\0' ' ')
case "$cmdline" in
*containerd*--config*/etc/cri*)
CRI_PID="$pid"
break
;;
esac
done
if [ -z "$CRI_PID" ]; then
log "CRI not found — config ready for next CRI start"
exit 0
fi
log "CRI already running (PID $CRI_PID) — needs restart to load devmapper plugin"
# Why reboot instead of kill?
# Linux kernel forbids entering a PARENT PID namespace via setns(2) — this is
# hardcoded in kernel/pid_namespace.c. Child namespaces can only descend, never
# ascend. Extension containers run in a child PID namespace of the host, so
# kill(2) from any form (busybox kill, nsenter --pid=FILE, nsenter --target N)
# will always fail with EINVAL. This is a kernel design rule, not a tool bug.
#
# reboot(2) syscall is different: it's global (kernel-wide), requires CAP_SYS_BOOT
# (which extensions have via AllGrantableCapabilities), and works from any namespace.
#
# Strategy: on first run, after creating pool + writing config, reboot the node.
# On second boot, CRI starts with pool already existing and config in place →
# devmapper plugin loads successfully. Marker file prevents reboot loop.
if [ -f "$REBOOT_MARKER" ]; then
log "reboot marker exists — this is post-reboot run, devmapper should be loaded"
log "extension finished (no reboot needed)"
exit 0
fi
log "writing reboot marker + triggering node reboot to reload CRI with devmapper"
/bin/busybox touch "$REBOOT_MARKER"
/bin/busybox sync
# Try reboot command first (uses reboot(2) syscall via util-linux)
if command -v reboot >/dev/null 2>&1; then
log "calling reboot command"
reboot -f 2>&1 || true
fi
# Fallback: sysrq reboot via host /proc (requires /host-proc mounted rw)
log "fallback: sysrq-trigger reboot"
echo 1 > "$HOST_PROC/sys/kernel/sysrq" 2>/dev/null || true
echo b > "$HOST_PROC/sysrq-trigger" 2>/dev/null || true
# Fallback: busybox reboot
/bin/busybox reboot -f 2>&1 || true
# If we get here, reboot failed
log "ERROR: all reboot methods failed — manual intervention required"
log "Run: talosctl -n <node-ip> reboot"
exit 1