Skip to content

Commit 0102e55

Browse files
fix(android): tighten VPN session lifecycle reliability
1 parent fb552c2 commit 0102e55

3 files changed

Lines changed: 108 additions & 39 deletions

File tree

android/app/src/main/java/com/therealaleph/mhrv/MhrvApp.kt

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,18 @@ class MhrvApp : Application() {
4242
)
4343
val previous = Thread.getDefaultUncaughtExceptionHandler()
4444
Thread.setDefaultUncaughtExceptionHandler { thread, throwable ->
45-
Log.e(
46-
CRASH_TAG,
47-
"uncaught on thread=${thread.name} (id=${thread.id}): ${throwable.message}",
48-
throwable,
49-
)
45+
// Log.e itself can throw on extreme conditions (logd dead,
46+
// OOM allocating the formatted message). If we let that
47+
// bubble up, we'd recurse into our own handler with a
48+
// half-handled original exception; swallow it so the
49+
// previous handler still fires with the real failure.
50+
try {
51+
Log.e(
52+
CRASH_TAG,
53+
"uncaught on thread=${thread.name} (id=${thread.id}): ${throwable.message}",
54+
throwable,
55+
)
56+
} catch (_: Throwable) { }
5057
// Let the default handler still terminate the process and
5158
// show the system "app closed" dialog — we just wanted to
5259
// get a log line out the door first.

android/app/src/main/java/com/therealaleph/mhrv/MhrvVpnService.kt

Lines changed: 65 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -242,13 +242,14 @@ class MhrvVpnService : VpnService() {
242242
tun = parcelFd
243243

244244
// 3) Start tun2proxy on a worker thread. It blocks until stop() or
245-
// shutdown. We detach the fd so ownership transfers cleanly; the
246-
// ParcelFileDescriptor (`tun`) still holds a reference, so closing
247-
// it at teardown reliably tears down the TUN even if tun2proxy
248-
// doesn't cleanly exit.
245+
// shutdown. We detach the fd so ownership transfers cleanly to
246+
// tun2proxy (closeFdOnDrop = true closes it on return from run()).
247+
// The ParcelFileDescriptor (`tun`) we keep is post-detach — its
248+
// own close() is a no-op for the underlying fd, so the worker is
249+
// the sole owner once it's running.
249250
val detachedFd = parcelFd.detachFd()
250251
tun2proxyRunning.set(true)
251-
tun2proxyThread = Thread({
252+
val worker = Thread({
252253
try {
253254
val rc = Tun2proxy.run(
254255
"socks5://127.0.0.1:$socks5Port",
@@ -264,7 +265,29 @@ class MhrvVpnService : VpnService() {
264265
} finally {
265266
tun2proxyRunning.set(false)
266267
}
267-
}, "tun2proxy").apply { start() }
268+
}, "tun2proxy")
269+
try {
270+
worker.start()
271+
tun2proxyThread = worker
272+
} catch (t: Throwable) {
273+
// Thread.start can throw OutOfMemoryError under extreme memory
274+
// pressure. The fd we just detached has no owner — without an
275+
// explicit close it leaks for the life of the process. Adopt
276+
// it into a fresh ParcelFileDescriptor purely so we can call
277+
// close() on it.
278+
Log.e(TAG, "tun2proxy thread start failed: ${t.message}", t)
279+
tun2proxyRunning.set(false)
280+
try {
281+
ParcelFileDescriptor.adoptFd(detachedFd).close()
282+
} catch (closeErr: Throwable) {
283+
Log.w(TAG, "adoptFd($detachedFd).close failed: ${closeErr.message}")
284+
}
285+
Native.stopProxy(proxyHandle)
286+
proxyHandle = 0L
287+
try { stopForeground(STOP_FOREGROUND_REMOVE) } catch (_: Throwable) {}
288+
stopSelf()
289+
return
290+
}
268291

269292
// (startForeground was already called at the top of this method
270293
// to satisfy Android 8+'s foreground-service contract — see the
@@ -291,12 +314,23 @@ class MhrvVpnService : VpnService() {
291314
* tun2proxy still forwarding packets into a half-dead Rust runtime
292315
* while the runtime is force-aborting its tasks — that's the scenario
293316
* that manifested as "Stop crashes the app" when there were in-flight
294-
* relay requests piled up against a dead Apps Script deployment. The
295-
* correct order is:
296-
* 1. Signal tun2proxy to stop (cooperative).
297-
* 2. Close the TUN fd — forces tun2proxy's read() to return EBADF.
298-
* 3. Join the tun2proxy thread (now it really will exit).
299-
* 4. Shut down the Rust proxy runtime (nothing left to forward to).
317+
* relay requests piled up against a dead Apps Script deployment.
318+
*
319+
* Steps, with the bound on each one called out so a hung native call
320+
* cannot stall the whole teardown thread:
321+
* 1. Signal tun2proxy to stop (cooperative). Bounded by a 2s
322+
* side-thread join — if the JNI call hangs we proceed anyway.
323+
* 2. Drop our `ParcelFileDescriptor` reference. Because we already
324+
* called detachFd() at startup, this is a no-op for the
325+
* underlying fd — the worker (closeFdOnDrop=true) owns it.
326+
* We keep the call only so the PROXY_ONLY / failed-establish
327+
* paths still null out the field cleanly.
328+
* 3. Join the tun2proxy thread, bounded at 4s. If the worker is
329+
* stuck we log and move on — the runtime shutdown below will
330+
* knock the rest of the world over.
331+
* 4. Shut down the Rust proxy runtime, bounded by `rt.shutdown_timeout`
332+
* on the Rust side (5s). This is the hard backstop: the listener
333+
* socket is released here regardless of what the worker is doing.
300334
*/
301335
private fun teardown() {
302336
// Idempotency guard. Without this, onDestroy racing the
@@ -315,17 +349,29 @@ class MhrvVpnService : VpnService() {
315349
"(tun2proxy running=${tun2proxyRunning.get()}, proxyHandle=$proxyHandle)",
316350
)
317351

318-
// 1. Cooperative stop signal.
352+
// 1. Cooperative stop signal — bounded so a hung Rust call cannot
353+
// stall the entire teardown thread. We've never observed
354+
// Tun2proxy.stop() block in practice, but the contract isn't
355+
// documented as bounded and the rest of teardown already takes
356+
// care to be timeout-bounded; this closes the gap.
319357
if (tun2proxyRunning.get()) {
320-
try { Tun2proxy.stop() } catch (t: Throwable) {
321-
Log.w(TAG, "Tun2proxy.stop: ${t.message}")
358+
val stopper = Thread({
359+
try { Tun2proxy.stop() } catch (t: Throwable) {
360+
Log.w(TAG, "Tun2proxy.stop: ${t.message}")
361+
}
362+
}, "mhrv-tun2proxy-stop").apply { start() }
363+
try { stopper.join(2_000) } catch (_: InterruptedException) {}
364+
if (stopper.isAlive) {
365+
Log.w(TAG, "Tun2proxy.stop did not return within 2s — proceeding")
322366
}
323367
}
324368

325-
// 2. Close the TUN fd. Since we called detachFd earlier the
326-
// ParcelFileDescriptor no longer owns the fd and close() here
327-
// is a no-op; the real fd is owned by tun2proxy (closeFdOnDrop
328-
// = true), which closes it on return from run().
369+
// 2. Drop our PFD reference. detachFd at startup means this
370+
// close() is a no-op for the underlying fd — tun2proxy owns
371+
// it (closeFdOnDrop = true) and closes it on return from
372+
// run(). The call is kept only to null the field cleanly on
373+
// paths that never reached detachFd (PROXY_ONLY, or an
374+
// establish() that failed mid-builder).
329375
try { tun?.close() } catch (t: Throwable) {
330376
Log.w(TAG, "tun.close: ${t.message}")
331377
}

android/app/src/main/java/com/therealaleph/mhrv/ui/HomeScreen.kt

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ import com.therealaleph.mhrv.ui.theme.ErrRed
4646
import com.therealaleph.mhrv.ui.theme.OkGreen
4747
import kotlinx.coroutines.Dispatchers
4848
import kotlinx.coroutines.delay
49+
import kotlinx.coroutines.flow.first
4950
import kotlinx.coroutines.launch
5051
import kotlinx.coroutines.withContext
52+
import kotlinx.coroutines.withTimeoutOrNull
5153
import org.json.JSONObject
5254

5355
/**
@@ -122,18 +124,31 @@ fun HomeScreen(
122124
}
123125
}
124126

125-
// Cooldown on Start/Stop. Rapid taps during a VPN transition trigger
126-
// an emulator-specific EGL renderer crash
127-
// (F OpenGLRenderer: EGL_NOT_INITIALIZED during rendering) — the
128-
// service survives, but the Compose UI process dies and the app
129-
// appears to close. On real hardware this is rare, but debouncing
130-
// is useful UX anyway: neither start nor stop is truly instant,
131-
// and the user gets no feedback if they tap while one is in flight.
132-
var transitionCooldown by remember { mutableStateOf(false) }
133-
LaunchedEffect(transitionCooldown) {
134-
if (transitionCooldown) {
135-
delay(2000)
136-
transitionCooldown = false
127+
// Gate Start/Stop on the service's actual state transition rather
128+
// than a fixed timer. The previous 2s cooldown was shorter than the
129+
// worst-case teardown (Tun2proxy.stop + 4s join + 5s rt.shutdown_timeout
130+
// ≈ 9s on the slowest path), which let the user fire a fresh Connect
131+
// while the previous Stop's native cleanup was still releasing the
132+
// listener port — the new startProxy then failed with "Address already
133+
// in use".
134+
//
135+
// `awaitingRunning` holds the value we expect VpnState.isRunning to
136+
// settle on after the user's action; null means "no transition in
137+
// flight". The LaunchedEffect below suspends on the StateFlow until
138+
// the predicate matches, with a 12s backstop in case the service
139+
// failed before flipping the flag (e.g., establish() returned null).
140+
// Side benefit: this also debounces the rapid-tap EGL renderer crash
141+
// the old timer was guarding against.
142+
var awaitingRunning by remember { mutableStateOf<Boolean?>(null) }
143+
val transitioning = awaitingRunning != null
144+
LaunchedEffect(awaitingRunning) {
145+
val target = awaitingRunning ?: return@LaunchedEffect
146+
try {
147+
withTimeoutOrNull(12_000) {
148+
VpnState.isRunning.first { it == target }
149+
}
150+
} finally {
151+
awaitingRunning = null
137152
}
138153
}
139154

@@ -373,10 +388,11 @@ fun HomeScreen(
373388
val isVpnRunning by VpnState.isRunning.collectAsState()
374389
Button(
375390
onClick = {
376-
transitionCooldown = true
377391
if (isVpnRunning) {
392+
awaitingRunning = false
378393
onStop()
379394
} else {
395+
awaitingRunning = true
380396
// Connect flow: auto-resolve google_ip so we don't
381397
// hand the proxy a stale anycast target; repair
382398
// front_domain if it got corrupted into an IP
@@ -418,7 +434,7 @@ fun HomeScreen(
418434
},
419435
enabled = (isVpnRunning ||
420436
cfg.mode == Mode.GOOGLE_ONLY ||
421-
(cfg.hasDeploymentId && cfg.authKey.isNotBlank())) && !transitionCooldown,
437+
(cfg.hasDeploymentId && cfg.authKey.isNotBlank())) && !transitioning,
422438
colors = ButtonDefaults.buttonColors(
423439
containerColor = if (isVpnRunning) ErrRed else OkGreen,
424440
contentColor = androidx.compose.ui.graphics.Color.White,
@@ -430,7 +446,7 @@ fun HomeScreen(
430446
) {
431447
Text(
432448
when {
433-
transitionCooldown -> ""
449+
transitioning -> ""
434450
isVpnRunning -> stringResource(R.string.btn_disconnect)
435451
else -> stringResource(R.string.btn_connect)
436452
},

0 commit comments

Comments
 (0)