Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 89 additions & 60 deletions android/app/src/main/java/com/fleetdm/agent/ApiClient.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@ import androidx.datastore.preferences.preferencesDataStore
import java.math.BigInteger
import java.net.HttpURLConnection
import java.net.URL
import java.net.UnknownHostException
import java.util.Date
import kotlinx.coroutines.CancellationException
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.delay
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.first
import kotlinx.coroutines.flow.map
Expand Down Expand Up @@ -59,6 +62,13 @@ interface CertificateApiClient {

object ApiClient : CertificateApiClient {
private const val TAG = "fleet-ApiClient"

// Retry DNS resolution failures that occur when Android wakes from Doze mode.
// The active network may be reported as connected before its DNS servers are fully operational.
// Uses exponential backoff: 1s, 2s, 4s, 8s, 16s, 32s, 64s between attempts (127s total retry window, 8 attempts).
private const val DNS_MAX_RETRIES = 7
private const val DNS_INITIAL_RETRY_DELAY_MS = 1000L

Comment thread
getvictor marked this conversation as resolved.
Comment thread
getvictor marked this conversation as resolved.
private val json = Json { ignoreUnknownKeys = true }

private lateinit var dataStore: DataStore<Preferences>
Expand Down Expand Up @@ -118,78 +128,97 @@ object ApiClient : CertificateApiClient {
): Result<T> = withContext(Dispatchers.IO) {
require(method != "GET" || body == null) { "GET requests must not include a body" }

var connection: HttpURLConnection? = null
val baseUrl = getBaseUrl() ?: return@withContext Result.failure(
Exception("Base URL not configured"),
)

// Validate base URL format and scheme
try {
val baseUrl = getBaseUrl() ?: return@withContext Result.failure(
Exception("Base URL not configured"),
val parsedUrl = URL(baseUrl)
if (parsedUrl.protocol !in listOf("https", "http")) {
return@withContext Result.failure(
Exception("Base URL must use HTTP or HTTPS scheme"),
)
}
} catch (e: Exception) {
return@withContext Result.failure(
Exception("Invalid base URL format: ${e.message}"),
)
}

// Validate base URL format and scheme
val url = URL("$baseUrl$endpoint")

for (dnsAttempt in 0..DNS_MAX_RETRIES) {
var connection: HttpURLConnection? = null
try {
val parsedUrl = URL(baseUrl)
if (parsedUrl.protocol !in listOf("https", "http")) {
return@withContext Result.failure(
Exception("Base URL must use HTTP or HTTPS scheme"),
)
if (dnsAttempt > 0) {
val delayMs = DNS_INITIAL_RETRY_DELAY_MS shl (dnsAttempt - 1)
Log.w(TAG, "retrying $method $endpoint after DNS failure (attempt ${dnsAttempt + 1}, delay ${delayMs}ms)")
delay(delayMs)
}
} catch (e: Exception) {
return@withContext Result.failure(
Exception("Invalid base URL format: ${e.message}"),
)
}

val url = URL("$baseUrl$endpoint")
connection = openConnectionOnActiveNetwork(url)

connection.apply {
requestMethod = method
useCaches = false
doInput = true
if (authorized) {
getNodeKeyOrEnroll().fold(
onFailure = { throwable -> return@withContext Result.failure(throwable) },
onSuccess = { nodeKey ->
setRequestProperty("Authorization", "Node key $nodeKey")
},
)
connection = openConnectionOnActiveNetwork(url)
Comment thread
getvictor marked this conversation as resolved.

connection.apply {
requestMethod = method
useCaches = false
doInput = true
if (authorized) {
getNodeKeyOrEnroll().fold(
onFailure = { throwable -> return@withContext Result.failure(throwable) },
onSuccess = { nodeKey ->
setRequestProperty("Authorization", "Node key $nodeKey")
},
)
}
connectTimeout = 15000
readTimeout = 15000

if (body != null) {
requireNotNull(bodySerializer) { "bodySerializer required when body is provided" }
setRequestProperty("Content-Type", "application/json")
doOutput = true
val bodyJson = json.encodeToString(value = body, serializer = bodySerializer)
outputStream.use { it.write(bodyJson.toByteArray()) }
}
}
connectTimeout = 15000
readTimeout = 15000

if (body != null) {
requireNotNull(bodySerializer) { "bodySerializer required when body is provided" }
setRequestProperty("Content-Type", "application/json")
doOutput = true
val bodyJson = json.encodeToString(value = body, serializer = bodySerializer)
outputStream.use { it.write(bodyJson.toByteArray()) }

val responseCode = connection.responseCode
val response = if (responseCode in 200..299) {
connection.inputStream.bufferedReader().use { it.readText() }
} else {
connection.errorStream?.bufferedReader()?.use { it.readText() }
?: "HTTP $responseCode"
}
}

val responseCode = connection.responseCode
val response = if (responseCode in 200..299) {
connection.inputStream.bufferedReader().use { it.readText() }
} else {
connection.errorStream?.bufferedReader()?.use { it.readText() }
?: "HTTP $responseCode"
}
Log.d(TAG, "server response from $method $endpoint ($responseCode)")

Log.d(TAG, "server response from $method $endpoint ($responseCode)")

if (responseCode in 200..299) {
val parsed = json.decodeFromString(string = response, deserializer = responseSerializer)
Result.success(parsed)
} else if (responseCode == 401) {
Result.failure(UnauthorizedException(response))
} else if (responseCode == 404) {
Result.failure(NotFoundException(response))
} else {
Result.failure(Exception("HTTP $responseCode: $response"))
return@withContext if (responseCode in 200..299) {
val parsed = json.decodeFromString(string = response, deserializer = responseSerializer)
Result.success(parsed)
} else if (responseCode == 401) {
Result.failure(UnauthorizedException(response))
} else if (responseCode == 404) {
Result.failure(NotFoundException(response))
} else {
Result.failure(Exception("HTTP $responseCode: $response"))
}
} catch (e: UnknownHostException) {
Log.w(TAG, "DNS resolution failed for $method $endpoint: ${e.message}")
if (dnsAttempt >= DNS_MAX_RETRIES) {
return@withContext Result.failure(e)
}
} catch (e: CancellationException) {
throw e
} catch (e: Exception) {
return@withContext Result.failure(e)
} finally {
connection?.disconnect()
}
} catch (e: Exception) {
Result.failure(e)
} finally {
connection?.disconnect()
}

// Unreachable: the loop always returns. Required by the compiler since it can't prove the range is non-empty.
Comment thread
getvictor marked this conversation as resolved.
Result.failure(Exception("Exhausted DNS retries for $method $endpoint"))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import androidx.datastore.preferences.core.stringPreferencesKey
import com.fleetdm.agent.scep.ScepClient
import com.fleetdm.agent.scep.ScepClientImpl
import java.math.BigInteger
import java.net.UnknownHostException
import java.security.PrivateKey
import java.security.cert.Certificate
import java.text.SimpleDateFormat
Expand Down Expand Up @@ -824,6 +825,10 @@ class CertificateOrchestrator(
* (template fetch, SCEP enrollment, status update), so sequential processing avoids
* overwhelming the server and the device's network stack.
*
* If DNS resolution fails for a certificate (after exhausting in-call retries), we abort the
* remaining certs since DNS failures are network-level, not cert-specific. The worker will
* return Result.retry() and WorkManager's backoff handles the DNS recovery.
*
* @param context Android context for certificate installation
* @param hostCertificates List of certificate templates to enroll
* @return Map of certificate ID to enrollment result
Expand All @@ -835,11 +840,28 @@ class CertificateOrchestrator(
): Map<Int, CertificateEnrollmentHandler.EnrollmentResult> {
Log.d(TAG, "Starting batch certificate enrollment for ${hostCertificates.size} certificates")

return hostCertificates.associate { cert ->
cert.id to enrollCertificate(context, cert.id, cert.uuid, certificateInstaller)
val results = mutableMapOf<Int, CertificateEnrollmentHandler.EnrollmentResult>()
for (cert in hostCertificates) {
val result = enrollCertificate(context, cert.id, cert.uuid, certificateInstaller)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a little confused why we have MAX_CERT_INSTALL_RETRIES=3 and MAX_RETRY_ATTEMPTS=5. When we call enrollCertificate and the result comes back as Failure, we call recordEnrollmentAttemptFailure. This increments retry counter. If we fail DNS 3 times the cert is permanently failed and this is reported to the server. But according to MAX_RETRY_ATTEMPTS we technically still have 2 more retries. I believe both counters are incremented on failure? Just trying to understand why we have 2 counters

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ksykulev This is arguably more complex than it should be. Maybe an eng-init issue to simplify it? DNS failures do not cause the cert to fail permanently.

Three retry levels handle failures at different time scales during certificate enrollment.

Level 1: DNS retry in makeRequest (DNS_MAX_RETRIES = 7, ~127s)

Handles the Android Doze wake issue where the DNS resolver needs seconds to become ready after the device wakes. Retries the same HTTP call with exponential backoff (1s, 2s, 4s, 8s, 16s, 32s, 64s). Without this, a brief DNS blip would fail the HTTP call, abort the entire batch, and trigger a full worker restart.

If DNS stays down past 127s, the batch aborts and Level 3 takes over.

Level 2: Per-cert SCEP retry (MAX_CERT_INSTALL_RETRIES = 3)

Handles cert-specific SCEP enrollment failures (bad challenge, server rejection). Persists in DataStore per certificate ID across worker runs. After 3 failures for the same cert, marks it permanently failed and reports to the server. Prevents a single bad cert from retrying forever.

Not involved in DNS failures (the early return in enrollCertificate bypasses recordEnrollmentAttemptFailure).

Level 3: Worker burst retry (MAX_RETRY_ATTEMPTS = 5)

Caps the rapid WorkManager retry burst and resets to the 15-minute periodic schedule. Without this, WorkManager's exponential backoff would escalate indefinitely (10s, 20s, 40s, 80s, 160s, ... up to 5 hours), and the periodic schedule would be paused while the retry chain is active.

After 5 rapid retries, the worker returns Result.success() to reset the retry state and hand control back to the periodic schedule, keeping the retry cadence at a predictable 15 minutes.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🫠

results[cert.id] = result
if (result is CertificateEnrollmentHandler.EnrollmentResult.Failure && result.isDnsFailure()) {
Log.w(
TAG,
"DNS resolution failed for certificate ${cert.id}, aborting batch (${hostCertificates.size - results.size} certs deferred to next run)",
)
break
}
}
return results
}

/**
* Returns true if this failure was caused by a DNS resolution problem, either directly or wrapped in
* another exception (e.g. SCEP wraps UnknownHostException in ScepNetworkException).
*/
private fun CertificateEnrollmentHandler.EnrollmentResult.Failure.isDnsFailure(): Boolean =
generateSequence(exception as Throwable?) { it.cause }.any { it is UnknownHostException }

/**
* Android-specific certificate installer using DevicePolicyManager.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1403,4 +1403,98 @@ class CertificateOrchestratorTest {
assertEquals(1, fakeApiClient.updateStatusCalls.size)
assertEquals(UpdateCertificateStatusStatus.FAILED, fakeApiClient.updateStatusCalls[0].status)
}

@Test
fun `enrollCertificates aborts batch on DNS failure`() = runTest {
// Scenario: DNS failure during enrollment of cert 1. Since DNS is a network-level issue
// (not cert-specific), we should abort the batch and defer remaining certs rather than
// burning the DNS retry window on each subsequent cert.
//
// Covers two cases:
// - Direct UnknownHostException from the Fleet API template fetch.
// - UnknownHostException wrapped in ScepNetworkException during SCEP enrollment (the
// orchestrator must walk the cause chain to detect it).

data class Case(val name: String, val configure: () -> Unit, val assertException: (Throwable?) -> Unit)

val cases = listOf(
Case(
name = "direct UnknownHostException from API",
configure = {
fakeApiClient.getCertificateTemplateHandler = { certId ->
if (certId == 1) {
Result.failure(java.net.UnknownHostException("Unable to resolve host"))
} else {
Result.success(successfulTemplateResult(certId))
}
}
},
assertException = { exception ->
assertTrue(
"Expected UnknownHostException but got ${exception?.javaClass?.name}",
exception is java.net.UnknownHostException,
)
},
),
Case(
name = "UnknownHostException wrapped in ScepNetworkException",
configure = {
fakeApiClient.getCertificateTemplateHandler = { certId ->
Result.success(successfulTemplateResult(certId))
}
mockScepClient.shouldThrowNetworkException = true
mockScepClient.networkExceptionCause = java.net.UnknownHostException("Unable to resolve host")
},
assertException = { exception ->
assertTrue(
"Expected ScepNetworkException but got ${exception?.javaClass?.name}",
exception is com.fleetdm.agent.scep.ScepNetworkException,
)
assertTrue(
"Expected UnknownHostException in cause chain",
exception?.cause is java.net.UnknownHostException,
)
},
),
)

for (case in cases) {
// Reset state between cases
clearDataStore()
fakeApiClient.reset()
mockScepClient.reset()

case.configure()

val hostCertificates = listOf(
HostCertificate(id = 1, status = "delivered", operation = "install", uuid = "uuid-1"),
HostCertificate(id = 2, status = "delivered", operation = "install", uuid = "uuid-2"),
HostCertificate(id = 3, status = "delivered", operation = "install", uuid = "uuid-3"),
)

val results = orchestrator.enrollCertificates(context, hostCertificates, mockInstaller)

// Only cert 1 is in results; certs 2 and 3 were skipped
assertEquals("${case.name}: only cert 1 processed", 1, results.size)
val failure = results[1] as CertificateEnrollmentHandler.EnrollmentResult.Failure
assertTrue("${case.name}: failure is retryable", failure.isRetryable)
case.assertException(failure.exception)
// API was only called for cert 1 - certs 2 and 3 never attempted
assertEquals("${case.name}: only cert 1 fetched", listOf(1), fakeApiClient.getCertificateTemplateCalls)
}
}

private fun successfulTemplateResult(certId: Int) = CertificateTemplateResult(
template = GetCertificateTemplateResponse(
id = certId,
name = "cert-$certId",
certificateAuthorityId = 1,
certificateAuthorityName = "TestCA",
createdAt = "2025-01-01T00:00:00Z",
subjectName = "CN=test",
certificateAuthorityType = "custom_scep_proxy",
status = "delivered",
),
scepUrl = TestCertificateTemplateFactory.DEFAULT_SCEP_URL,
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class MockScepClient : ScepClient {
var shouldThrowEnrollmentException = false
var shouldThrowNetworkException = false
var shouldThrowCertificateException = false
var networkExceptionCause: Throwable? = null
var enrollmentDelay = 0L
var capturedConfig: GetCertificateTemplateResponse? = null
var capturedScepUrl: String? = null
Expand All @@ -43,7 +44,7 @@ class MockScepClient : ScepClient {
}

when {
shouldThrowNetworkException -> throw ScepNetworkException("Mock network error")
shouldThrowNetworkException -> throw ScepNetworkException("Mock network error", networkExceptionCause)
shouldThrowEnrollmentException -> throw ScepEnrollmentException("Mock enrollment failed")
shouldThrowCertificateException -> throw ScepCertificateException("Mock certificate error")
!shouldSucceed -> throw ScepException("Mock general error")
Expand Down Expand Up @@ -101,6 +102,7 @@ class MockScepClient : ScepClient {
shouldThrowEnrollmentException = false
shouldThrowNetworkException = false
shouldThrowCertificateException = false
networkExceptionCause = null
enrollmentDelay = 0L
capturedConfig = null
capturedScepUrl = null
Expand Down
1 change: 1 addition & 0 deletions android/changes/43462-android-cert-dns-retry
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Fixed Android agent to retry DNS resolution failures when waking from Doze mode, and to defer remaining certificates in a batch to the next enrollment cycle when a DNS failure persists.
Loading