Skip to content

Commit 6f692e7

Browse files
feat(agent): implement certificate renewal (#1775)
1 parent 87be5cd commit 6f692e7

3 files changed

Lines changed: 249 additions & 11 deletions

File tree

crates/agent-tunnel/src/listener.rs

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ pub struct AgentTunnelListener {
105105
endpoint: quinn::Endpoint,
106106
registry: Arc<AgentRegistry>,
107107
agent_connections: Arc<RwLock<HashMap<Uuid, quinn::Connection>>>,
108+
ca_manager: Arc<CaManager>,
108109
}
109110

110111
impl AgentTunnelListener {
@@ -158,6 +159,7 @@ impl AgentTunnelListener {
158159
endpoint,
159160
registry,
160161
agent_connections,
162+
ca_manager,
161163
};
162164

163165
Ok((listener, handle))
@@ -198,8 +200,9 @@ impl devolutions_gateway_task::Task for AgentTunnelListener {
198200

199201
let registry = Arc::clone(&self.registry);
200202
let agent_connections = Arc::clone(&self.agent_connections);
203+
let ca_manager = Arc::clone(&self.ca_manager);
201204

202-
conn_handles.spawn(run_agent_connection(registry, agent_connections, incoming));
205+
conn_handles.spawn(run_agent_connection(registry, agent_connections, ca_manager, incoming));
203206
}
204207

205208
// Reap completed connection tasks to prevent unbounded growth.
@@ -220,6 +223,7 @@ impl devolutions_gateway_task::Task for AgentTunnelListener {
220223
async fn run_agent_connection(
221224
registry: Arc<AgentRegistry>,
222225
agent_connections: Arc<RwLock<HashMap<Uuid, quinn::Connection>>>,
226+
ca_manager: Arc<CaManager>,
223227
incoming: quinn::Incoming,
224228
) {
225229
let peer_addr = incoming.remote_address();
@@ -253,7 +257,7 @@ async fn run_agent_connection(
253257
agent_connections.write().await.insert(agent_id, conn.clone());
254258

255259
// Accept the first bidirectional stream as the control stream.
256-
let control_result = run_control_loop(&conn, agent_id, &registry).await;
260+
let control_result = run_control_loop(&conn, agent_id, &agent_name, &registry, &ca_manager).await;
257261

258262
// Agent disconnected — clean up.
259263
info!(%agent_id, "Agent QUIC connection closed");
@@ -269,7 +273,13 @@ async fn run_agent_connection(
269273
}
270274
}
271275

272-
async fn run_control_loop(conn: &quinn::Connection, agent_id: Uuid, registry: &AgentRegistry) -> anyhow::Result<()> {
276+
async fn run_control_loop(
277+
conn: &quinn::Connection,
278+
agent_id: Uuid,
279+
agent_name: &str,
280+
registry: &AgentRegistry,
281+
ca_manager: &CaManager,
282+
) -> anyhow::Result<()> {
273283
let mut ctrl: ControlStream<_, _> = conn.accept_bi().await.context("accept control stream")?.into();
274284

275285
info!(%agent_id, "Control stream accepted");
@@ -290,7 +300,7 @@ async fn run_control_loop(conn: &quinn::Connection, agent_id: Uuid, registry: &A
290300
}
291301
};
292302

293-
handle_control_message(registry, agent_id, &mut ctrl, msg).await;
303+
handle_control_message(registry, ca_manager, agent_id, agent_name, &mut ctrl, msg).await;
294304
}
295305

296306
// Detect connection close.
@@ -306,7 +316,9 @@ async fn run_control_loop(conn: &quinn::Connection, agent_id: Uuid, registry: &A
306316

307317
async fn handle_control_message<S: tokio::io::AsyncWrite + Unpin, R: tokio::io::AsyncRead + Unpin>(
308318
registry: &AgentRegistry,
319+
ca_manager: &CaManager,
309320
agent_id: Uuid,
321+
agent_name: &str,
310322
ctrl: &mut ControlStream<S, R>,
311323
msg: ControlMessage,
312324
) {
@@ -358,8 +370,36 @@ async fn handle_control_message<S: tokio::io::AsyncWrite + Unpin, R: tokio::io::
358370
ControlMessage::HeartbeatAck { .. } => {
359371
debug!(%agent_id, "Unexpected HeartbeatAck from agent");
360372
}
361-
ControlMessage::CertRenewalRequest { .. } | ControlMessage::CertRenewalResponse { .. } => {
362-
debug!(%agent_id, "Certificate renewal not supported in this build");
373+
ControlMessage::CertRenewalRequest { csr_pem, .. } => {
374+
info!(%agent_id, "Agent requested certificate renewal");
375+
376+
// Reuse the agent_id and agent_name authenticated by mTLS — never
377+
// trust the CSR's subject. The CA only re-signs the public key the
378+
// agent put in its CSR; identity stays whatever the existing cert
379+
// already proved during the handshake.
380+
let result = match ca_manager.sign_agent_csr(agent_id, agent_name, &csr_pem, None) {
381+
Ok(signed) => {
382+
info!(%agent_id, %agent_name, "Renewed agent certificate");
383+
agent_tunnel_proto::CertRenewalResult::Success {
384+
client_cert_pem: signed.client_cert_pem,
385+
gateway_ca_cert_pem: signed.ca_cert_pem,
386+
}
387+
}
388+
Err(error) => {
389+
warn!(%agent_id, error = %format!("{error:#}"), "Certificate renewal failed");
390+
agent_tunnel_proto::CertRenewalResult::Error {
391+
reason: format!("{error:#}"),
392+
}
393+
}
394+
};
395+
396+
let response = ControlMessage::cert_renewal_response(result);
397+
let _ = ctrl.send(&response).await.inspect_err(|e| {
398+
warn!(%agent_id, error = %e, "Failed to send CertRenewalResponse");
399+
});
400+
}
401+
ControlMessage::CertRenewalResponse { .. } => {
402+
debug!(%agent_id, "Unexpected CertRenewalResponse from agent");
363403
}
364404
}
365405
}

devolutions-agent/src/enrollment.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,78 @@ fn persist_enrollment_response(
245245
})
246246
}
247247

248+
// ---------------------------------------------------------------------------
249+
// Certificate renewal helpers
250+
// ---------------------------------------------------------------------------
251+
252+
/// Check whether the PEM-encoded certificate at `cert_path` expires within
253+
/// `threshold_days`. The agent uses this on every reconnect to decide whether
254+
/// to ask the gateway for a new certificate before opening real traffic.
255+
pub fn is_cert_expiring(cert_path: &Utf8Path, threshold_days: u32) -> Result<bool> {
256+
use std::io::BufReader;
257+
258+
let pem_str = std::fs::read_to_string(cert_path).with_context(|| format!("read certificate from {cert_path}"))?;
259+
let der = rustls_pemfile::certs(&mut BufReader::new(pem_str.as_bytes()))
260+
.next()
261+
.context("empty PEM input")?
262+
.context("parse certificate PEM")?;
263+
let (_, cert) =
264+
x509_parser::parse_x509_certificate(&der).map_err(|e| anyhow::anyhow!("parse X.509 certificate: {e}"))?;
265+
266+
let not_after_epoch = cert.validity().not_after.timestamp();
267+
let now_epoch = i64::try_from(
268+
std::time::SystemTime::now()
269+
.duration_since(std::time::UNIX_EPOCH)
270+
.context("system clock before UNIX epoch")?
271+
.as_secs(),
272+
)
273+
.context("unix timestamp exceeds i64::MAX")?;
274+
275+
let threshold_secs = i64::from(threshold_days) * 86_400;
276+
Ok(not_after_epoch - now_epoch <= threshold_secs)
277+
}
278+
279+
/// Extract the `CommonName` from an existing PEM certificate. The renewal CSR
280+
/// must reuse the agent's name across renewals — the gateway looks the agent
281+
/// up in its registry by that name, and the most authoritative source for it
282+
/// is the cert the gateway itself signed last time.
283+
pub fn read_agent_name_from_cert(cert_path: &Utf8Path) -> Result<String> {
284+
use std::io::BufReader;
285+
286+
let pem_str = std::fs::read_to_string(cert_path).with_context(|| format!("read certificate from {cert_path}"))?;
287+
let der = rustls_pemfile::certs(&mut BufReader::new(pem_str.as_bytes()))
288+
.next()
289+
.context("empty PEM input")?
290+
.context("parse certificate PEM")?;
291+
let (_, cert) =
292+
x509_parser::parse_x509_certificate(&der).map_err(|e| anyhow::anyhow!("parse X.509 certificate: {e}"))?;
293+
294+
let cn = cert
295+
.subject()
296+
.iter_common_name()
297+
.next()
298+
.context("certificate subject has no Common Name")?
299+
.as_str()
300+
.context("certificate Common Name is not valid UTF-8")?;
301+
302+
Ok(cn.to_owned())
303+
}
304+
305+
/// Build a renewal CSR using the agent's existing private key. Reusing the key
306+
/// across renewals matches the design that says the private key never leaves
307+
/// the agent — the gateway only ever sees CSRs.
308+
pub fn generate_csr_from_existing_key(key_path: &Utf8Path, agent_name: &str) -> Result<String> {
309+
let key_pem = std::fs::read_to_string(key_path).with_context(|| format!("read private key from {key_path}"))?;
310+
let key_pair = rcgen::KeyPair::from_pem(&key_pem).context("parse private key PEM")?;
311+
312+
let mut params = rcgen::CertificateParams::default();
313+
params.distinguished_name.push(rcgen::DnType::CommonName, agent_name);
314+
315+
let csr = params.serialize_request(&key_pair).context("serialize renewal CSR")?;
316+
317+
csr.pem().context("encode CSR to PEM")
318+
}
319+
248320
#[cfg(test)]
249321
mod tests {
250322
use super::*;

devolutions-agent/src/tunnel.rs

Lines changed: 131 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,17 @@ impl Task for TunnelTask {
133133
let start = std::time::Instant::now();
134134

135135
match run_single_connection(&self.conf_handle, &mut shutdown_signal).await {
136-
Ok(()) => {
136+
Ok(ConnectionOutcome::Shutdown) => {
137137
info!("Tunnel task stopped");
138138
return Ok(());
139139
}
140+
Ok(ConnectionOutcome::CertRenewed) => {
141+
// Renewal is a successful "completion", not a failure — skip
142+
// the backoff and reconnect immediately with the new cert.
143+
info!("Certificate renewed; reconnecting with new cert immediately");
144+
backoff.reset();
145+
continue;
146+
}
140147
Err(error) => {
141148
warn!(error = %format!("{error:#}"), "Tunnel connection lost");
142149
}
@@ -175,11 +182,23 @@ impl Task for TunnelTask {
175182
// Single connection lifetime
176183
// ---------------------------------------------------------------------------
177184

185+
/// Outcome of a single connection lifetime, telling the outer loop what to do next.
186+
enum ConnectionOutcome {
187+
/// Shutdown signal received — exit the tunnel task cleanly.
188+
Shutdown,
189+
/// Certificate was renewed successfully; reconnect immediately with the new cert.
190+
CertRenewed,
191+
}
192+
178193
/// Run a single QUIC tunnel connection lifetime: config → connect → event loop.
179194
///
180-
/// Returns `Ok(())` on graceful shutdown (shutdown signal received).
181-
/// Returns `Err(...)` on any failure — the caller should retry with backoff.
182-
async fn run_single_connection(conf_handle: &ConfHandle, shutdown_signal: &mut ShutdownSignal) -> anyhow::Result<()> {
195+
/// - `Ok(Shutdown)`: graceful shutdown, exit the task.
196+
/// - `Ok(CertRenewed)`: certificate renewed; caller should reconnect immediately.
197+
/// - `Err(...)`: connection lost or handshake failed — caller should retry with backoff.
198+
async fn run_single_connection(
199+
conf_handle: &ConfHandle,
200+
shutdown_signal: &mut ShutdownSignal,
201+
) -> anyhow::Result<ConnectionOutcome> {
183202
// Ensure rustls crypto provider is installed (ring).
184203
let _ = rustls::crypto::ring::default_provider().install_default();
185204

@@ -356,6 +375,18 @@ async fn run_single_connection(conf_handle: &ConfHandle, shutdown_signal: &mut S
356375

357376
info!(epoch, "Sent initial RouteAdvertise");
358377

378+
// -- Certificate renewal (post-connect, pre-traffic) --
379+
//
380+
// Run once per reconnect rather than on a periodic timer: the QUIC session
381+
// has a 120s idle timeout and 15s keep-alive, so any blip / VPN reconnect
382+
// / host sleep / gateway restart drops the connection within minutes and
383+
// sends us back through this path. With a 1-year cert and a 15-day
384+
// threshold, the renewal window will be hit on the first reconnect after
385+
// T-15d, which is more than often enough in any real deployment.
386+
if let Some(outcome) = try_renew_certificate(&mut ctrl, &connection, cert_path, key_path, ca_path).await? {
387+
return Ok(outcome);
388+
}
389+
359390
// Split: recv half goes to a reader task, send half stays for periodic messages.
360391
let (mut ctrl_send, ctrl_recv) = ctrl.into_split();
361392
let mut task_handles = tokio::task::JoinSet::new();
@@ -409,7 +440,102 @@ async fn run_single_connection(conf_handle: &ConfHandle, shutdown_signal: &mut S
409440

410441
task_handles.shutdown().await;
411442

412-
Ok(())
443+
Ok(ConnectionOutcome::Shutdown)
444+
}
445+
446+
// ---------------------------------------------------------------------------
447+
// Certificate renewal
448+
// ---------------------------------------------------------------------------
449+
450+
/// Check if the client cert is near expiry; if so, renew it via the control
451+
/// stream before opening real traffic.
452+
///
453+
/// Returns:
454+
/// - `Ok(Some(CertRenewed))` — renewed successfully; outer loop must reconnect
455+
/// so the new cert takes effect on the next mTLS handshake.
456+
/// - `Ok(None)` — no renewal needed (or attempted renewal failed in a recoverable
457+
/// way, e.g. the gateway said no); proceed with the existing cert.
458+
/// - `Err(_)` — IO / protocol error on the control stream itself; treat as
459+
/// connection lost.
460+
async fn try_renew_certificate<S, R>(
461+
ctrl: &mut ControlStream<S, R>,
462+
connection: &quinn::Connection,
463+
cert_path: &camino::Utf8Path,
464+
key_path: &camino::Utf8Path,
465+
ca_path: &camino::Utf8Path,
466+
) -> anyhow::Result<Option<ConnectionOutcome>>
467+
where
468+
S: tokio::io::AsyncWrite + Unpin,
469+
R: tokio::io::AsyncRead + Unpin,
470+
{
471+
const RENEWAL_THRESHOLD_DAYS: u32 = 15;
472+
const RENEWAL_TIMEOUT: Duration = Duration::from_secs(30);
473+
474+
match crate::enrollment::is_cert_expiring(cert_path, RENEWAL_THRESHOLD_DAYS) {
475+
Ok(false) => {
476+
debug!("Client certificate not in renewal window");
477+
return Ok(None);
478+
}
479+
Err(error) => {
480+
warn!(error = %format!("{error:#}"), "Failed to check certificate expiry; skipping renewal");
481+
return Ok(None);
482+
}
483+
Ok(true) => {}
484+
}
485+
486+
info!(
487+
threshold_days = RENEWAL_THRESHOLD_DAYS,
488+
"Certificate within renewal window; requesting renewal"
489+
);
490+
491+
// Reuse the agent name from the existing cert as the renewal CSR's
492+
// CommonName. The gateway ignores CSR subject and trusts the
493+
// mTLS-authenticated identity, but matching the existing CN keeps the
494+
// CSR semantically correct in case validation tightens later.
495+
let agent_name = crate::enrollment::read_agent_name_from_cert(cert_path)
496+
.context("read agent name from existing certificate for renewal")?;
497+
let csr_pem =
498+
crate::enrollment::generate_csr_from_existing_key(key_path, &agent_name).context("generate renewal CSR")?;
499+
500+
ctrl.send(&ControlMessage::cert_renewal_request(csr_pem))
501+
.await
502+
.context("send CertRenewalRequest")?;
503+
504+
let response = tokio::time::timeout(RENEWAL_TIMEOUT, ctrl.recv())
505+
.await
506+
.context("timeout waiting for CertRenewalResponse")?
507+
.context("receive CertRenewalResponse")?;
508+
509+
match response {
510+
ControlMessage::CertRenewalResponse {
511+
result:
512+
agent_tunnel_proto::CertRenewalResult::Success {
513+
client_cert_pem,
514+
gateway_ca_cert_pem,
515+
},
516+
..
517+
} => {
518+
std::fs::write(cert_path.as_str(), &client_cert_pem).context("write renewed certificate")?;
519+
std::fs::write(ca_path.as_str(), &gateway_ca_cert_pem).context("write renewed CA certificate")?;
520+
info!("Certificate renewed; closing connection so new cert takes effect on reconnect");
521+
connection.close(0u32.into(), b"cert-renewed");
522+
Ok(Some(ConnectionOutcome::CertRenewed))
523+
}
524+
ControlMessage::CertRenewalResponse {
525+
result: agent_tunnel_proto::CertRenewalResult::Error { reason },
526+
..
527+
} => {
528+
warn!(%reason, "Gateway refused certificate renewal; continuing with existing cert");
529+
Ok(None)
530+
}
531+
other => {
532+
warn!(
533+
?other,
534+
"Unexpected response to renewal request; continuing with existing cert"
535+
);
536+
Ok(None)
537+
}
538+
}
413539
}
414540

415541
// ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)