Skip to content

Commit 18d4a3d

Browse files
fix(dgw): persist regenerated server key to avoid cert/key mismatch
1 parent f16f14a commit 18d4a3d

1 file changed

Lines changed: 168 additions & 14 deletions

File tree

crates/agent-tunnel/src/cert.rs

Lines changed: 168 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,50 @@ pub struct SignedAgentCert {
128128
pub ca_cert_pem: String,
129129
}
130130

131+
/// Provenance of the server key used to sign a (re)issued server certificate.
132+
///
133+
/// Tracked so that [`CaManager::ensure_server_cert`] always persists a freshly
134+
/// generated key to disk *before* the new cert document is written. If we only
135+
/// gated the write on `!key_path.exists()` (the previous behaviour), then for
136+
/// statuses like `ExpiringSoon` / `Unreadable` — where the key file may still
137+
/// exist but the keypair we just generated is brand new — the cert would be
138+
/// signed with a key that never gets persisted, producing a cert/key mismatch.
139+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
140+
enum KeyOrigin {
141+
LoadedFromDisk,
142+
FreshlyGenerated,
143+
}
144+
145+
/// Atomically write `contents` to `path`: write to a sibling temp file, then
146+
/// rename over the target. Prevents a crash mid-write from leaving the target
147+
/// truncated or partially written.
148+
fn write_atomically(path: &Utf8Path, contents: &[u8]) -> anyhow::Result<()> {
149+
let parent = path
150+
.parent()
151+
.with_context(|| format!("target path has no parent directory: {path}"))?;
152+
if !parent.as_str().is_empty() {
153+
std::fs::create_dir_all(parent).with_context(|| format!("create directory {parent}"))?;
154+
}
155+
let file_name = path
156+
.file_name()
157+
.with_context(|| format!("target path has no file name: {path}"))?;
158+
let tmp_path = parent.join(format!(".{file_name}.tmp.{}", Uuid::new_v4()));
159+
160+
// Best-effort cleanup if anything fails after the temp file is created.
161+
let write_result = std::fs::write(&tmp_path, contents);
162+
if let Err(e) = write_result {
163+
let _ = std::fs::remove_file(&tmp_path);
164+
return Err(anyhow::Error::new(e).context(format!("write temp file {tmp_path}")));
165+
}
166+
167+
if let Err(e) = std::fs::rename(&tmp_path, path) {
168+
let _ = std::fs::remove_file(&tmp_path);
169+
return Err(anyhow::Error::new(e).context(format!("rename {tmp_path} -> {path}")));
170+
}
171+
172+
Ok(())
173+
}
174+
131175
impl CaManager {
132176
/// Load an existing CA from disk, or generate a new one.
133177
pub fn load_or_generate(data_dir: &Utf8Path) -> anyhow::Result<Arc<Self>> {
@@ -278,7 +322,13 @@ impl CaManager {
278322
// The keypair is preserved across SAN regenerations to keep the SPKI pin
279323
// stable for already-enrolled agents. Only the cert document changes.
280324
// Generate a fresh key only if the existing key is missing/unreadable.
281-
let server_key_pair = match (status, std::fs::read_to_string(&key_path)) {
325+
//
326+
// `key_origin` tracks whether `server_key_pair` was loaded from disk or
327+
// freshly generated. When freshly generated we MUST persist the new key
328+
// (atomically) *before* writing the cert, otherwise a crash between the
329+
// two writes — or the cert-only write path used previously — would
330+
// leave a cert/key mismatch on disk and break Gateway TLS.
331+
let (server_key_pair, key_origin) = match (status, std::fs::read_to_string(&key_path)) {
282332
(ServerCertStatus::Valid, _) => {
283333
info!(%cert_path, ?expected_sans, "Using existing agent tunnel server certificate");
284334
return Ok((cert_path, key_path));
@@ -290,11 +340,14 @@ impl CaManager {
290340
new_sans = ?expected_sans,
291341
"Agent tunnel server cert SAN set changed; regenerating with the existing keypair",
292342
);
293-
KeyPair::from_pem(&key_pem).context("parse existing server key pair from PEM")?
343+
let kp = KeyPair::from_pem(&key_pem).context("parse existing server key pair from PEM")?;
344+
(kp, KeyOrigin::LoadedFromDisk)
294345
}
295346
(other_status, _) => {
296347
info!(%cert_path, ?other_status, "Generating new server certificate keypair");
297-
KeyPair::generate_for(&rcgen::PKCS_ECDSA_P256_SHA256).context("generate server key pair")?
348+
let kp =
349+
KeyPair::generate_for(&rcgen::PKCS_ECDSA_P256_SHA256).context("generate server key pair")?;
350+
(kp, KeyOrigin::FreshlyGenerated)
298351
}
299352
};
300353

@@ -322,21 +375,25 @@ impl CaManager {
322375
let server_cert_pem = server_cert.pem();
323376
let fingerprint = cert_fingerprint_from_pem(&server_cert_pem).unwrap_or_else(|_| "<unknown>".to_owned());
324377

325-
std::fs::write(&cert_path, &server_cert_pem).with_context(|| format!("write server cert to {cert_path}"))?;
326-
// Only write the key when generating a new one. Reusing the existing key
327-
// means we already validated the file is readable above.
328-
if !key_path.exists() {
329-
std::fs::write(&key_path, server_key_pair.serialize_pem())
378+
// Order matters: when the key was freshly generated, persist it BEFORE
379+
// the cert so we never end up with a new cert on disk paired with an
380+
// old/missing key. Use write-to-temp + rename for atomicity so a crash
381+
// mid-write cannot leave a truncated key on disk.
382+
if matches!(key_origin, KeyOrigin::FreshlyGenerated) {
383+
write_atomically(&key_path, server_key_pair.serialize_pem().as_bytes())
330384
.with_context(|| format!("write server key to {key_path}"))?;
331-
}
332385

333-
#[cfg(unix)]
334-
{
335-
use std::os::unix::fs::PermissionsExt as _;
336-
std::fs::set_permissions(&key_path, std::fs::Permissions::from_mode(0o600))
337-
.with_context(|| format!("set permissions on {key_path}"))?;
386+
#[cfg(unix)]
387+
{
388+
use std::os::unix::fs::PermissionsExt as _;
389+
std::fs::set_permissions(&key_path, std::fs::Permissions::from_mode(0o600))
390+
.with_context(|| format!("set permissions on {key_path}"))?;
391+
}
338392
}
339393

394+
write_atomically(&cert_path, server_cert_pem.as_bytes())
395+
.with_context(|| format!("write server cert to {cert_path}"))?;
396+
340397
info!(
341398
%cert_path,
342399
primary_name,
@@ -895,6 +952,103 @@ mod tests {
895952
assert!(msg.contains("empty"), "got: {msg}");
896953
}
897954

955+
#[test]
956+
fn ensure_server_cert_regenerates_key_atomically_when_cert_expiring_soon() {
957+
// Regression for the cert/key mismatch bug: when the cert is expiring
958+
// soon (or unreadable / missing) but a stale key file is still on disk
959+
// from a previous run, the regeneration path used to sign the new cert
960+
// with a freshly generated keypair while skipping the key write —
961+
// leaving cert and key out of sync. The fix tracks key provenance and
962+
// always persists a freshly generated key BEFORE writing the cert.
963+
//
964+
// This test backdates the cert's validity by overwriting cert+key on
965+
// disk with a deliberately near-expiry pair, then calls
966+
// `ensure_server_cert` and asserts the on-disk key is the one inside
967+
// the new cert (i.e. they match SPKI-wise).
968+
969+
let temp_dir = std::env::temp_dir().join(format!("dgw-expiring-{}", Uuid::new_v4()));
970+
let data_dir = Utf8PathBuf::from_path_buf(temp_dir.clone()).expect("temp path UTF-8");
971+
let ca = CaManager::load_or_generate(&data_dir).expect("CA");
972+
973+
let names = ["gateway.corp.example.com"];
974+
let (cert_path, key_path) = ca.ensure_server_cert(&names).expect("first issue");
975+
976+
// Mint a replacement (cert, key) pair where the cert is already
977+
// expiring (within the 7-day window) but the key on disk is a stale
978+
// keypair unrelated to whatever the regen path will generate. The
979+
// point: force the code into the `ExpiringSoon` branch with a key
980+
// file present on disk, and verify regeneration writes a matching key.
981+
let stale_key =
982+
KeyPair::generate_for(&rcgen::PKCS_ECDSA_P256_SHA256).expect("stale keypair");
983+
let mut expiring_params = CertificateParams::default();
984+
expiring_params
985+
.distinguished_name
986+
.push(DnType::CommonName, names[0]);
987+
expiring_params
988+
.subject_alt_names
989+
.push(SanType::DnsName(names[0].try_into().unwrap()));
990+
expiring_params.not_before = time::OffsetDateTime::now_utc() - Duration::from_secs(60 * SECS_PER_DAY);
991+
// Inside the 7-day "ExpiringSoon" threshold.
992+
expiring_params.not_after = time::OffsetDateTime::now_utc() + Duration::from_secs(SECS_PER_DAY);
993+
let ca_cert_for_sign = ca.reconstruct_ca_cert().expect("reconstruct CA");
994+
let expiring_cert = expiring_params
995+
.signed_by(&stale_key, &ca_cert_for_sign, &ca.ca_key_pair)
996+
.expect("sign expiring cert");
997+
std::fs::write(&cert_path, expiring_cert.pem()).expect("overwrite cert with expiring one");
998+
std::fs::write(&key_path, stale_key.serialize_pem()).expect("overwrite key with stale one");
999+
1000+
// Sanity: confirm the status check classifies this as ExpiringSoon.
1001+
let expected_sans = build_san_set(&names).expect("expected sans");
1002+
let status = check_server_cert(&cert_path, &key_path, &expected_sans);
1003+
assert!(
1004+
matches!(status, ServerCertStatus::ExpiringSoon),
1005+
"precondition: expected ExpiringSoon, got {status:?}"
1006+
);
1007+
1008+
// Trigger regeneration.
1009+
ca.ensure_server_cert(&names).expect("regen after expiring");
1010+
1011+
// The on-disk key must match the public key embedded in the new cert.
1012+
let new_cert_pem = std::fs::read_to_string(&cert_path).expect("read regenerated cert");
1013+
let new_key_pem = std::fs::read_to_string(&key_path).expect("read regenerated key");
1014+
1015+
let new_cert_der = cert_pem_to_der(&new_cert_pem).expect("parse new cert PEM");
1016+
let cert_spki = spki_sha256_from_der(&new_cert_der).expect("cert SPKI");
1017+
1018+
// Build the SPKI of the persisted key by self-signing a throwaway cert
1019+
// with it (rcgen doesn't expose the SPKI of a KeyPair directly).
1020+
let persisted_key = KeyPair::from_pem(&new_key_pem).expect("parse persisted key");
1021+
let mut probe_params = CertificateParams::default();
1022+
probe_params
1023+
.distinguished_name
1024+
.push(DnType::CommonName, "probe");
1025+
let probe_cert = probe_params.self_signed(&persisted_key).expect("self-sign probe");
1026+
let probe_der = cert_pem_to_der(&probe_cert.pem()).expect("probe DER");
1027+
let probe_spki = spki_sha256_from_der(&probe_der).expect("probe SPKI");
1028+
1029+
assert_eq!(
1030+
cert_spki, probe_spki,
1031+
"on-disk key must match the public key embedded in the regenerated cert"
1032+
);
1033+
1034+
// And confirm the key is no longer the stale one we wrote earlier.
1035+
let mut stale_probe_params = CertificateParams::default();
1036+
stale_probe_params
1037+
.distinguished_name
1038+
.push(DnType::CommonName, "stale-probe");
1039+
let stale_probe_cert = stale_probe_params
1040+
.self_signed(&stale_key)
1041+
.expect("self-sign stale probe");
1042+
let stale_probe_der = cert_pem_to_der(&stale_probe_cert.pem()).expect("stale probe DER");
1043+
let stale_spki = spki_sha256_from_der(&stale_probe_der).expect("stale SPKI");
1044+
assert_ne!(
1045+
cert_spki, stale_spki,
1046+
"regenerated cert must NOT use the stale on-disk key"
1047+
);
1048+
1049+
let _ = std::fs::remove_dir_all(&temp_dir);
1050+
}
1051+
8981052
#[test]
8991053
fn read_cert_chain_rejects_wrong_label() {
9001054
let (_, leaf_pem) = make_cert_pair();

0 commit comments

Comments
 (0)