@@ -128,6 +128,50 @@ pub struct SignedAgentCert {
128128 pub ca_cert_pem : String ,
129129}
130130
131+ /// Provenance of the server key used to sign a (re)issued server certificate.
132+ ///
133+ /// Tracked so that [`CaManager::ensure_server_cert`] always persists a freshly
134+ /// generated key to disk *before* the new cert document is written. If we only
135+ /// gated the write on `!key_path.exists()` (the previous behaviour), then for
136+ /// statuses like `ExpiringSoon` / `Unreadable` — where the key file may still
137+ /// exist but the keypair we just generated is brand new — the cert would be
138+ /// signed with a key that never gets persisted, producing a cert/key mismatch.
139+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
140+ enum KeyOrigin {
141+ LoadedFromDisk ,
142+ FreshlyGenerated ,
143+ }
144+
145+ /// Atomically write `contents` to `path`: write to a sibling temp file, then
146+ /// rename over the target. Prevents a crash mid-write from leaving the target
147+ /// truncated or partially written.
148+ fn write_atomically ( path : & Utf8Path , contents : & [ u8 ] ) -> anyhow:: Result < ( ) > {
149+ let parent = path
150+ . parent ( )
151+ . with_context ( || format ! ( "target path has no parent directory: {path}" ) ) ?;
152+ if !parent. as_str ( ) . is_empty ( ) {
153+ std:: fs:: create_dir_all ( parent) . with_context ( || format ! ( "create directory {parent}" ) ) ?;
154+ }
155+ let file_name = path
156+ . file_name ( )
157+ . with_context ( || format ! ( "target path has no file name: {path}" ) ) ?;
158+ let tmp_path = parent. join ( format ! ( ".{file_name}.tmp.{}" , Uuid :: new_v4( ) ) ) ;
159+
160+ // Best-effort cleanup if anything fails after the temp file is created.
161+ let write_result = std:: fs:: write ( & tmp_path, contents) ;
162+ if let Err ( e) = write_result {
163+ let _ = std:: fs:: remove_file ( & tmp_path) ;
164+ return Err ( anyhow:: Error :: new ( e) . context ( format ! ( "write temp file {tmp_path}" ) ) ) ;
165+ }
166+
167+ if let Err ( e) = std:: fs:: rename ( & tmp_path, path) {
168+ let _ = std:: fs:: remove_file ( & tmp_path) ;
169+ return Err ( anyhow:: Error :: new ( e) . context ( format ! ( "rename {tmp_path} -> {path}" ) ) ) ;
170+ }
171+
172+ Ok ( ( ) )
173+ }
174+
131175impl CaManager {
132176 /// Load an existing CA from disk, or generate a new one.
133177 pub fn load_or_generate ( data_dir : & Utf8Path ) -> anyhow:: Result < Arc < Self > > {
@@ -278,7 +322,13 @@ impl CaManager {
278322 // The keypair is preserved across SAN regenerations to keep the SPKI pin
279323 // stable for already-enrolled agents. Only the cert document changes.
280324 // Generate a fresh key only if the existing key is missing/unreadable.
281- let server_key_pair = match ( status, std:: fs:: read_to_string ( & key_path) ) {
325+ //
326+ // `key_origin` tracks whether `server_key_pair` was loaded from disk or
327+ // freshly generated. When freshly generated we MUST persist the new key
328+ // (atomically) *before* writing the cert, otherwise a crash between the
329+ // two writes — or the cert-only write path used previously — would
330+ // leave a cert/key mismatch on disk and break Gateway TLS.
331+ let ( server_key_pair, key_origin) = match ( status, std:: fs:: read_to_string ( & key_path) ) {
282332 ( ServerCertStatus :: Valid , _) => {
283333 info ! ( %cert_path, ?expected_sans, "Using existing agent tunnel server certificate" ) ;
284334 return Ok ( ( cert_path, key_path) ) ;
@@ -290,11 +340,14 @@ impl CaManager {
290340 new_sans = ?expected_sans,
291341 "Agent tunnel server cert SAN set changed; regenerating with the existing keypair" ,
292342 ) ;
293- KeyPair :: from_pem ( & key_pem) . context ( "parse existing server key pair from PEM" ) ?
343+ let kp = KeyPair :: from_pem ( & key_pem) . context ( "parse existing server key pair from PEM" ) ?;
344+ ( kp, KeyOrigin :: LoadedFromDisk )
294345 }
295346 ( other_status, _) => {
296347 info ! ( %cert_path, ?other_status, "Generating new server certificate keypair" ) ;
297- KeyPair :: generate_for ( & rcgen:: PKCS_ECDSA_P256_SHA256 ) . context ( "generate server key pair" ) ?
348+ let kp =
349+ KeyPair :: generate_for ( & rcgen:: PKCS_ECDSA_P256_SHA256 ) . context ( "generate server key pair" ) ?;
350+ ( kp, KeyOrigin :: FreshlyGenerated )
298351 }
299352 } ;
300353
@@ -322,21 +375,25 @@ impl CaManager {
322375 let server_cert_pem = server_cert. pem ( ) ;
323376 let fingerprint = cert_fingerprint_from_pem ( & server_cert_pem) . unwrap_or_else ( |_| "<unknown>" . to_owned ( ) ) ;
324377
325- std:: fs:: write ( & cert_path, & server_cert_pem) . with_context ( || format ! ( "write server cert to {cert_path}" ) ) ?;
326- // Only write the key when generating a new one. Reusing the existing key
327- // means we already validated the file is readable above.
328- if !key_path. exists ( ) {
329- std:: fs:: write ( & key_path, server_key_pair. serialize_pem ( ) )
378+ // Order matters: when the key was freshly generated, persist it BEFORE
379+ // the cert so we never end up with a new cert on disk paired with an
380+ // old/missing key. Use write-to-temp + rename for atomicity so a crash
381+ // mid-write cannot leave a truncated key on disk.
382+ if matches ! ( key_origin, KeyOrigin :: FreshlyGenerated ) {
383+ write_atomically ( & key_path, server_key_pair. serialize_pem ( ) . as_bytes ( ) )
330384 . with_context ( || format ! ( "write server key to {key_path}" ) ) ?;
331- }
332385
333- #[ cfg( unix) ]
334- {
335- use std:: os:: unix:: fs:: PermissionsExt as _;
336- std:: fs:: set_permissions ( & key_path, std:: fs:: Permissions :: from_mode ( 0o600 ) )
337- . with_context ( || format ! ( "set permissions on {key_path}" ) ) ?;
386+ #[ cfg( unix) ]
387+ {
388+ use std:: os:: unix:: fs:: PermissionsExt as _;
389+ std:: fs:: set_permissions ( & key_path, std:: fs:: Permissions :: from_mode ( 0o600 ) )
390+ . with_context ( || format ! ( "set permissions on {key_path}" ) ) ?;
391+ }
338392 }
339393
394+ write_atomically ( & cert_path, server_cert_pem. as_bytes ( ) )
395+ . with_context ( || format ! ( "write server cert to {cert_path}" ) ) ?;
396+
340397 info ! (
341398 %cert_path,
342399 primary_name,
@@ -895,6 +952,103 @@ mod tests {
895952 assert ! ( msg. contains( "empty" ) , "got: {msg}" ) ;
896953 }
897954
955+ #[ test]
956+ fn ensure_server_cert_regenerates_key_atomically_when_cert_expiring_soon ( ) {
957+ // Regression for the cert/key mismatch bug: when the cert is expiring
958+ // soon (or unreadable / missing) but a stale key file is still on disk
959+ // from a previous run, the regeneration path used to sign the new cert
960+ // with a freshly generated keypair while skipping the key write —
961+ // leaving cert and key out of sync. The fix tracks key provenance and
962+ // always persists a freshly generated key BEFORE writing the cert.
963+ //
964+ // This test backdates the cert's validity by overwriting cert+key on
965+ // disk with a deliberately near-expiry pair, then calls
966+ // `ensure_server_cert` and asserts the on-disk key is the one inside
967+ // the new cert (i.e. they match SPKI-wise).
968+
969+ let temp_dir = std:: env:: temp_dir ( ) . join ( format ! ( "dgw-expiring-{}" , Uuid :: new_v4( ) ) ) ;
970+ let data_dir = Utf8PathBuf :: from_path_buf ( temp_dir. clone ( ) ) . expect ( "temp path UTF-8" ) ;
971+ let ca = CaManager :: load_or_generate ( & data_dir) . expect ( "CA" ) ;
972+
973+ let names = [ "gateway.corp.example.com" ] ;
974+ let ( cert_path, key_path) = ca. ensure_server_cert ( & names) . expect ( "first issue" ) ;
975+
976+ // Mint a replacement (cert, key) pair where the cert is already
977+ // expiring (within the 7-day window) but the key on disk is a stale
978+ // keypair unrelated to whatever the regen path will generate. The
979+ // point: force the code into the `ExpiringSoon` branch with a key
980+ // file present on disk, and verify regeneration writes a matching key.
981+ let stale_key =
982+ KeyPair :: generate_for ( & rcgen:: PKCS_ECDSA_P256_SHA256 ) . expect ( "stale keypair" ) ;
983+ let mut expiring_params = CertificateParams :: default ( ) ;
984+ expiring_params
985+ . distinguished_name
986+ . push ( DnType :: CommonName , names[ 0 ] ) ;
987+ expiring_params
988+ . subject_alt_names
989+ . push ( SanType :: DnsName ( names[ 0 ] . try_into ( ) . unwrap ( ) ) ) ;
990+ expiring_params. not_before = time:: OffsetDateTime :: now_utc ( ) - Duration :: from_secs ( 60 * SECS_PER_DAY ) ;
991+ // Inside the 7-day "ExpiringSoon" threshold.
992+ expiring_params. not_after = time:: OffsetDateTime :: now_utc ( ) + Duration :: from_secs ( SECS_PER_DAY ) ;
993+ let ca_cert_for_sign = ca. reconstruct_ca_cert ( ) . expect ( "reconstruct CA" ) ;
994+ let expiring_cert = expiring_params
995+ . signed_by ( & stale_key, & ca_cert_for_sign, & ca. ca_key_pair )
996+ . expect ( "sign expiring cert" ) ;
997+ std:: fs:: write ( & cert_path, expiring_cert. pem ( ) ) . expect ( "overwrite cert with expiring one" ) ;
998+ std:: fs:: write ( & key_path, stale_key. serialize_pem ( ) ) . expect ( "overwrite key with stale one" ) ;
999+
1000+ // Sanity: confirm the status check classifies this as ExpiringSoon.
1001+ let expected_sans = build_san_set ( & names) . expect ( "expected sans" ) ;
1002+ let status = check_server_cert ( & cert_path, & key_path, & expected_sans) ;
1003+ assert ! (
1004+ matches!( status, ServerCertStatus :: ExpiringSoon ) ,
1005+ "precondition: expected ExpiringSoon, got {status:?}"
1006+ ) ;
1007+
1008+ // Trigger regeneration.
1009+ ca. ensure_server_cert ( & names) . expect ( "regen after expiring" ) ;
1010+
1011+ // The on-disk key must match the public key embedded in the new cert.
1012+ let new_cert_pem = std:: fs:: read_to_string ( & cert_path) . expect ( "read regenerated cert" ) ;
1013+ let new_key_pem = std:: fs:: read_to_string ( & key_path) . expect ( "read regenerated key" ) ;
1014+
1015+ let new_cert_der = cert_pem_to_der ( & new_cert_pem) . expect ( "parse new cert PEM" ) ;
1016+ let cert_spki = spki_sha256_from_der ( & new_cert_der) . expect ( "cert SPKI" ) ;
1017+
1018+ // Build the SPKI of the persisted key by self-signing a throwaway cert
1019+ // with it (rcgen doesn't expose the SPKI of a KeyPair directly).
1020+ let persisted_key = KeyPair :: from_pem ( & new_key_pem) . expect ( "parse persisted key" ) ;
1021+ let mut probe_params = CertificateParams :: default ( ) ;
1022+ probe_params
1023+ . distinguished_name
1024+ . push ( DnType :: CommonName , "probe" ) ;
1025+ let probe_cert = probe_params. self_signed ( & persisted_key) . expect ( "self-sign probe" ) ;
1026+ let probe_der = cert_pem_to_der ( & probe_cert. pem ( ) ) . expect ( "probe DER" ) ;
1027+ let probe_spki = spki_sha256_from_der ( & probe_der) . expect ( "probe SPKI" ) ;
1028+
1029+ assert_eq ! (
1030+ cert_spki, probe_spki,
1031+ "on-disk key must match the public key embedded in the regenerated cert"
1032+ ) ;
1033+
1034+ // And confirm the key is no longer the stale one we wrote earlier.
1035+ let mut stale_probe_params = CertificateParams :: default ( ) ;
1036+ stale_probe_params
1037+ . distinguished_name
1038+ . push ( DnType :: CommonName , "stale-probe" ) ;
1039+ let stale_probe_cert = stale_probe_params
1040+ . self_signed ( & stale_key)
1041+ . expect ( "self-sign stale probe" ) ;
1042+ let stale_probe_der = cert_pem_to_der ( & stale_probe_cert. pem ( ) ) . expect ( "stale probe DER" ) ;
1043+ let stale_spki = spki_sha256_from_der ( & stale_probe_der) . expect ( "stale SPKI" ) ;
1044+ assert_ne ! (
1045+ cert_spki, stale_spki,
1046+ "regenerated cert must NOT use the stale on-disk key"
1047+ ) ;
1048+
1049+ let _ = std:: fs:: remove_dir_all ( & temp_dir) ;
1050+ }
1051+
8981052 #[ test]
8991053 fn read_cert_chain_rejects_wrong_label ( ) {
9001054 let ( _, leaf_pem) = make_cert_pair ( ) ;
0 commit comments