diff --git a/Cargo.lock b/Cargo.lock index 0de243037..f9032146e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1146,6 +1146,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + [[package]] name = "hyper" version = "1.8.1" @@ -3081,6 +3087,7 @@ dependencies = [ "axum", "clap", "futures-util", + "humantime", "hyper", "hyper-util", "k8s-openapi", diff --git a/Cargo.toml b/Cargo.toml index 9ca62620b..42a36c9d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ either = "1.13.0" futures = "0.3.30" futures-util = "0.3.30" http = "1.3.1" +humantime = "2.1.0" indexmap = "2.5.0" indoc = "2.0.6" jiff = "0.2.18" diff --git a/crates/stackable-webhook/CHANGELOG.md b/crates/stackable-webhook/CHANGELOG.md index 13efae6d9..70105bb74 100644 --- a/crates/stackable-webhook/CHANGELOG.md +++ b/crates/stackable-webhook/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Changed + +- Check for certificate rotation every 5 minutes to prevent expiry due to monotonic and wall clock divergence ([#1175]). + +[#1175]: https://github.com/stackabletech/operator-rs/pull/1175 + ## [0.9.0] - 2026-02-03 ### Added diff --git a/crates/stackable-webhook/Cargo.toml b/crates/stackable-webhook/Cargo.toml index 7c7ad3603..4bbdbeb9d 100644 --- a/crates/stackable-webhook/Cargo.toml +++ b/crates/stackable-webhook/Cargo.toml @@ -15,6 +15,7 @@ arc-swap.workspace = true async-trait.workspace = true axum.workspace = true futures-util.workspace = true +humantime.workspace = true hyper-util.workspace = true hyper.workspace = true k8s-openapi.workspace = true diff --git a/crates/stackable-webhook/src/tls/cert_resolver.rs b/crates/stackable-webhook/src/tls/cert_resolver.rs index d355b956c..ac1cfecb9 100644 --- a/crates/stackable-webhook/src/tls/cert_resolver.rs +++ b/crates/stackable-webhook/src/tls/cert_resolver.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{sync::Arc, time::SystemTime}; use arc_swap::ArcSwap; use snafu::{OptionExt, ResultExt, Snafu}; @@ -57,6 +57,11 @@ pub struct CertificateResolver { /// Using a [`ArcSwap`] (over e.g. [`tokio::sync::RwLock`]), so that we can easily /// (and performant) bridge between async write and sync read. current_certified_key: ArcSwap, + + /// The wall-clock expiry time (`not_after`) of the current certificate. + /// Used to detect clock drift between monotonic and wall-clock time. + current_not_after: ArcSwap, + subject_alterative_dns_names: Arc>, certificate_tx: mpsc::Sender, @@ -68,7 +73,7 @@ impl CertificateResolver { certificate_tx: mpsc::Sender, ) -> Result { let subject_alterative_dns_names = Arc::new(subject_alterative_dns_names); - let certified_key = Self::generate_new_certificate_inner( + let (certified_key, not_after) = Self::generate_new_certificate_inner( subject_alterative_dns_names.clone(), &certificate_tx, ) @@ -77,26 +82,51 @@ impl CertificateResolver { Ok(Self { subject_alterative_dns_names, current_certified_key: ArcSwap::new(certified_key), + current_not_after: ArcSwap::new(Arc::new(not_after)), certificate_tx, }) } pub async fn rotate_certificate(&self) -> Result<()> { - let certified_key = self.generate_new_certificate().await?; + let (certified_key, not_after) = self.generate_new_certificate().await?; // TODO: Sign the new cert somehow with the old cert. See https://github.com/stackabletech/decisions/issues/56 self.current_certified_key.store(certified_key); + self.current_not_after.store(Arc::new(not_after)); Ok(()) } - async fn generate_new_certificate(&self) -> Result> { + /// Returns `true` if the current certificate is expired or will expire + /// within the given `buffer` duration according to wall-clock time. + /// + /// This catches cases where the monotonic timer (used by `tokio::time`) + /// has drifted from wall-clock time, e.g. due to system hibernation. + pub fn needs_rotation(&self, buffer: std::time::Duration) -> bool { + let not_after = **self.current_not_after.load(); + // If subtraction underflows (buffer > time since epoch), fall back to + // UNIX_EPOCH so that the comparison always triggers rotation. + let deadline = not_after + .checked_sub(buffer) + .unwrap_or(SystemTime::UNIX_EPOCH); + + tracing::debug!( + x509.subject_alterative_names = ?self.subject_alterative_dns_names, + x509.not_after = %humantime::format_rfc3339(not_after), + deadline = %humantime::format_rfc3339(deadline), + "checking if certificate needs rotation" + ); + + SystemTime::now() >= deadline + } + + async fn generate_new_certificate(&self) -> Result<(Arc, SystemTime)> { let subject_alterative_dns_names = self.subject_alterative_dns_names.clone(); Self::generate_new_certificate_inner(subject_alterative_dns_names, &self.certificate_tx) .await } - /// Creates a new certificate and returns the certified key. + /// Creates a new certificate and returns the certified key as well as `notAfter` timestamp. /// /// The certificate is send to the passed `cert_tx`. /// @@ -106,7 +136,7 @@ impl CertificateResolver { async fn generate_new_certificate_inner( subject_alterative_dns_names: Arc>, certificate_tx: &mpsc::Sender, - ) -> Result> { + ) -> Result<(Arc, SystemTime)> { // The certificate generations can take a while, so we use `spawn_blocking` let (cert, certified_key) = tokio::task::spawn_blocking(move || { let tls_provider = @@ -144,12 +174,14 @@ impl CertificateResolver { .await .context(TokioSpawnBlockingSnafu)??; + let not_after = cert.tbs_certificate.validity.not_after.to_system_time(); + certificate_tx .send(cert) .await .map_err(|_err| CertificateResolverError::SendCertificateToChannel)?; - Ok(certified_key) + Ok((certified_key, not_after)) } } diff --git a/crates/stackable-webhook/src/tls/mod.rs b/crates/stackable-webhook/src/tls/mod.rs index 3c5d36482..20d12a1fc 100644 --- a/crates/stackable-webhook/src/tls/mod.rs +++ b/crates/stackable-webhook/src/tls/mod.rs @@ -38,9 +38,30 @@ use crate::{ mod cert_resolver; -pub const WEBHOOK_CA_LIFETIME: Duration = Duration::from_hours_unchecked(24); -pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = Duration::from_hours_unchecked(24); -pub const WEBHOOK_CERTIFICATE_ROTATION_INTERVAL: Duration = Duration::from_hours_unchecked(20); +/// All certificates (CAs and leaf certificates) are valid for this amount of time in hours. If this +/// is ever reduced, ensure it stays well above [`CERTIFICATE_ROTATION_CHECK_INTERVAL`] +/// (currently 5 minutes), otherwise the certificate could expire between checks. +const CERTIFICATE_LIFETIME_HOURS: u64 = 24; + +/// The CA lifetime as a human-readable [`Duration`]. +pub const WEBHOOK_CA_LIFETIME: Duration = + Duration::from_hours_unchecked(CERTIFICATE_LIFETIME_HOURS); + +/// The leaf certificate lifetime as a human-readable [`Duration`]. +pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = + Duration::from_hours_unchecked(CERTIFICATE_LIFETIME_HOURS); + +/// Rotate the certificate when less than 1/6 of its lifetime remains (4 hours for the current 24h +/// lifetime). Derived from [`CERTIFICATE_LIFETIME_HOURS`] so it scales if the lifetime changes. +const CERTIFICATE_EXPIRY_BUFFER_HOURS: u64 = CERTIFICATE_LIFETIME_HOURS * 60 / 6; + +const CERTIFICATE_EXPIRY_BUFFER: Duration = + Duration::from_minutes_unchecked(CERTIFICATE_EXPIRY_BUFFER_HOURS); + +/// How often to check whether the certificate needs rotation. This is intentionally independent of +/// the certificate lifetime - it controls how quickly we detect wall-clock drift (from hibernation, +/// VM migration, etc.), not how long the certificate lives. +const CERTIFICATE_ROTATION_CHECK_INTERVAL: Duration = Duration::from_minutes_unchecked(5); pub type Result = std::result::Result; @@ -153,8 +174,9 @@ impl TlsServer { router, } = self; - let start = tokio::time::Instant::now() + *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL; - let mut interval = tokio::time::interval_at(start, *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL); + let start = tokio::time::Instant::now() + *CERTIFICATE_ROTATION_CHECK_INTERVAL; + let mut rotation_check_interval = + tokio::time::interval_at(start, *CERTIFICATE_ROTATION_CHECK_INTERVAL); let tls_acceptor = TlsAcceptor::from(Arc::new(config)); let tcp_listener = TcpListener::bind(socket_addr) @@ -183,11 +205,10 @@ impl TlsServer { loop { let tls_acceptor = tls_acceptor.clone(); - // Wait for either a new TCP connection or the certificate rotation interval tick tokio::select! { // We opt for a biased execution of arms to make sure we always check if a - // shutdown signal was received or the certificate needs rotation based on the - // interval. This ensures, we always use a valid certificate for the TLS connection. + // shutdown signal was received or the certificate needs rotation before + // accepting new connections. biased; // Once a shutdown signal is received (this future becomes `Poll::Ready`), break out @@ -198,13 +219,16 @@ impl TlsServer { break; } - // This is cancellation-safe. If this branch is cancelled, the tick is NOT consumed. - // As such, we will not miss rotating the certificate. - _ = interval.tick() => { - cert_resolver - .rotate_certificate() - .await - .context(RotateCertificateSnafu)? + // Check wall-clock time to decide if the certificate needs rotation. + // This is cancellation-safe: if cancelled, the tick is NOT consumed. + _ = rotation_check_interval.tick() => { + if cert_resolver.needs_rotation(*CERTIFICATE_EXPIRY_BUFFER) { + tracing::info!("certificate approaching expiry, rotating"); + cert_resolver + .rotate_certificate() + .await + .context(RotateCertificateSnafu)?; + } } // This is cancellation-safe. If cancelled, no new connections are accepted.