From 37e00f738f73aea14f5f8fe10bfe36ba8f31aee9 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Sun, 29 Mar 2026 16:16:10 +0100 Subject: [PATCH 1/4] feat: increase staged rollout window from 1 hour to 24 hours Prevents clustered restarts when a new release is published by spreading node upgrades evenly across a 24-hour window instead of 1 hour. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index 1fc3fa97..7b70dc17 100644 --- a/src/config.rs +++ b/src/config.rs @@ -369,7 +369,7 @@ const fn default_check_interval() -> u64 { } const fn default_staged_rollout_hours() -> u64 { - 1 // 1 hour window for staged rollout (testing) + 24 // 24 hour window for staged rollout } // ============================================================================ From b289215bf482d466ed18f994b8dd09bda2ac2b83 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Sun, 29 Mar 2026 22:42:13 +0100 Subject: [PATCH 2/4] feat: sleep for exact rollout delay instead of polling at check interval When an upgrade is pending, the monitor task now sleeps for precisely the remaining rollout delay rather than waiting for the next check interval tick. This eliminates restart clustering caused by quantization to the check interval. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/node.rs | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/node.rs b/src/node.rs index 2726f9d2..0d58ad51 100644 --- a/src/node.rs +++ b/src/node.rs @@ -585,16 +585,33 @@ impl RunningNode { warn!("Error during upgrade process: {}", e); } } - // Schedule next check with jitter to prevent fleet re-alignment - let jittered_duration = - jittered_interval(monitor.check_interval()); - let next_check = chrono::Utc::now() - + chrono::Duration::from_std(jittered_duration).unwrap_or_else(|e| { - warn!("chrono::Duration::from_std failed for interval ({e}), defaulting to 1 hour"); - chrono::Duration::hours(1) - }); - info!("Next upgrade check scheduled for {}", next_check.to_rfc3339()); - tokio::time::sleep(jittered_duration).await; + // If an upgrade is pending, sleep for exactly the remaining + // rollout delay so the node restarts at its scheduled time + // rather than waiting for the next check interval tick. + let sleep_duration = monitor.time_until_upgrade().map_or_else( + || { + // No pending upgrade - schedule next check with jitter + let jittered_duration = + jittered_interval(monitor.check_interval()); + let next_check = chrono::Utc::now() + + chrono::Duration::from_std(jittered_duration).unwrap_or_else(|e| { + warn!("chrono::Duration::from_std failed for interval ({e}), defaulting to 1 hour"); + chrono::Duration::hours(1) + }); + info!("Next upgrade check scheduled for {}", next_check.to_rfc3339()); + jittered_duration + }, + |remaining| { + let wake_time = chrono::Utc::now() + + chrono::Duration::from_std(remaining).unwrap_or_else(|e| { + warn!("chrono::Duration::from_std failed for rollout delay ({e}), defaulting to 1 minute"); + chrono::Duration::minutes(1) + }); + info!("Will apply upgrade at {}", wake_time.to_rfc3339()); + remaining + }, + ); + tokio::time::sleep(sleep_duration).await; } } } From 79f33282ef5094ea9d3be8ee81a80a71cca55eee Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Sun, 29 Mar 2026 22:47:56 +0100 Subject: [PATCH 3/4] fix: skip crates.io publish for pre-release versions Pre-release versions (alpha, beta, rc) should not be published to crates.io. Also removes publish-crate from the release job dependency chain so pre-release GitHub releases aren't blocked. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dd34296c..22c1f53c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -371,7 +371,7 @@ jobs: name: Publish to crates.io needs: [validate, test] runs-on: ubuntu-latest - if: github.event.inputs.dry_run != 'true' + if: github.event.inputs.dry_run != 'true' && needs.validate.outputs.is_prerelease != 'true' steps: - uses: actions/checkout@v4 @@ -390,7 +390,7 @@ jobs: release: name: Create GitHub Release - needs: [validate, sign, publish-crate] + needs: [validate, sign] runs-on: ubuntu-latest if: github.event.inputs.dry_run != 'true' steps: From 4876f26bcb409508f16b607aae3ce3c8d65e81a3 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Mon, 30 Mar 2026 18:21:03 +0100 Subject: [PATCH 4/4] fix: address PR review feedback - Add backoff when rollout delay has elapsed but upgrade failed, to prevent a tight retry loop on Duration::ZERO - Wrap upgrade sleep in tokio::select! with shutdown.cancelled() so shutdown can interrupt long rollout delay sleeps - Restore publish-crate dependency on release job with conditional logic: release proceeds if publish-crate succeeds or was skipped (pre-release), but blocks if it fails (stable) Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/release.yml | 4 ++-- src/node.rs | 41 ++++++++++++++++++++++++++++------- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 22c1f53c..17e1cb82 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -390,9 +390,9 @@ jobs: release: name: Create GitHub Release - needs: [validate, sign] + needs: [validate, sign, publish-crate] + if: ${{ always() && !cancelled() && needs.validate.result == 'success' && needs.sign.result == 'success' && (needs.publish-crate.result == 'success' || needs.publish-crate.result == 'skipped') && github.event.inputs.dry_run != 'true' }} runs-on: ubuntu-latest - if: github.event.inputs.dry_run != 'true' steps: - uses: actions/checkout@v4 diff --git a/src/node.rs b/src/node.rs index 0d58ad51..e8a41bed 100644 --- a/src/node.rs +++ b/src/node.rs @@ -602,16 +602,41 @@ impl RunningNode { jittered_duration }, |remaining| { - let wake_time = chrono::Utc::now() - + chrono::Duration::from_std(remaining).unwrap_or_else(|e| { - warn!("chrono::Duration::from_std failed for rollout delay ({e}), defaulting to 1 minute"); - chrono::Duration::minutes(1) - }); - info!("Will apply upgrade at {}", wake_time.to_rfc3339()); - remaining + // If the rollout delay has fully elapsed but the upgrade was + // not successfully applied, avoid a tight loop by backing off + // at least one check interval before retrying. + if remaining.is_zero() { + let backoff = jittered_interval(monitor.check_interval()); + let next_check = chrono::Utc::now() + + chrono::Duration::from_std(backoff).unwrap_or_else(|e| { + warn!("chrono::Duration::from_std failed for backoff ({e}), defaulting to 1 hour"); + chrono::Duration::hours(1) + }); + info!( + "Upgrade rollout delay elapsed but previous apply did not succeed; \ + backing off, next check scheduled for {}", + next_check.to_rfc3339() + ); + backoff + } else { + let wake_time = chrono::Utc::now() + + chrono::Duration::from_std(remaining).unwrap_or_else(|e| { + warn!("chrono::Duration::from_std failed for rollout delay ({e}), defaulting to 1 minute"); + chrono::Duration::minutes(1) + }); + info!("Will apply upgrade at {}", wake_time.to_rfc3339()); + remaining + } }, ); - tokio::time::sleep(sleep_duration).await; + // Use select! so shutdown can interrupt long sleeps + // (e.g. during a full rollout window delay). + tokio::select! { + () = shutdown.cancelled() => { + break; + } + () = tokio::time::sleep(sleep_duration) => {} + } } } }