diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dd34296..17e1cb8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -371,7 +371,7 @@ jobs: name: Publish to crates.io needs: [validate, test] runs-on: ubuntu-latest - if: github.event.inputs.dry_run != 'true' + if: github.event.inputs.dry_run != 'true' && needs.validate.outputs.is_prerelease != 'true' steps: - uses: actions/checkout@v4 @@ -391,8 +391,8 @@ jobs: release: name: Create GitHub Release needs: [validate, sign, publish-crate] + if: ${{ always() && !cancelled() && needs.validate.result == 'success' && needs.sign.result == 'success' && (needs.publish-crate.result == 'success' || needs.publish-crate.result == 'skipped') && github.event.inputs.dry_run != 'true' }} runs-on: ubuntu-latest - if: github.event.inputs.dry_run != 'true' steps: - uses: actions/checkout@v4 diff --git a/src/config.rs b/src/config.rs index 1fc3fa9..7b70dc1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -369,7 +369,7 @@ const fn default_check_interval() -> u64 { } const fn default_staged_rollout_hours() -> u64 { - 1 // 1 hour window for staged rollout (testing) + 24 // 24 hour window for staged rollout } // ============================================================================ diff --git a/src/node.rs b/src/node.rs index 2726f9d..e8a41be 100644 --- a/src/node.rs +++ b/src/node.rs @@ -585,16 +585,58 @@ impl RunningNode { warn!("Error during upgrade process: {}", e); } } - // Schedule next check with jitter to prevent fleet re-alignment - let jittered_duration = - jittered_interval(monitor.check_interval()); - let next_check = chrono::Utc::now() - + chrono::Duration::from_std(jittered_duration).unwrap_or_else(|e| { - warn!("chrono::Duration::from_std failed for interval ({e}), defaulting to 1 hour"); - chrono::Duration::hours(1) - }); - info!("Next upgrade check scheduled for {}", next_check.to_rfc3339()); - tokio::time::sleep(jittered_duration).await; + // If an upgrade is pending, sleep for exactly the remaining + // rollout delay so the node restarts at its scheduled time + // rather than waiting for the next check interval tick. + let sleep_duration = monitor.time_until_upgrade().map_or_else( + || { + // No pending upgrade - schedule next check with jitter + let jittered_duration = + jittered_interval(monitor.check_interval()); + let next_check = chrono::Utc::now() + + chrono::Duration::from_std(jittered_duration).unwrap_or_else(|e| { + warn!("chrono::Duration::from_std failed for interval ({e}), defaulting to 1 hour"); + chrono::Duration::hours(1) + }); + info!("Next upgrade check scheduled for {}", next_check.to_rfc3339()); + jittered_duration + }, + |remaining| { + // If the rollout delay has fully elapsed but the upgrade was + // not successfully applied, avoid a tight loop by backing off + // at least one check interval before retrying. + if remaining.is_zero() { + let backoff = jittered_interval(monitor.check_interval()); + let next_check = chrono::Utc::now() + + chrono::Duration::from_std(backoff).unwrap_or_else(|e| { + warn!("chrono::Duration::from_std failed for backoff ({e}), defaulting to 1 hour"); + chrono::Duration::hours(1) + }); + info!( + "Upgrade rollout delay elapsed but previous apply did not succeed; \ + backing off, next check scheduled for {}", + next_check.to_rfc3339() + ); + backoff + } else { + let wake_time = chrono::Utc::now() + + chrono::Duration::from_std(remaining).unwrap_or_else(|e| { + warn!("chrono::Duration::from_std failed for rollout delay ({e}), defaulting to 1 minute"); + chrono::Duration::minutes(1) + }); + info!("Will apply upgrade at {}", wake_time.to_rfc3339()); + remaining + } + }, + ); + // Use select! so shutdown can interrupt long sleeps + // (e.g. during a full rollout window delay). + tokio::select! { + () = shutdown.cancelled() => { + break; + } + () = tokio::time::sleep(sleep_duration) => {} + } } } }