From 261bebe12420fd8fd55a4911df9c41ba62462074 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amaury=20Lev=C3=A9?= Date: Mon, 25 May 2026 22:52:10 +0200 Subject: [PATCH 1/2] Prototype: phased shutdown for MTP (#5345) Introduce a two-phase shutdown model for Microsoft.Testing.Platform so test sessions get a deterministic drain window before being aborted: - Extend internal ITestApplicationCancellationTokenSource with DrainingToken (graceful cancel) and AbortingToken (forceful abort), plus an Abort() entry point. The existing CancellationToken is kept as a back-compat alias for DrainingToken so current consumers keep observing graceful cancellation without changes. - Rewrite CTRLPlusCCancellationTokenSource with a Running -> Draining -> Aborting state machine: * 1st Ctrl+C enters Draining, starts a 30s grace timer that escalates to Aborting on elapse. * 2nd Ctrl+C escalates to Aborting immediately, starts a 10s abort-timeout safety net that FailFasts via IEnvironment. * 3rd Ctrl+C is no longer intercepted - the runtime terminates the process (matches docker compose / kubectl / npm UX). - Add 6 unit tests covering initial state, single/idempotent cancel, abort, grace-period escalation, and zero-grace escalation. Passes on net9.0 and net462. CLI options (--shutdown-grace-period, --shutdown-abort-timeout), env- var propagation to controlled hosts, TerminalOutputDevice UX updates, and migration of existing token consumers are intentionally deferred to follow-up PRs and tracked in the RFC. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../CTRLPlusCCancellationTokenSource.cs | 171 ++++++++++++++++-- ...ITestApplicationCancellationTokenSource.cs | 37 ++++ .../Hosts/CommonHostTests.cs | 8 + .../CTRLPlusCCancellationTokenSourceTests.cs | 112 ++++++++++++ 4 files changed, 315 insertions(+), 13 deletions(-) create mode 100644 test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs diff --git a/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs b/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs index 9226f4c862..20a7122389 100644 --- a/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs +++ b/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs @@ -6,19 +6,65 @@ namespace Microsoft.Testing.Platform.Services; +/// +/// Two-phase, Ctrl+C-aware . +/// +/// +/// Phase machine (see RFC "Phased graceful shutdown for MTP", issue #5345): +/// +/// RUNNING ──Ctrl+C / Cancel()──▶ DRAINING ──grace elapsed / 2nd Ctrl+C / Abort()──▶ ABORTING +/// ──3rd Ctrl+C──▶ (process terminated by runtime) +/// +/// +/// Transitions are idempotent and one-way. Existing consumers reading +/// automatically observe Draining (back-compat). +/// +/// internal sealed class CTRLPlusCCancellationTokenSource : ITestApplicationCancellationTokenSource, IDisposable { - private readonly CancellationTokenSource _cancellationTokenSource = new(); + // Conservative defaults inspired by .NET HostOptions.ShutdownTimeout (30s) and + // Vitest's teardownTimeout (10s). Will become CLI options in a follow-up + // (--shutdown-grace-period, --shutdown-abort-timeout). + // TODO(#5345): wire to PlatformCommandLineProvider. + internal static readonly TimeSpan DefaultGracePeriod = TimeSpan.FromSeconds(30); + internal static readonly TimeSpan DefaultAbortTimeout = TimeSpan.FromSeconds(10); + + private const int PhaseRunning = 0; + private const int PhaseDraining = 1; + private const int PhaseAborting = 2; + + private readonly CancellationTokenSource _drainingCts = new(); + private readonly CancellationTokenSource _abortingCts = new(); + private readonly TimeSpan _gracePeriod; + private readonly TimeSpan _abortTimeout; + private readonly IEnvironment _environment; private readonly ILogger? _logger; + private int _phase = PhaseRunning; + private int _ctrlCCount; + public CTRLPlusCCancellationTokenSource(IConsole? console = null, ILogger? logger = null) + : this(console, logger, DefaultGracePeriod, DefaultAbortTimeout, environment: null) { + } + + // Test-friendly overload so we can exercise the phase machine without waiting 30s. + internal CTRLPlusCCancellationTokenSource( + IConsole? console, + ILogger? logger, + TimeSpan gracePeriod, + TimeSpan abortTimeout, + IEnvironment? environment = null) + { + _gracePeriod = gracePeriod; + _abortTimeout = abortTimeout; + _environment = environment ?? new SystemEnvironment(); + _logger = logger; + if (console is not null && !IsCancelKeyPressNotSupported()) { console.CancelKeyPress += OnConsoleCancelKeyPressed; } - - _logger = logger; } [SupportedOSPlatformGuard("android")] @@ -33,27 +79,126 @@ private static bool IsCancelKeyPressNotSupported() OperatingSystem.IsWasi() || OperatingSystem.IsBrowser(); - public void CancelAfter(TimeSpan timeout) => _cancellationTokenSource.CancelAfter(timeout); + /// + public CancellationToken CancellationToken => _drainingCts.Token; + + /// + public CancellationToken DrainingToken => _drainingCts.Token; + + /// + public CancellationToken AbortingToken => _abortingCts.Token; + + internal int CurrentPhase => Volatile.Read(ref _phase); + + public void CancelAfter(TimeSpan timeout) => _drainingCts.CancelAfter(timeout); - public CancellationToken CancellationToken - => _cancellationTokenSource.Token; + /// + public void Cancel() => EnterDraining(); + + /// + public void Abort() + { + EnterDraining(); + EnterAborting(); + } + + public void Dispose() + { + _drainingCts.Dispose(); + _abortingCts.Dispose(); + } private void OnConsoleCancelKeyPressed(object? sender, ConsoleCancelEventArgs e) { - e.Cancel = true; + int count = Interlocked.Increment(ref _ctrlCCount); + + switch (count) + { + case 1: + // 1st Ctrl+C: cooperative cancel. + e.Cancel = true; + EnterDraining(); + break; + case 2: + // 2nd Ctrl+C: escalate to abort. + e.Cancel = true; + EnterAborting(); + break; + default: + // 3rd+ Ctrl+C: stop intercepting and let the runtime terminate + // the process. This matches docker compose / kubectl / npm UX: + // the user has explicitly asked us to die. + e.Cancel = false; + break; + } + } + + private void EnterDraining() + { + if (Interlocked.CompareExchange(ref _phase, PhaseDraining, PhaseRunning) != PhaseRunning) + { + return; + } + try { - _cancellationTokenSource.Cancel(); + _drainingCts.Cancel(); } catch (AggregateException ex) { - _logger?.LogWarning($"Exception during CTRLPlusCCancellationTokenSource cancel:\n{ex}"); + _logger?.LogWarning($"Exception during shutdown (Draining):\n{ex}"); + } + + // Auto-escalate to Aborting after the grace period. + if (_gracePeriod > TimeSpan.Zero && _gracePeriod != Timeout.InfiniteTimeSpan) + { + ScheduleEscalation(_gracePeriod, EnterAborting); + } + else if (_gracePeriod == TimeSpan.Zero) + { + EnterAborting(); } } - public void Dispose() - => _cancellationTokenSource.Dispose(); + private void EnterAborting() + { + if (Interlocked.Exchange(ref _phase, PhaseAborting) == PhaseAborting) + { + return; + } - public void Cancel() - => _cancellationTokenSource.Cancel(); + try + { + _abortingCts.Cancel(); + } + catch (AggregateException ex) + { + _logger?.LogWarning($"Exception during shutdown (Aborting):\n{ex}"); + } + + // After abort timeout, if the host is still alive, hard-terminate. + // FailFast is intentional: at this point we asked twice and waited; any + // remaining work has had its chance. This is the safety net that breaks + // hangs in non-cooperative frameworks (issue #5345). + if (_abortTimeout > TimeSpan.Zero && _abortTimeout != Timeout.InfiniteTimeSpan) + { + ScheduleEscalation(_abortTimeout, ForceTerminate); + } + } + + private static void ScheduleEscalation(TimeSpan delay, Action action) + { + // Fire-and-forget timer. We don't dispose: the host is shutting down anyway, + // and a short-lived CTS is cheaper than holding a Timer reference we'd need + // to manage across the phase machine. + var timerCts = new CancellationTokenSource(delay); + timerCts.Token.Register(action); + } + + private void ForceTerminate() + { + _logger?.LogWarning( + $"Shutdown grace exhausted ({_gracePeriod} + {_abortTimeout}); terminating host."); + _environment.FailFast("Test platform shutdown grace period exhausted."); + } } diff --git a/src/Platform/Microsoft.Testing.Platform/Services/ITestApplicationCancellationTokenSource.cs b/src/Platform/Microsoft.Testing.Platform/Services/ITestApplicationCancellationTokenSource.cs index 4d194e02f6..c5a18c0dcc 100644 --- a/src/Platform/Microsoft.Testing.Platform/Services/ITestApplicationCancellationTokenSource.cs +++ b/src/Platform/Microsoft.Testing.Platform/Services/ITestApplicationCancellationTokenSource.cs @@ -3,9 +3,46 @@ namespace Microsoft.Testing.Platform.Services; +/// +/// Source of the platform's lifetime cancellation tokens. Exposes a two-phase +/// shutdown model (see Issue #5345 / RFC "Phased graceful shutdown for MTP"): +/// +/// — set when the platform +/// enters the Draining phase (Ctrl+C, programmatic , +/// test session abort). Consumers should stop dispatching new work and +/// flush in-flight state. +/// — set when the platform +/// enters the Aborting phase (2nd Ctrl+C, grace period elapsed, +/// programmatic ). Consumers should bail out of +/// long-running work as fast as possible. +/// +/// is kept as the back-compat alias for +/// ; existing consumers do not need to change. +/// internal interface ITestApplicationCancellationTokenSource { + /// + /// Gets the back-compat alias for . + /// CancellationToken CancellationToken { get; } + /// + /// Gets the token that is signalled when the platform enters the Draining phase. + /// + CancellationToken DrainingToken { get; } + + /// + /// Gets the token that is signalled when the platform enters the Aborting phase. + /// + CancellationToken AbortingToken { get; } + + /// + /// Request the Draining phase. Idempotent. + /// void Cancel(); + + /// + /// Request the Aborting phase. Idempotent. Equivalent to a second Ctrl+C. + /// + void Abort(); } diff --git a/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Hosts/CommonHostTests.cs b/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Hosts/CommonHostTests.cs index b8d5f76a62..9a1b1aa72f 100644 --- a/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Hosts/CommonHostTests.cs +++ b/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Hosts/CommonHostTests.cs @@ -112,9 +112,17 @@ private sealed class TestApplicationCancellationTokenSource : ITestApplicationCa { public CancellationToken CancellationToken => CancellationToken.None; + public CancellationToken DrainingToken => CancellationToken.None; + + public CancellationToken AbortingToken => CancellationToken.None; + public void Cancel() { } + + public void Abort() + { + } } private sealed class AsyncCleanableTestHostApplicationLifetime : ITestHostApplicationLifetime, IAsyncCleanableExtension diff --git a/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs b/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs new file mode 100644 index 0000000000..55d048a047 --- /dev/null +++ b/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs @@ -0,0 +1,112 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using Microsoft.Testing.Platform.Services; + +namespace Microsoft.Testing.Platform.UnitTests; + +[TestClass] +public sealed class CTRLPlusCCancellationTokenSourceTests +{ + [TestMethod] + public void Initial_State_NeitherTokenIsCancelled() + { + using var source = new CTRLPlusCCancellationTokenSource( + console: null, + logger: null, + gracePeriod: Timeout.InfiniteTimeSpan, + abortTimeout: Timeout.InfiniteTimeSpan); + + Assert.IsFalse(source.CancellationToken.IsCancellationRequested); + Assert.IsFalse(source.DrainingToken.IsCancellationRequested); + Assert.IsFalse(source.AbortingToken.IsCancellationRequested); + } + + [TestMethod] + public void Cancel_OnlySignalsDrainingToken() + { + using var source = new CTRLPlusCCancellationTokenSource( + console: null, + logger: null, + gracePeriod: Timeout.InfiniteTimeSpan, + abortTimeout: Timeout.InfiniteTimeSpan); + + source.Cancel(); + + Assert.IsTrue(source.DrainingToken.IsCancellationRequested); + Assert.IsTrue(source.CancellationToken.IsCancellationRequested, "Legacy alias must follow DrainingToken."); + Assert.IsFalse(source.AbortingToken.IsCancellationRequested); + } + + [TestMethod] + public void Abort_SignalsBothTokens() + { + using var source = new CTRLPlusCCancellationTokenSource( + console: null, + logger: null, + gracePeriod: Timeout.InfiniteTimeSpan, + abortTimeout: Timeout.InfiniteTimeSpan); + + source.Abort(); + + Assert.IsTrue(source.DrainingToken.IsCancellationRequested); + Assert.IsTrue(source.AbortingToken.IsCancellationRequested); + } + + [TestMethod] + public void Cancel_IsIdempotent() + { + using var source = new CTRLPlusCCancellationTokenSource( + console: null, + logger: null, + gracePeriod: Timeout.InfiniteTimeSpan, + abortTimeout: Timeout.InfiniteTimeSpan); + + source.Cancel(); + source.Cancel(); + source.Cancel(); + + Assert.IsTrue(source.DrainingToken.IsCancellationRequested); + Assert.IsFalse(source.AbortingToken.IsCancellationRequested); + } + + [TestMethod] + public async Task GracePeriodElapse_EscalatesToAborting() + { + using var source = new CTRLPlusCCancellationTokenSource( + console: null, + logger: null, + gracePeriod: TimeSpan.FromMilliseconds(50), + abortTimeout: Timeout.InfiniteTimeSpan); + + source.Cancel(); + Assert.IsTrue(source.DrainingToken.IsCancellationRequested); + Assert.IsFalse(source.AbortingToken.IsCancellationRequested); + + // Wait for grace to elapse plus a margin. + using var waitCts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); + while (!source.AbortingToken.IsCancellationRequested && !waitCts.IsCancellationRequested) + { + await Task.Delay(10, TestContext.CancellationToken).ConfigureAwait(false); + } + + Assert.IsTrue(source.AbortingToken.IsCancellationRequested, "Aborting must trip after the grace period."); + } + + public TestContext TestContext { get; set; } = null!; + + [TestMethod] + public void ZeroGracePeriod_ImmediatelyEscalatesToAborting() + { + using var source = new CTRLPlusCCancellationTokenSource( + console: null, + logger: null, + gracePeriod: TimeSpan.Zero, + abortTimeout: Timeout.InfiniteTimeSpan); + + source.Cancel(); + + Assert.IsTrue(source.DrainingToken.IsCancellationRequested); + Assert.IsTrue(source.AbortingToken.IsCancellationRequested); + } +} From c9ed2914f4f1b4df684a63633dd0b6e3afe19ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amaury=20Lev=C3=A9?= Date: Thu, 4 Jun 2026 23:13:46 +0200 Subject: [PATCH 2/2] Address review on phased-shutdown CTRLPlusCCancellationTokenSource - Dispose() now detaches the Console.CancelKeyPress handler and disposes any pending escalation CancellationTokenSources so callbacks don't fire after the source is disposed and we don't keep the instance alive through the static console event. - OnConsoleCancelKeyPressed guards against post-Dispose invocations and routes the 2nd Ctrl+C through Abort() rather than EnterAborting() directly, keeping the Running -> Draining -> Aborting invariant. - EnterAborting() now calls EnterDraining() first so the legacy CancellationToken / DrainingToken are always signaled when AbortingToken is canceled, and uses CompareExchange(Draining -> Aborting) instead of an unconditional Exchange. - ScheduleEscalation retains the timer CancellationTokenSource so it can be disposed in Dispose(). - Test GracePeriodElapse_EscalatesToAborting is now event-driven via a TaskCompletionSource registered on AbortingToken (no more Task.Delay polling loop). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../CTRLPlusCCancellationTokenSource.cs | 78 +++++++++++++++++-- .../CTRLPlusCCancellationTokenSourceTests.cs | 15 ++-- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs b/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs index 20a7122389..b60ba66d43 100644 --- a/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs +++ b/src/Platform/Microsoft.Testing.Platform/Services/CTRLPlusCCancellationTokenSource.cs @@ -39,9 +39,16 @@ internal sealed class CTRLPlusCCancellationTokenSource : ITestApplicationCancell private readonly TimeSpan _abortTimeout; private readonly IEnvironment _environment; private readonly ILogger? _logger; + private readonly IConsole? _console; + private readonly object _escalationLock = new(); + + // Fire-and-forget escalation timers, retained so we can dispose them in + // Dispose() and prevent callbacks from firing after the host has shut down. + private List? _escalations; private int _phase = PhaseRunning; private int _ctrlCCount; + private int _disposed; public CTRLPlusCCancellationTokenSource(IConsole? console = null, ILogger? logger = null) : this(console, logger, DefaultGracePeriod, DefaultAbortTimeout, environment: null) @@ -63,6 +70,7 @@ internal CTRLPlusCCancellationTokenSource( if (console is not null && !IsCancelKeyPressNotSupported()) { + _console = console; console.CancelKeyPress += OnConsoleCancelKeyPressed; } } @@ -104,12 +112,47 @@ public void Abort() public void Dispose() { + if (Interlocked.Exchange(ref _disposed, 1) != 0) + { + return; + } + + // Detach from Console.CancelKeyPress so we don't keep this instance alive + // through the static event and don't invoke the handler after disposal. + if (_console is not null && !IsCancelKeyPressNotSupported()) + { + _console.CancelKeyPress -= OnConsoleCancelKeyPressed; + } + + List? escalations; + lock (_escalationLock) + { + escalations = _escalations; + _escalations = null; + } + + if (escalations is not null) + { + foreach (CancellationTokenSource escalation in escalations) + { + escalation.Dispose(); + } + } + _drainingCts.Dispose(); _abortingCts.Dispose(); } private void OnConsoleCancelKeyPressed(object? sender, ConsoleCancelEventArgs e) { + // Guard against handlers that may race with Dispose (the unsubscribe in + // Dispose() is best-effort: the event invocation list could already have + // captured this delegate before we removed it). + if (Volatile.Read(ref _disposed) != 0) + { + return; + } + int count = Interlocked.Increment(ref _ctrlCCount); switch (count) @@ -120,9 +163,13 @@ private void OnConsoleCancelKeyPressed(object? sender, ConsoleCancelEventArgs e) EnterDraining(); break; case 2: - // 2nd Ctrl+C: escalate to abort. + // 2nd Ctrl+C: escalate to abort. Go through Abort() (which calls + // EnterDraining() then EnterAborting()) so the + // Running → Draining → Aborting invariant holds even if a future + // refactor lets a 2nd Ctrl+C arrive before the 1st has been + // processed (or before EnterDraining was reached at all). e.Cancel = true; - EnterAborting(); + Abort(); break; default: // 3rd+ Ctrl+C: stop intercepting and let the runtime terminate @@ -162,7 +209,13 @@ private void EnterDraining() private void EnterAborting() { - if (Interlocked.Exchange(ref _phase, PhaseAborting) == PhaseAborting) + // Aborting implies Draining: ensure the legacy CancellationToken / + // DrainingToken are always signaled when Aborting is signaled, even if + // EnterAborting is invoked from a Running state (e.g., via Abort() or a + // future direct trigger). EnterDraining is idempotent. + EnterDraining(); + + if (Interlocked.CompareExchange(ref _phase, PhaseAborting, PhaseDraining) != PhaseDraining) { return; } @@ -186,13 +239,24 @@ private void EnterAborting() } } - private static void ScheduleEscalation(TimeSpan delay, Action action) + private void ScheduleEscalation(TimeSpan delay, Action action) { - // Fire-and-forget timer. We don't dispose: the host is shutting down anyway, - // and a short-lived CTS is cheaper than holding a Timer reference we'd need - // to manage across the phase machine. var timerCts = new CancellationTokenSource(delay); timerCts.Token.Register(action); + + // Retain the CTS so Dispose() can cancel pending escalations and + // release the underlying timer. Without this, the timer would run to + // completion and could invoke the callback after the source is disposed. + lock (_escalationLock) + { + if (Volatile.Read(ref _disposed) != 0) + { + timerCts.Dispose(); + return; + } + + (_escalations ??= []).Add(timerCts); + } } private void ForceTerminate() diff --git a/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs b/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs index 55d048a047..d8d7eced5e 100644 --- a/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs +++ b/test/UnitTests/Microsoft.Testing.Platform.UnitTests/Services/CTRLPlusCCancellationTokenSourceTests.cs @@ -83,13 +83,16 @@ public async Task GracePeriodElapse_EscalatesToAborting() Assert.IsTrue(source.DrainingToken.IsCancellationRequested); Assert.IsFalse(source.AbortingToken.IsCancellationRequested); - // Wait for grace to elapse plus a margin. - using var waitCts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); - while (!source.AbortingToken.IsCancellationRequested && !waitCts.IsCancellationRequested) - { - await Task.Delay(10, TestContext.CancellationToken).ConfigureAwait(false); - } + // Event-driven wait: complete a TaskCompletionSource as soon as the token + // is canceled rather than polling, so the test finishes immediately after + // the grace period elapses (avoids Task.Delay flakiness under load). + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + using CancellationTokenRegistration registration = source.AbortingToken.Register(() => tcs.TrySetResult(true)); + Task timeoutTask = Task.Delay(TimeSpan.FromSeconds(5), TestContext.CancellationToken); + Task completed = await Task.WhenAny(tcs.Task, timeoutTask).ConfigureAwait(false); + + Assert.AreSame(tcs.Task, completed, "Aborting must trip before the timeout."); Assert.IsTrue(source.AbortingToken.IsCancellationRequested, "Aborting must trip after the grace period."); }