diff --git a/RELEASENOTES.md b/RELEASENOTES.md index e69de29bb..fc2fa6ae5 100644 --- a/RELEASENOTES.md +++ b/RELEASENOTES.md @@ -0,0 +1 @@ +- Improved retry resilience for transient 5xx server errors (502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout) from the GitHub API, with dedicated exponential backoff and Retry-After header support. Non-transient 4xx client errors are no longer needlessly retried. diff --git a/src/Octoshift/RetryPolicy.cs b/src/Octoshift/RetryPolicy.cs index 550619756..34c098564 100644 --- a/src/Octoshift/RetryPolicy.cs +++ b/src/Octoshift/RetryPolicy.cs @@ -64,8 +64,12 @@ public async Task> RetryOnResult(Func> func, Func Retry(Func> func) => await CreateRetryPolicyForException().ExecuteAsync(func); + public async Task RetryOnAnyException(Func func) => await CreateRetryPolicyForAllExceptions().ExecuteAsync(func); + + public async Task RetryOnAnyException(Func> func) => await CreateRetryPolicyForAllExceptions().ExecuteAsync(func); + private AsyncRetryPolicy CreateRetryPolicyForException() where TException : Exception => Policy - .Handle() + .Handle(ex => IsRetryableException(ex)) .WaitAndRetryAsync(5, retry => retry * TimeSpan.FromMilliseconds(_retryInterval), (Exception ex, TimeSpan ts, Context ctx) => { if (ex is HttpRequestException httpEx) @@ -83,6 +87,35 @@ private AsyncRetryPolicy CreateRetryPolicyForException() where TExce _log?.LogVerbose("Retrying..."); }); + private AsyncRetryPolicy CreateRetryPolicyForAllExceptions() => Policy + .Handle() + .WaitAndRetryAsync(5, retry => retry * TimeSpan.FromMilliseconds(_retryInterval), (Exception ex, TimeSpan ts, Context ctx) => + { + _log?.LogVerbose(ex.ToString()); + _log?.LogVerbose("Retrying..."); + }); + + private static bool IsRetryableException(Exception ex) + { + if (ex is OctoshiftCliException) + { + return false; + } + + if (ex is not HttpRequestException httpEx) + { + return true; + } + + if (httpEx.StatusCode == System.Net.HttpStatusCode.Unauthorized) + { + return true; // Let it through so the retry handler can throw OctoshiftCliException + } + + // Retry on server errors (5xx) and network-level failures (null status code) + return httpEx.StatusCode is null || (int)httpEx.StatusCode >= 500; + } + private void ThrowUnauthorizedException(HttpRequestException ex) { var serviceContext = _serviceName is not null ? $" ({_serviceName})" : ""; diff --git a/src/Octoshift/Services/GithubClient.cs b/src/Octoshift/Services/GithubClient.cs index 6d846bc8d..8d833ec86 100644 --- a/src/Octoshift/Services/GithubClient.cs +++ b/src/Octoshift/Services/GithubClient.cs @@ -24,9 +24,13 @@ public class GithubClient private const int MILLISECONDS_PER_SECOND = 1000; private const int SECONDARY_RATE_LIMIT_MAX_RETRIES = 3; private const int SECONDARY_RATE_LIMIT_DEFAULT_DELAY = 60; // 60 seconds default delay + private const int SERVER_ERROR_MAX_RETRIES = 5; + private const int SERVER_ERROR_DEFAULT_DELAY = 5; // 5 seconds default delay internal int _secondaryRateLimitMaxRetries = SECONDARY_RATE_LIMIT_MAX_RETRIES; // Exposed for testing purposes internal int _secondaryRateLimitDefaultDelay = SECONDARY_RATE_LIMIT_DEFAULT_DELAY; // Exposed for testing purposes + internal int _serverErrorMaxRetries = SERVER_ERROR_MAX_RETRIES; // Exposed for testing purposes + internal int _serverErrorDefaultDelay = SERVER_ERROR_DEFAULT_DELAY; // Exposed for testing purposes public GithubClient(OctoLogger log, HttpClient httpClient, IVersionProvider versionProvider, RetryPolicy retryPolicy, DateTimeProvider dateTimeProvider, string personalAccessToken) { @@ -202,7 +206,8 @@ public virtual async Task PatchAsync(string url, object body, Dictionary object body = null, HttpStatusCode expectedStatus = HttpStatusCode.OK, Dictionary customHeaders = null, - int retryCount = 0) + int retryCount = 0, + int serverErrorRetryCount = 0) { await ApplyRetryDelayAsync(); _log.LogVerbose($"HTTP {httpMethod}: {url}"); @@ -245,12 +250,18 @@ public virtual async Task PatchAsync(string url, object body, Dictionary // Check for secondary rate limits before handling primary rate limits if (IsSecondaryRateLimit(response.StatusCode, content)) { - return await HandleSecondaryRateLimit(httpMethod, url, body, expectedStatus, customHeaders, headers, retryCount); + return await HandleSecondaryRateLimit(httpMethod, url, body, expectedStatus, customHeaders, headers, retryCount, serverErrorRetryCount); + } + + // Check for transient server errors (502, 503, 504) + if (IsTransientServerError(response.StatusCode)) + { + return await HandleServerError(httpMethod, url, body, expectedStatus, customHeaders, headers, retryCount, serverErrorRetryCount); } if (response.StatusCode == HttpStatusCode.Forbidden && _retryDelay > 0) { - (content, headers) = await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount); + (content, headers) = await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount, serverErrorRetryCount); } else if (expectedStatus == HttpStatusCode.OK) { @@ -375,20 +386,21 @@ private bool IsGraphQLSecondaryRateLimit(string content) HttpStatusCode expectedStatus, Dictionary customHeaders, KeyValuePair>[] headers, - int retryCount = 0) + int retryCount = 0, + int serverErrorRetryCount = 0) { if (retryCount >= _secondaryRateLimitMaxRetries) { - throw new OctoshiftCliException($"Secondary rate limit exceeded. Maximum retries ({SECONDARY_RATE_LIMIT_MAX_RETRIES}) reached. Please wait before retrying your request."); + throw new OctoshiftCliException($"Secondary rate limit exceeded. Maximum retries ({_secondaryRateLimitMaxRetries}) reached. Please wait before retrying your request."); } var delaySeconds = GetSecondaryRateLimitDelay(headers, retryCount); - _log.LogWarning($"Secondary rate limit detected (attempt {retryCount + 1}/{SECONDARY_RATE_LIMIT_MAX_RETRIES}). Waiting {delaySeconds} seconds before retrying..."); + _log.LogWarning($"Secondary rate limit detected (attempt {retryCount + 1}/{_secondaryRateLimitMaxRetries}). Waiting {delaySeconds} seconds before retrying..."); await Task.Delay(delaySeconds * MILLISECONDS_PER_SECOND); - return await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount + 1); + return await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount + 1, serverErrorRetryCount); } private int GetSecondaryRateLimitDelay(KeyValuePair>[] headers, int retryCount) @@ -417,4 +429,45 @@ private int GetSecondaryRateLimitDelay(KeyValuePair> // Otherwise use exponential backoff: 1m → 2m → 4m return _secondaryRateLimitDefaultDelay * (int)Math.Pow(2, retryCount); } + + private static bool IsTransientServerError(HttpStatusCode statusCode) => + statusCode is HttpStatusCode.BadGateway + or HttpStatusCode.ServiceUnavailable + or HttpStatusCode.GatewayTimeout; + + private async Task<(string Content, KeyValuePair>[] ResponseHeaders)> HandleServerError( + HttpMethod httpMethod, + string url, + object body, + HttpStatusCode expectedStatus, + Dictionary customHeaders, + KeyValuePair>[] headers, + int retryCount, + int serverErrorRetryCount) + { + if (serverErrorRetryCount >= _serverErrorMaxRetries) + { + throw new HttpRequestException($"Server error persisted after {_serverErrorMaxRetries} retries. Please try again later."); + } + + var delaySeconds = GetServerErrorDelay(headers, serverErrorRetryCount); + + _log.LogWarning($"Server error detected (attempt {serverErrorRetryCount + 1}/{_serverErrorMaxRetries}). Waiting {delaySeconds} seconds before retrying..."); + + await Task.Delay(delaySeconds * MILLISECONDS_PER_SECOND); + + return await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount, serverErrorRetryCount + 1); + } + + private int GetServerErrorDelay(KeyValuePair>[] headers, int serverErrorRetryCount) + { + var retryAfterHeader = ExtractHeaderValue("Retry-After", headers); + if (!string.IsNullOrEmpty(retryAfterHeader) && int.TryParse(retryAfterHeader, out var retryAfterSeconds)) + { + return retryAfterSeconds; + } + + // Exponential backoff: 5s → 10s → 20s → 40s → 80s + return _serverErrorDefaultDelay * (int)Math.Pow(2, serverErrorRetryCount); + } } diff --git a/src/OctoshiftCLI.Tests/Octoshift/RetryPolicyTests.cs b/src/OctoshiftCLI.Tests/Octoshift/RetryPolicyTests.cs new file mode 100644 index 000000000..dac894d48 --- /dev/null +++ b/src/OctoshiftCLI.Tests/Octoshift/RetryPolicyTests.cs @@ -0,0 +1,206 @@ +using System; +using System.Net; +using System.Net.Http; +using System.Threading.Tasks; +using FluentAssertions; +using Xunit; + +namespace OctoshiftCLI.Tests.Octoshift; + +public sealed class RetryPolicyTests +{ + private readonly RetryPolicy _retryPolicy; + + public RetryPolicyTests() + { + _retryPolicy = new RetryPolicy(null) + { + _retryInterval = 0, + _httpRetryInterval = 0 + }; + } + + [Theory] + [InlineData(HttpStatusCode.InternalServerError)] + [InlineData(HttpStatusCode.BadGateway)] + [InlineData(HttpStatusCode.ServiceUnavailable)] + [InlineData(HttpStatusCode.GatewayTimeout)] + public async Task Retry_Retries_On_5xx_HttpRequestException(HttpStatusCode statusCode) + { + // Arrange + var callCount = 0; + + // Act + var result = await _retryPolicy.Retry(async () => + { + callCount++; + return callCount == 1 + ? throw new HttpRequestException("server error", null, statusCode) + : await Task.FromResult("success"); + }); + + // Assert + result.Should().Be("success"); + callCount.Should().Be(2); + } + + [Theory] + [InlineData(HttpStatusCode.BadRequest)] + [InlineData(HttpStatusCode.NotFound)] + [InlineData(HttpStatusCode.Conflict)] + [InlineData(HttpStatusCode.Forbidden)] + public async Task Retry_Does_Not_Retry_On_4xx_HttpRequestException(HttpStatusCode statusCode) + { + // Arrange + var callCount = 0; + + // Act / Assert + await FluentActions + .Invoking(async () => await _retryPolicy.Retry(() => + { + callCount++; + return Task.FromException(new HttpRequestException("client error", null, statusCode)); + })) + .Should() + .ThrowAsync(); + + callCount.Should().Be(1); + } + + [Fact] + public async Task Retry_Retries_On_Null_StatusCode_HttpRequestException() + { + // Arrange - null status code represents network-level failures + var callCount = 0; + + // Act + var result = await _retryPolicy.Retry(async () => + { + callCount++; + return callCount == 1 + ? throw new HttpRequestException("network error") + : await Task.FromResult("success"); + }); + + // Assert + result.Should().Be("success"); + callCount.Should().Be(2); + } + + [Fact] + public async Task Retry_Retries_On_Non_Http_Exceptions() + { + // Arrange + var callCount = 0; + + // Act + var result = await _retryPolicy.Retry(async () => + { + callCount++; + return callCount == 1 + ? throw new TimeoutException("timed out") + : await Task.FromResult("success"); + }); + + // Assert + result.Should().Be("success"); + callCount.Should().Be(2); + } + + [Fact] + public async Task Retry_Does_Not_Retry_On_OctoshiftCliException() + { + // Arrange + var callCount = 0; + + // Act / Assert + await FluentActions + .Invoking(async () => await _retryPolicy.Retry(() => + { + callCount++; + return Task.FromException(new OctoshiftCliException("terminal error")); + })) + .Should() + .ThrowAsync(); + + callCount.Should().Be(1); + } + + [Fact] + public async Task Retry_Throws_OctoshiftCliException_On_401() + { + // Arrange + var callCount = 0; + + // Act / Assert + await FluentActions + .Invoking(async () => await _retryPolicy.Retry(() => + { + callCount++; + return Task.FromException(new HttpRequestException("unauthorized", null, HttpStatusCode.Unauthorized)); + })) + .Should() + .ThrowAsync() + .WithMessage("*Unauthorized*"); + } + + [Fact] + public async Task RetryOnAnyException_Retries_On_OctoshiftCliException() + { + // Arrange + var callCount = 0; + + // Act + var result = await _retryPolicy.RetryOnAnyException(async () => + { + callCount++; + return callCount == 1 + ? throw new OctoshiftCliException("transient error") + : await Task.FromResult("success"); + }); + + // Assert + result.Should().Be("success"); + callCount.Should().Be(2); + } + + [Fact] + public async Task RetryOnAnyException_Retries_On_HttpRequestException() + { + // Arrange + var callCount = 0; + + // Act + var result = await _retryPolicy.RetryOnAnyException(async () => + { + callCount++; + return callCount == 1 + ? throw new HttpRequestException("server error", null, HttpStatusCode.BadRequest) + : await Task.FromResult("success"); + }); + + // Assert + result.Should().Be("success"); + callCount.Should().Be(2); + } + + [Fact] + public async Task RetryOnAnyException_Retries_On_TimeoutException() + { + // Arrange + var callCount = 0; + + // Act + var result = await _retryPolicy.RetryOnAnyException(async () => + { + callCount++; + return callCount == 1 + ? throw new TimeoutException("timed out") + : await Task.FromResult("success"); + }); + + // Assert + result.Should().Be("success"); + callCount.Should().Be(2); + } +} diff --git a/src/OctoshiftCLI.Tests/Octoshift/Services/GithubClientTests.cs b/src/OctoshiftCLI.Tests/Octoshift/Services/GithubClientTests.cs index 5dbc939c6..f93353699 100644 --- a/src/OctoshiftCLI.Tests/Octoshift/Services/GithubClientTests.cs +++ b/src/OctoshiftCLI.Tests/Octoshift/Services/GithubClientTests.cs @@ -94,6 +94,148 @@ public async Task GetAsync_Retries_On_Non_Success(HttpStatusCode httpStatusCode) .ReturnsAsync(CreateHttpResponseFactory(statusCode: httpStatusCode, content: "FIRST_RESPONSE")) .ReturnsAsync(CreateHttpResponseFactory(content: EXPECTED_RESPONSE_CONTENT)); + using var httpClient = new HttpClient(handlerMock.Object); + var githubClient = new GithubClient(_mockOctoLogger.Object, httpClient, null, _retryPolicy, _dateTimeProvider.Object, PERSONAL_ACCESS_TOKEN) + { + _serverErrorDefaultDelay = 0 + }; + + // Act + var returnedContent = await githubClient.GetAsync(URL); + + // Assert + returnedContent.Should().Be(EXPECTED_RESPONSE_CONTENT); + } + + [Theory] + [InlineData(HttpStatusCode.BadGateway)] + [InlineData(HttpStatusCode.ServiceUnavailable)] + [InlineData(HttpStatusCode.GatewayTimeout)] + public async Task GetAsync_Retries_Transient_Server_Errors_With_Dedicated_Handler(HttpStatusCode httpStatusCode) + { + // Arrange + var handlerMock = new Mock(); + handlerMock + .Protected() + .SetupSequence>( + "SendAsync", + ItExpr.Is(req => req.Method == HttpMethod.Get), + ItExpr.IsAny()) + .ReturnsAsync(CreateHttpResponseFactory(statusCode: httpStatusCode, content: "SERVER_ERROR")) + .ReturnsAsync(CreateHttpResponseFactory(statusCode: httpStatusCode, content: "SERVER_ERROR")) + .ReturnsAsync(CreateHttpResponseFactory(content: EXPECTED_RESPONSE_CONTENT)); + + using var httpClient = new HttpClient(handlerMock.Object); + var githubClient = new GithubClient(_mockOctoLogger.Object, httpClient, null, _retryPolicy, _dateTimeProvider.Object, PERSONAL_ACCESS_TOKEN) + { + _serverErrorDefaultDelay = 0 + }; + + // Act + var returnedContent = await githubClient.GetAsync(URL); + + // Assert + returnedContent.Should().Be(EXPECTED_RESPONSE_CONTENT); + _mockOctoLogger.Verify(m => m.LogWarning(It.Is(s => s.Contains("Server error detected"))), Times.Exactly(2)); + } + + [Fact] + public async Task GetAsync_Throws_After_Max_Server_Error_Retries() + { + // Arrange + var handlerMock = new Mock(); + handlerMock + .Protected() + .Setup>( + "SendAsync", + ItExpr.Is(req => req.Method == HttpMethod.Get), + ItExpr.IsAny()) + .ReturnsAsync(CreateHttpResponseFactory(statusCode: HttpStatusCode.BadGateway, content: "SERVER_ERROR")); + + using var httpClient = new HttpClient(handlerMock.Object); + var githubClient = new GithubClient(_mockOctoLogger.Object, httpClient, null, _retryPolicy, _dateTimeProvider.Object, PERSONAL_ACCESS_TOKEN) + { + _serverErrorDefaultDelay = 0, + _serverErrorMaxRetries = 2 + }; + + // Act / Assert + await FluentActions + .Invoking(async () => await githubClient.GetAsync(URL)) + .Should() + .ThrowAsync() + .WithMessage("*Server error persisted after 2 retries*"); + } + + [Fact] + public async Task GetAsync_Server_Error_Respects_Retry_After_Header() + { + // Arrange + var handlerMock = new Mock(); + handlerMock + .Protected() + .SetupSequence>( + "SendAsync", + ItExpr.Is(req => req.Method == HttpMethod.Get), + ItExpr.IsAny()) + .ReturnsAsync(CreateHttpResponseFactory( + statusCode: HttpStatusCode.ServiceUnavailable, + content: "SERVER_ERROR", + headers: new[] { ("Retry-After", "0") })) + .ReturnsAsync(CreateHttpResponseFactory(content: EXPECTED_RESPONSE_CONTENT)); + + using var httpClient = new HttpClient(handlerMock.Object); + var githubClient = new GithubClient(_mockOctoLogger.Object, httpClient, null, _retryPolicy, _dateTimeProvider.Object, PERSONAL_ACCESS_TOKEN) + { + _serverErrorDefaultDelay = 0 + }; + + // Act + var returnedContent = await githubClient.GetAsync(URL); + + // Assert + returnedContent.Should().Be(EXPECTED_RESPONSE_CONTENT); + } + + [Fact] + public async Task GetAsync_Does_Not_Retry_4xx_Errors() + { + // Arrange + var handlerMock = MockHttpHandler( + req => req.Method == HttpMethod.Get, + CreateHttpResponseFactory(HttpStatusCode.BadRequest, content: "BAD_REQUEST")); + + using var httpClient = new HttpClient(handlerMock.Object); + var githubClient = new GithubClient(_mockOctoLogger.Object, httpClient, null, _retryPolicy, _dateTimeProvider.Object, PERSONAL_ACCESS_TOKEN); + + // Act / Assert + await FluentActions + .Invoking(async () => await githubClient.GetAsync(URL)) + .Should() + .ThrowExactlyAsync(); + + // Verify only one attempt was made (no retries) + handlerMock + .Protected() + .Verify("SendAsync", Times.Once(), + ItExpr.IsAny(), + ItExpr.IsAny()); + } + + [Fact] + public async Task GetAsync_500_Uses_Generic_Retry_Not_Dedicated_Handler() + { + // Arrange - 500 should NOT go through the dedicated server error handler + var handlerMock = new Mock(); + handlerMock + .Protected() + .SetupSequence>( + "SendAsync", + ItExpr.Is(req => req.Method == HttpMethod.Get), + ItExpr.IsAny()) + .ReturnsAsync(CreateHttpResponseFactory(statusCode: HttpStatusCode.InternalServerError, content: "INTERNAL_ERROR")) + .ReturnsAsync(CreateHttpResponseFactory(content: EXPECTED_RESPONSE_CONTENT)); + using var httpClient = new HttpClient(handlerMock.Object); var githubClient = new GithubClient(_mockOctoLogger.Object, httpClient, null, _retryPolicy, _dateTimeProvider.Object, PERSONAL_ACCESS_TOKEN); @@ -102,6 +244,8 @@ public async Task GetAsync_Retries_On_Non_Success(HttpStatusCode httpStatusCode) // Assert returnedContent.Should().Be(EXPECTED_RESPONSE_CONTENT); + // Should NOT log "Server error detected" since 500 goes through generic retry + _mockOctoLogger.Verify(m => m.LogWarning(It.Is(s => s.Contains("Server error detected"))), Times.Never); } [Fact] diff --git a/src/gei/Commands/MigrateRepo/MigrateRepoCommandHandler.cs b/src/gei/Commands/MigrateRepo/MigrateRepoCommandHandler.cs index 138fc0c99..b7c5445a8 100644 --- a/src/gei/Commands/MigrateRepo/MigrateRepoCommandHandler.cs +++ b/src/gei/Commands/MigrateRepo/MigrateRepoCommandHandler.cs @@ -247,7 +247,7 @@ private string ExtractGhesBaseUrl(string ghesApiUrl) bool keepArchive, bool useGithubStorage) { - var (gitArchiveUrl, metadataArchiveUrl, gitArchiveId, metadataArchiveId) = await _retryPolicy.Retry( + var (gitArchiveUrl, metadataArchiveUrl, gitArchiveId, metadataArchiveId) = await _retryPolicy.RetryOnAnyException( async () => await GenerateArchives(githubSourceOrg, sourceRepo, skipReleases, lockSourceRepo)); if (!useGithubStorage && !blobCredentialsRequired)