Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASENOTES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Improved retry resilience for transient 5xx server errors (502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout) from the GitHub API, with dedicated exponential backoff and Retry-After header support. Non-transient 4xx client errors are no longer needlessly retried.
35 changes: 34 additions & 1 deletion src/Octoshift/RetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ public async Task<PolicyResult<T>> RetryOnResult<T>(Func<Task<T>> func, Func<T,

public async Task<T> Retry<T>(Func<Task<T>> func) => await CreateRetryPolicyForException<Exception>().ExecuteAsync(func);

public async Task RetryOnAnyException(Func<Task> func) => await CreateRetryPolicyForAllExceptions().ExecuteAsync(func);

public async Task<T> RetryOnAnyException<T>(Func<Task<T>> func) => await CreateRetryPolicyForAllExceptions().ExecuteAsync(func);

private AsyncRetryPolicy CreateRetryPolicyForException<TException>() where TException : Exception => Policy
.Handle<TException>()
.Handle<TException>(ex => IsRetryableException(ex))
.WaitAndRetryAsync(5, retry => retry * TimeSpan.FromMilliseconds(_retryInterval), (Exception ex, TimeSpan ts, Context ctx) =>
{
if (ex is HttpRequestException httpEx)
Expand All @@ -83,6 +87,35 @@ private AsyncRetryPolicy CreateRetryPolicyForException<TException>() where TExce
_log?.LogVerbose("Retrying...");
});

private AsyncRetryPolicy CreateRetryPolicyForAllExceptions() => Policy
.Handle<Exception>()
.WaitAndRetryAsync(5, retry => retry * TimeSpan.FromMilliseconds(_retryInterval), (Exception ex, TimeSpan ts, Context ctx) =>
{
_log?.LogVerbose(ex.ToString());
_log?.LogVerbose("Retrying...");
});

private static bool IsRetryableException(Exception ex)
{
if (ex is OctoshiftCliException)
{
return false;
}

if (ex is not HttpRequestException httpEx)
{
return true;
}

if (httpEx.StatusCode == System.Net.HttpStatusCode.Unauthorized)
{
return true; // Let it through so the retry handler can throw OctoshiftCliException
}

// Retry on server errors (5xx) and network-level failures (null status code)
return httpEx.StatusCode is null || (int)httpEx.StatusCode >= 500;
}

private void ThrowUnauthorizedException(HttpRequestException ex)
{
var serviceContext = _serviceName is not null ? $" ({_serviceName})" : "";
Expand Down
67 changes: 60 additions & 7 deletions src/Octoshift/Services/GithubClient.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,13 @@ public class GithubClient
private const int MILLISECONDS_PER_SECOND = 1000;
private const int SECONDARY_RATE_LIMIT_MAX_RETRIES = 3;
private const int SECONDARY_RATE_LIMIT_DEFAULT_DELAY = 60; // 60 seconds default delay
private const int SERVER_ERROR_MAX_RETRIES = 5;
private const int SERVER_ERROR_DEFAULT_DELAY = 5; // 5 seconds default delay

internal int _secondaryRateLimitMaxRetries = SECONDARY_RATE_LIMIT_MAX_RETRIES; // Exposed for testing purposes
internal int _secondaryRateLimitDefaultDelay = SECONDARY_RATE_LIMIT_DEFAULT_DELAY; // Exposed for testing purposes
internal int _serverErrorMaxRetries = SERVER_ERROR_MAX_RETRIES; // Exposed for testing purposes
internal int _serverErrorDefaultDelay = SERVER_ERROR_DEFAULT_DELAY; // Exposed for testing purposes

public GithubClient(OctoLogger log, HttpClient httpClient, IVersionProvider versionProvider, RetryPolicy retryPolicy, DateTimeProvider dateTimeProvider, string personalAccessToken)
{
Expand Down Expand Up @@ -202,7 +206,8 @@ public virtual async Task<string> PatchAsync(string url, object body, Dictionary
object body = null,
HttpStatusCode expectedStatus = HttpStatusCode.OK,
Dictionary<string, string> customHeaders = null,
int retryCount = 0)
int retryCount = 0,
int serverErrorRetryCount = 0)
{
await ApplyRetryDelayAsync();
_log.LogVerbose($"HTTP {httpMethod}: {url}");
Expand Down Expand Up @@ -245,12 +250,18 @@ public virtual async Task<string> PatchAsync(string url, object body, Dictionary
// Check for secondary rate limits before handling primary rate limits
if (IsSecondaryRateLimit(response.StatusCode, content))
{
return await HandleSecondaryRateLimit(httpMethod, url, body, expectedStatus, customHeaders, headers, retryCount);
return await HandleSecondaryRateLimit(httpMethod, url, body, expectedStatus, customHeaders, headers, retryCount, serverErrorRetryCount);
}

// Check for transient server errors (502, 503, 504)
if (IsTransientServerError(response.StatusCode))
{
return await HandleServerError(httpMethod, url, body, expectedStatus, customHeaders, headers, retryCount, serverErrorRetryCount);
}

if (response.StatusCode == HttpStatusCode.Forbidden && _retryDelay > 0)
{
(content, headers) = await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount);
(content, headers) = await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount, serverErrorRetryCount);
}
else if (expectedStatus == HttpStatusCode.OK)
{
Expand Down Expand Up @@ -375,20 +386,21 @@ private bool IsGraphQLSecondaryRateLimit(string content)
HttpStatusCode expectedStatus,
Dictionary<string, string> customHeaders,
KeyValuePair<string, IEnumerable<string>>[] headers,
int retryCount = 0)
int retryCount = 0,
int serverErrorRetryCount = 0)
{
if (retryCount >= _secondaryRateLimitMaxRetries)
{
throw new OctoshiftCliException($"Secondary rate limit exceeded. Maximum retries ({SECONDARY_RATE_LIMIT_MAX_RETRIES}) reached. Please wait before retrying your request.");
throw new OctoshiftCliException($"Secondary rate limit exceeded. Maximum retries ({_secondaryRateLimitMaxRetries}) reached. Please wait before retrying your request.");
}

var delaySeconds = GetSecondaryRateLimitDelay(headers, retryCount);

_log.LogWarning($"Secondary rate limit detected (attempt {retryCount + 1}/{SECONDARY_RATE_LIMIT_MAX_RETRIES}). Waiting {delaySeconds} seconds before retrying...");
_log.LogWarning($"Secondary rate limit detected (attempt {retryCount + 1}/{_secondaryRateLimitMaxRetries}). Waiting {delaySeconds} seconds before retrying...");

await Task.Delay(delaySeconds * MILLISECONDS_PER_SECOND);

return await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount + 1);
return await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount + 1, serverErrorRetryCount);
}

private int GetSecondaryRateLimitDelay(KeyValuePair<string, IEnumerable<string>>[] headers, int retryCount)
Expand Down Expand Up @@ -417,4 +429,45 @@ private int GetSecondaryRateLimitDelay(KeyValuePair<string, IEnumerable<string>>
// Otherwise use exponential backoff: 1m → 2m → 4m
return _secondaryRateLimitDefaultDelay * (int)Math.Pow(2, retryCount);
}

private static bool IsTransientServerError(HttpStatusCode statusCode) =>
statusCode is HttpStatusCode.BadGateway
or HttpStatusCode.ServiceUnavailable
or HttpStatusCode.GatewayTimeout;

private async Task<(string Content, KeyValuePair<string, IEnumerable<string>>[] ResponseHeaders)> HandleServerError(
HttpMethod httpMethod,
string url,
object body,
HttpStatusCode expectedStatus,
Dictionary<string, string> customHeaders,
KeyValuePair<string, IEnumerable<string>>[] headers,
int retryCount,
int serverErrorRetryCount)
{
if (serverErrorRetryCount >= _serverErrorMaxRetries)
{
throw new HttpRequestException($"Server error persisted after {_serverErrorMaxRetries} retries. Please try again later.");
}

var delaySeconds = GetServerErrorDelay(headers, serverErrorRetryCount);

_log.LogWarning($"Server error detected (attempt {serverErrorRetryCount + 1}/{_serverErrorMaxRetries}). Waiting {delaySeconds} seconds before retrying...");

await Task.Delay(delaySeconds * MILLISECONDS_PER_SECOND);

return await SendAsync(httpMethod, url, body, expectedStatus, customHeaders, retryCount, serverErrorRetryCount + 1);
}

private int GetServerErrorDelay(KeyValuePair<string, IEnumerable<string>>[] headers, int serverErrorRetryCount)
{
var retryAfterHeader = ExtractHeaderValue("Retry-After", headers);
if (!string.IsNullOrEmpty(retryAfterHeader) && int.TryParse(retryAfterHeader, out var retryAfterSeconds))
{
return retryAfterSeconds;
}

// Exponential backoff: 5s → 10s → 20s → 40s → 80s
return _serverErrorDefaultDelay * (int)Math.Pow(2, serverErrorRetryCount);
}
}
206 changes: 206 additions & 0 deletions src/OctoshiftCLI.Tests/Octoshift/RetryPolicyTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
using System;
using System.Net;
using System.Net.Http;
using System.Threading.Tasks;
using FluentAssertions;
using Xunit;

namespace OctoshiftCLI.Tests.Octoshift;

public sealed class RetryPolicyTests
{
private readonly RetryPolicy _retryPolicy;

public RetryPolicyTests()
{
_retryPolicy = new RetryPolicy(null)
{
_retryInterval = 0,
_httpRetryInterval = 0
};
}

[Theory]
[InlineData(HttpStatusCode.InternalServerError)]
[InlineData(HttpStatusCode.BadGateway)]
[InlineData(HttpStatusCode.ServiceUnavailable)]
[InlineData(HttpStatusCode.GatewayTimeout)]
public async Task Retry_Retries_On_5xx_HttpRequestException(HttpStatusCode statusCode)
{
// Arrange
var callCount = 0;

// Act
var result = await _retryPolicy.Retry(async () =>
{
callCount++;
return callCount == 1
? throw new HttpRequestException("server error", null, statusCode)
: await Task.FromResult("success");
});

// Assert
result.Should().Be("success");
callCount.Should().Be(2);
}

[Theory]
[InlineData(HttpStatusCode.BadRequest)]
[InlineData(HttpStatusCode.NotFound)]
[InlineData(HttpStatusCode.Conflict)]
[InlineData(HttpStatusCode.Forbidden)]
public async Task Retry_Does_Not_Retry_On_4xx_HttpRequestException(HttpStatusCode statusCode)
{
// Arrange
var callCount = 0;

// Act / Assert
await FluentActions
.Invoking(async () => await _retryPolicy.Retry<string>(() =>
{
callCount++;
return Task.FromException<string>(new HttpRequestException("client error", null, statusCode));
}))
.Should()
.ThrowAsync<HttpRequestException>();

callCount.Should().Be(1);
}

[Fact]
public async Task Retry_Retries_On_Null_StatusCode_HttpRequestException()
{
// Arrange - null status code represents network-level failures
var callCount = 0;

// Act
var result = await _retryPolicy.Retry(async () =>
{
callCount++;
return callCount == 1
? throw new HttpRequestException("network error")
: await Task.FromResult("success");
});

// Assert
result.Should().Be("success");
callCount.Should().Be(2);
}

[Fact]
public async Task Retry_Retries_On_Non_Http_Exceptions()
{
// Arrange
var callCount = 0;

// Act
var result = await _retryPolicy.Retry(async () =>
{
callCount++;
return callCount == 1
? throw new TimeoutException("timed out")
: await Task.FromResult("success");
});

// Assert
result.Should().Be("success");
callCount.Should().Be(2);
}

[Fact]
public async Task Retry_Does_Not_Retry_On_OctoshiftCliException()
{
// Arrange
var callCount = 0;

// Act / Assert
await FluentActions
.Invoking(async () => await _retryPolicy.Retry<string>(() =>
{
callCount++;
return Task.FromException<string>(new OctoshiftCliException("terminal error"));
}))
.Should()
.ThrowAsync<OctoshiftCliException>();

callCount.Should().Be(1);
}

[Fact]
public async Task Retry_Throws_OctoshiftCliException_On_401()
{
// Arrange
var callCount = 0;

// Act / Assert
await FluentActions
.Invoking(async () => await _retryPolicy.Retry<string>(() =>
{
callCount++;
return Task.FromException<string>(new HttpRequestException("unauthorized", null, HttpStatusCode.Unauthorized));
}))
.Should()
.ThrowAsync<OctoshiftCliException>()
.WithMessage("*Unauthorized*");
}

[Fact]
public async Task RetryOnAnyException_Retries_On_OctoshiftCliException()
{
// Arrange
var callCount = 0;

// Act
var result = await _retryPolicy.RetryOnAnyException(async () =>
{
callCount++;
return callCount == 1
? throw new OctoshiftCliException("transient error")
: await Task.FromResult("success");
});

// Assert
result.Should().Be("success");
callCount.Should().Be(2);
}

[Fact]
public async Task RetryOnAnyException_Retries_On_HttpRequestException()
{
// Arrange
var callCount = 0;

// Act
var result = await _retryPolicy.RetryOnAnyException(async () =>
{
callCount++;
return callCount == 1
? throw new HttpRequestException("server error", null, HttpStatusCode.BadRequest)
: await Task.FromResult("success");
});

// Assert
result.Should().Be("success");
callCount.Should().Be(2);
}

[Fact]
public async Task RetryOnAnyException_Retries_On_TimeoutException()
{
// Arrange
var callCount = 0;

// Act
var result = await _retryPolicy.RetryOnAnyException(async () =>
{
callCount++;
return callCount == 1
? throw new TimeoutException("timed out")
: await Task.FromResult("success");
});

// Assert
result.Should().Be("success");
callCount.Should().Be(2);
}
}
Loading
Loading