Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
da451a8
added the retry logic
Apr 20, 2026
1fc3df5
Merge branch 'main' into stevosyan/add-retry-to-complete-calls
Apr 27, 2026
4fc5200
Update src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs
sophiatev Apr 27, 2026
61f8a21
Add tests for ExecuteWithRetryAsync retry logic
Copilot Apr 27, 2026
32cc282
removed redundant logs
Apr 27, 2026
d640ac6
fixed line endings
Apr 27, 2026
b18d970
Apply suggestion from @Copilot
sophiatev Apr 27, 2026
5a38c5f
returned the completion logs
Apr 27, 2026
9a57bf8
Merge branch 'stevosyan/add-retry-to-complete-calls' of https://githu…
Apr 27, 2026
614893e
Potential fix for pull request finding 'Local scope variable shadows …
sophiatev Apr 27, 2026
6826505
Update src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs
sophiatev Apr 27, 2026
28b2839
fixed the line endings
Apr 27, 2026
55535e3
Add max-attempts exhaustion test for ExecuteWithRetryAsync
Copilot Apr 27, 2026
ba6c0c0
Assert status code in TransientGrpcRetry log test
Copilot Apr 27, 2026
e609227
simplied method extraction
Apr 27, 2026
9625812
Add status code assertion to MultipleTransientErrors log test
Copilot Apr 27, 2026
8ec4113
refactored so the retry also uses the shared backoff class
Apr 27, 2026
5d1b8e6
Merge branch 'stevosyan/add-retry-to-complete-calls' of https://githu…
Apr 27, 2026
08922fb
Trying to fix line endings
Apr 27, 2026
aa19300
reverting some unnecessary changes
Apr 27, 2026
3df94d7
missed the log changes
Apr 27, 2026
155b5ed
Potential fix for pull request finding 'Missed ternary opportunity'
sophiatev Apr 27, 2026
05a0958
Update src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs
sophiatev Apr 27, 2026
fc5c1ff
Fix thread-safety: create Random per ExecuteWithRetryAsync call, remo…
Copilot Apr 27, 2026
89957bb
fixed attempt logic
Apr 27, 2026
35051fa
fixing line endings
Apr 27, 2026
6d11687
reverting some more unnecessary changes
Apr 27, 2026
de9c357
fixing the failing tests
Apr 27, 2026
2eda8a9
fixed the log tests
Apr 27, 2026
d16cdc3
fixing the failing tests
Apr 27, 2026
1379dd4
test: add integration-level retry tests to RunBackgroundTaskLoggingTe…
Copilot Apr 27, 2026
cf46ceb
fixed the failing test
Apr 27, 2026
ed926d0
fixed another failing max attempt test
Apr 27, 2026
33700f2
test: add Non_Transient_Abandon_Orchestrator_Error_Is_Not_Retried int…
Copilot Apr 27, 2026
b3e93e3
updated the tests slightly
Apr 27, 2026
1142c65
removed try-catch, updated tests
Apr 27, 2026
c3c9571
remove json change
Apr 27, 2026
f9052dd
fix line endings, add a few more logs
Apr 28, 2026
81801a3
change order of a log and reconnect attempt
Apr 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
namespace Microsoft.DurableTask.Worker.Grpc;

/// <summary>
/// Helpers for computing reconnect backoff delays in the gRPC worker.
/// Helpers for computing reconnect and retry backoff delays in the gRPC worker.
/// </summary>
static class ReconnectBackoff
static class GrpcBackoff
{
/// <summary>
/// Creates a random source for reconnect jitter using an explicit random seed so multiple workers on
Expand All @@ -32,10 +32,11 @@ public static Random CreateRandom()
/// <param name="baseDelay">The base delay used for the exponential growth.</param>
/// <param name="cap">The maximum delay before jitter is applied.</param>
/// <param name="random">The random source used for jitter.</param>
/// <param name="fullJitter">If true, applies full jitter. If false, applies a smaller jitter that is biased towards the upper bound.</param>
/// <returns>The computed jittered delay.</returns>
public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random)
public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random, bool fullJitter)
{
if (baseDelay <= TimeSpan.Zero)
if (baseDelay <= TimeSpan.Zero || cap <= TimeSpan.Zero)
{
return TimeSpan.Zero;
}
Expand All @@ -48,13 +49,13 @@ public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Ra
// Cap the exponent to avoid overflow in 2^attempt for pathological attempt values.
int safeAttempt = Math.Min(attempt, 30);

double capMs = Math.Max(0, cap.TotalMilliseconds);
double exponentialMs = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt);
double upperBoundMs = Math.Min(capMs, exponentialMs);
double upperBoundMs = Math.Min(cap.TotalMilliseconds, exponentialMs);

double jitteredMs = fullJitter
? random.NextDouble() * upperBoundMs
: upperBoundMs + (random.NextDouble() * (upperBoundMs * .2));

// Full jitter intentionally allows any value in the retry window. The wide spread keeps many
// workers that saw the same outage from reconnecting in lockstep against the backend.
double jitteredMs = random.NextDouble() * upperBoundMs;
return TimeSpan.FromMilliseconds(jitteredMs);
}
}
257 changes: 163 additions & 94 deletions src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,29 @@ internal class InternalOptions
/// </summary>
public TimeSpan ReconnectBackoffCap { get; set; } = TimeSpan.FromSeconds(30);

/// <summary>
/// Gets or sets the maximum number of attempts the worker will make when retrying a transient
/// gRPC call (such as completing or abandoning a work item). Once this many attempts have failed,
/// the most recent exception is rethrown. Defaults to 10.
/// </summary>
public int TransientRetryMaxAttempts { get; set; } = 10;

/// <summary>
/// Gets or sets the initial delay used when computing exponential backoff between retries of a
/// transient gRPC call. The delay doubles after each failed attempt, and the exponential component
/// is capped at <see cref="TransientRetryBackoffCap"/> before jitter is applied. In the default
/// biased-jitter mode, the final delay may therefore slightly exceed
/// <see cref="TransientRetryBackoffCap"/>. Defaults to 200 ms.
/// </summary>
public TimeSpan TransientRetryBackoffBase { get; set; } = TimeSpan.FromMilliseconds(200);

/// <summary>
/// Gets or sets the cap applied to the exponential backoff component between retries of a transient
/// gRPC call before jitter is applied. In the default biased-jitter mode, the final computed delay
/// may be slightly greater than this value. Defaults to 15 seconds.
/// </summary>
public TimeSpan TransientRetryBackoffCap { get; set; } = TimeSpan.FromSeconds(15);

/// <summary>
/// Gets or sets an optional callback invoked when the worker requests a fresh gRPC channel after
/// repeated connect failures. The callback receives the previously-used channel and should return
Expand Down
5 changes: 4 additions & 1 deletion src/Worker/Grpc/Logs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ static partial class Logs
public static partial void ReceivedHealthPing(this ILogger logger);

[LoggerMessage(EventId = 76, Level = LogLevel.Information, Message = "Work-item stream ended by the backend (graceful close). Will reconnect.")]
public static partial void StreamEndedByPeer(this ILogger logger);
public static partial void StreamEndedByPeer(this ILogger logger);

[LoggerMessage(EventId = 77, Level = LogLevel.Warning, Message = "Transient gRPC error for '{OperationName}'. Attempt {Attempt} of {MaxAttempts}. Retrying in {BackoffMs} ms. StatusCode={StatusCode}")]
public static partial void TransientGrpcRetry(this ILogger logger, string operationName, int attempt, int maxAttempts, double backoffMs, int statusCode, Exception exception);
Comment thread
sophiatev marked this conversation as resolved.
Comment thread
sophiatev marked this conversation as resolved.
}
}
Loading
Loading