From 4c42ec9c7df067e0e0e309610b61e25752d73f9f Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Thu, 25 Jun 2026 18:09:12 +0900 Subject: feat(kernel): retry-with-backoff on retryable provider errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the upstream LLM API returns a retryable error (HTTP 429 / 5xx "overloaded"), the kernel now retries provider.stream() with a stepped backoff, visibly, until the 8h cumulative-sleep budget is exhausted — then emits the final error and seals the turn. Retries fire only when no content was emitted yet this step (safety invariant: never duplicate partial output). - wire: new transient TurnProviderRetryEvent AgentEvent variant (emitted before each sleep; not persisted to model history). - kernel contracts: RetryStrategy (pure delayFor + injected sleep) + optional retry? on RunTurnInput (omit = no retry, backward-compatible). - kernel run-turn: retry loop in executeStep; providerRetryEvent constructor. Kernel imports no timer (sleep injected). - session-orchestrator: concrete schedule (5s..30m, repeat 30m, 8h budget) + abortable setTimeout sleep, wired into RunTurnInput.retry. tsc -b EXIT 0; biome clean; 1574 vitest pass (+16 new: 11 kernel retry tests with injected fake sleep + pure delayFor, zero @dispatch/* mocks; 5 schedule tests). Transports unchanged (transport-ws forwards AgentEvent verbatim in chat.delta; transport-http is generic JSON.stringify). Plan: notes/retry-with-backoff-plan.md. tasks.md updated with milestone + optional CLI-renderer roadmap follow-up. --- packages/wire/src/index.ts | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'packages/wire/src') diff --git a/packages/wire/src/index.ts b/packages/wire/src/index.ts index bade977..eecd2f7 100644 --- a/packages/wire/src/index.ts +++ b/packages/wire/src/index.ts @@ -273,6 +273,7 @@ export type AgentEvent = | TurnUsageEvent | TurnStepCompleteEvent | TurnErrorEvent + | TurnProviderRetryEvent | TurnDoneEvent | TurnSealedEvent | TurnSteeringEvent; @@ -429,6 +430,31 @@ export interface TurnErrorEvent { readonly code?: string; } +/** + * A retryable provider error is being retried with backoff. Emitted once per + * scheduled retry, BEFORE the sleep, so the UI can show "⚠ Server overloaded — + * retrying in 5s…" immediately. TRANSIENT: emitted to the frontend but NOT + * persisted into the model's message history (it never pollutes the prompt). + * + * When the retry budget is exhausted, the existing `error` event is emitted and + * the turn seals — so the final failure is still a persisted error. `attempt` is + * 0-based (the Nth retry about to happen); `delayMs` is the scheduled sleep + * before that retry fires. + */ +export interface TurnProviderRetryEvent { + readonly type: "provider-retry"; + readonly conversationId: string; + readonly turnId: string; + /** 0-based: this is the Nth retry about to happen. */ + readonly attempt: number; + /** ms the client should expect to wait before the retry fires. */ + readonly delayMs: number; + /** The endpoint's error verbatim (e.g. "HTTP 429: {…overloaded_error…}"). */ + readonly message: string; + /** The HTTP code when known (e.g. "429"). */ + readonly code?: string; +} + /** The turn has completed (model finished generating). */ export interface TurnDoneEvent { readonly type: "done"; -- cgit v1.2.3