diff options
| author | Adam Malczewski <[email protected]> | 2026-06-25 18:09:12 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-06-25 18:09:12 +0900 |
| commit | 4c42ec9c7df067e0e0e309610b61e25752d73f9f (patch) | |
| tree | b8ad7c766dd2b0756f5015e09d1d79055446d3b8 /packages/kernel/src/contracts | |
| parent | c1bc7bfaaca7bdf4d9b2973f5dc88605217a7866 (diff) | |
| download | dispatch-4c42ec9c7df067e0e0e309610b61e25752d73f9f.tar.gz dispatch-4c42ec9c7df067e0e0e309610b61e25752d73f9f.zip | |
feat(kernel): retry-with-backoff on retryable provider errors
When the upstream LLM API returns a retryable error (HTTP 429 / 5xx
"overloaded"), the kernel now retries provider.stream() with a stepped
backoff, visibly, until the 8h cumulative-sleep budget is exhausted — then
emits the final error and seals the turn. Retries fire only when no content
was emitted yet this step (safety invariant: never duplicate partial output).
- wire: new transient TurnProviderRetryEvent AgentEvent variant (emitted
before each sleep; not persisted to model history).
- kernel contracts: RetryStrategy (pure delayFor + injected sleep) + optional
retry? on RunTurnInput (omit = no retry, backward-compatible).
- kernel run-turn: retry loop in executeStep; providerRetryEvent constructor.
Kernel imports no timer (sleep injected).
- session-orchestrator: concrete schedule (5s..30m, repeat 30m, 8h budget) +
abortable setTimeout sleep, wired into RunTurnInput.retry.
tsc -b EXIT 0; biome clean; 1574 vitest pass (+16 new: 11 kernel retry tests
with injected fake sleep + pure delayFor, zero @dispatch/* mocks; 5 schedule
tests). Transports unchanged (transport-ws forwards AgentEvent verbatim in
chat.delta; transport-http is generic JSON.stringify).
Plan: notes/retry-with-backoff-plan.md. tasks.md updated with milestone +
optional CLI-renderer roadmap follow-up.
Diffstat (limited to 'packages/kernel/src/contracts')
| -rw-r--r-- | packages/kernel/src/contracts/events.ts | 1 | ||||
| -rw-r--r-- | packages/kernel/src/contracts/index.ts | 2 | ||||
| -rw-r--r-- | packages/kernel/src/contracts/runtime.ts | 44 |
3 files changed, 47 insertions, 0 deletions
diff --git a/packages/kernel/src/contracts/events.ts b/packages/kernel/src/contracts/events.ts index 6c9652d..dca34c2 100644 --- a/packages/kernel/src/contracts/events.ts +++ b/packages/kernel/src/contracts/events.ts @@ -11,6 +11,7 @@ export type { TurnDoneEvent, TurnErrorEvent, TurnInputEvent, + TurnProviderRetryEvent, TurnReasoningDeltaEvent, TurnSealedEvent, TurnStartEvent, diff --git a/packages/kernel/src/contracts/index.ts b/packages/kernel/src/contracts/index.ts index c67607b..f3e5bca 100644 --- a/packages/kernel/src/contracts/index.ts +++ b/packages/kernel/src/contracts/index.ts @@ -40,6 +40,7 @@ export type { TurnDoneEvent, TurnErrorEvent, TurnInputEvent, + TurnProviderRetryEvent, TurnReasoningDeltaEvent, TurnSealedEvent, TurnStartEvent, @@ -109,6 +110,7 @@ export type { export type { EventEmitter, FinishReason, + RetryStrategy, RunTurnInput, RunTurnResult, } from "./runtime.js"; diff --git a/packages/kernel/src/contracts/runtime.ts b/packages/kernel/src/contracts/runtime.ts index 02fc446..8376e42 100644 --- a/packages/kernel/src/contracts/runtime.ts +++ b/packages/kernel/src/contracts/runtime.ts @@ -129,6 +129,22 @@ export interface RunTurnInput { * double-persist them. */ readonly onStepComplete?: (messages: readonly ChatMessage[]) => Promise<void> | void; + + /** + * Optional injected retry strategy for retryable provider errors (e.g. HTTP + * 429 / 5xx "overloaded"). When omitted, a retryable error ends the step + * exactly as before (backward-compatible). When provided, the runtime wraps + * `provider.stream()` consumption in a retry loop: on a retryable error + * (an emitted `error` ProviderEvent with `retryable === true`, OR a thrown + * error) — ONLY when no content was emitted yet this step (the safety + * invariant — never duplicate partial output) — it asks `retry.delayFor` + * for a delay, emits a transient `provider-retry` AgentEvent, sleeps via the + * injected `retry.sleep` (abortable), and re-calls `provider.stream()`. + * + * Injected (not ambient): the kernel imports no timer and owns no schedule. + * Mirrors the `now`/`logger` injection pattern — optional + backward-compatible. + */ + readonly retry?: RetryStrategy; } /** @@ -145,3 +161,31 @@ export interface RunTurnResult { /** Why the turn ended. */ readonly finishReason: FinishReason; } + +/** + * Injected retry strategy for retryable provider errors (e.g. HTTP 429 / 5xx). + * + * The kernel provides the HOOK (this contract + the retry loop in `runTurn`); + * the shell (session-orchestrator) provides the POLICY (the concrete schedule) + * and the I/O (the actual sleep). The kernel imports no timer — `sleep` is an + * injected effect so the runtime stays pure and deterministic in tests. + * + * Retries are ONLY attempted when NO content was emitted yet this step (the + * safety invariant — never duplicate partial output). When omitted on + * `RunTurnInput`, no retry happens (backward-compatible: a retryable error ends + * the step exactly as before). + */ +export interface RetryStrategy { + /** + * Pure, deterministic decision: given the 0-based attempt index, return the + * delay in ms to sleep before the next retry, or `undefined` to stop (budget + * exhausted). No I/O, no clock — fully testable. + */ + readonly delayFor: (attempt: number) => number | undefined; + /** + * Injected effect: actually sleep for the given ms. Must honor the abort + * signal — reject when aborted so the turn seals `aborted`. The kernel + * imports no timer; the shell provides a `setTimeout`-based implementation. + */ + readonly sleep: (ms: number, signal: AbortSignal) => Promise<void>; +} |
