From dabcbc79831052effc6ce990021feee07d661f7e Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Wed, 24 Jun 2026 14:10:03 +0900 Subject: fix(kernel+tool-shell): abort hanging tool calls without bricking the conversation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel: executeToolCall now races tool.execute against the abort signal via Promise.race; on abort resolves (not rejects) with an "Aborted" result so the step completes normally → finishReason "aborted" → turn seals cleanly (done event) → finally clears activeTurns → conversation freed, next message accepted. run-turn strips tool-call chunks from the assistant message on abort (keeps text/thinking) and omits tool-result messages to avoid persisting dangling tool calls that would 400 the provider next turn. tool-shell: realSpawn spawns detached (own process group); on abort AND timeout kills the entire group (process.kill(-pgid, SIGKILL)) and resolves immediately — no child.on("close") dependency, so a grandchild holding the pipes can't stall the spawn promise or leak. Also: ORCHESTRATOR.md migrated to dispatch CLI summon mechanism; .skills summary; bin/sync-env PATH injection; frontend handoff docs. 1453 vitest pass · tsc -b EXIT 0 · biome clean. --- packages/tool-shell/src/spawn.ts | 71 +++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 15 deletions(-) (limited to 'packages/tool-shell/src/spawn.ts') diff --git a/packages/tool-shell/src/spawn.ts b/packages/tool-shell/src/spawn.ts index 9025c26..9b1d7e4 100644 --- a/packages/tool-shell/src/spawn.ts +++ b/packages/tool-shell/src/spawn.ts @@ -3,24 +3,66 @@ import type { SpawnResult, SpawnShell } from "./shell.js"; export const realSpawn: SpawnShell = (params): Promise => { return new Promise((resolve) => { + // detached: true puts the child in its own process group (pgid = child.pid). + // This lets us kill the entire group (child + any grandchildren that inherit + // the pipes) via process.kill(-pgid, "SIGKILL") on abort/timeout, so a + // backgrounded grandchild can't keep the stdio pipes open and stall the + // promise on child.on("close"). const child = nodeSpawn("sh", ["-c", params.command], { cwd: params.cwd, stdio: ["ignore", "pipe", "pipe"], + detached: true, }); + let settled = false; let timedOut = false; - let killed = false; - const timer = setTimeout(() => { - timedOut = true; - child.kill("SIGKILL"); - }, params.timeout); + let timer: ReturnType | undefined; + + /** Kill the entire child process group (best-effort — group may be gone). */ + const killGroup = () => { + if (child.pid !== undefined) { + try { + process.kill(-child.pid, "SIGKILL"); + } catch { + // Process group may already be gone — ignore. + } + } + }; + + /** Remove the abort listener and clear the timeout timer (no leaks). */ + const cleanup = () => { + if (timer !== undefined) { + clearTimeout(timer); + timer = undefined; + } + params.signal.removeEventListener("abort", onAbort); + }; + + /** Resolve once, then clean up so listeners/timers never leak. */ + const settle = (result: SpawnResult) => { + if (settled) return; + settled = true; + cleanup(); + resolve(result); + }; const onAbort = () => { - killed = true; - child.kill("SIGKILL"); + if (settled) return; + killGroup(); + // Resolve immediately — do NOT wait for child.on("close"), which may + // never fire if a grandchild holds the pipes open. + settle({ exitCode: null, timedOut: false, aborted: true }); }; params.signal.addEventListener("abort", onAbort, { once: true }); + timer = setTimeout(() => { + if (settled) return; + timedOut = true; + killGroup(); + // Resolve immediately — same reasoning as abort. + settle({ exitCode: null, timedOut: true, aborted: false }); + }, params.timeout); + child.stdout.on("data", (chunk: Buffer) => { params.onOutput(chunk.toString(), "stdout"); }); @@ -29,18 +71,17 @@ export const realSpawn: SpawnShell = (params): Promise => { params.onOutput(chunk.toString(), "stderr"); }); + // Normal-completion path: wait for "close" so all stdout/stderr is captured. + // If abort/timeout already settled, this is a no-op (settled === true). child.on("close", (code) => { - clearTimeout(timer); - params.signal.removeEventListener("abort", onAbort); - resolve({ exitCode: code, timedOut }); + settle({ exitCode: code, timedOut, aborted: false }); }); + // Spawn error (e.g. bad cwd, sh not found). Kill the group just in case + // and resolve — never leave the promise pending. child.on("error", () => { - clearTimeout(timer); - params.signal.removeEventListener("abort", onAbort); - if (!killed && !timedOut) { - resolve({ exitCode: 1, timedOut: false }); - } + killGroup(); + settle({ exitCode: 1, timedOut: false, aborted: false }); }); }); }; -- cgit v1.2.3