From 9b91d1bca83e7599fb7d7de6038cedf186e61764 Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Thu, 25 Jun 2026 22:15:04 +0900 Subject: fix(ssh): POST /computers/:alias/test hangs after successful SSH connect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test endpoint's runProbe() waited for the ssh2 stream's 'close' event, which some SSH servers never emit for short-lived exec channels (the command 'true' exits instantly). This caused the promise to hang forever — the HTTP response never returned, and the FE's Test spinner spun indefinitely. Three fixes: 1. runProbe now resolves on the 'exit' event (not 'close') — the command has finished and the exit code is available. 'close' is kept as a fallback. Stream data/stderr are drained to prevent buffer deadlocks. 2. runProbe has a 15s timeout safety net — if the exec callback or 'exit' event never fires (e.g. server requires a pty for exec), the probe resolves false instead of hanging forever. 3. The entire test() method is wrapped in a 30s Promise.race timeout — even if pool.acquire() or pool.drop() hangs, the endpoint ALWAYS responds with { ok, error? }. The probe is fully non-interactive (no blocking prompts). tsc EXIT 0, biome clean, 1756 tests pass. --- packages/ssh/src/service.ts | 89 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 17 deletions(-) diff --git a/packages/ssh/src/service.ts b/packages/ssh/src/service.ts index 629c9f8..cebe2ef 100644 --- a/packages/ssh/src/service.ts +++ b/packages/ssh/src/service.ts @@ -124,19 +124,18 @@ export function createSshService(deps: SshServiceDeps): { } // One-shot probe: acquire (connects), run a trivial command, then drop // the connection so a test never holds a pooled socket open (plan §9.1). + // Wrapped in a timeout safety net so the endpoint ALWAYS responds — + // even if the SSH connect/exec/drop hangs (the probe is non-interactive). + const PROBE_TOTAL_TIMEOUT_MS = 30_000; try { - const conn = await pool.acquire(alias); - const client = await conn.getClient(); - const ok = await runProbe(client); - if (ok) { - // Successful connect pins the host key (the accept-new analog); - // a fresh known_hosts read reflects the new pin. - deps.logger.info("computer test ok", { alias }); - } - await pool.drop(alias); - return ok - ? { alias, ok: true } - : { alias, ok: false, error: "remote command returned no exit code" }; + const result = await Promise.race([ + runTestProbe(pool, alias, deps.logger), + timeoutAfter( + PROBE_TOTAL_TIMEOUT_MS, + `test timed out after ${PROBE_TOTAL_TIMEOUT_MS / 1000}s`, + ), + ]); + return result; } catch (err: unknown) { await pool.drop(alias).catch(() => undefined); const message = err instanceof Error ? err.message : String(err); @@ -159,21 +158,77 @@ export function createSshService(deps: SshServiceDeps): { return { service, pool, remoteFactory }; } -/** Run `true` over SSH as a connectivity probe; resolve ok=true on exit 0. */ +/** + * The one-shot test probe: acquire → exec `true` → drop. Extracted so it can + * be raced against a timeout. Always drops the connection (even on success). + */ +async function runTestProbe( + pool: SshConnectionPool, + alias: string, + logger: Logger, +): Promise { + const conn = await pool.acquire(alias); + const client = await conn.getClient(); + const ok = await runProbe(client); + if (ok) { + logger.info("computer test ok", { alias }); + } + await pool.drop(alias); + return ok + ? { alias, ok: true } + : { alias, ok: false, error: "remote command returned no exit code" }; +} + +/** Reject with `message` after `ms`. Used to race against a hanging probe. */ +function timeoutAfter(ms: number, message: string): Promise { + return new Promise((_resolve, reject) => { + setTimeout(() => reject(new Error(message)), ms); + }); +} + +/** Probe timeout — the `true` command exits instantly; 15s is generous. */ +const PROBE_TIMEOUT_MS = 15_000; + +/** + * Run `true` over SSH as a connectivity probe; resolve `true` on exit 0. + * + * Resolves on the `exit` event (not `close` — some SSH servers don't emit + * `close` for short-lived exec channels, causing the promise to hang forever). + * A timeout safety net ensures we ALWAYS resolve, even if `exec` callback or + * the `exit` event never fires (e.g. the server requires a pty for exec). + */ function runProbe(client: import("ssh2").Client): Promise { return new Promise((resolve) => { + let settled = false; + const done = (result: boolean): void => { + if (!settled) { + settled = true; + clearTimeout(timer); + resolve(result); + } + }; + + const timer = setTimeout(() => { + done(false); + }, PROBE_TIMEOUT_MS); + client.exec("true", { pty: false }, (err, stream) => { if (err !== null && err !== undefined) { - resolve(false); + done(false); return; } - let exitCode: number | null = null; + // Resolve on `exit` — the command has finished. Don't wait for + // `close` (some servers never emit it for exec channels). stream.on("exit", (code: number | null) => { - exitCode = code; + done(code === 0); }); + // Safety net: if `exit` never fires, `close` might. stream.on("close", () => { - resolve(exitCode === 0); + done(false); }); + // Drain any output so the stream doesn't deadlock. + stream.on("data", () => {}); + stream.stderr.on("data", () => {}); }); }); } -- cgit v1.2.3