diff options
| author | Adam Malczewski <[email protected]> | 2026-06-25 22:15:04 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-06-25 22:15:04 +0900 |
| commit | 9b91d1bca83e7599fb7d7de6038cedf186e61764 (patch) | |
| tree | 7a6c2325e618eaf89743ef89734e7b5102744668 | |
| parent | 2cc9ddfb590dc60557bba3ed76a6c4639df5f596 (diff) | |
| download | dispatch-9b91d1bca83e7599fb7d7de6038cedf186e61764.tar.gz dispatch-9b91d1bca83e7599fb7d7de6038cedf186e61764.zip | |
fix(ssh): POST /computers/:alias/test hangs after successful SSH connect
The test endpoint's runProbe() waited for the ssh2 stream's 'close' event,
which some SSH servers never emit for short-lived exec channels (the command
'true' exits instantly). This caused the promise to hang forever — the HTTP
response never returned, and the FE's Test spinner spun indefinitely.
Three fixes:
1. runProbe now resolves on the 'exit' event (not 'close') — the command has
finished and the exit code is available. 'close' is kept as a fallback.
Stream data/stderr are drained to prevent buffer deadlocks.
2. runProbe has a 15s timeout safety net — if the exec callback or 'exit'
event never fires (e.g. server requires a pty for exec), the probe
resolves false instead of hanging forever.
3. The entire test() method is wrapped in a 30s Promise.race timeout —
even if pool.acquire() or pool.drop() hangs, the endpoint ALWAYS
responds with { ok, error? }.
The probe is fully non-interactive (no blocking prompts). tsc EXIT 0,
biome clean, 1756 tests pass.
| -rw-r--r-- | packages/ssh/src/service.ts | 89 |
1 files changed, 72 insertions, 17 deletions
diff --git a/packages/ssh/src/service.ts b/packages/ssh/src/service.ts index 629c9f8..cebe2ef 100644 --- a/packages/ssh/src/service.ts +++ b/packages/ssh/src/service.ts @@ -124,19 +124,18 @@ export function createSshService(deps: SshServiceDeps): { } // One-shot probe: acquire (connects), run a trivial command, then drop // the connection so a test never holds a pooled socket open (plan §9.1). + // Wrapped in a timeout safety net so the endpoint ALWAYS responds — + // even if the SSH connect/exec/drop hangs (the probe is non-interactive). + const PROBE_TOTAL_TIMEOUT_MS = 30_000; try { - const conn = await pool.acquire(alias); - const client = await conn.getClient(); - const ok = await runProbe(client); - if (ok) { - // Successful connect pins the host key (the accept-new analog); - // a fresh known_hosts read reflects the new pin. - deps.logger.info("computer test ok", { alias }); - } - await pool.drop(alias); - return ok - ? { alias, ok: true } - : { alias, ok: false, error: "remote command returned no exit code" }; + const result = await Promise.race<TestComputerResponse>([ + runTestProbe(pool, alias, deps.logger), + timeoutAfter<TestComputerResponse>( + PROBE_TOTAL_TIMEOUT_MS, + `test timed out after ${PROBE_TOTAL_TIMEOUT_MS / 1000}s`, + ), + ]); + return result; } catch (err: unknown) { await pool.drop(alias).catch(() => undefined); const message = err instanceof Error ? err.message : String(err); @@ -159,21 +158,77 @@ export function createSshService(deps: SshServiceDeps): { return { service, pool, remoteFactory }; } -/** Run `true` over SSH as a connectivity probe; resolve ok=true on exit 0. */ +/** + * The one-shot test probe: acquire → exec `true` → drop. Extracted so it can + * be raced against a timeout. Always drops the connection (even on success). + */ +async function runTestProbe( + pool: SshConnectionPool, + alias: string, + logger: Logger, +): Promise<TestComputerResponse> { + const conn = await pool.acquire(alias); + const client = await conn.getClient(); + const ok = await runProbe(client); + if (ok) { + logger.info("computer test ok", { alias }); + } + await pool.drop(alias); + return ok + ? { alias, ok: true } + : { alias, ok: false, error: "remote command returned no exit code" }; +} + +/** Reject with `message` after `ms`. Used to race against a hanging probe. */ +function timeoutAfter<T>(ms: number, message: string): Promise<T> { + return new Promise<T>((_resolve, reject) => { + setTimeout(() => reject(new Error(message)), ms); + }); +} + +/** Probe timeout — the `true` command exits instantly; 15s is generous. */ +const PROBE_TIMEOUT_MS = 15_000; + +/** + * Run `true` over SSH as a connectivity probe; resolve `true` on exit 0. + * + * Resolves on the `exit` event (not `close` — some SSH servers don't emit + * `close` for short-lived exec channels, causing the promise to hang forever). + * A timeout safety net ensures we ALWAYS resolve, even if `exec` callback or + * the `exit` event never fires (e.g. the server requires a pty for exec). + */ function runProbe(client: import("ssh2").Client): Promise<boolean> { return new Promise<boolean>((resolve) => { + let settled = false; + const done = (result: boolean): void => { + if (!settled) { + settled = true; + clearTimeout(timer); + resolve(result); + } + }; + + const timer = setTimeout(() => { + done(false); + }, PROBE_TIMEOUT_MS); + client.exec("true", { pty: false }, (err, stream) => { if (err !== null && err !== undefined) { - resolve(false); + done(false); return; } - let exitCode: number | null = null; + // Resolve on `exit` — the command has finished. Don't wait for + // `close` (some servers never emit it for exec channels). stream.on("exit", (code: number | null) => { - exitCode = code; + done(code === 0); }); + // Safety net: if `exit` never fires, `close` might. stream.on("close", () => { - resolve(exitCode === 0); + done(false); }); + // Drain any output so the stream doesn't deadlock. + stream.on("data", () => {}); + stream.stderr.on("data", () => {}); }); }); } |
