diff options
| author | Adam Malczewski <[email protected]> | 2026-06-27 20:06:47 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-06-27 20:06:47 +0900 |
| commit | 2e741d1c1ac309327aff4fed0e248bc5baa342d4 (patch) | |
| tree | fb84708f55e07572e4c69447884365d9b457755c | |
| parent | 2c91dc63802a386b1612ea0ed8c1e96b6f4421db (diff) | |
| download | dispatch-2e741d1c1ac309327aff4fed0e248bc5baa342d4.tar.gz dispatch-2e741d1c1ac309327aff4fed0e248bc5baa342d4.zip | |
feat(vision): store images in tmp dir instead of SQLite — compact URLs + purge on compaction/close
| -rw-r--r-- | packages/session-orchestrator/src/orchestrator.ts | 30 | ||||
| -rw-r--r-- | packages/transport-http/src/app.ts | 31 | ||||
| -rw-r--r-- | packages/vision-handoff/src/extension.ts | 88 | ||||
| -rw-r--r-- | packages/vision-handoff/src/service.ts | 135 |
4 files changed, 279 insertions, 5 deletions
diff --git a/packages/session-orchestrator/src/orchestrator.ts b/packages/session-orchestrator/src/orchestrator.ts index c0493f3..045b88d 100644 --- a/packages/session-orchestrator/src/orchestrator.ts +++ b/packages/session-orchestrator/src/orchestrator.ts @@ -49,6 +49,20 @@ import type { ToolAssembly } from "./tools-filter.js"; * call `consult_vision`) and the images are registered for tool access. */ export interface VisionHandoffService { + /** + * Store images to tmp files and return compact URLs. Each input image's data + * URL is saved to a tmp file and replaced with a compact HTTP path so the + * persisted conversation store holds a tiny string, not megabytes of base64. + * When `saveImageToTmp` is not configured, data URLs pass through unchanged. + */ + readonly storeImages: ( + conversationId: string, + images: readonly ImageInput[], + ) => Promise<readonly ImageInput[]>; + + /** Delete all tmp images for a conversation (on close). Best-effort. */ + readonly purgeConversationImages: (conversationId: string) => Promise<void>; + readonly prepareForProvider: ( messages: readonly ChatMessage[], currentModelName: string | undefined, @@ -625,7 +639,18 @@ export function createSessionOrchestrator( const effectiveModelName = resolveModelName(modelName, storedModel); const history = await deps.conversationStore.load(conversationId); - const userMsg = buildUserMessage(text, images); + + // Store images to tmp files (compact URLs) BEFORE building the user + // message so the persisted chunks hold tiny URL references, not + // megabytes of base64 data URLs. When the vision-handoff service isn't + // loaded, images pass through unchanged (backward compatible). + const visionHandoffForStore = deps.resolveVisionHandoff?.(); + const storedImages = + visionHandoffForStore !== undefined && images !== undefined + ? await visionHandoffForStore.storeImages(conversationId, images) + : images; + + const userMsg = buildUserMessage(text, storedImages); // Workspace assignment for new conversations happens BEFORE // effective-cwd resolution (see workspaceSetupPromise above) so @@ -988,6 +1013,9 @@ export function createSessionOrchestrator( }); }); void deps.conversationStore.setConversationStatus(conversationId, "closed"); + // Purge tmp images for this conversation (best-effort, fire-and-forget). + const vh = deps.resolveVisionHandoff?.(); + if (vh !== undefined) void vh.purgeConversationImages(conversationId); return { abortedTurn }; }, diff --git a/packages/transport-http/src/app.ts b/packages/transport-http/src/app.ts index ea216e1..16c4167 100644 --- a/packages/transport-http/src/app.ts +++ b/packages/transport-http/src/app.ts @@ -201,6 +201,37 @@ export function createApp(opts: CreateServerOptions): Hono { app.get("/health", (c) => c.json({ ok: true })); + // ── Tmp image serving (vision handoff) ────────────────────────────────────── + app.get("/images/:conversationId/:imageId", async (c) => { + const conversationId = c.req.param("conversationId"); + const imageId = c.req.param("imageId"); + if (imageId.includes("/") || imageId.includes("..")) { + return c.json({ error: "Invalid image ID" }, 400); + } + const imageDir = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images"; + const { join } = await import("node:path"); + const { readFile: fsReadFile } = await import("node:fs/promises"); + const filePath = join(imageDir, conversationId, imageId); + try { + const buf = await fsReadFile(filePath); + const ext = imageId.toLowerCase(); + const mime = ext.endsWith(".png") + ? "image/png" + : ext.endsWith(".jpg") || ext.endsWith(".jpeg") + ? "image/jpeg" + : ext.endsWith(".webp") + ? "image/webp" + : ext.endsWith(".gif") + ? "image/gif" + : ext.endsWith(".bmp") + ? "image/bmp" + : "application/octet-stream"; + return new Response(buf, { headers: { "Content-Type": mime, "Cache-Control": "no-cache" } }); + } catch { + return c.json({ error: "Image not found" }, 404); + } + }); + app.get("/conversations/:id/metrics", async (c) => { const conversationId = c.req.param("id"); diff --git a/packages/vision-handoff/src/extension.ts b/packages/vision-handoff/src/extension.ts index af646aa..faf4621 100644 --- a/packages/vision-handoff/src/extension.ts +++ b/packages/vision-handoff/src/extension.ts @@ -9,13 +9,18 @@ * image + the model's specific question, and returns the conversation ID + the * vision model's answer. Follow-ups go through the dispatch CLI. * + * Images are saved to a tmp directory (`/tmp/dispatch/images/<convId>/`) so the + * conversation store (SQLite) only holds a compact URL reference — not + * megabytes of base64. Tmp files are purged on reboot (ephemeral dir), after + * compaction (the transcription replaces the image), and on conversation close. + * * Effects (filesystem, orchestrator) live here in the shell, injected into the * service. The pure decisions live in `pure.ts`. No `console.*`; logging via * `host.logger`. */ -import { readFile } from "node:fs/promises"; -import { extname, isAbsolute, resolve as pathResolve } from "node:path"; +import { mkdir, readFile, rm, unlink, writeFile } from "node:fs/promises"; +import { extname, isAbsolute, join, resolve as pathResolve } from "node:path"; import { conversationStoreHandle } from "@dispatch/conversation-store"; import type { CredentialStore } from "@dispatch/credential-store"; import { credentialStoreHandle } from "@dispatch/credential-store"; @@ -38,6 +43,8 @@ export const manifest: Manifest = { contributes: { services: ["vision-handoff/service"], tools: ["consult_vision"] }, }; +const IMAGE_DIR = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images"; + /** MIME types for recognized image extensions. */ const MIME_BY_EXT: Readonly<Record<string, string>> = { ".png": "image/png", @@ -48,6 +55,15 @@ const MIME_BY_EXT: Readonly<Record<string, string>> = { ".bmp": "image/bmp", }; +/** Reverse: MIME → extension. */ +const EXT_BY_MIME: Readonly<Record<string, string>> = { + "image/png": ".png", + "image/jpeg": ".jpg", + "image/webp": ".webp", + "image/gif": ".gif", + "image/bmp": ".bmp", +}; + /** * Read an image file from disk as a base64 data URL. Resolves relative paths * against the cwd (the conversation's working directory). Throws on missing @@ -61,6 +77,70 @@ async function readFileAsDataUrl(path: string, cwd?: string): Promise<string> { return `data:${mime};base64,${buf.toString("base64")}`; } +/** + * Save a data URL image to a tmp file and return a compact HTTP path. + * The compact URL (`/images/<conversationId>/<uuid>.<ext>`) is what gets + * persisted in the conversation store — a tiny string, not megabytes of base64. + */ +async function saveImageToTmp( + conversationId: string, + dataUrl: string, + mimeType?: string, +): Promise<string> { + const mime = mimeType ?? "image/png"; + const ext = EXT_BY_MIME[mime] ?? ".png"; + const imageId = `${crypto.randomUUID()}${ext}`; + const dir = join(IMAGE_DIR, conversationId); + await mkdir(dir, { recursive: true }); + const filePath = join(dir, imageId); + const base64 = dataUrl.split(",")[1] ?? ""; + await writeFile(filePath, Buffer.from(base64, "base64")); + return `/images/${conversationId}/${imageId}`; +} + +/** + * Resolve a compact URL (`/images/<convId>/<imageId>`) back to a data URL by + * reading the tmp file. Data URLs and HTTP URLs pass through unchanged. + */ +async function resolveImageUrl(url: string): Promise<string> { + if (url.startsWith("data:") || url.startsWith("http")) return url; + if (!url.startsWith("/images/")) return url; + const parts = url.split("/"); // ["", "images", convId, imageId] + const convId = parts[2]; + const imageId = parts[3]; + if (convId === undefined || imageId === undefined) return url; + const filePath = join(IMAGE_DIR, convId, imageId); + const buf = await readFile(filePath); + const ext = extname(imageId).toLowerCase(); + const mime = MIME_BY_EXT[ext] ?? "image/png"; + return `data:${mime};base64,${buf.toString("base64")}`; +} + +/** Delete a single tmp image file (after compaction — best-effort). */ +async function deleteTmpImage(compactUrl: string): Promise<void> { + if (!compactUrl.startsWith("/images/")) return; + const parts = compactUrl.split("/"); + const convId = parts[2]; + const imageId = parts[3]; + if (convId === undefined || imageId === undefined) return; + const filePath = join(IMAGE_DIR, convId, imageId); + try { + await unlink(filePath); + } catch { + // Best-effort — file may already be deleted. + } +} + +/** Delete all tmp images for a conversation (on close — best-effort). */ +async function deleteConversationImages(conversationId: string): Promise<void> { + const dir = join(IMAGE_DIR, conversationId); + try { + await rm(dir, { recursive: true, force: true }); + } catch { + // Best-effort. + } +} + export async function activate(host: HostAPI): Promise<void> { const credentialStore = host.getService(credentialStoreHandle) as CredentialStore | undefined; if (credentialStore === undefined) { @@ -82,6 +162,10 @@ export async function activate(host: HostAPI): Promise<void> { credentialStore, resolveModel, readFileAsDataUrl, + saveImageToTmp, + resolveImageUrl, + deleteTmpImage, + deleteConversationImages, resolveOrchestrator: () => { const loaded = host.getExtensions().some((m) => m.id === "session-orchestrator"); if (!loaded) return undefined; diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts index 7403c21..cc13d93 100644 --- a/packages/vision-handoff/src/service.ts +++ b/packages/vision-handoff/src/service.ts @@ -115,6 +115,37 @@ export interface VisionHandoffDeps { imageUrl: string, transcription: string, ) => Promise<void>; + /** + * Save an image data URL to a tmp file and return a compact URL + * (`/images/<conversationId>/<imageId>.<ext>`) that can be persisted in the + * conversation store instead of the full data URL (which would be megabytes). + * The frontend serves the image via `GET /images/...`; the provider resolves + * it back to a data URL via {@link resolveImageUrl} at runtime. When `undefined`, + * data URLs pass through unchanged (images persist in SQLite — the large-DB + * path, for environments without tmp file support). + */ + readonly saveImageToTmp?: ( + conversationId: string, + dataUrl: string, + mimeType?: string, + ) => Promise<string>; + /** + * Resolve a compact URL (`/images/...`) back to a data URL by reading the tmp + * file. Data URLs and HTTP URLs pass through unchanged. Paired with + * {@link saveImageToTmp}. + */ + readonly resolveImageUrl?: (url: string) => Promise<string>; + /** + * Delete a tmp image file (after it has been compacted to text — the + * transcription is cached, the raw image is no longer needed). Best-effort: + * errors are logged, not thrown. + */ + readonly deleteTmpImage?: (compactUrl: string) => Promise<void>; + /** + * Delete all tmp images for a conversation (on conversation close). + * Best-effort. + */ + readonly deleteConversationImages?: (conversationId: string) => Promise<void>; /** Generate a new conversation ID for a consultation. Defaults to crypto.randomUUID. */ readonly generateId?: () => string; readonly logger?: Logger; @@ -128,6 +159,24 @@ export interface VisionHandoffService { readonly isVisionCapable: (modelName: string | undefined) => Promise<boolean>; /** + * Store images to tmp files and return compact URLs. Each input image's data + * URL is saved to `/tmp/dispatch/images/<conversationId>/<uuid>.<ext>` and + * replaced with a compact HTTP path (`/images/<conversationId>/<uuid>.<ext>`) + * so the persisted conversation store holds a tiny string, not megabytes of + * base64. When `saveImageToTmp` is not configured, data URLs pass through + * unchanged (backward compatible). + */ + readonly storeImages: ( + conversationId: string, + images: readonly ImageInput[], + ) => Promise<readonly ImageInput[]>; + + /** + * Delete all tmp images for a conversation (on close). Best-effort. + */ + readonly purgeConversationImages: (conversationId: string) => Promise<void>; + + /** * Resolve a vision-capable model from the catalog (any provider). Returns * `undefined` when none is available. */ @@ -306,6 +355,15 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando if (convId !== undefined && deps.setImageTranscription !== undefined) { await deps.setImageTranscription(convId, entry.url, text); } + // The image has been transcribed to text — delete the tmp file + // (the transcription is cached, the raw image is no longer needed). + if (deps.deleteTmpImage !== undefined) { + try { + await deps.deleteTmpImage(entry.url); + } catch { + // Best-effort — don't let cleanup failure break the turn. + } + } } catch (err) { const msg = err instanceof Error ? err.message : String(err); log?.warn("vision-handoff: image compaction transcription failed", { error: msg }); @@ -340,6 +398,42 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando return result; } + async function resolveImageUrlsInMessages( + messages: readonly ChatMessage[], + ): Promise<readonly ChatMessage[]> { + if (deps.resolveImageUrl === undefined) return messages; + let hasCompact = false; + for (const msg of messages) { + if (msg.chunks.some((c) => c.type === "image")) { + hasCompact = true; + break; + } + } + if (!hasCompact) return messages; + const result: ChatMessage[] = []; + for (const msg of messages) { + if (!msg.chunks.some((c) => c.type === "image")) { + result.push(msg); + continue; + } + const newChunks: Chunk[] = []; + for (const chunk of msg.chunks) { + if (chunk.type === "image") { + const dataUrl = await deps.resolveImageUrl!(chunk.url); + newChunks.push({ + type: "image", + url: dataUrl, + ...(chunk.mimeType !== undefined ? { mimeType: chunk.mimeType } : {}), + }); + } else { + newChunks.push(chunk); + } + } + result.push({ role: msg.role, chunks: newChunks }); + } + return result; + } + const service: VisionHandoffService = { async isVisionCapable(modelName: string | undefined): Promise<boolean> { if (modelName === undefined) return false; @@ -347,6 +441,38 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando return isVisionCapable(modelName, info); }, + async storeImages( + conversationId: string, + images: readonly ImageInput[], + ): Promise<readonly ImageInput[]> { + if (deps.saveImageToTmp === undefined) return images; + const result: ImageInput[] = []; + for (const img of images) { + if (img.url.startsWith("data:")) { + const compactUrl = await deps.saveImageToTmp(conversationId, img.url, img.mimeType); + result.push({ + url: compactUrl, + ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}), + }); + } else { + result.push(img); + } + } + return result; + }, + + async purgeConversationImages(conversationId: string): Promise<void> { + if (deps.deleteConversationImages === undefined) return; + try { + await deps.deleteConversationImages(conversationId); + } catch (err) { + log?.warn("vision-handoff: failed to purge conversation images", { + conversationId, + error: err instanceof Error ? err.message : String(err), + }); + } + }, + resolveVisionModel, async prepareForProvider( @@ -362,6 +488,11 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando // Fast path: no images anywhere → nothing to do. if (!hasImageChunks(messages)) return messages; + // Resolve compact URLs (/images/...) → data URLs for the provider. + // The persisted chunks store compact URLs (tiny strings); the provider + // needs data URLs (read from tmp files at runtime). + const resolved = await resolveImageUrlsInMessages(messages); + const isCapable = currentModelName !== undefined && (await isVisionCapable(currentModelName, await getInfo(currentModelName))); @@ -371,7 +502,7 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando // are transcribed to text (one-time, cached) and stripped from the // provider messages. Recent images (within the limit) stay native. if (isCapable) { - return compactImagesForVisionModel(messages, opts, currentModelName); + return compactImagesForVisionModel(resolved, opts, currentModelName); } // ── Non-vision model: placeholders + consult_vision ────────────────── @@ -388,7 +519,7 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando // per-conversation registry so the consult_vision tool can look it up. let seqId = 0; const result: ChatMessage[] = []; - for (const msg of messages) { + for (const msg of resolved) { if (!msg.chunks.some((c) => c.type === "image")) { result.push(msg); continue; |
