diff options
| author | Adam Malczewski <[email protected]> | 2026-06-27 20:48:24 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-06-27 20:48:24 +0900 |
| commit | 04356c8678ae8dd1d7ddca2d0460b514116adc2e (patch) | |
| tree | 6c81894ef02d062570b12f4d3a871e58600dcb9c | |
| parent | 3184b10e614ce6249c83aa111368e98f6689f456 (diff) | |
| parent | b24ed99e89bc657e8c98c7cef8608e0c0b7594da (diff) | |
| download | dispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.tar.gz dispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.zip | |
Merge branch 'feature/vision-handoff' into dev
# Conflicts:
# packages/session-orchestrator/src/extension.ts
# packages/session-orchestrator/src/orchestrator.ts
37 files changed, 2707 insertions, 19 deletions
@@ -104,6 +104,7 @@ "@dispatch/tool-youtube-transcript": "workspace:*", "@dispatch/transport-http": "workspace:*", "@dispatch/transport-ws": "workspace:*", + "@dispatch/vision-handoff": "workspace:*", }, }, "packages/journal-sink": { @@ -371,6 +372,16 @@ "name": "@dispatch/ui-contract", "version": "0.2.0", }, + "packages/vision-handoff": { + "name": "@dispatch/vision-handoff", + "version": "0.0.0", + "dependencies": { + "@dispatch/conversation-store": "workspace:*", + "@dispatch/credential-store": "workspace:*", + "@dispatch/kernel": "workspace:*", + "@dispatch/openai-stream": "workspace:*", + }, + }, "packages/wire": { "name": "@dispatch/wire", "version": "0.12.0", @@ -473,6 +484,8 @@ "@dispatch/ui-contract": ["@dispatch/ui-contract@workspace:packages/ui-contract"], + "@dispatch/vision-handoff": ["@dispatch/vision-handoff@workspace:packages/vision-handoff"], + "@dispatch/wire": ["@dispatch/wire@workspace:packages/wire"], "@esbuild/aix-ppc64": ["@esbuild/[email protected]", "", { "os": "aix", "cpu": "ppc64" }, "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg=="], diff --git a/packages/conversation-store/src/keys.ts b/packages/conversation-store/src/keys.ts index b2c635d..6ec2bc5 100644 --- a/packages/conversation-store/src/keys.ts +++ b/packages/conversation-store/src/keys.ts @@ -66,6 +66,14 @@ export function compactThresholdKey(conversationId: string): string { return `conv:${conversationId}:compact-percent`; } +/** Per-conversation image transcription cache (JSON map of imageUrl → transcription). */ +export function imageTranscriptionsKey(conversationId: string): string { + return `conv:${conversationId}:image-transcriptions`; +} + +/** Global vision settings (image compaction limit + compaction model). */ +export const VISION_SETTINGS_KEY = "vision-settings"; + export function metaKey(conversationId: string): string { return `conv:${conversationId}:meta`; } diff --git a/packages/conversation-store/src/store.ts b/packages/conversation-store/src/store.ts index f90e809..69334e6 100644 --- a/packages/conversation-store/src/store.ts +++ b/packages/conversation-store/src/store.ts @@ -20,6 +20,7 @@ import { compactThresholdKey, computerKey, cwdKey, + imageTranscriptionsKey, metaKey, metricsKey, metricsPrefix, @@ -28,6 +29,7 @@ import { parseSeq, reasoningEffortKey, seqKey, + VISION_SETTINGS_KEY, workspaceKey, } from "./keys.js"; import { reconcileWithReport } from "./reconcile.js"; @@ -141,6 +143,35 @@ export interface ConversationStore { /** Set the compact percent (0-100, 0 = manual only). */ readonly setCompactPercent: (conversationId: string, percent: number) => Promise<void>; /** + * Get the per-conversation image transcription cache: a map of image URL → + * transcription text. Used by the vision handoff to avoid re-transcribing + * old images that were compacted to text on a previous turn. Returns an + * empty map when none are cached. + */ + readonly getImageTranscriptions: (conversationId: string) => Promise<ReadonlyMap<string, string>>; + /** + * Upsert a single image transcription into the per-conversation cache. + * Merges with any existing transcriptions (does NOT replace the whole map). + */ + readonly setImageTranscription: ( + conversationId: string, + imageUrl: string, + transcription: string, + ) => Promise<void>; + /** + * Get the global vision settings (image compaction limit + compaction model). + * The limit defaults to 10 when never set; the compaction model defaults to + * null (auto-select). Shared across ALL conversations and vision models. + */ + readonly getVisionSettings: () => Promise<{ + readonly imageLimit: number; + readonly compactionModel: string | null; + }>; + /** Set the global vision image compaction limit (0 = disabled). */ + readonly setVisionImageLimit: (limit: number) => Promise<void>; + /** Set the global vision compaction model (null = auto-select). */ + readonly setVisionCompactionModel: (model: string | null) => Promise<void>; + /** * Set the `compactedFrom` field on a conversation's metadata, pointing to * the archive conversation that holds the pre-compaction history. */ @@ -1004,6 +1035,52 @@ export function createConversationStore( } }, + async getImageTranscriptions(conversationId) { + const raw = await storage.get(imageTranscriptionsKey(conversationId)); + if (raw === null) return new Map(); + try { + const obj = JSON.parse(raw) as Record<string, string>; + return new Map(Object.entries(obj)); + } catch { + return new Map(); + } + }, + + async setImageTranscription(conversationId, imageUrl, transcription) { + const existing = await this.getImageTranscriptions(conversationId); + const merged = new Map(existing); + merged.set(imageUrl, transcription); + const obj: Record<string, string> = {}; + for (const [k, v] of merged) obj[k] = v; + await storage.set(imageTranscriptionsKey(conversationId), JSON.stringify(obj)); + }, + + async getVisionSettings() { + const raw = await storage.get(VISION_SETTINGS_KEY); + if (raw === null) return { imageLimit: 10, compactionModel: null }; + try { + const obj = JSON.parse(raw) as { imageLimit?: number; compactionModel?: string | null }; + return { + imageLimit: typeof obj.imageLimit === "number" ? obj.imageLimit : 10, + compactionModel: obj.compactionModel ?? null, + }; + } catch { + return { imageLimit: 10, compactionModel: null }; + } + }, + + async setVisionImageLimit(limit) { + const current = await this.getVisionSettings(); + const obj = { imageLimit: limit, compactionModel: current.compactionModel }; + await storage.set(VISION_SETTINGS_KEY, JSON.stringify(obj)); + }, + + async setVisionCompactionModel(model) { + const current = await this.getVisionSettings(); + const obj = { imageLimit: current.imageLimit, compactionModel: model }; + await storage.set(VISION_SETTINGS_KEY, JSON.stringify(obj)); + }, + async setCompactedFrom(conversationId, newConversationId) { const raw = await storage.get(metaKey(conversationId)); const existing = raw !== null ? parseMetaRow(raw) : null; diff --git a/packages/host-bin/package.json b/packages/host-bin/package.json index e68251b..7d3b38c 100644 --- a/packages/host-bin/package.json +++ b/packages/host-bin/package.json @@ -34,6 +34,7 @@ "@dispatch/surface-loaded-extensions": "workspace:*", "@dispatch/surface-registry": "workspace:*", "@dispatch/transport-ws": "workspace:*", - "@dispatch/system-prompt": "workspace:*" + "@dispatch/system-prompt": "workspace:*", + "@dispatch/vision-handoff": "workspace:*" } } diff --git a/packages/host-bin/src/main.ts b/packages/host-bin/src/main.ts index 2ab1118..aa114d5 100644 --- a/packages/host-bin/src/main.ts +++ b/packages/host-bin/src/main.ts @@ -44,6 +44,7 @@ import { extension as toolWriteFileExt } from "@dispatch/tool-write-file"; import { extension as toolYoutubeTranscriptExt } from "@dispatch/tool-youtube-transcript"; import { createTransportHttpExtension } from "@dispatch/transport-http"; import { createTransportWsExtension } from "@dispatch/transport-ws"; +import { extension as visionHandoffExt } from "@dispatch/vision-handoff"; import type { ChildHandle } from "./collector-supervisor.js"; import { createCollectorSupervisor } from "./collector-supervisor.js"; import { configMapToAccess, envToConfigMap } from "./config.js"; @@ -206,6 +207,13 @@ async function boot(): Promise<void> { const extensions: Extension[] = [ ...CORE_EXTENSIONS, createCredentialStoreExtension({ credentials }), + // vision-handoff activates AFTER credential-store (it resolves the + // credential-store service at activate time to find vision-capable models). + // Placed here, not in CORE_EXTENSIONS, so the service is available when it + // activates. The session-orchestrator resolves its service LAZILY + // (per-turn), so activation order between it and session-orchestrator + // doesn't matter. + visionHandoffExt, ...externalExtensions, ]; diff --git a/packages/host-bin/tsconfig.json b/packages/host-bin/tsconfig.json index cb85915..09b87df 100644 --- a/packages/host-bin/tsconfig.json +++ b/packages/host-bin/tsconfig.json @@ -63,6 +63,9 @@ }, { "path": "../transport-ws" + }, + { + "path": "../vision-handoff" } ] } diff --git a/packages/kernel/src/contracts/conversation.ts b/packages/kernel/src/contracts/conversation.ts index f074c52..80da86e 100644 --- a/packages/kernel/src/contracts/conversation.ts +++ b/packages/kernel/src/contracts/conversation.ts @@ -12,6 +12,8 @@ export type { ConversationMeta, ConversationStatus, ErrorChunk, + ImageChunk, + ImageInput, Role, StepId, StepMetrics, diff --git a/packages/kernel/src/contracts/index.ts b/packages/kernel/src/contracts/index.ts index 09e0a56..28e0a0b 100644 --- a/packages/kernel/src/contracts/index.ts +++ b/packages/kernel/src/contracts/index.ts @@ -19,6 +19,8 @@ export type { ConversationMeta, ConversationStatus, ErrorChunk, + ImageChunk, + ImageInput, Role, StepId, StepMetrics, diff --git a/packages/kernel/src/contracts/provider.ts b/packages/kernel/src/contracts/provider.ts index b6dc8ca..3137073 100644 --- a/packages/kernel/src/contracts/provider.ts +++ b/packages/kernel/src/contracts/provider.ts @@ -114,6 +114,16 @@ export interface ModelInfo { readonly displayName?: string; /** The model's max context window in tokens (e.g. 200000). Optional — providers that don't report it leave it undefined. */ readonly contextWindow?: number; + /** + * Whether this model can natively accept image input (vision/multimodal). + * When `true`, image chunks in a user message are passed through to the + * provider serialized to its image-content format. When `false`/absent, the + * orchestrator's vision handoff transcribes images to text (via a + * vision-capable model) before the model sees them. Optional — providers + * that cannot detect it leave it undefined (treated as non-vision); a + * provider that knows a model is vision-capable sets it `true`. + */ + readonly vision?: boolean; } /** diff --git a/packages/openai-stream/src/convert-messages.test.ts b/packages/openai-stream/src/convert-messages.test.ts index 3520eb5..57c7d81 100644 --- a/packages/openai-stream/src/convert-messages.test.ts +++ b/packages/openai-stream/src/convert-messages.test.ts @@ -35,6 +35,100 @@ describe("convertMessages", () => { expect(result).toEqual([{ role: "user", content: "Hello, world!" }]); }); + it("converts a user message with a text + image chunk to a multimodal content array", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "What is in this image?" }, + { type: "image", url: "data:image/png;base64,iVBORw0KGgo=" }, + ], + }, + ]; + + const result = convertMessages(messages); + expect(result).toEqual([ + { + role: "user", + content: [ + { type: "text", text: "What is in this image?" }, + { type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } }, + ], + }, + ]); + }); + + it("converts an image-only user message (no text) to a content array with just the image", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [{ type: "image", url: "https://example.com/cat.png" }], + }, + ]; + + const result = convertMessages(messages); + expect(result).toEqual([ + { + role: "user", + content: [{ type: "image_url", image_url: { url: "https://example.com/cat.png" } }], + }, + ]); + }); + + it("converts a user message with multiple images interspersed with text", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "Compare these:" }, + { type: "image", url: "data:image/png;base64,aaa" }, + { type: "text", text: "and" }, + { type: "image", url: "data:image/jpeg;base64,bbb" }, + ], + }, + ]; + + const result = convertMessages(messages); + expect(result).toHaveLength(1); + const content = result[0]?.content; + expect(Array.isArray(content)).toBe(true); + if (Array.isArray(content)) { + expect(content).toHaveLength(4); + expect(content[0]).toEqual({ type: "text", text: "Compare these:" }); + expect(content[1]).toEqual({ + type: "image_url", + image_url: { url: "data:image/png;base64,aaa" }, + }); + expect(content[2]).toEqual({ type: "text", text: "and" }); + expect(content[3]).toEqual({ + type: "image_url", + image_url: { url: "data:image/jpeg;base64,bbb" }, + }); + } + }); + + it("skips empty text parts in a multimodal message but keeps images", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "" }, + { type: "image", url: "data:image/png;base64,x" }, + ], + }, + ]; + + const result = convertMessages(messages); + const content = result[0]?.content; + expect(Array.isArray(content)).toBe(true); + if (Array.isArray(content)) { + // Empty text part is dropped; only the image remains. + expect(content).toEqual([ + { type: "image_url", image_url: { url: "data:image/png;base64,x" } }, + ]); + } + }); + it("converts an assistant message with text only", () => { const messages: ChatMessage[] = [ { diff --git a/packages/openai-stream/src/convert-messages.ts b/packages/openai-stream/src/convert-messages.ts index e830243..eba3575 100644 --- a/packages/openai-stream/src/convert-messages.ts +++ b/packages/openai-stream/src/convert-messages.ts @@ -1,8 +1,28 @@ import type { ChatMessage, Chunk } from "@dispatch/kernel"; +/** A text part within a multimodal OpenAI content array. */ +export interface OpenAITextPart { + readonly type: "text"; + readonly text: string; +} + +/** An image part within a multimodal OpenAI content array (OpenAI vision format). */ +export interface OpenAIImagePart { + readonly type: "image_url"; + readonly image_url: { readonly url: string }; +} + +/** + * A part of a multimodal message content array. When a message has mixed text + * and image chunks, the content is serialized as an array of these parts + * (OpenAI's vision format). Plain-text messages keep a string `content` for + * byte-stability with providers that only accept strings. + */ +export type OpenAIContentPart = OpenAITextPart | OpenAIImagePart; + export interface OpenAIMessage { readonly role: "system" | "user" | "assistant" | "tool"; - readonly content: string | null; + readonly content: string | null | readonly OpenAIContentPart[]; readonly tool_calls?: readonly OpenAIToolCall[]; readonly tool_call_id?: string; } @@ -49,6 +69,29 @@ function convertSystemMessage(msg: ChatMessage): OpenAIMessage { } function convertUserMessage(msg: ChatMessage): OpenAIMessage { + // If the message has image chunks, serialize as a multimodal content array + // (OpenAI vision format): text parts + image_url parts in chunk order. + // Plain text-only messages keep a string `content` for byte-stability with + // providers that only accept a string (and to keep prompt-cache prefixes + // unchanged for the common no-image case). + const hasImage = msg.chunks.some((c) => c.type === "image"); + if (hasImage) { + const parts: OpenAIContentPart[] = []; + for (const chunk of msg.chunks) { + if (chunk.type === "text") { + if (chunk.text.length > 0) { + parts.push({ type: "text", text: chunk.text }); + } + } else if (chunk.type === "image") { + parts.push({ type: "image_url", image_url: { url: chunk.url } }); + } + // Non-text/non-image chunks (tool-call, thinking, etc.) are not part of a + // user message's provider content and are skipped here. + } + // An image-only message (no text) still needs at least the image part. + return { role: "user", content: parts.length > 0 ? parts : "" }; + } + const text = msg.chunks .filter((c): c is Extract<Chunk, { type: "text" }> => c.type === "text") .map((c) => c.text) diff --git a/packages/openai-stream/src/index.ts b/packages/openai-stream/src/index.ts index bd2f673..3f76b99 100644 --- a/packages/openai-stream/src/index.ts +++ b/packages/openai-stream/src/index.ts @@ -1,8 +1,14 @@ -export type { OpenAIMessage, OpenAIToolCall } from "./convert-messages.js"; +export type { + OpenAIContentPart, + OpenAIImagePart, + OpenAIMessage, + OpenAITextPart, + OpenAIToolCall, +} from "./convert-messages.js"; export { convertMessages } from "./convert-messages.js"; export type { OpenAITool } from "./convert-tools.js"; export { convertTools } from "./convert-tools.js"; -export { parseModelList } from "./listModels.js"; +export { isVisionModelId, parseModelList } from "./listModels.js"; export { parseSSELines } from "./parse-sse.js"; export type { CreateOpenAICompatProviderOpts } from "./provider.js"; export { createOpenAICompatProvider } from "./provider.js"; diff --git a/packages/openai-stream/src/listModels.test.ts b/packages/openai-stream/src/listModels.test.ts index c2438bc..2e3b1a3 100644 --- a/packages/openai-stream/src/listModels.test.ts +++ b/packages/openai-stream/src/listModels.test.ts @@ -1,7 +1,7 @@ import type { ApiKeyCredentials, ModelInfo, ProviderContract } from "@dispatch/kernel"; import type { FetchLike } from "@dispatch/trace-replay"; import { describe, expect, it, vi } from "vitest"; -import { parseModelList } from "./listModels.js"; +import { isVisionModelId, parseModelList } from "./listModels.js"; import { createOpenAICompatProvider } from "./provider.js"; function makeProvider(fetchFn: FetchLike, apiKey = "sk-test-1234567890abcdef"): ProviderContract { @@ -35,6 +35,53 @@ describe("listModels — pure mapping (parseModelList)", () => { const result = parseModelList([]); expect(result).toEqual([]); }); + + it("extracts contextWindow from common field names", () => { + const result = parseModelList([ + { id: "m1", context_length: 128000 }, + { id: "m2", context_window: 200000 }, + { id: "m3", max_context_length: 64000 }, + { id: "m4", max_tokens: 8000 }, + ]); + expect(result).toEqual([ + { id: "m1", contextWindow: 128000 }, + { id: "m2", contextWindow: 200000 }, + { id: "m3", contextWindow: 64000 }, + { id: "m4", contextWindow: 8000 }, + ]); + }); +}); + +describe("listModels — vision capability detection", () => { + it("isVisionModelId returns true for umans kimi and qwen model ids", () => { + expect(isVisionModelId("umans-kimi-k2.7")).toBe(true); + expect(isVisionModelId("Umans-Kimi-K2.7")).toBe(true); // case-insensitive + expect(isVisionModelId("umans-qwen3.6-35b-a3b")).toBe(true); + }); + + it("isVisionModelId returns false for non-vision model ids", () => { + expect(isVisionModelId("umans-glm-5.2")).toBe(false); + expect(isVisionModelId("umans-coder")).toBe(false); + expect(isVisionModelId("umans-flash")).toBe(false); + expect(isVisionModelId("kimi-k2.7-code")).toBe(false); // opencode kimi, not umans + expect(isVisionModelId("qwen3.7-max")).toBe(false); // opencode qwen, not umans + expect(isVisionModelId("deepseek-v4-flash")).toBe(false); + }); + + it("parseModelList sets vision: true on umans kimi and qwen models only", () => { + const result = parseModelList([ + { id: "umans-kimi-k2.7", context_length: 262144 }, + { id: "umans-qwen3.6-35b-a3b", context_length: 262144 }, + { id: "umans-glm-5.2", context_length: 405504 }, + { id: "umans-coder" }, + ]); + expect(result).toEqual([ + { id: "umans-kimi-k2.7", contextWindow: 262144, vision: true }, + { id: "umans-qwen3.6-35b-a3b", contextWindow: 262144, vision: true }, + { id: "umans-glm-5.2", contextWindow: 405504 }, + { id: "umans-coder" }, + ]); + }); }); describe("listModels — provider contract", () => { diff --git a/packages/openai-stream/src/listModels.ts b/packages/openai-stream/src/listModels.ts index 0e94c43..df116b0 100644 --- a/packages/openai-stream/src/listModels.ts +++ b/packages/openai-stream/src/listModels.ts @@ -24,17 +24,39 @@ interface OpenAIModelListResponse { } /** + * Whether a model id is vision-capable (can natively accept image input). + * + * The OpenAI-compatible `/models` endpoint does not reliably report image + * capabilities, so this is a hardcoded heuristic by model id: the Umans Kimi + * (`umans-kimi-k2.7`) and Umans Qwen (`umans-qwen3.6-35b-a3b`) models are + * vision-capable; all others are treated as non-vision. This is the single + * source of truth — the orchestrator's vision handoff and the `consult_vision` + * tool both consult the `ModelInfo.vision` flag this sets, so adding a model + * here enables vision everywhere. Pure: id → boolean, no I/O. + * + * (When an endpoint gains reliable vision reporting, this can be replaced with + * a real capability check without changing callers.) + */ +export function isVisionModelId(id: string): boolean { + const lower = id.toLowerCase(); + return lower.includes("umans-kimi") || lower.includes("umans-qwen"); +} + +/** * Pure mapping: raw OpenAI-compatible model list → ModelInfo[]. - * Extracts `contextWindow` from common field names (providers vary). - * Extracted for direct unit testing with no I/O. + * Extracts `contextWindow` from common field names (providers vary) and + * detects vision capability via {@link isVisionModelId}. Extracted for direct + * unit testing with no I/O. */ export function parseModelList(data: readonly OpenAIModelEntry[]): readonly ModelInfo[] { return data.map((entry) => { const contextWindow = entry.context_length ?? entry.context_window ?? entry.max_context_length ?? entry.max_tokens; + const vision = isVisionModelId(entry.id); return { id: entry.id, ...(contextWindow !== undefined ? { contextWindow } : {}), + ...(vision ? { vision } : {}), }; }); } diff --git a/packages/session-orchestrator/src/extension.ts b/packages/session-orchestrator/src/extension.ts index 0cd83ef..783d894 100644 --- a/packages/session-orchestrator/src/extension.ts +++ b/packages/session-orchestrator/src/extension.ts @@ -12,6 +12,7 @@ import { createSessionOrchestrator, createWarmService, sessionOrchestratorHandle, + visionHandoffLocalHandle, } from "./orchestrator.js"; import { selectFirstProvider } from "./pure.js"; import { filterRemoteIncompatibleTools, toolsFilter } from "./tools-filter.js"; @@ -107,6 +108,20 @@ export function activate(host: HostAPI): void { return undefined; } }, + resolveVisionHandoff: () => { + // Lazily resolve the vision-handoff service. Returns undefined when the + // vision-handoff extension isn't loaded (images pass through unchanged — + // correct for vision-capable models; the feature degrades off cleanly for + // text-only turns). Lazy so activation order doesn't matter; the + // activated-manifests guard avoids a getService throw when absent. + const loaded = host.getExtensions().some((m) => m.id === "vision-handoff"); + if (!loaded) return undefined; + try { + return host.getService(visionHandoffLocalHandle); + } catch { + return undefined; + } + }, }); host.provideService(sessionOrchestratorHandle, orchestrator); diff --git a/packages/session-orchestrator/src/orchestrator.ts b/packages/session-orchestrator/src/orchestrator.ts index 617c079..5c36922 100644 --- a/packages/session-orchestrator/src/orchestrator.ts +++ b/packages/session-orchestrator/src/orchestrator.ts @@ -5,6 +5,7 @@ import type { CompactionResult, ConversationStatus, EventHookDescriptor, + ImageInput, Logger, ModelInfo, ProviderContract, @@ -34,11 +35,71 @@ import { } from "./pure.js"; import type { ToolAssembly } from "./tools-filter.js"; +// --- Vision handoff (lazy, optional) --- + +/** + * Minimal contract the vision-handoff service satisfies. Defined here (not + * imported from the vision-handoff package) so the orchestrator has NO + * compile-time dependency on it — the service is resolved lazily at runtime + * (like the message-queue / system-prompt services), and the feature degrades + * off cleanly when the extension isn't loaded (images pass through unchanged, + * which is correct for vision-capable models and a no-op for text-only turns). + * + * `prepareForProvider` transforms a message list for the provider: if the + * active model is vision-capable, messages pass through unchanged; otherwise + * image chunks are replaced with numbered placeholders (telling the model to + * call `consult_vision`) and the images are registered for tool access. + */ +export interface VisionHandoffService { + /** + * Store images to tmp files and return compact URLs. Each input image's data + * URL is saved to a tmp file and replaced with a compact HTTP path so the + * persisted conversation store holds a tiny string, not megabytes of base64. + * When `saveImageToTmp` is not configured, data URLs pass through unchanged. + */ + readonly storeImages: ( + conversationId: string, + images: readonly ImageInput[], + ) => Promise<readonly ImageInput[]>; + + /** Delete all tmp images for a conversation (on close). Best-effort. */ + readonly purgeConversationImages: (conversationId: string) => Promise<void>; + + readonly prepareForProvider: ( + messages: readonly ChatMessage[], + currentModelName: string | undefined, + opts?: { + readonly conversationId?: string; + readonly imageLimit?: number; + readonly signal?: AbortSignal; + readonly logger?: Logger; + }, + ) => Promise<readonly ChatMessage[]>; +} + +/** + * Local handle for the vision-handoff service, keyed by the same ID the + * vision-handoff extension registers under (`"vision-handoff/service"`). Defined + * locally (not imported) so the orchestrator has no compile-time dependency on + * the vision-handoff package — the service is resolved lazily at runtime, and + * the feature degrades off cleanly when the extension isn't loaded. + */ +export const visionHandoffLocalHandle: ServiceHandle<VisionHandoffService> = + defineService<VisionHandoffService>("vision-handoff/service"); + // --- Broadcast hub types --- export interface StartTurnInput { readonly conversationId: string; readonly text: string; + /** + * Images attached to this turn (e.g. user-pasted screenshots). Each is + * appended as an `image` chunk on the persisted user message. For a + * vision-capable model the images pass through to the provider natively; for + * a non-vision model the vision handoff transcribes them to text first. + * Optional — omit for a text-only turn. + */ + readonly images?: readonly ImageInput[]; readonly modelName?: string; readonly cwd?: string; /** @@ -77,6 +138,12 @@ export type StartTurnResult = export interface EnqueueInput { readonly conversationId: string; readonly text: string; + /** + * Images attached (the steering / opening message analog of + * `StartTurnInput.images`). Threaded to `startTurn` when the conversation is + * idle (the message starts a turn). Additive optional. + */ + readonly images?: readonly ImageInput[]; /** Workspace to stamp on a new conversation. Defaults to `"default"`. */ readonly workspaceId?: string; /** @@ -291,6 +358,8 @@ export interface SessionOrchestrator { workspaceId?: string; /** Explicit system-prompt override — see {@link StartTurnInput.systemPrompt}. */ systemPrompt?: string; + /** Images attached to this turn — see {@link StartTurnInput.images}. */ + images?: readonly ImageInput[]; }): Promise<void>; } @@ -345,6 +414,17 @@ export interface SessionOrchestratorDeps { * when the stream completes. Lazy so activation order doesn't matter. */ readonly resolveConcurrencyLimiter?: () => ConcurrencyLimiter | undefined; + /** + * Lazily resolves the vision-handoff service, or `undefined` when the + * vision-handoff extension isn't loaded. Used to transcribe image chunks to + * text for non-vision models before they reach the provider (so a text-only + * model can still reason about pasted/code images). When `undefined`, images + * pass through unchanged (correct for vision-capable models; a text-only model + * would then receive image content its API may reject — the feature degrades + * off cleanly for text-only turns since there are no images). Lazy so + * activation order doesn't matter; called per-turn. + */ + readonly resolveVisionHandoff?: () => VisionHandoffService | undefined; /** Apply the per-turn tools filter chain. Injected for testability. */ readonly applyToolsFilter: (assembly: ToolAssembly) => Promise<ToolAssembly>; /** Base logger (auto-scoped to this extension); childed per turn for span capture. */ @@ -447,6 +527,7 @@ export function createSessionOrchestrator( reasoningEffortOverride: ReasoningEffort | undefined, workspaceId: string, systemPromptOverride: string | undefined, + images: readonly ImageInput[] | undefined, ): void { const turnId = generateTurnId(); const promptStartedAt = deps.now?.() ?? Date.now(); @@ -569,7 +650,18 @@ export function createSessionOrchestrator( const effectiveModelName = resolveModelName(modelName, storedModel); const history = await deps.conversationStore.load(conversationId); - const userMsg = buildUserMessage(text); + + // Store images to tmp files (compact URLs) BEFORE building the user + // message so the persisted chunks hold tiny URL references, not + // megabytes of base64 data URLs. When the vision-handoff service isn't + // loaded, images pass through unchanged (backward compatible). + const visionHandoffForStore = deps.resolveVisionHandoff?.(); + const storedImages = + visionHandoffForStore !== undefined && images !== undefined + ? await visionHandoffForStore.storeImages(conversationId, images) + : images; + + const userMsg = buildUserMessage(text, storedImages); // Workspace assignment for new conversations happens BEFORE // effective-cwd resolution (see workspaceSetupPromise above) so @@ -744,9 +836,35 @@ export function createSessionOrchestrator( return [{ role: "user", chunks: [{ type: "text", text: steerText }] }]; }; + // Vision handoff: transform the message list for the provider. When the + // active model is vision-capable, images pass through natively (no-op). + // When it is NOT vision-capable, image chunks are transcribed to text + // descriptions via a vision-capable model — so a text-only model can + // still reason about images. The PERSISTED user message keeps the + // original image chunks (appended below); only the provider's view is + // transcribed. When the vision-handoff service isn't loaded, images pass + // through unchanged (correct for vision models; text-only models would + // then receive image content their API may reject — degrades off cleanly + // for text-only turns with no images). + const visionHandoff = deps.resolveVisionHandoff?.(); + let providerMessages: readonly ChatMessage[] = [...history, userMsg]; + if (visionHandoff !== undefined) { + const visionSettings = await deps.conversationStore.getVisionSettings(); + providerMessages = await visionHandoff.prepareForProvider( + providerMessages, + effectiveModelName, + { + conversationId, + imageLimit: visionSettings.imageLimit, + signal: controller.signal, + ...(turnLogger !== undefined ? { logger: turnLogger } : {}), + }, + ); + } + const opts: RunTurnInput = { provider, - messages: [...history, userMsg], + messages: providerMessages, tools: assembled.tools, dispatch, emit: emitAndAccumulate, @@ -852,6 +970,7 @@ export function createSessionOrchestrator( reasoningEffort, workspaceId, systemPrompt, + images, }) { if (activeTurns.has(conversationId)) { return { started: false, reason: "already-active" }; @@ -865,18 +984,20 @@ export function createSessionOrchestrator( reasoningEffort, workspaceId ?? "default", systemPrompt, + images, ); const turn = activeTurns.get(conversationId); const turnId = turn !== undefined ? turn.turnId : ""; return { started: true, turnId }; }, - enqueue({ conversationId, text, workspaceId, computerId }) { + enqueue({ conversationId, text, workspaceId, computerId, images }) { const result = orchestrator.startTurn({ conversationId, text, ...(workspaceId !== undefined ? { workspaceId } : {}), ...(computerId !== undefined ? { computerId } : {}), + ...(images !== undefined ? { images } : {}), }); if (result.started) { return { startedTurn: true, queue: [] }; @@ -939,6 +1060,9 @@ export function createSessionOrchestrator( }); }); void deps.conversationStore.setConversationStatus(conversationId, "closed"); + // Purge tmp images for this conversation (best-effort, fire-and-forget). + const vh = deps.resolveVisionHandoff?.(); + if (vh !== undefined) void vh.purgeConversationImages(conversationId); return { abortedTurn }; }, @@ -961,6 +1085,7 @@ export function createSessionOrchestrator( reasoningEffort, workspaceId, systemPrompt, + images, }) { const turnInput: StartTurnInput = { conversationId, @@ -971,6 +1096,7 @@ export function createSessionOrchestrator( ...(reasoningEffort !== undefined ? { reasoningEffort } : {}), ...(workspaceId !== undefined ? { workspaceId } : {}), ...(systemPrompt !== undefined ? { systemPrompt } : {}), + ...(images !== undefined ? { images } : {}), }; const result = orchestrator.startTurn(turnInput); if (!result.started) { diff --git a/packages/session-orchestrator/src/pure.test.ts b/packages/session-orchestrator/src/pure.test.ts index c75cb82..7a574f1 100644 --- a/packages/session-orchestrator/src/pure.test.ts +++ b/packages/session-orchestrator/src/pure.test.ts @@ -26,6 +26,39 @@ describe("buildUserMessage", () => { expect(msg.role).toBe("user"); expect(msg.chunks[0]).toEqual({ type: "text", text: "" }); }); + + it("appends image chunks after the text chunk when images are given", () => { + const msg = buildUserMessage("look at this", [ + { url: "data:image/png;base64,aaa" }, + { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" }, + ]); + expect(msg.chunks).toHaveLength(3); + expect(msg.chunks[0]).toEqual({ type: "text", text: "look at this" }); + expect(msg.chunks[1]).toEqual({ type: "image", url: "data:image/png;base64,aaa" }); + expect(msg.chunks[2]).toEqual({ + type: "image", + url: "data:image/jpeg;base64,bbb", + mimeType: "image/jpeg", + }); + }); + + it("builds an image-only message when text is empty", () => { + const msg = buildUserMessage("", [{ url: "data:image/png;base64,zzz" }]); + expect(msg.chunks).toHaveLength(1); + expect(msg.chunks[0]).toEqual({ type: "image", url: "data:image/png;base64,zzz" }); + }); + + it("includes mimeType when provided", () => { + const msg = buildUserMessage("hi", [ + { url: "data:image/webp;base64,x", mimeType: "image/webp" }, + ]); + expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBe("image/webp"); + }); + + it("omits mimeType when not provided", () => { + const msg = buildUserMessage("hi", [{ url: "https://example.com/x.png" }]); + expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBeUndefined(); + }); }); describe("selectFirstProvider", () => { diff --git a/packages/session-orchestrator/src/pure.ts b/packages/session-orchestrator/src/pure.ts index 2208e8f..0d2068f 100644 --- a/packages/session-orchestrator/src/pure.ts +++ b/packages/session-orchestrator/src/pure.ts @@ -1,12 +1,40 @@ import type { ChatMessage, + Chunk, + ImageInput, ProviderContract, ReasoningEffort, ToolDispatchPolicy, } from "@dispatch/kernel"; -export function buildUserMessage(text: string): ChatMessage { - return { role: "user", chunks: [{ type: "text", text }] }; +/** + * Build the persisted user message for a turn. When `images` are provided, each + * is appended as an `image` chunk AFTER the text chunk, so the persisted message + * carries both the prompt text and the attached images (the frontend renders + * the images; vision-capable providers receive them natively; non-vision + * providers have them transcribed by the vision handoff before streaming). + * + * Pure: inputs → a ChatMessage, no I/O. + */ +export function buildUserMessage(text: string, images?: readonly ImageInput[]): ChatMessage { + const chunks: Chunk[] = []; + if (text.length > 0) { + chunks.push({ type: "text", text }); + } + if (images !== undefined) { + for (const img of images) { + chunks.push({ + type: "image", + url: img.url, + ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}), + }); + } + } + // An image-only message (empty text) is valid. + if (chunks.length === 0) { + chunks.push({ type: "text", text: "" }); + } + return { role: "user", chunks }; } // ── Provider-error retry backoff schedule ─────────────────────────────────── diff --git a/packages/transport-contract/src/contract.types.test.ts b/packages/transport-contract/src/contract.types.test.ts index 9d3d904..34ff544 100644 --- a/packages/transport-contract/src/contract.types.test.ts +++ b/packages/transport-contract/src/contract.types.test.ts @@ -20,6 +20,7 @@ import type { LspServerState, LspStatusResponse, McpStatusResponse, + ModelsResponse, SetConversationComputerRequest, SetCwdRequest, SetWorkspaceDefaultComputerRequest, @@ -55,6 +56,18 @@ const _chatWithoutComputer: ChatRequest = { message: "hello", }; +// ─── ChatRequest.images (additive optional) ────────────────────────────────── + +const _chatWithImages: ChatRequest = { + message: "What's in this screenshot?", + images: [{ url: "data:image/png;base64,iVBORw0KGgo=", mimeType: "image/png" }], +}; + +const _chatWithHttpImage: ChatRequest = { + message: "analyze this", + images: [{ url: "https://example.com/diagram.png" }], +}; + // ─── Computer list / single response ───────────────────────────────────────── const _computer: Computer = { @@ -255,6 +268,35 @@ describe("transport-contract types compile and are exported", () => { expect(_chatWithComputer.computerId).toBe("prod-box"); }); + // ─── ChatRequest.images (additive optional) ────────────────────────────── + + it("ChatRequest: images is additive optional (omittable)", () => { + expect(_chatWithoutComputer.images).toBeUndefined(); + }); + + it("ChatRequest: carries images (data URL) when set", () => { + expect(_chatWithImages.images).toHaveLength(1); + expect(_chatWithImages.images?.[0]?.url).toContain("base64"); + expect(_chatWithImages.images?.[0]?.mimeType).toBe("image/png"); + }); + + it("ChatRequest: carries images (http URL, mimeType optional)", () => { + expect(_chatWithHttpImage.images?.[0]?.url).toBe("https://example.com/diagram.png"); + expect(_chatWithHttpImage.images?.[0]?.mimeType).toBeUndefined(); + }); + + it("ModelsResponse: ModelMetadata carries optional vision flag", () => { + const resp: ModelsResponse = { + models: ["umans/kimi-k2.7", "umans/glm-5.2"], + modelInfo: { + "umans/kimi-k2.7": { contextWindow: 200000, vision: true }, + "umans/glm-5.2": { contextWindow: 128000 }, + }, + }; + expect(resp.modelInfo?.["umans/kimi-k2.7"]?.vision).toBe(true); + expect(resp.modelInfo?.["umans/glm-5.2"]?.vision).toBeUndefined(); + }); + // ─── Computers ─────────────────────────────────────────────────────────── it("ComputerListResponse: carries entries with usage counts", () => { diff --git a/packages/transport-contract/src/index.ts b/packages/transport-contract/src/index.ts index 400d9d5..d5f3000 100644 --- a/packages/transport-contract/src/index.ts +++ b/packages/transport-contract/src/index.ts @@ -26,6 +26,7 @@ import type { ComputerEntry, ConversationMeta, ConversationStatus, + ImageInput, QueuedMessage, ReasoningEffort, StoredChunk, @@ -41,6 +42,8 @@ export type { ComputerEntry, ConversationMeta, ConversationStatus, + ImageChunk, + ImageInput, QueuedMessage, ReasoningEffort, StepMetrics, @@ -68,6 +71,19 @@ export interface ChatRequest { readonly message: string; /** + * Images attached to this turn (e.g. a user-pasted screenshot). Each entry's + * `url` is a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` + * URL. The server converts these to `image` chunks on the persisted user + * message. For a VISION-capable model (e.g. kimi), the images are passed + * through to the provider natively. For a NON-vision model (e.g. glm-5.2), + * the server's vision handoff transcribes each image to a text description + * (via a vision-capable model) and feeds that text instead — so a text-only + * model can still reason about the image's contents. Optional — omit for a + * text-only turn (backward compatible). + */ + readonly images?: readonly ImageInput[]; + + /** * The model to use, as a model name in `<credentialName>/<model>` form — one * of the exact strings returned by `GET /models`. Omit to use the server's * default credential + model. @@ -124,6 +140,14 @@ export interface ModelsResponse { /** Per-model metadata returned alongside the model catalog. */ export interface ModelMetadata { readonly contextWindow?: number; + /** + * Whether this model can natively accept image input (vision/multimodal). + * When `true`, image chunks in a user message are passed through to the + * provider. When `false`/absent, the server's vision handoff transcribes + * images to text before the model sees them. A client may use this to show a + * vision badge in the model picker. Optional — absent when unknown. + */ + readonly vision?: boolean; } /** @@ -387,6 +411,23 @@ export interface SystemPromptVariablesResponse { readonly variables: readonly SystemPromptVariable[]; } +// ─── Vision settings (global) ────────────────────────────────────────────────── + +/** + * Response of `GET /settings/vision` — the global vision configuration shared + * across all conversations and vision models. + */ +export interface VisionSettingsResponse { + readonly imageLimit: number; + readonly compactionModel: string | null; +} + +/** Body of `PUT /settings/vision` — a partial update. */ +export interface SetVisionSettingsRequest { + readonly imageLimit?: number; + readonly compactionModel?: string | null; +} + // ─── Message queue (steering) ───────────────────────────────────────────────── /** diff --git a/packages/transport-http/src/app.ts b/packages/transport-http/src/app.ts index 23f8dde..0fcc8f0 100644 --- a/packages/transport-http/src/app.ts +++ b/packages/transport-http/src/app.ts @@ -42,6 +42,7 @@ import type { ThroughputResponse, TitleResponse, UpdateHeartbeatRequest, + VisionSettingsResponse, WarmResponse, WorkspaceListResponse, WorkspaceResponse, @@ -212,6 +213,37 @@ export function createApp(opts: CreateServerOptions): Hono { app.get("/health", (c) => c.json({ ok: true })); + // ── Tmp image serving (vision handoff) ────────────────────────────────────── + app.get("/images/:conversationId/:imageId", async (c) => { + const conversationId = c.req.param("conversationId"); + const imageId = c.req.param("imageId"); + if (imageId.includes("/") || imageId.includes("..")) { + return c.json({ error: "Invalid image ID" }, 400); + } + const imageDir = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images"; + const { join } = await import("node:path"); + const { readFile: fsReadFile } = await import("node:fs/promises"); + const filePath = join(imageDir, conversationId, imageId); + try { + const buf = await fsReadFile(filePath); + const ext = imageId.toLowerCase(); + const mime = ext.endsWith(".png") + ? "image/png" + : ext.endsWith(".jpg") || ext.endsWith(".jpeg") + ? "image/jpeg" + : ext.endsWith(".webp") + ? "image/webp" + : ext.endsWith(".gif") + ? "image/gif" + : ext.endsWith(".bmp") + ? "image/bmp" + : "application/octet-stream"; + return new Response(buf, { headers: { "Content-Type": mime, "Cache-Control": "no-cache" } }); + } catch { + return c.json({ error: "Image not found" }, 404); + } + }); + app.get("/conversations/:id/metrics", async (c) => { const conversationId = c.req.param("id"); @@ -306,11 +338,14 @@ export function createApp(opts: CreateServerOptions): Hono { app.get("/models", async (c) => { try { const models = await opts.credentialStore.listCatalog(); - const modelInfo: Record<string, { contextWindow?: number }> = {}; + const modelInfo: Record<string, { contextWindow?: number; vision?: boolean }> = {}; for (const modelName of models) { const info = await opts.credentialStore.getModelInfo(modelName); - if (info?.contextWindow !== undefined) { - modelInfo[modelName] = { contextWindow: info.contextWindow }; + if (info?.contextWindow !== undefined || info?.vision === true) { + const entry: { contextWindow?: number; vision?: boolean } = {}; + if (info?.contextWindow !== undefined) entry.contextWindow = info.contextWindow; + if (info?.vision === true) entry.vision = true; + modelInfo[modelName] = entry; } } const body: ModelsResponse = { @@ -410,8 +445,16 @@ export function createApp(opts: CreateServerOptions): Hono { return c.json({ error: result.error }, 400); } - const { conversationId, message, model, cwd, computerId, reasoningEffort, workspaceId } = - result; + const { + conversationId, + message, + model, + cwd, + computerId, + reasoningEffort, + workspaceId, + images, + } = result; log.info("chat: request accepted", { conversationId, hasModel: model !== undefined, @@ -419,6 +462,7 @@ export function createApp(opts: CreateServerOptions): Hono { hasComputerId: computerId !== undefined, hasReasoningEffort: reasoningEffort !== undefined, hasWorkspaceId: workspaceId !== undefined, + imageCount: images?.length ?? 0, }); const events: AgentEvent[] = []; @@ -469,6 +513,7 @@ export function createApp(opts: CreateServerOptions): Hono { ...(computerId !== undefined ? { computerId } : {}), ...(reasoningEffort !== undefined ? { reasoningEffort } : {}), ...(workspaceId !== undefined ? { workspaceId } : {}), + ...(images !== undefined ? { images } : {}), }; opts.orchestrator @@ -1671,6 +1716,43 @@ export function createApp(opts: CreateServerOptions): Hono { return c.json(response, 200); }); + app.get("/settings/vision", async (c) => { + const settings = await opts.conversationStore.getVisionSettings(); + const body: VisionSettingsResponse = settings; + return c.json(body, 200); + }); + + app.put("/settings/vision", async (c) => { + let body: unknown; + try { + body = await c.req.json(); + } catch { + return c.json({ error: "Invalid JSON body" }, 400); + } + const obj = body as { imageLimit?: unknown; compactionModel?: unknown }; + if (obj.imageLimit !== undefined) { + if ( + typeof obj.imageLimit !== "number" || + !Number.isInteger(obj.imageLimit) || + obj.imageLimit < 0 + ) { + return c.json({ error: "imageLimit must be a non-negative integer" }, 400); + } + await opts.conversationStore.setVisionImageLimit(obj.imageLimit); + log.info("vision: image limit set", { imageLimit: obj.imageLimit }); + } + if (obj.compactionModel !== undefined) { + if (obj.compactionModel !== null && typeof obj.compactionModel !== "string") { + return c.json({ error: "compactionModel must be a string or null" }, 400); + } + await opts.conversationStore.setVisionCompactionModel(obj.compactionModel); + log.info("vision: compaction model set", { compactionModel: obj.compactionModel }); + } + const settings = await opts.conversationStore.getVisionSettings(); + const response: VisionSettingsResponse = settings; + return c.json(response, 200); + }); + // ─── Static frontend serving (catch-all, API routes take precedence) ────── if (opts.webDir !== undefined) { const webDir = opts.webDir; diff --git a/packages/transport-http/src/logic.test.ts b/packages/transport-http/src/logic.test.ts index fc8302e..67632f3 100644 --- a/packages/transport-http/src/logic.test.ts +++ b/packages/transport-http/src/logic.test.ts @@ -182,6 +182,69 @@ describe("parseChatBody", () => { expect(result.reasoningEffort).toBeUndefined(); } }); + + // ── images ────────────────────────────────────────────────────────────── + + it("parses images array with data URLs", () => { + const result = parseChatBody( + { + message: "what is this?", + images: [ + { url: "data:image/png;base64,aaa" }, + { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" }, + ], + }, + fakeId, + ); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images).toHaveLength(2); + expect(result.images?.[0]?.url).toBe("data:image/png;base64,aaa"); + expect(result.images?.[1]?.mimeType).toBe("image/jpeg"); + } + }); + + it("parses images with http URLs", () => { + const result = parseChatBody( + { message: "hi", images: [{ url: "https://example.com/x.png" }] }, + fakeId, + ); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images?.[0]?.url).toBe("https://example.com/x.png"); + } + }); + + it("returns error when images is not an array", () => { + const result = parseChatBody({ message: "hi", images: "not-an-array" }, fakeId); + expect(isParseError(result)).toBe(true); + }); + + it("returns error when an image lacks a url", () => { + const result = parseChatBody({ message: "hi", images: [{ mimeType: "image/png" }] }, fakeId); + expect(isParseError(result)).toBe(true); + }); + + it("returns error when an image url is empty", () => { + const result = parseChatBody({ message: "hi", images: [{ url: "" }] }, fakeId); + expect(isParseError(result)).toBe(true); + }); + + it("omits images when absent (backward compatible)", () => { + const result = parseChatBody({ message: "hi" }, fakeId); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images).toBeUndefined(); + } + }); + + it("omits images when the array is empty", () => { + const result = parseChatBody({ message: "hi", images: [] }, fakeId); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images).toBeUndefined(); + } + }); }); describe("parseSinceSeq", () => { diff --git a/packages/transport-http/src/logic.ts b/packages/transport-http/src/logic.ts index d5f2dea..c97f320 100644 --- a/packages/transport-http/src/logic.ts +++ b/packages/transport-http/src/logic.ts @@ -55,6 +55,13 @@ export interface ChatCommand { readonly computerId?: string; readonly reasoningEffort?: ReasoningEffort; readonly workspaceId?: string; + /** + * Images attached to this turn (data URLs or http URLs). Parsed from the + * `ChatRequest.images` field; forwarded to the orchestrator which converts + * them to `image` chunks on the user message. Each entry must have a non-empty + * string `url`; `mimeType` is optional. + */ + readonly images?: readonly { readonly url: string; readonly mimeType?: string }[]; } export interface ParseError { @@ -121,6 +128,33 @@ export function parseChatBody(body: unknown, generateId: () => string): ParseRes (result as { workspaceId?: string }).workspaceId = obj.workspaceId; } + if (obj.images !== undefined) { + if (!Array.isArray(obj.images)) { + return { error: "Field 'images' must be an array" }; + } + const images: { url: string; mimeType?: string }[] = []; + for (const entry of obj.images) { + if (entry === null || typeof entry !== "object") { + return { error: "Each image must be an object with a 'url' string" }; + } + const img = entry as { url?: unknown; mimeType?: unknown }; + if (typeof img.url !== "string" || img.url.length === 0) { + return { error: "Each image must have a non-empty string 'url'" }; + } + const parsed: { url: string; mimeType?: string } = { url: img.url }; + if (img.mimeType !== undefined) { + if (typeof img.mimeType !== "string") { + return { error: "Field 'mimeType' on an image must be a string" }; + } + parsed.mimeType = img.mimeType; + } + images.push(parsed); + } + if (images.length > 0) { + (result as { images?: readonly { url: string; mimeType?: string }[] }).images = images; + } + } + return result; } diff --git a/packages/transport-ws/src/extension.ts b/packages/transport-ws/src/extension.ts index 3811ed7..d26712b 100644 --- a/packages/transport-ws/src/extension.ts +++ b/packages/transport-ws/src/extension.ts @@ -291,6 +291,7 @@ export function createTransportWsExtension(): Extension { : {}), ...(result.workspaceId !== undefined ? { workspaceId: result.workspaceId } : {}), ...(result.computerId !== undefined ? { computerId: result.computerId } : {}), + ...(result.images !== undefined ? { images: result.images } : {}), }); if (!startResult.started) { send(ws, { diff --git a/packages/transport-ws/src/router.ts b/packages/transport-ws/src/router.ts index a33aa5a..0caf305 100644 --- a/packages/transport-ws/src/router.ts +++ b/packages/transport-ws/src/router.ts @@ -58,6 +58,12 @@ export interface ChatRouteResult { * conversation → workspace → local chain). */ readonly computerId?: string; + /** + * Images attached to this turn (data URLs or http URLs), forwarded verbatim to + * the orchestrator. Absent when the client omits it. Each entry must have a + * non-empty string `url`; `mimeType` is optional. + */ + readonly images?: readonly { readonly url: string; readonly mimeType?: string }[]; } /** A malformed chat.send that should yield a chat.error reply. */ @@ -174,6 +180,36 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError errorMessage: `chat.send: invalid reasoningEffort "${msg.reasoningEffort}" — must be one of: low, medium, high, xhigh, max`, }; } + // Validate images (if present): each must be an object with a non-empty url. + let images: readonly { url: string; mimeType?: string }[] | undefined; + if (msg.images !== undefined) { + if (!Array.isArray(msg.images)) { + return { + kind: "chat-error", + conversationId: msg.conversationId, + errorMessage: "chat.send: 'images' must be an array", + }; + } + const parsed: { url: string; mimeType?: string }[] = []; + for (const entry of msg.images) { + if ( + entry === null || + typeof entry !== "object" || + typeof entry.url !== "string" || + entry.url.length === 0 + ) { + return { + kind: "chat-error", + conversationId: msg.conversationId, + errorMessage: "chat.send: each image must have a non-empty string 'url'", + }; + } + const p: { url: string; mimeType?: string } = { url: entry.url }; + if (entry.mimeType !== undefined) p.mimeType = entry.mimeType; + parsed.push(p); + } + if (parsed.length > 0) images = parsed; + } return { kind: "chat", conversationId: msg.conversationId, @@ -183,6 +219,7 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError ...(msg.reasoningEffort !== undefined ? { reasoningEffort: msg.reasoningEffort } : {}), ...(msg.workspaceId !== undefined ? { workspaceId: msg.workspaceId } : {}), ...(msg.computerId !== undefined ? { computerId: msg.computerId } : {}), + ...(images !== undefined ? { images } : {}), }; } diff --git a/packages/vision-handoff/package.json b/packages/vision-handoff/package.json new file mode 100644 index 0000000..b11f7ee --- /dev/null +++ b/packages/vision-handoff/package.json @@ -0,0 +1,14 @@ +{ + "name": "@dispatch/vision-handoff", + "version": "0.0.0", + "type": "module", + "private": true, + "main": "dist/index.js", + "types": "dist/index.d.ts", + "dependencies": { + "@dispatch/conversation-store": "workspace:*", + "@dispatch/credential-store": "workspace:*", + "@dispatch/kernel": "workspace:*", + "@dispatch/openai-stream": "workspace:*" + } +} diff --git a/packages/vision-handoff/src/extension.ts b/packages/vision-handoff/src/extension.ts new file mode 100644 index 0000000..08fddca --- /dev/null +++ b/packages/vision-handoff/src/extension.ts @@ -0,0 +1,198 @@ +/** + * vision-handoff extension — registers the universal vision handoff service + + * the `consult_vision` tool. + * + * The service performs provider-agnostic vision handoff: when a non-vision model + * (e.g. glm-5.2) receives an image, it replaces the image with a numbered + * placeholder and registers it for tool access. The `consult_vision` tool opens + * a NEW conversation tab with a vision-capable model (e.g. Kimi), attaches the + * image + the model's specific question, and returns the conversation ID + the + * vision model's answer. Follow-ups go through the dispatch CLI. + * + * Images are saved to a tmp directory (`/tmp/dispatch/images/<convId>/`) so the + * conversation store (SQLite) only holds a compact URL reference — not + * megabytes of base64. Tmp files are purged on reboot (ephemeral dir), after + * compaction (the transcription replaces the image), and on conversation close. + * + * Effects (filesystem, orchestrator) live here in the shell, injected into the + * service. The pure decisions live in `pure.ts`. No `console.*`; logging via + * `host.logger`. + */ + +import { mkdir, readFile, rm, unlink, writeFile } from "node:fs/promises"; +import { extname, isAbsolute, join, resolve as pathResolve } from "node:path"; +import { conversationStoreHandle } from "@dispatch/conversation-store"; +import type { CredentialStore } from "@dispatch/credential-store"; +import { credentialStoreHandle } from "@dispatch/credential-store"; +import type { Extension, HostAPI, Manifest } from "@dispatch/kernel"; +import { + createVisionHandoffService, + orchestratorLocalHandle, + visionHandoffHandle, +} from "./service.js"; +import { createConsultVisionTool } from "./tool.js"; + +export const manifest: Manifest = { + id: "vision-handoff", + name: "Vision Handoff", + version: "0.0.0", + apiVersion: "^0.1.0", + trust: "bundled", + activation: "eager", + capabilities: { network: true }, + contributes: { services: ["vision-handoff/service"], tools: ["consult_vision"] }, +}; + +const IMAGE_DIR = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images"; + +/** MIME types for recognized image extensions. */ +const MIME_BY_EXT: Readonly<Record<string, string>> = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".webp": "image/webp", + ".gif": "image/gif", + ".bmp": "image/bmp", +}; + +/** Reverse: MIME → extension. */ +const EXT_BY_MIME: Readonly<Record<string, string>> = { + "image/png": ".png", + "image/jpeg": ".jpg", + "image/webp": ".webp", + "image/gif": ".gif", + "image/bmp": ".bmp", +}; + +/** + * Read an image file from disk as a base64 data URL. Resolves relative paths + * against the cwd (the conversation's working directory). Throws on missing + * file / read error (the caller surfaces it). The shell edge — real `node:fs`. + */ +async function readFileAsDataUrl(path: string, cwd?: string): Promise<string> { + const abs = cwd !== undefined && !isAbsolute(path) ? pathResolve(cwd, path) : pathResolve(path); + const buf = await readFile(abs); + const ext = extname(abs).toLowerCase(); + const mime = MIME_BY_EXT[ext] ?? "image/png"; + return `data:${mime};base64,${buf.toString("base64")}`; +} + +/** + * Save a data URL image to a tmp file and return a compact HTTP path. + * The compact URL (`/images/<conversationId>/<uuid>.<ext>`) is what gets + * persisted in the conversation store — a tiny string, not megabytes of base64. + */ +async function saveImageToTmp( + conversationId: string, + dataUrl: string, + mimeType?: string, +): Promise<string> { + const mime = mimeType ?? "image/png"; + const ext = EXT_BY_MIME[mime] ?? ".png"; + const imageId = `${crypto.randomUUID()}${ext}`; + const dir = join(IMAGE_DIR, conversationId); + await mkdir(dir, { recursive: true }); + const filePath = join(dir, imageId); + const base64 = dataUrl.split(",")[1] ?? ""; + await writeFile(filePath, Buffer.from(base64, "base64")); + return `/images/${conversationId}/${imageId}`; +} + +/** + * Resolve a compact URL (`/images/<convId>/<imageId>`) back to a data URL by + * reading the tmp file. Data URLs and HTTP URLs pass through unchanged. + */ +async function resolveImageUrl(url: string): Promise<string> { + if (url.startsWith("data:") || url.startsWith("http")) return url; + if (!url.startsWith("/images/")) return url; + const parts = url.split("/"); // ["", "images", convId, imageId] + const convId = parts[2]; + const imageId = parts[3]; + if (convId === undefined || imageId === undefined) return url; + const filePath = join(IMAGE_DIR, convId, imageId); + const buf = await readFile(filePath); + const ext = extname(imageId).toLowerCase(); + const mime = MIME_BY_EXT[ext] ?? "image/png"; + return `data:${mime};base64,${buf.toString("base64")}`; +} + +/** Delete a single tmp image file (after compaction — best-effort). */ +async function deleteTmpImage(compactUrl: string): Promise<void> { + if (!compactUrl.startsWith("/images/")) return; + const parts = compactUrl.split("/"); + const convId = parts[2]; + const imageId = parts[3]; + if (convId === undefined || imageId === undefined) return; + const filePath = join(IMAGE_DIR, convId, imageId); + try { + await unlink(filePath); + } catch { + // Best-effort — file may already be deleted. + } +} + +/** Delete all tmp images for a conversation (on close — best-effort). */ +async function deleteConversationImages(conversationId: string): Promise<void> { + const dir = join(IMAGE_DIR, conversationId); + try { + await rm(dir, { recursive: true, force: true }); + } catch { + // Best-effort. + } +} + +export async function activate(host: HostAPI): Promise<void> { + const credentialStore = host.getService(credentialStoreHandle) as CredentialStore | undefined; + if (credentialStore === undefined) { + host.logger.warn( + "vision-handoff: credential-store service not available. The consult_vision tool and image handoff are disabled.", + ); + return; + } + + const resolveModel = (modelName: string) => { + const resolved = credentialStore.resolve(modelName); + if (resolved === undefined) return undefined; + const provider = host.getProviders().get(resolved.providerId); + if (provider === undefined) return undefined; + return { provider, model: resolved.model }; + }; + + const service = createVisionHandoffService({ + credentialStore, + resolveModel, + readFileAsDataUrl, + saveImageToTmp, + resolveImageUrl, + deleteTmpImage, + deleteConversationImages, + resolveOrchestrator: () => { + const loaded = host.getExtensions().some((m) => m.id === "session-orchestrator"); + if (!loaded) return undefined; + try { + return host.getService(orchestratorLocalHandle); + } catch { + return undefined; + } + }, + getImageTranscriptions: async (conversationId: string) => { + const store = host.getService(conversationStoreHandle); + return store.getImageTranscriptions(conversationId); + }, + setImageTranscription: async (conversationId: string, url: string, text: string) => { + const store = host.getService(conversationStoreHandle); + await store.setImageTranscription(conversationId, url, text); + }, + setConversationTitle: async (conversationId: string, title: string) => { + const store = host.getService(conversationStoreHandle); + await store.setConversationTitle(conversationId, title); + }, + logger: host.logger.child({ extensionId: "vision-handoff" }), + }); + + host.provideService(visionHandoffHandle, service); + host.defineTool(createConsultVisionTool(service)); + host.logger.info("vision-handoff: registered (consult_vision tool + handoff service)"); +} + +export const extension: Extension = { manifest, activate }; diff --git a/packages/vision-handoff/src/index.ts b/packages/vision-handoff/src/index.ts new file mode 100644 index 0000000..2713346 --- /dev/null +++ b/packages/vision-handoff/src/index.ts @@ -0,0 +1,21 @@ +export { extension, manifest } from "./extension.js"; +export { + collectTextFromStream, + findVisionModelName, + formatConsultResult, + formatImagePlaceholder, + formatNoVisionPlaceholder, + isVisionCapable, +} from "./pure.js"; +export type { + OrchestratorForVision, + ResolvedVisionModel, + VisionHandoffDeps, + VisionHandoffService, +} from "./service.js"; +export { + createVisionHandoffService, + orchestratorLocalHandle, + visionHandoffHandle, +} from "./service.js"; +export { createConsultVisionTool } from "./tool.js"; diff --git a/packages/vision-handoff/src/pure.test.ts b/packages/vision-handoff/src/pure.test.ts new file mode 100644 index 0000000..21b1224 --- /dev/null +++ b/packages/vision-handoff/src/pure.test.ts @@ -0,0 +1,180 @@ +import type { ModelInfo, ProviderEvent } from "@dispatch/kernel"; +import { describe, expect, it } from "vitest"; +import { + collectTextFromStream, + findVisionModelName, + formatConsultationTitle, + formatConsultResult, + formatImagePlaceholder, + formatNoVisionPlaceholder, + isVisionCapable, +} from "./pure.js"; + +describe("isVisionCapable", () => { + it("returns true when ModelInfo.vision is true", () => { + expect(isVisionCapable("umans/umans-kimi-k2.7", { id: "umans-kimi-k2.7", vision: true })).toBe( + true, + ); + }); + + it("returns false when ModelInfo.vision is false (overrides name heuristic)", () => { + expect(isVisionCapable("umans/umans-kimi-k2.7", { id: "umans-kimi-k2.7", vision: false })).toBe( + false, + ); + }); + + it("falls back to name heuristic when vision is absent (umans kimi + qwen)", () => { + expect(isVisionCapable("umans/umans-kimi-k2.7", undefined)).toBe(true); + expect(isVisionCapable("umans/umans-qwen3.6-35b-a3b", undefined)).toBe(true); + }); + + it("falls back to name heuristic when vision is absent (non-vision)", () => { + expect(isVisionCapable("umans/umans-glm-5.2", undefined)).toBe(false); + expect(isVisionCapable("umans/umans-coder", { id: "umans-coder" })).toBe(false); + }); + + it("returns false for undefined model name", () => { + expect(isVisionCapable(undefined, undefined)).toBe(false); + }); +}); + +describe("findVisionModelName", () => { + const getInfo = async (name: string): Promise<ModelInfo | undefined> => { + const map: Record<string, ModelInfo> = { + "umans/umans-kimi-k2.7": { id: "umans-kimi-k2.7", vision: true }, + "umans/umans-qwen3.6-35b-a3b": { id: "umans-qwen3.6-35b-a3b", vision: true }, + "umans/umans-glm-5.2": { id: "umans-glm-5.2" }, + "umans/llama-vision": { id: "llama-vision", vision: true }, + }; + return map[name]; + }; + + it("finds the first umans kimi model via name heuristic", async () => { + const name = await findVisionModelName( + ["umans/umans-glm-5.2", "umans/umans-kimi-k2.7", "umans/llama-vision"], + getInfo, + ); + expect(name).toBe("umans/umans-kimi-k2.7"); + }); + + it("finds a vision model via ModelInfo.vision when name heuristic misses", async () => { + const name = await findVisionModelName(["umans/umans-glm-5.2", "umans/llama-vision"], getInfo); + expect(name).toBe("umans/llama-vision"); + }); + + it("skips the excluded model and finds the next vision model", async () => { + const name = await findVisionModelName( + ["umans/umans-kimi-k2.7", "umans/umans-qwen3.6-35b-a3b"], + getInfo, + "umans/umans-kimi-k2.7", + ); + expect(name).toBe("umans/umans-qwen3.6-35b-a3b"); + }); + + it("returns undefined when no vision model is available", async () => { + const name = await findVisionModelName(["umans/umans-glm-5.2"], getInfo); + expect(name).toBeUndefined(); + }); + + it("returns undefined for empty catalog", async () => { + const name = await findVisionModelName([], getInfo); + expect(name).toBeUndefined(); + }); +}); + +describe("collectTextFromStream", () => { + async function* stream(events: ProviderEvent[]): AsyncIterable<ProviderEvent> { + for (const e of events) yield e; + } + + it("collects text-delta events into a single string", async () => { + const events: ProviderEvent[] = [ + { type: "text-delta", delta: "Hello " }, + { type: "text-delta", delta: "world!" }, + ]; + const text = await collectTextFromStream(stream(events)); + expect(text).toBe("Hello world!"); + }); + + it("ignores non-text events", async () => { + const events: ProviderEvent[] = [ + { type: "reasoning-delta", delta: "thinking..." }, + { type: "text-delta", delta: "answer" }, + { type: "usage", usage: { inputTokens: 5, outputTokens: 1 } }, + { type: "finish", reason: "stop" }, + ]; + const text = await collectTextFromStream(stream(events)); + expect(text).toBe("answer"); + }); + + it("throws on an error event", async () => { + const events: ProviderEvent[] = [ + { type: "text-delta", delta: "partial" }, + { type: "error", message: "boom" }, + ]; + await expect(collectTextFromStream(stream(events))).rejects.toThrow("boom"); + }); + + it("returns empty string for an empty stream", async () => { + const text = await collectTextFromStream(stream([])); + expect(text).toBe(""); + }); +}); + +describe("formatImagePlaceholder", () => { + it("includes the image ID and mentions consult_vision", () => { + const text = formatImagePlaceholder(1); + expect(text).toContain("Image 1"); + expect(text).toContain("consult_vision"); + expect(text).toContain("imageIds=[1]"); + }); + + it("increments the ID for each image", () => { + expect(formatImagePlaceholder(2)).toContain("Image 2"); + expect(formatImagePlaceholder(2)).toContain("imageIds=[2]"); + }); +}); + +describe("formatNoVisionPlaceholder", () => { + it("explains the limitation", () => { + const text = formatNoVisionPlaceholder(); + expect(text).toContain("no vision-capable model"); + }); +}); + +describe("formatConsultResult", () => { + it("includes the conversation ID, the response, and the dispatch CLI hint", () => { + const result = formatConsultResult("abc-123", "The error is on line 12."); + expect(result).toContain("abc-123"); + expect(result).toContain("The error is on line 12."); + expect(result).toContain("dispatch CLI"); + }); + + it("trims the response", () => { + const result = formatConsultResult("c1", " spaced "); + expect(result).toContain("spaced"); + expect(result).not.toContain("spaced "); + }); +}); + +describe("formatConsultationTitle", () => { + it("prefixes the question with 'IMAGE - '", () => { + expect(formatConsultationTitle("What error is shown?")).toBe("IMAGE - What error is shown?"); + }); + + it("truncates long questions to 80 chars with an ellipsis (matching the store's TITLE_MAX)", () => { + const long = "x".repeat(100); + const title = formatConsultationTitle(long); + expect(title).toBe(`IMAGE - ${"x".repeat(80)}…`); + expect(title.length).toBe("IMAGE - ".length + 80 + 1); // prefix + 80 + ellipsis + }); + + it("does not truncate questions at or under 80 chars", () => { + expect(formatConsultationTitle("x".repeat(80))).toBe(`IMAGE - ${"x".repeat(80)}`); + expect(formatConsultationTitle("x".repeat(79))).toBe(`IMAGE - ${"x".repeat(79)}`); + }); + + it("handles an empty question", () => { + expect(formatConsultationTitle("")).toBe("IMAGE - "); + }); +}); diff --git a/packages/vision-handoff/src/pure.ts b/packages/vision-handoff/src/pure.ts new file mode 100644 index 0000000..af3476f --- /dev/null +++ b/packages/vision-handoff/src/pure.ts @@ -0,0 +1,156 @@ +/** + * Pure decision helpers for the vision handoff. + * + * No I/O, no ambient state. The shell (the extension + the service) injects the + * effects (credential store lookups, orchestrator, provider streaming). This + * module owns only the policy: which model is vision-capable, how to format + * image placeholders for non-vision models, and how to format the + * consultation tool's result. + */ + +import type { ModelInfo, ProviderEvent } from "@dispatch/kernel"; +import { isVisionModelId } from "@dispatch/openai-stream"; + +/** + * Whether a model is vision-capable, given its catalog name and (optional) + * resolved `ModelInfo`. When `ModelInfo.vision` is present it is authoritative; + * otherwise fall back to the hardcoded name heuristic ({@link isVisionModelId}). + * + * The `modelName` is the `<credentialName>/<model>` catalog form; the heuristic + * inspects the model SEGMENT (after the first `/`) so `umans/kimi-k2.7` → the + * `kimi-k2.7` segment is checked. Pure. + */ +export function isVisionCapable( + modelName: string | undefined, + info: ModelInfo | undefined, +): boolean { + // When ModelInfo explicitly reports vision (true OR false), it is authoritative + // — an explicit false overrides the name heuristic (a provider that KNOWS a + // model is non-vision wins over the name guess). + if (info?.vision !== undefined) return info.vision; + if (modelName === undefined) return false; + const slash = modelName.indexOf("/"); + const modelId = slash >= 0 ? modelName.slice(slash + 1) : modelName; + return isVisionModelId(modelId); +} + +/** + * Find the first vision-capable model name in a catalog, given a lookup that + * resolves a `<credentialName>/<model>` → `ModelInfo`. Returns `undefined` when + * no vision-capable model is available. Pure given the (async) lookup. + * + * @param catalog The full list of model names (`<credentialName>/<model>`). + * @param getInfo Async lookup of a model name → ModelInfo (from the credential store). + * @param exclude Optional model name to skip (e.g. the current non-vision model). + */ +export async function findVisionModelName( + catalog: readonly string[], + getInfo: (modelName: string) => Promise<ModelInfo | undefined>, + exclude?: string, +): Promise<string | undefined> { + for (const name of catalog) { + if (exclude !== undefined && name === exclude) continue; + // Fast path: the name heuristic lets us short-circuit without an async + // lookup for known vision families (kimi). + const slash = name.indexOf("/"); + const modelId = slash >= 0 ? name.slice(slash + 1) : name; + if (isVisionModelId(modelId)) return name; + const info = await getInfo(name); + if (info?.vision === true) return name; + } + return undefined; +} + +/** + * Fold a provider's streamed events into a single text string. Pure given the + * async iterable — collects `text-delta` events, ignores everything else + * (reasoning, usage, tool-calls). If the stream yields an error event, it is + * surfaced as a thrown Error so the caller can decide how to degrade. + */ +export async function collectTextFromStream(stream: AsyncIterable<ProviderEvent>): Promise<string> { + let text = ""; + for await (const event of stream) { + if (event.type === "text-delta") { + text += event.delta; + } else if (event.type === "error") { + throw new Error(event.message); + } + } + return text; +} + +/** + * Format the placeholder text that replaces an `image` chunk when a non-vision + * model is active. The placeholder tells the model an image is attached and it + * should call `consult_vision` to analyze it — the model drives the analysis + * (asking a specific question) rather than receiving a pre-emptive generic dump. + * + * @param imageId The 1-based ID assigned to this image (used by the tool to + * look up the registered image data). + * Pure. + */ +export function formatImagePlaceholder(imageId: number): string { + return ( + `[Image ${imageId} attached — you cannot view images. Call the ` + + `consult_vision tool with imageIds=[${imageId}] and a specific question ` + + `to analyze it via a vision-capable model.]` + ); +} + +/** + * Placeholder text used when NO vision-capable model is available (the + * degraded path — the tool cannot function). Pure. + */ +export function formatNoVisionPlaceholder(): string { + return ( + "[Image attached — no vision-capable model is available to analyze it. " + + "Install or configure a vision-capable model (e.g. kimi) to enable image analysis.]" + ); +} + +/** + * Maximum length of the consultation title body (matching the conversation + * store's `TITLE_MAX`). The question is truncated to this before the + * `"IMAGE - "` prefix is applied so the consultation tab's title stays in line + * with the store's own title-derivation limit. + */ +const CONSULTATION_TITLE_MAX = 80; + +/** + * Format the title for a vision consultation conversation tab. The title is + * `"IMAGE - "` prefixed to the (truncated) question so the tab is visually + * distinguishable from normal conversation tabs. The question is truncated to + * match the conversation store's title-derivation limit (`TITLE_MAX = 80`). + * + * Pure. + * + * @param question The question the model asked the vision model. + */ +export function formatConsultationTitle(question: string): string { + const body = + question.length > CONSULTATION_TITLE_MAX + ? `${question.slice(0, CONSULTATION_TITLE_MAX)}…` + : question; + return `IMAGE - ${body}`; +} + +/** + * Format the `consult_vision` tool's result string. Returns the conversation ID + * (so the model / user can continue the vision consultation), the vision model's + * response, and a note that follow-up questions use the dispatch CLI (the model + * can load the `dispatch-cli` skill for the exact commands). + * + * Pure. + * + * @param conversationId The new vision consultation conversation ID. + * @param response The vision model's answer to the model's question. + */ +export function formatConsultResult(conversationId: string, response: string): string { + const trimmed = response.trim(); + return ( + `Vision consultation opened in conversation ${conversationId}.\n\n` + + `Response: ${trimmed}\n\n` + + `To ask follow-up questions about this image, use the dispatch CLI ` + + `(conversation: ${conversationId}).` + ); +} diff --git a/packages/vision-handoff/src/service.test.ts b/packages/vision-handoff/src/service.test.ts new file mode 100644 index 0000000..8c4117e --- /dev/null +++ b/packages/vision-handoff/src/service.test.ts @@ -0,0 +1,375 @@ +import type { + AgentEvent, + ChatMessage, + ModelInfo, + ProviderContract, + ProviderEvent, + ToolContract, +} from "@dispatch/kernel"; +import { describe, expect, it, vi } from "vitest"; +import { createVisionHandoffService, type VisionHandoffDeps } from "./service.js"; + +// ── Test doubles (outermost-edge fakes — NOT @dispatch/* mocks) ────────────── + +function makeVisionProvider( + describe: (imageUrl: string) => string, + id = "umans", +): ProviderContract { + return { + id, + stream: vi.fn( + ( + messages: readonly ChatMessage[], + _tools: readonly ToolContract[], + ): AsyncIterable<ProviderEvent> => { + const img = messages.flatMap((m) => m.chunks).find((c) => c.type === "image"); + const url = img && img.type === "image" ? img.url : ""; + const text = describe(url); + async function* gen(): AsyncIterable<ProviderEvent> { + yield { type: "text-delta", delta: text }; + yield { type: "finish", reason: "stop" }; + } + return gen(); + }, + ), + }; +} + +function makeDeps(overrides: Partial<VisionHandoffDeps> = {}): VisionHandoffDeps { + const visionProvider = makeVisionProvider((url) => `DESCRIPTION of ${url}`); + const catalog = ["umans/umans-kimi-k2.7", "umans/umans-glm-5.2"]; + const infoMap: Record<string, ModelInfo> = { + "umans/umans-kimi-k2.7": { id: "umans-kimi-k2.7", vision: true }, + "umans/umans-glm-5.2": { id: "umans-glm-5.2" }, + }; + return { + credentialStore: { + listCatalog: vi.fn(async () => catalog), + getModelInfo: vi.fn(async (name: string) => infoMap[name]), + resolve: vi.fn((name: string) => { + if (name === "umans/umans-kimi-k2.7") + return { providerId: "umans", model: "umans-kimi-k2.7" }; + if (name === "umans/umans-glm-5.2") return { providerId: "umans", model: "umans-glm-5.2" }; + return undefined; + }), + }, + resolveModel: vi.fn((name: string) => + name === "umans/umans-kimi-k2.7" || name === "umans/umans-glm-5.2" + ? { provider: visionProvider, model: name.split("/")[1] } + : undefined, + ), + readFileAsDataUrl: vi.fn(async (path: string) => `data:image/png;base64,FILE(${path})`), + setConversationTitle: vi.fn(async (_conversationId: string, _title: string) => {}), + ...overrides, + }; +} + +describe("VisionHandoffService.isVisionCapable", () => { + it("returns true for kimi (via ModelInfo)", async () => { + const svc = createVisionHandoffService(makeDeps()); + expect(await svc.isVisionCapable("umans/umans-kimi-k2.7")).toBe(true); + }); + + it("returns false for glm-5.2", async () => { + const svc = createVisionHandoffService(makeDeps()); + expect(await svc.isVisionCapable("umans/umans-glm-5.2")).toBe(false); + }); + + it("returns false for undefined model name", async () => { + const svc = createVisionHandoffService(makeDeps()); + expect(await svc.isVisionCapable(undefined)).toBe(false); + }); +}); + +describe("VisionHandoffService.resolveVisionModel", () => { + it("resolves the kimi model from the catalog", async () => { + const svc = createVisionHandoffService(makeDeps()); + const vision = await svc.resolveVisionModel(); + expect(vision?.modelName).toBe("umans/umans-kimi-k2.7"); + expect(vision?.model).toBe("umans-kimi-k2.7"); + }); + + it("excludes the given model", async () => { + const svc = createVisionHandoffService(makeDeps()); + const vision = await svc.resolveVisionModel("umans/umans-kimi-k2.7"); + expect(vision).toBeUndefined(); + }); +}); + +describe("VisionHandoffService.prepareForProvider", () => { + it("passes messages through unchanged when the model is vision-capable", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "What's this?" }, + { type: "image", url: "data:image/png;base64,abc" }, + ], + }, + ]; + const result = await svc.prepareForProvider(messages, "umans/umans-kimi-k2.7"); + expect(result).toBe(messages); // same reference — no copy, no change + }); + + it("passes messages through unchanged when there are no images", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [{ role: "user", chunks: [{ type: "text", text: "hi" }] }]; + const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2"); + expect(result).toBe(messages); + }); + + it("replaces image chunks with numbered placeholders for a non-vision model", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "Describe this" }, + { type: "image", url: "data:image/png;base64,img1" }, + ], + }, + ]; + const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { + conversationId: "conv-1", + }); + expect(result).toHaveLength(1); + const chunks = result[0]?.chunks; + expect(chunks).toHaveLength(2); + // Text chunk unchanged. + expect(chunks?.[0]).toEqual({ type: "text", text: "Describe this" }); + // Image chunk → placeholder text. + expect(chunks?.[1]?.type).toBe("text"); + const placeholder = (chunks?.[1] as { text: string }).text; + expect(placeholder).toContain("Image 1"); + expect(placeholder).toContain("consult_vision"); + }); + + it("assigns sequential image IDs across multiple messages", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,a" }] }, + { role: "assistant", chunks: [{ type: "text", text: "ok" }] }, + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,b" }] }, + ]; + const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { + conversationId: "conv-1", + }); + // First image → Image 1, second → Image 2. + expect((result[0]?.chunks[0] as { text: string }).text).toContain("Image 1"); + // Assistant message unchanged. + expect(result[1]?.chunks[0]?.type).toBe("text"); + expect((result[2]?.chunks[0] as { text: string }).text).toContain("Image 2"); + }); + + it("registers images so getRegisteredImage can look them up", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [{ type: "image", url: "data:image/png;base64,registered" }], + }, + ]; + await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-42" }); + const img = svc.getRegisteredImage("conv-42", 1); + expect(img?.url).toBe("data:image/png;base64,registered"); + }); + + it("uses no-vision placeholder when no vision model is available", async () => { + const deps = makeDeps(); + (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,abc" }] }, + ]; + const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { + conversationId: "conv-1", + }); + const text = (result[0]?.chunks[0] as { text: string }).text; + expect(text).toContain("no vision-capable model"); + expect(text).not.toContain("consult_vision"); + }); +}); + +describe("VisionHandoffService.consultVision", () => { + function makeOrchestratorDouble(response: string): { + orchestrator: NonNullable< + VisionHandoffDeps["resolveOrchestrator"] extends () => infer T ? T : never + >; + handleMessage: ReturnType<typeof vi.fn>; + } { + const handleMessage = vi.fn( + async (input: { + conversationId: string; + text: string; + onEvent: (event: AgentEvent) => void; + }): Promise<void> => { + input.onEvent({ + type: "text-delta", + conversationId: input.conversationId, + turnId: "t1", + delta: response, + }); + input.onEvent({ + type: "done", + conversationId: input.conversationId, + turnId: "t1", + reason: "stop", + }); + }, + ); + return { orchestrator: { handleMessage }, handleMessage }; + } + + it("opens a new consultation with a pasted image and returns convId + response", async () => { + const deps = makeDeps(); + const { orchestrator, handleMessage } = makeOrchestratorDouble("The error is on line 12."); + deps.resolveOrchestrator = () => orchestrator; + const svc = createVisionHandoffService(deps); + + // Register an image first (as prepareForProvider would). + const messages: ChatMessage[] = [ + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] }, + ]; + await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" }); + + const result = await svc.consultVision("What error is shown?", { + conversationId: "conv-1", + imageIds: [1], + }); + + expect("error" in result).toBe(false); + if (!("error" in result)) { + expect(result.conversationId).toBeTruthy(); + expect(result.response).toContain("line 12"); + expect(result.response).toContain(result.conversationId); + expect(result.response).toContain("dispatch CLI"); + } + // The orchestrator was called with the vision model + the image. + expect(handleMessage).toHaveBeenCalledOnce(); + const call = handleMessage.mock.calls[0]?.[0]; + expect(call.modelName).toBe("umans/umans-kimi-k2.7"); + expect(call.images).toHaveLength(1); + expect(call.images?.[0]?.url).toBe("data:image/png;base64,img1"); + }); + + it("labels the consultation tab with an 'IMAGE - ' prefixed title", async () => { + const deps = makeDeps(); + const { orchestrator } = makeOrchestratorDouble("The error is on line 12."); + deps.resolveOrchestrator = () => orchestrator; + const svc = createVisionHandoffService(deps); + + // Register an image first (as prepareForProvider would). + const messages: ChatMessage[] = [ + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] }, + ]; + await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" }); + + const result = await svc.consultVision("What error is shown?", { + conversationId: "conv-1", + imageIds: [1], + }); + + expect("error" in result).toBe(false); + // The title was set with the IMAGE - prefix + the question. + expect(deps.setConversationTitle).toHaveBeenCalledOnce(); + const [titleConvId, title] = (deps.setConversationTitle as ReturnType<typeof vi.fn>).mock + .calls[0]; + expect(titleConvId).toBe((result as { conversationId: string }).conversationId); + expect(title).toBe("IMAGE - What error is shown?"); + }); + + it("does not call setConversationTitle when it is not provided", async () => { + const deps = makeDeps({ setConversationTitle: undefined }); + const { orchestrator } = makeOrchestratorDouble("response"); + deps.resolveOrchestrator = () => orchestrator; + const svc = createVisionHandoffService(deps); + + const messages: ChatMessage[] = [ + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] }, + ]; + await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" }); + + // Should NOT throw — setConversationTitle is optional. + const result = await svc.consultVision("What?", { + conversationId: "conv-1", + imageIds: [1], + }); + expect("error" in result).toBe(false); + }); + + it("opens a consultation with a file path image", async () => { + const deps = makeDeps(); + const { orchestrator } = makeOrchestratorDouble("It's a diagram."); + deps.resolveOrchestrator = () => orchestrator; + const svc = createVisionHandoffService(deps); + + const result = await svc.consultVision("What is this diagram?", { + conversationId: "conv-1", + path: "diagram.png", + cwd: "/work", + }); + + expect("error" in result).toBe(false); + expect(deps.readFileAsDataUrl).toHaveBeenCalledWith("diagram.png", "/work"); + }); + + it("returns an error when imageId is not registered", async () => { + const deps = makeDeps(); + const { orchestrator } = makeOrchestratorDouble("response"); + deps.resolveOrchestrator = () => orchestrator; + const svc = createVisionHandoffService(deps); + + const result = await svc.consultVision("What?", { + conversationId: "conv-1", + imageIds: [99], // not registered + }); + expect("error" in result).toBe(true); + if ("error" in result) { + expect(result.error).toContain("Image 99"); + } + }); + + it("returns an error when no orchestrator is available", async () => { + const deps = makeDeps(); + // No resolveOrchestrator provided. + const svc = createVisionHandoffService(deps); + const result = await svc.consultVision("What?", { + conversationId: "conv-1", + imageIds: [1], + }); + expect("error" in result).toBe(true); + }); + + it("returns an error when no vision model is available", async () => { + const deps = makeDeps(); + (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]); + const { orchestrator } = makeOrchestratorDouble("response"); + deps.resolveOrchestrator = () => orchestrator; + const svc = createVisionHandoffService(deps); + const result = await svc.consultVision("What?", { + conversationId: "conv-1", + imageIds: [1], + }); + expect("error" in result).toBe(true); + if ("error" in result) { + expect(result.error).toContain("No vision-capable model"); + } + }); + + it("returns an error when no image source is provided", async () => { + const deps = makeDeps(); + const { orchestrator } = makeOrchestratorDouble("response"); + deps.resolveOrchestrator = () => orchestrator; + const svc = createVisionHandoffService(deps); + const result = await svc.consultVision("What?", { + conversationId: "conv-1", + }); + expect("error" in result).toBe(true); + }); +}); diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts new file mode 100644 index 0000000..01245df --- /dev/null +++ b/packages/vision-handoff/src/service.ts @@ -0,0 +1,684 @@ +/** + * Vision handoff service — the imperative shell that performs the universal, + * provider-agnostic vision handoff. + * + * Two capabilities: + * 1. **prepareForProvider** (`prepareForProvider`): when a user message carries + * images but the active model cannot see them, this replaces each image chunk + * with a numbered placeholder (telling the model to call `consult_vision`) + * and registers the image data in a per-conversation registry for tool + * access. Vision-capable models pass through unchanged (images flow natively). + * 2. **consult_vision tool** (`consultVision`): opens a NEW conversation tab with + * a vision-capable model (resolved from the catalog — any provider), attaches + * the image(s) + the model's specific question, waits for the response, and + * returns the conversation ID + the vision model's answer. The model (e.g. + * GLM 5.2) directs the analysis — asking exactly what it needs — instead of + * receiving a pre-emptive generic dump. Follow-up questions go through the + * dispatch CLI (the conversation ID is the bridge), not another tool call. + * + * Effects (credential store, orchestrator, filesystem) are injected. The pure + * decisions live in `pure.ts`. This shell wires them. + */ + +import type { CredentialStore } from "@dispatch/credential-store"; +import type { + AgentEvent, + ChatMessage, + Chunk, + ImageInput, + Logger, + ModelInfo, + ProviderContract, +} from "@dispatch/kernel"; +import { defineService, type ServiceHandle } from "@dispatch/kernel"; +import { + collectTextFromStream, + findVisionModelName, + formatConsultationTitle, + formatConsultResult, + formatImagePlaceholder, + formatNoVisionPlaceholder, + isVisionCapable, +} from "./pure.js"; + +/** + * Minimal orchestrator interface the service needs to start vision consultation + * turns. Defined locally (not imported from session-orchestrator) to avoid a + * compile-time dependency — resolved lazily at runtime via a local handle keyed + * to the same service ID. + */ +export interface OrchestratorForVision { + readonly handleMessage: (input: { + readonly conversationId: string; + readonly text: string; + readonly onEvent: (event: AgentEvent) => void; + readonly modelName?: string; + readonly cwd?: string; + readonly images?: readonly ImageInput[]; + readonly systemPrompt?: string; + }) => Promise<void>; +} + +/** Local handle for the session-orchestrator service (same ID, no import dep). */ +export const orchestratorLocalHandle: ServiceHandle<OrchestratorForVision> = + defineService<OrchestratorForVision>("session-orchestrator/orchestrator"); + +/** + * Resolved vision model — a provider + its model id, ready to stream from. + */ +export interface ResolvedVisionModel { + readonly provider: ProviderContract; + readonly model: string; + readonly modelName: string; +} + +/** A registered image (looked up by the consult_vision tool via imageId). */ +interface RegisteredImage { + readonly url: string; + readonly mimeType?: string; +} + +/** + * Dependencies the service needs — all injected (no ambient state). + */ +export interface VisionHandoffDeps { + readonly credentialStore: CredentialStore; + /** Resolve a `<credentialName>/<model>` → its provider + model id. */ + readonly resolveModel: ( + modelName: string, + ) => { provider: ProviderContract; model: string } | undefined; + /** + * Read a file from disk as a base64 data URL. Injected so the shell controls + * the filesystem edge. Returns the data URL, or throws on error. + */ + readonly readFileAsDataUrl: (path: string, cwd?: string) => Promise<string>; + /** + * Lazily resolve the session-orchestrator (for starting vision consultation + * turns). Returns `undefined` when not available — `consult_vision` degrades + * with an error. Lazy so activation order doesn't matter. + */ + readonly resolveOrchestrator?: () => OrchestratorForVision | undefined; + /** + * Get the per-conversation cached image transcriptions (imageUrl → text). + * Used to avoid re-transcribing old images that were compacted to text on a + * previous turn. Optional — when absent, compaction still works but + * re-transcribes every turn (no caching). + */ + readonly getImageTranscriptions?: ( + conversationId: string, + ) => Promise<ReadonlyMap<string, string>>; + /** + * Upsert a single image transcription into the per-conversation cache. + * Optional — paired with getImageTranscriptions. + */ + readonly setImageTranscription?: ( + conversationId: string, + imageUrl: string, + transcription: string, + ) => Promise<void>; + /** + * Save an image data URL to a tmp file and return a compact URL + * (`/images/<conversationId>/<imageId>.<ext>`) that can be persisted in the + * conversation store instead of the full data URL (which would be megabytes). + * The frontend serves the image via `GET /images/...`; the provider resolves + * it back to a data URL via {@link resolveImageUrl} at runtime. When `undefined`, + * data URLs pass through unchanged (images persist in SQLite — the large-DB + * path, for environments without tmp file support). + */ + readonly saveImageToTmp?: ( + conversationId: string, + dataUrl: string, + mimeType?: string, + ) => Promise<string>; + /** + * Resolve a compact URL (`/images/...`) back to a data URL by reading the tmp + * file. Data URLs and HTTP URLs pass through unchanged. Paired with + * {@link saveImageToTmp}. + */ + readonly resolveImageUrl?: (url: string) => Promise<string>; + /** + * Delete a tmp image file (after it has been compacted to text — the + * transcription is cached, the raw image is no longer needed). Best-effort: + * errors are logged, not thrown. + */ + readonly deleteTmpImage?: (compactUrl: string) => Promise<void>; + /** + * Delete all tmp images for a conversation (on conversation close). + * Best-effort. + */ + readonly deleteConversationImages?: (conversationId: string) => Promise<void>; + /** + * Set the human-readable title of a conversation. Used to label vision + * consultation tabs with an `"IMAGE - "` prefix so they're visually + * distinguishable from normal conversation tabs. Backed by the conversation + * store's `setConversationTitle`. Optional — when absent, consultation tabs + * keep their default (question-derived) title. + */ + readonly setConversationTitle?: (conversationId: string, title: string) => Promise<void>; + /** Generate a new conversation ID for a consultation. Defaults to crypto.randomUUID. */ + readonly generateId?: () => string; + readonly logger?: Logger; +} + +export interface VisionHandoffService { + /** + * Whether a given model (by catalog name) is vision-capable. Uses the + * credential store's ModelInfo + the name heuristic. + */ + readonly isVisionCapable: (modelName: string | undefined) => Promise<boolean>; + + /** + * Store images to tmp files and return compact URLs. Each input image's data + * URL is saved to `/tmp/dispatch/images/<conversationId>/<uuid>.<ext>` and + * replaced with a compact HTTP path (`/images/<conversationId>/<uuid>.<ext>`) + * so the persisted conversation store holds a tiny string, not megabytes of + * base64. When `saveImageToTmp` is not configured, data URLs pass through + * unchanged (backward compatible). + */ + readonly storeImages: ( + conversationId: string, + images: readonly ImageInput[], + ) => Promise<readonly ImageInput[]>; + + /** + * Delete all tmp images for a conversation (on close). Best-effort. + */ + readonly purgeConversationImages: (conversationId: string) => Promise<void>; + + /** + * Resolve a vision-capable model from the catalog (any provider). Returns + * `undefined` when none is available. + */ + readonly resolveVisionModel: (excludeName?: string) => Promise<ResolvedVisionModel | undefined>; + + /** + * Transform a message list for the provider: if the active model is + * vision-capable, return messages unchanged (images pass through natively). + * If NOT vision-capable, replace every `image` chunk with a numbered + * placeholder (telling the model to call `consult_vision`) and register the + * image data in the per-conversation registry for tool access. The PERSISTED + * history is NOT modified — only what the provider sees. Never throws. + */ + readonly prepareForProvider: ( + messages: readonly ChatMessage[], + currentModelName: string | undefined, + opts?: { + readonly conversationId?: string; + readonly imageLimit?: number; + readonly signal?: AbortSignal; + readonly logger?: Logger; + }, + ) => Promise<readonly ChatMessage[]>; + + /** + * Look up a registered image by conversation ID + image ID. Returns + * `undefined` when the image isn't registered (e.g. after a server restart). + */ + readonly getRegisteredImage: ( + conversationId: string, + imageId: number, + ) => RegisteredImage | undefined; + + /** + * Open a NEW vision consultation conversation: attach image(s) + the model's + * question to a vision-capable model, wait for the response, and return the + * conversation ID + the vision model's answer. The model drives the analysis + * — it asks exactly what it needs. Follow-ups go through the dispatch CLI. + * + * @returns The conversation ID + the vision model's response text, or an + * error string (never throws — the tool surfaces it). + */ + readonly consultVision: ( + question: string, + opts: { + readonly conversationId: string; + readonly imageIds?: readonly number[]; + readonly path?: string; + readonly cwd?: string; + readonly signal?: AbortSignal; + readonly logger?: Logger; + }, + ) => Promise< + { readonly conversationId: string; readonly response: string } | { readonly error: string } + >; +} + +export const visionHandoffHandle: ServiceHandle<VisionHandoffService> = + defineService<VisionHandoffService>("vision-handoff/service"); + +/** Whether a message list contains any image chunks. Pure. */ +function hasImageChunks(messages: readonly ChatMessage[]): boolean { + return messages.some((m) => m.chunks.some((c) => c.type === "image")); +} + +export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHandoffService { + const log = deps.logger; + const generateId = deps.generateId ?? (() => crypto.randomUUID()); + + // Per-conversation image registry: conversationId → (imageId → image data). + // Populated by prepareForProvider; consulted by the consult_vision tool. + // In-memory only (cleared on restart — the user re-pastes if needed). + const imageRegistry = new Map<string, Map<number, RegisteredImage>>(); + + async function getInfo(modelName: string): Promise<ModelInfo | undefined> { + return deps.credentialStore.getModelInfo(modelName); + } + + async function resolveVisionModel( + excludeName?: string, + ): Promise<ResolvedVisionModel | undefined> { + const catalog = await deps.credentialStore.listCatalog(); + const name = await findVisionModelName(catalog, getInfo, excludeName); + if (name === undefined) return undefined; + const resolved = deps.resolveModel(name); + if (resolved === undefined) return undefined; + return { provider: resolved.provider, model: resolved.model, modelName: name }; + } + + /** + * Compact images for a vision-capable model: when the conversation has more + * image chunks than the limit, the oldest images are transcribed to text + * (one-time, cached in the conversation store) and stripped from the + * provider messages. Recent images (within the limit) stay native. + * + * The persisted history is NOT modified — only the provider's view. + * Transcriptions are cached so they're reused on subsequent turns (no + * re-transcription). When no caching deps are available, it still works but + * re-transcribes every turn. + */ + async function compactImagesForVisionModel( + messages: readonly ChatMessage[], + opts: + | { + readonly conversationId?: string; + readonly imageLimit?: number; + readonly signal?: AbortSignal; + readonly logger?: Logger; + } + | undefined, + currentModelName: string | undefined, + ): Promise<readonly ChatMessage[]> { + void currentModelName; // reserved for future model-specific compaction logic + const limit = opts?.imageLimit; + // No limit or limit <= 0 → pass all images through (compaction disabled). + if (limit === undefined || limit <= 0) return messages; + + // Collect all image chunks in order (oldest first, across all messages). + const imageEntries: { msgIdx: number; chunkIdx: number; url: string }[] = []; + for (const [mi, msg] of messages.entries()) { + for (const [ci, chunk] of msg.chunks.entries()) { + if (chunk.type === "image") { + imageEntries.push({ msgIdx: mi, chunkIdx: ci, url: chunk.url }); + } + } + } + + // If within the limit, pass everything through natively. + if (imageEntries.length <= limit) return messages; + + // The oldest (imageEntries.length - limit) images need transcription. + const toTranscribeCount = imageEntries.length - limit; + const toTranscribe = imageEntries.slice(0, toTranscribeCount); + + // Load cached transcriptions. + const convId = opts?.conversationId; + const cache = + convId !== undefined && deps.getImageTranscriptions !== undefined + ? await deps.getImageTranscriptions(convId) + : new Map<string, string>(); + + // Transcribe any that aren't cached yet (via the vision model). + const transcriptions = new Map<string, string>(cache); + const vision = await resolveVisionModel(); + for (const entry of toTranscribe) { + if (transcriptions.has(entry.url)) continue; + if (vision === undefined) { + // No vision model available for transcription — use a placeholder. + transcriptions.set( + entry.url, + "[Image was compacted — no vision model available to transcribe it.]", + ); + continue; + } + try { + const prompt = + "Describe this image in detail. Include visible text (transcribe verbatim), " + + "key objects, layout, and notable details. This description will replace " + + "the image in a conversation history, so be thorough."; + const userMessage: ChatMessage = { + role: "user", + chunks: [ + { type: "text", text: prompt }, + { type: "image", url: entry.url }, + ], + }; + const stream = vision.provider.stream([userMessage], [], { + model: vision.model, + systemPrompt: "You are a vision assistant. Describe images faithfully and thoroughly.", + }); + const description = (await collectTextFromStream(stream)).trim(); + const text = + description.length > 0 ? description : "[Image transcription produced no output.]"; + transcriptions.set(entry.url, text); + // Cache it in the conversation store (if available). + if (convId !== undefined && deps.setImageTranscription !== undefined) { + await deps.setImageTranscription(convId, entry.url, text); + } + // The image has been transcribed to text — delete the tmp file + // (the transcription is cached, the raw image is no longer needed). + if (deps.deleteTmpImage !== undefined) { + try { + await deps.deleteTmpImage(entry.url); + } catch { + // Best-effort — don't let cleanup failure break the turn. + } + } + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + log?.warn("vision-handoff: image compaction transcription failed", { error: msg }); + transcriptions.set(entry.url, `[Image transcription failed: ${msg}]`); + } + } + + // Build the provider messages: replace transcribed images with text, + // keep recent images (within the limit) native. + const transcribedUrls = new Set(toTranscribe.map((e) => e.url)); + const result: ChatMessage[] = []; + for (const msg of messages) { + if (!msg.chunks.some((c) => c.type === "image")) { + result.push(msg); + continue; + } + const newChunks: Chunk[] = []; + for (const chunk of msg.chunks) { + if (chunk.type === "image" && transcribedUrls.has(chunk.url)) { + const transcription = transcriptions.get(chunk.url); + if (transcription !== undefined) { + newChunks.push({ type: "text", text: `[Compacted image]: ${transcription}` }); + } else { + newChunks.push(chunk); // fallback: keep the image + } + } else { + newChunks.push(chunk); + } + } + result.push({ role: msg.role, chunks: newChunks }); + } + return result; + } + + async function resolveImageUrlsInMessages( + messages: readonly ChatMessage[], + ): Promise<readonly ChatMessage[]> { + if (deps.resolveImageUrl === undefined) return messages; + let hasCompact = false; + for (const msg of messages) { + if (msg.chunks.some((c) => c.type === "image")) { + hasCompact = true; + break; + } + } + if (!hasCompact) return messages; + const result: ChatMessage[] = []; + for (const msg of messages) { + if (!msg.chunks.some((c) => c.type === "image")) { + result.push(msg); + continue; + } + const newChunks: Chunk[] = []; + for (const chunk of msg.chunks) { + if (chunk.type === "image") { + const dataUrl = await deps.resolveImageUrl!(chunk.url); + newChunks.push({ + type: "image", + url: dataUrl, + ...(chunk.mimeType !== undefined ? { mimeType: chunk.mimeType } : {}), + }); + } else { + newChunks.push(chunk); + } + } + result.push({ role: msg.role, chunks: newChunks }); + } + return result; + } + + const service: VisionHandoffService = { + async isVisionCapable(modelName: string | undefined): Promise<boolean> { + if (modelName === undefined) return false; + const info = await getInfo(modelName); + return isVisionCapable(modelName, info); + }, + + async storeImages( + conversationId: string, + images: readonly ImageInput[], + ): Promise<readonly ImageInput[]> { + if (deps.saveImageToTmp === undefined) return images; + const result: ImageInput[] = []; + for (const img of images) { + if (img.url.startsWith("data:")) { + const compactUrl = await deps.saveImageToTmp(conversationId, img.url, img.mimeType); + result.push({ + url: compactUrl, + ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}), + }); + } else { + result.push(img); + } + } + return result; + }, + + async purgeConversationImages(conversationId: string): Promise<void> { + if (deps.deleteConversationImages === undefined) return; + try { + await deps.deleteConversationImages(conversationId); + } catch (err) { + log?.warn("vision-handoff: failed to purge conversation images", { + conversationId, + error: err instanceof Error ? err.message : String(err), + }); + } + }, + + resolveVisionModel, + + async prepareForProvider( + messages: readonly ChatMessage[], + currentModelName: string | undefined, + opts?: { + readonly conversationId?: string; + readonly imageLimit?: number; + readonly signal?: AbortSignal; + readonly logger?: Logger; + }, + ): Promise<readonly ChatMessage[]> { + // Fast path: no images anywhere → nothing to do. + if (!hasImageChunks(messages)) return messages; + + // Resolve compact URLs (/images/...) → data URLs for the provider. + // The persisted chunks store compact URLs (tiny strings); the provider + // needs data URLs (read from tmp files at runtime). + const resolved = await resolveImageUrlsInMessages(messages); + + const isCapable = + currentModelName !== undefined && + (await isVisionCapable(currentModelName, await getInfo(currentModelName))); + + // ── Vision-capable model: image compaction ────────────────────────── + // When the conversation has more images than the limit, the oldest images + // are transcribed to text (one-time, cached) and stripped from the + // provider messages. Recent images (within the limit) stay native. + if (isCapable) { + return compactImagesForVisionModel(resolved, opts, currentModelName); + } + + // ── Non-vision model: placeholders + consult_vision ────────────────── + const vision = await resolveVisionModel(); + const convId = opts?.conversationId; + + const placeholderFn = + vision !== undefined && convId !== undefined + ? (id: number) => formatImagePlaceholder(id) + : () => formatNoVisionPlaceholder(); + + // Replace each image chunk with a numbered placeholder. Assign sequential + // 1-based IDs across all messages and register each image in the + // per-conversation registry so the consult_vision tool can look it up. + let seqId = 0; + const result: ChatMessage[] = []; + for (const msg of resolved) { + if (!msg.chunks.some((c) => c.type === "image")) { + result.push(msg); + continue; + } + const newChunks: Chunk[] = []; + for (const chunk of msg.chunks) { + if (chunk.type === "image") { + seqId++; + if (convId !== undefined && vision !== undefined) { + let convImages = imageRegistry.get(convId); + if (convImages === undefined) { + convImages = new Map(); + imageRegistry.set(convId, convImages); + } + convImages.set(seqId, { + url: chunk.url, + ...(chunk.mimeType !== undefined ? { mimeType: chunk.mimeType } : {}), + }); + } + newChunks.push({ type: "text", text: placeholderFn(seqId) }); + } else { + newChunks.push(chunk); + } + } + result.push({ role: msg.role, chunks: newChunks }); + } + return result; + }, + + getRegisteredImage(conversationId: string, imageId: number): RegisteredImage | undefined { + return imageRegistry.get(conversationId)?.get(imageId); + }, + + async consultVision( + question: string, + opts: { + readonly conversationId: string; + readonly imageIds?: readonly number[]; + readonly path?: string; + readonly cwd?: string; + readonly signal?: AbortSignal; + readonly logger?: Logger; + }, + ): Promise< + { readonly conversationId: string; readonly response: string } | { readonly error: string } + > { + const orchestrator = deps.resolveOrchestrator?.(); + if (orchestrator === undefined) { + return { + error: "The session orchestrator is not available — cannot start a vision consultation.", + }; + } + + const vision = await resolveVisionModel(); + if (vision === undefined) { + return { + error: + "No vision-capable model is available in the catalog. Install or configure one (e.g. kimi) to enable image analysis.", + }; + } + + // Collect image data URLs to attach. + const images: ImageInput[] = []; + if (opts.imageIds !== undefined) { + for (const id of opts.imageIds) { + const img = service.getRegisteredImage(opts.conversationId, id); + if (img === undefined) { + return { + error: `Image ${id} is not registered. It may have been lost after a server restart — ask the user to re-paste the image.`, + }; + } + images.push({ + url: img.url, + ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}), + }); + } + } + if (opts.path !== undefined) { + try { + const dataUrl = await deps.readFileAsDataUrl(opts.path, opts.cwd); + images.push({ url: dataUrl }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { error: `Failed to read image file "${opts.path}": ${msg}` }; + } + } + if (images.length === 0) { + return { + error: + "No image to consult about. Provide imageIds (for pasted images) or path (for a file).", + }; + } + + // Start a NEW conversation with the vision model. + const consultationId = generateId(); + log?.info("vision-handoff: starting consultation", { + consultationId, + visionModel: vision.modelName, + imageCount: images.length, + fromConversation: opts.conversationId, + }); + + // Label the consultation tab with an "IMAGE - " prefix so it's visually + // distinguishable from normal conversation tabs. Set BEFORE the turn + // starts so the tab shows the correct title from the first moment (the + // store keeps a non-"Untitled" title on first message append). + if (deps.setConversationTitle !== undefined) { + try { + await deps.setConversationTitle(consultationId, formatConsultationTitle(question)); + } catch (err) { + // Best-effort — don't let a title-write failure break the consultation. + log?.warn("vision-handoff: failed to set consultation title", { + consultationId, + error: err instanceof Error ? err.message : String(err), + }); + } + } + + let responseText = ""; + let errorMessage = ""; + try { + await orchestrator.handleMessage({ + conversationId: consultationId, + text: question, + images, + modelName: vision.modelName, + ...(opts.cwd !== undefined ? { cwd: opts.cwd } : {}), + systemPrompt: + "You are a vision assistant. A developer who cannot see images is asking you specific questions about an image they attached. Answer their question precisely and thoroughly.", + onEvent: (event: AgentEvent) => { + if (event.type === "text-delta") { + responseText += event.delta; + } else if (event.type === "error") { + errorMessage = event.message; + } + }, + }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { error: `Vision consultation failed: ${msg}` }; + } + + if (errorMessage.length > 0 && responseText.trim().length === 0) { + return { error: `Vision consultation failed: ${errorMessage}` }; + } + + const response = formatConsultResult(consultationId, responseText); + return { conversationId: consultationId, response }; + }, + }; + + return service; +} diff --git a/packages/vision-handoff/src/tool.ts b/packages/vision-handoff/src/tool.ts new file mode 100644 index 0000000..86be2ed --- /dev/null +++ b/packages/vision-handoff/src/tool.ts @@ -0,0 +1,137 @@ +/** + * consult_vision tool — lets any model (vision-capable or not) consult a + * vision-capable model about an image by opening a NEW conversation tab. + * + * The tool attaches image(s) + the model's specific question to a vision-capable + * model (resolved from the catalog — e.g. Kimi), waits for the response, and + * returns the conversation ID + the vision model's answer. The MODEL directs the + * analysis — it asks exactly what it needs to know — instead of receiving a + * pre-emptive generic dump. + * + * For images PASTED into the chat, the model references them by `imageIds` (from + * the "[Image N attached]" placeholders the orchestrator injected). For image + * FILES on disk, the model passes a `path`. + * + * Follow-up questions are NOT handled by this tool — the model uses the dispatch + * CLI to continue the vision conversation (the returned conversation ID is the + * bridge; the model can load the `dispatch-cli` skill for the exact commands). + */ + +import type { ToolContract, ToolExecuteContext, ToolResult } from "@dispatch/kernel"; +import type { VisionHandoffService } from "./service.js"; + +export function createConsultVisionTool(service: VisionHandoffService): ToolContract { + return { + name: "consult_vision", + description: + "Consult a vision-capable model (e.g. Kimi) about an image by opening a new " + + "conversation tab. Attaches the image(s) + your specific question, waits for " + + "the vision model's response, and returns the conversation ID + the answer. " + + "Use this when you cannot view an image (e.g. a pasted screenshot or diagram) " + + "and need to know what it shows — ask a SPECIFIC question (e.g. 'What error " + + "message is on line 12?' rather than 'describe this image'). The conversation " + + "ID is returned so follow-up questions can be asked via the dispatch CLI.", + parameters: { + type: "object", + properties: { + question: { + type: "string", + description: + "Your specific question about the image. Be precise — the vision model " + + "will answer exactly this. E.g. 'What error message is displayed?' or " + + "'Compare the layout of these two screenshots.'", + }, + imageIds: { + type: "array", + items: { type: "number" }, + description: + "The IDs of pasted images to attach (from the '[Image N attached]' " + + "placeholders in the conversation). Pass multiple to attach several " + + "images to one consultation (e.g. [1, 2] to compare them).", + }, + path: { + type: "string", + description: + "Path to an image FILE on disk to attach (alternative to imageIds for " + + "code-referenced images). Relative paths resolve against the cwd.", + }, + }, + required: ["question"], + }, + concurrencySafe: true, + async execute(args: unknown, ctx: ToolExecuteContext): Promise<ToolResult> { + const input = args as { + question?: unknown; + imageIds?: unknown; + path?: unknown; + } | null; + + const question = input?.question; + if (typeof question !== "string" || question.trim().length === 0) { + return { + content: "Error: 'question' is required and must be a non-empty string.", + isError: true, + }; + } + + const imageIds = input?.imageIds; + const path = input?.path; + + // Parse imageIds (must be an array of numbers if present). + let parsedImageIds: number[] | undefined; + if (imageIds !== undefined) { + if (!Array.isArray(imageIds)) { + return { content: "Error: 'imageIds' must be an array of numbers.", isError: true }; + } + parsedImageIds = imageIds.filter((n): n is number => typeof n === "number"); + if (parsedImageIds.length === 0) { + return { content: "Error: 'imageIds' must contain at least one number.", isError: true }; + } + } + + // path must be a string if present. + let parsedPath: string | undefined; + if (path !== undefined) { + if (typeof path !== "string" || path.trim().length === 0) { + return { content: "Error: 'path' must be a non-empty string.", isError: true }; + } + parsedPath = path; + } + + // At least one image source is required. + if (parsedImageIds === undefined && parsedPath === undefined) { + return { + content: + "Error: provide 'imageIds' (for pasted images) or 'path' (for a file) " + + "to attach an image to the consultation.", + isError: true, + }; + } + + const span = ctx.log.span("consult_vision.execute", { + imageCount: (parsedImageIds?.length ?? 0) + (parsedPath !== undefined ? 1 : 0), + }); + try { + const result = await service.consultVision(question, { + conversationId: ctx.conversationId ?? "", + ...(parsedImageIds !== undefined ? { imageIds: parsedImageIds } : {}), + ...(parsedPath !== undefined ? { path: parsedPath } : {}), + ...(ctx.cwd !== undefined ? { cwd: ctx.cwd } : {}), + signal: ctx.signal, + logger: ctx.log, + }); + span.end({ attrs: { ok: !("error" in result) } }); + if ("error" in result) { + return { content: result.error, isError: true }; + } + return { content: result.response }; + } catch (err: unknown) { + span.end({ err }); + return { + content: `Error during vision consultation: ${err instanceof Error ? err.message : String(err)}`, + isError: true, + }; + } + }, + }; +} diff --git a/packages/vision-handoff/tsconfig.json b/packages/vision-handoff/tsconfig.json new file mode 100644 index 0000000..b5439aa --- /dev/null +++ b/packages/vision-handoff/tsconfig.json @@ -0,0 +1,12 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "rootDir": "src", "outDir": "dist", "composite": true }, + "include": ["src/**/*.ts"], + "references": [ + { "path": "../kernel" }, + { "path": "../wire" }, + { "path": "../conversation-store" }, + { "path": "../credential-store" }, + { "path": "../openai-stream" } + ] +} diff --git a/packages/wire/src/index.test.ts b/packages/wire/src/index.test.ts index 3f07e00..81d10c1 100644 --- a/packages/wire/src/index.test.ts +++ b/packages/wire/src/index.test.ts @@ -8,7 +8,7 @@ */ import { describe, expect, it } from "vitest"; -import type { Computer, ComputerEntry, Workspace } from "./index.js"; +import type { Chunk, Computer, ComputerEntry, ImageChunk, ImageInput, Workspace } from "./index.js"; describe("@dispatch/wire — Computer / Workspace shapes", () => { it("a Computer literal satisfies the Computer type", () => { @@ -57,3 +57,32 @@ describe("@dispatch/wire — Computer / Workspace shapes", () => { expect(local.defaultComputerId).toBeNull(); }); }); + +describe("@dispatch/wire — ImageChunk / ImageInput shapes", () => { + it("an ImageChunk carries a data URL and optional mimeType", () => { + const c: ImageChunk = { + type: "image", + url: "data:image/png;base64,iVBORw0KGgo=", + mimeType: "image/png", + }; + expect(c.type).toBe("image"); + expect(c.url).toContain("base64"); + expect(c.mimeType).toBe("image/png"); + }); + + it("an ImageChunk with only a url is valid (mimeType optional)", () => { + const c: ImageChunk = { type: "image", url: "https://example.com/cat.png" }; + expect(c.mimeType).toBeUndefined(); + }); + + it("ImageInput mirrors ImageChunk's url semantics", () => { + const input: ImageInput = { url: "data:image/jpeg;base64,/9j/4AAQ" }; + expect(input.url).toContain("jpeg"); + }); + + it("ImageChunk is a member of the Chunk union (assignable)", () => { + const chunk: Chunk = { type: "image", url: "data:image/png;base64,x" }; + // Compile-time proof: an ImageChunk satisfies the Chunk union. + expect(chunk.type).toBe("image"); + }); +}); diff --git a/packages/wire/src/index.ts b/packages/wire/src/index.ts index 6d10e0f..113f684 100644 --- a/packages/wire/src/index.ts +++ b/packages/wire/src/index.ts @@ -36,7 +36,8 @@ export type Chunk = | ToolCallChunk | ToolResultChunk | ErrorChunk - | SystemChunk; + | SystemChunk + | ImageChunk; /** A piece of plain text content from the assistant or user. */ export interface TextChunk { @@ -113,6 +114,46 @@ export interface SystemChunk { } /** + * An image attached to a message (e.g. a user-pasted screenshot or pasted + * photo). Carries a `url` that is EITHER a base64 data URL + * (`data:image/png;base64,…`) OR an `http(s)://` URL. Vision-capable models + * receive it natively (the provider serializes it to its image-content + * format); non-vision models never see it directly — the orchestrator's + * **vision handoff** transcribes it to a text description (via a + * vision-capable model) and feeds that text instead, so a text-only model can + * still reason about the image's contents. + * + * When a transcription was performed, it is persisted as a separate `text` + * chunk alongside the `image` chunk in the SAME user message, so the + * description is reused on every later turn (no re-transcription) and a + * client renders both the original image and its textual analysis. + */ +export interface ImageChunk { + readonly type: "image"; + /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */ + readonly url: string; + /** + * Optional MIME type of the image (e.g. `"image/png"`). Inferred from the + * data URL when absent; present so a client can render an icon/label without + * parsing the URL. Optional — callers that only have a URL omit it. + */ + readonly mimeType?: string; +} + +/** + * An image a client attaches to a chat message (`ChatRequest.images`). The + * transport-facing input shape; the orchestrator converts each `ImageInput` + * into an `ImageChunk` on the persisted user message. Carries the same `url` + * semantics as `ImageChunk.url`. + */ +export interface ImageInput { + /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */ + readonly url: string; + /** Optional MIME type (e.g. `"image/png"`). Optional — inferred from the data URL when absent. */ + readonly mimeType?: string; +} + +/** * A chat message: a role plus an ordered sequence of chunks. Messages are the * unit passed to and from the provider; chunks are the unit persisted and * rendered. diff --git a/tsconfig.json b/tsconfig.json index d31b44a..f97edde 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -41,6 +41,9 @@ "path": "./packages/credential-store" }, { + "path": "./packages/vision-handoff" + }, + { "path": "./packages/exec-backend" }, { |
