diff options
| author | Adam Malczewski <[email protected]> | 2026-06-27 03:40:38 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-06-27 03:40:38 +0900 |
| commit | d5633cf6e007eaf8255a44529a638d2466a74ba3 (patch) | |
| tree | 14fe72f5b585eb72c763073b4e7022b914bdbafb | |
| parent | ad9d135e583c99a0d93327115defa43187cde1c3 (diff) | |
| download | dispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.tar.gz dispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.zip | |
feat(vision-handoff): implement vision for capable models and universal vision handoff
35 files changed, 1727 insertions, 19 deletions
@@ -103,6 +103,7 @@ "@dispatch/tool-youtube-transcript": "workspace:*", "@dispatch/transport-http": "workspace:*", "@dispatch/transport-ws": "workspace:*", + "@dispatch/vision-handoff": "workspace:*", }, }, "packages/journal-sink": { @@ -361,6 +362,15 @@ "name": "@dispatch/ui-contract", "version": "0.2.0", }, + "packages/vision-handoff": { + "name": "@dispatch/vision-handoff", + "version": "0.0.0", + "dependencies": { + "@dispatch/credential-store": "workspace:*", + "@dispatch/kernel": "workspace:*", + "@dispatch/openai-stream": "workspace:*", + }, + }, "packages/wire": { "name": "@dispatch/wire", "version": "0.12.0", @@ -461,6 +471,8 @@ "@dispatch/ui-contract": ["@dispatch/ui-contract@workspace:packages/ui-contract"], + "@dispatch/vision-handoff": ["@dispatch/vision-handoff@workspace:packages/vision-handoff"], + "@dispatch/wire": ["@dispatch/wire@workspace:packages/wire"], "@esbuild/aix-ppc64": ["@esbuild/[email protected]", "", { "os": "aix", "cpu": "ppc64" }, "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg=="], diff --git a/packages/host-bin/package.json b/packages/host-bin/package.json index 65ea305..b5ab954 100644 --- a/packages/host-bin/package.json +++ b/packages/host-bin/package.json @@ -33,6 +33,7 @@ "@dispatch/surface-loaded-extensions": "workspace:*", "@dispatch/surface-registry": "workspace:*", "@dispatch/transport-ws": "workspace:*", - "@dispatch/system-prompt": "workspace:*" + "@dispatch/system-prompt": "workspace:*", + "@dispatch/vision-handoff": "workspace:*" } } diff --git a/packages/host-bin/src/main.ts b/packages/host-bin/src/main.ts index 8633052..a5dabab 100644 --- a/packages/host-bin/src/main.ts +++ b/packages/host-bin/src/main.ts @@ -43,6 +43,7 @@ import { extension as toolWriteFileExt } from "@dispatch/tool-write-file"; import { extension as toolYoutubeTranscriptExt } from "@dispatch/tool-youtube-transcript"; import { createTransportHttpExtension } from "@dispatch/transport-http"; import { createTransportWsExtension } from "@dispatch/transport-ws"; +import { extension as visionHandoffExt } from "@dispatch/vision-handoff"; import type { ChildHandle } from "./collector-supervisor.js"; import { createCollectorSupervisor } from "./collector-supervisor.js"; import { configMapToAccess, envToConfigMap } from "./config.js"; @@ -204,6 +205,13 @@ async function boot(): Promise<void> { const extensions: Extension[] = [ ...CORE_EXTENSIONS, createCredentialStoreExtension({ credentials }), + // vision-handoff activates AFTER credential-store (it resolves the + // credential-store service at activate time to find vision-capable models). + // Placed here, not in CORE_EXTENSIONS, so the service is available when it + // activates. The session-orchestrator resolves its service LAZILY + // (per-turn), so activation order between it and session-orchestrator + // doesn't matter. + visionHandoffExt, ...externalExtensions, ]; diff --git a/packages/host-bin/tsconfig.json b/packages/host-bin/tsconfig.json index 2b1edf5..305274c 100644 --- a/packages/host-bin/tsconfig.json +++ b/packages/host-bin/tsconfig.json @@ -60,6 +60,9 @@ }, { "path": "../transport-ws" + }, + { + "path": "../vision-handoff" } ] } diff --git a/packages/kernel/src/contracts/conversation.ts b/packages/kernel/src/contracts/conversation.ts index f074c52..80da86e 100644 --- a/packages/kernel/src/contracts/conversation.ts +++ b/packages/kernel/src/contracts/conversation.ts @@ -12,6 +12,8 @@ export type { ConversationMeta, ConversationStatus, ErrorChunk, + ImageChunk, + ImageInput, Role, StepId, StepMetrics, diff --git a/packages/kernel/src/contracts/index.ts b/packages/kernel/src/contracts/index.ts index 09e0a56..28e0a0b 100644 --- a/packages/kernel/src/contracts/index.ts +++ b/packages/kernel/src/contracts/index.ts @@ -19,6 +19,8 @@ export type { ConversationMeta, ConversationStatus, ErrorChunk, + ImageChunk, + ImageInput, Role, StepId, StepMetrics, diff --git a/packages/kernel/src/contracts/provider.ts b/packages/kernel/src/contracts/provider.ts index b6dc8ca..3137073 100644 --- a/packages/kernel/src/contracts/provider.ts +++ b/packages/kernel/src/contracts/provider.ts @@ -114,6 +114,16 @@ export interface ModelInfo { readonly displayName?: string; /** The model's max context window in tokens (e.g. 200000). Optional — providers that don't report it leave it undefined. */ readonly contextWindow?: number; + /** + * Whether this model can natively accept image input (vision/multimodal). + * When `true`, image chunks in a user message are passed through to the + * provider serialized to its image-content format. When `false`/absent, the + * orchestrator's vision handoff transcribes images to text (via a + * vision-capable model) before the model sees them. Optional — providers + * that cannot detect it leave it undefined (treated as non-vision); a + * provider that knows a model is vision-capable sets it `true`. + */ + readonly vision?: boolean; } /** diff --git a/packages/openai-stream/src/convert-messages.test.ts b/packages/openai-stream/src/convert-messages.test.ts index 3520eb5..57c7d81 100644 --- a/packages/openai-stream/src/convert-messages.test.ts +++ b/packages/openai-stream/src/convert-messages.test.ts @@ -35,6 +35,100 @@ describe("convertMessages", () => { expect(result).toEqual([{ role: "user", content: "Hello, world!" }]); }); + it("converts a user message with a text + image chunk to a multimodal content array", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "What is in this image?" }, + { type: "image", url: "data:image/png;base64,iVBORw0KGgo=" }, + ], + }, + ]; + + const result = convertMessages(messages); + expect(result).toEqual([ + { + role: "user", + content: [ + { type: "text", text: "What is in this image?" }, + { type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } }, + ], + }, + ]); + }); + + it("converts an image-only user message (no text) to a content array with just the image", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [{ type: "image", url: "https://example.com/cat.png" }], + }, + ]; + + const result = convertMessages(messages); + expect(result).toEqual([ + { + role: "user", + content: [{ type: "image_url", image_url: { url: "https://example.com/cat.png" } }], + }, + ]); + }); + + it("converts a user message with multiple images interspersed with text", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "Compare these:" }, + { type: "image", url: "data:image/png;base64,aaa" }, + { type: "text", text: "and" }, + { type: "image", url: "data:image/jpeg;base64,bbb" }, + ], + }, + ]; + + const result = convertMessages(messages); + expect(result).toHaveLength(1); + const content = result[0]?.content; + expect(Array.isArray(content)).toBe(true); + if (Array.isArray(content)) { + expect(content).toHaveLength(4); + expect(content[0]).toEqual({ type: "text", text: "Compare these:" }); + expect(content[1]).toEqual({ + type: "image_url", + image_url: { url: "data:image/png;base64,aaa" }, + }); + expect(content[2]).toEqual({ type: "text", text: "and" }); + expect(content[3]).toEqual({ + type: "image_url", + image_url: { url: "data:image/jpeg;base64,bbb" }, + }); + } + }); + + it("skips empty text parts in a multimodal message but keeps images", () => { + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "" }, + { type: "image", url: "data:image/png;base64,x" }, + ], + }, + ]; + + const result = convertMessages(messages); + const content = result[0]?.content; + expect(Array.isArray(content)).toBe(true); + if (Array.isArray(content)) { + // Empty text part is dropped; only the image remains. + expect(content).toEqual([ + { type: "image_url", image_url: { url: "data:image/png;base64,x" } }, + ]); + } + }); + it("converts an assistant message with text only", () => { const messages: ChatMessage[] = [ { diff --git a/packages/openai-stream/src/convert-messages.ts b/packages/openai-stream/src/convert-messages.ts index e830243..eba3575 100644 --- a/packages/openai-stream/src/convert-messages.ts +++ b/packages/openai-stream/src/convert-messages.ts @@ -1,8 +1,28 @@ import type { ChatMessage, Chunk } from "@dispatch/kernel"; +/** A text part within a multimodal OpenAI content array. */ +export interface OpenAITextPart { + readonly type: "text"; + readonly text: string; +} + +/** An image part within a multimodal OpenAI content array (OpenAI vision format). */ +export interface OpenAIImagePart { + readonly type: "image_url"; + readonly image_url: { readonly url: string }; +} + +/** + * A part of a multimodal message content array. When a message has mixed text + * and image chunks, the content is serialized as an array of these parts + * (OpenAI's vision format). Plain-text messages keep a string `content` for + * byte-stability with providers that only accept strings. + */ +export type OpenAIContentPart = OpenAITextPart | OpenAIImagePart; + export interface OpenAIMessage { readonly role: "system" | "user" | "assistant" | "tool"; - readonly content: string | null; + readonly content: string | null | readonly OpenAIContentPart[]; readonly tool_calls?: readonly OpenAIToolCall[]; readonly tool_call_id?: string; } @@ -49,6 +69,29 @@ function convertSystemMessage(msg: ChatMessage): OpenAIMessage { } function convertUserMessage(msg: ChatMessage): OpenAIMessage { + // If the message has image chunks, serialize as a multimodal content array + // (OpenAI vision format): text parts + image_url parts in chunk order. + // Plain text-only messages keep a string `content` for byte-stability with + // providers that only accept a string (and to keep prompt-cache prefixes + // unchanged for the common no-image case). + const hasImage = msg.chunks.some((c) => c.type === "image"); + if (hasImage) { + const parts: OpenAIContentPart[] = []; + for (const chunk of msg.chunks) { + if (chunk.type === "text") { + if (chunk.text.length > 0) { + parts.push({ type: "text", text: chunk.text }); + } + } else if (chunk.type === "image") { + parts.push({ type: "image_url", image_url: { url: chunk.url } }); + } + // Non-text/non-image chunks (tool-call, thinking, etc.) are not part of a + // user message's provider content and are skipped here. + } + // An image-only message (no text) still needs at least the image part. + return { role: "user", content: parts.length > 0 ? parts : "" }; + } + const text = msg.chunks .filter((c): c is Extract<Chunk, { type: "text" }> => c.type === "text") .map((c) => c.text) diff --git a/packages/openai-stream/src/index.ts b/packages/openai-stream/src/index.ts index bd2f673..3f76b99 100644 --- a/packages/openai-stream/src/index.ts +++ b/packages/openai-stream/src/index.ts @@ -1,8 +1,14 @@ -export type { OpenAIMessage, OpenAIToolCall } from "./convert-messages.js"; +export type { + OpenAIContentPart, + OpenAIImagePart, + OpenAIMessage, + OpenAITextPart, + OpenAIToolCall, +} from "./convert-messages.js"; export { convertMessages } from "./convert-messages.js"; export type { OpenAITool } from "./convert-tools.js"; export { convertTools } from "./convert-tools.js"; -export { parseModelList } from "./listModels.js"; +export { isVisionModelId, parseModelList } from "./listModels.js"; export { parseSSELines } from "./parse-sse.js"; export type { CreateOpenAICompatProviderOpts } from "./provider.js"; export { createOpenAICompatProvider } from "./provider.js"; diff --git a/packages/openai-stream/src/listModels.test.ts b/packages/openai-stream/src/listModels.test.ts index c2438bc..3acf46e 100644 --- a/packages/openai-stream/src/listModels.test.ts +++ b/packages/openai-stream/src/listModels.test.ts @@ -1,7 +1,7 @@ import type { ApiKeyCredentials, ModelInfo, ProviderContract } from "@dispatch/kernel"; import type { FetchLike } from "@dispatch/trace-replay"; import { describe, expect, it, vi } from "vitest"; -import { parseModelList } from "./listModels.js"; +import { isVisionModelId, parseModelList } from "./listModels.js"; import { createOpenAICompatProvider } from "./provider.js"; function makeProvider(fetchFn: FetchLike, apiKey = "sk-test-1234567890abcdef"): ProviderContract { @@ -35,6 +35,48 @@ describe("listModels — pure mapping (parseModelList)", () => { const result = parseModelList([]); expect(result).toEqual([]); }); + + it("extracts contextWindow from common field names", () => { + const result = parseModelList([ + { id: "m1", context_length: 128000 }, + { id: "m2", context_window: 200000 }, + { id: "m3", max_context_length: 64000 }, + { id: "m4", max_tokens: 8000 }, + ]); + expect(result).toEqual([ + { id: "m1", contextWindow: 128000 }, + { id: "m2", contextWindow: 200000 }, + { id: "m3", contextWindow: 64000 }, + { id: "m4", contextWindow: 8000 }, + ]); + }); +}); + +describe("listModels — vision capability detection", () => { + it("isVisionModelId returns true for kimi-family model ids", () => { + expect(isVisionModelId("kimi-k2.7")).toBe(true); + expect(isVisionModelId("Kimi-K2.7")).toBe(true); // case-insensitive + expect(isVisionModelId("moonshot/kimi-k2-thinking")).toBe(true); + }); + + it("isVisionModelId returns false for non-kimi model ids", () => { + expect(isVisionModelId("glm-5.2")).toBe(false); + expect(isVisionModelId("deepseek-v4-flash")).toBe(false); + expect(isVisionModelId("umans-coder")).toBe(false); + }); + + it("parseModelList sets vision: true on kimi-family models", () => { + const result = parseModelList([ + { id: "kimi-k2.7", context_length: 200000 }, + { id: "glm-5.2", context_length: 128000 }, + { id: "deepseek-v4-flash" }, + ]); + expect(result).toEqual([ + { id: "kimi-k2.7", contextWindow: 200000, vision: true }, + { id: "glm-5.2", contextWindow: 128000 }, + { id: "deepseek-v4-flash" }, + ]); + }); }); describe("listModels — provider contract", () => { diff --git a/packages/openai-stream/src/listModels.ts b/packages/openai-stream/src/listModels.ts index 0e94c43..273fee3 100644 --- a/packages/openai-stream/src/listModels.ts +++ b/packages/openai-stream/src/listModels.ts @@ -24,17 +24,39 @@ interface OpenAIModelListResponse { } /** + * Whether a model id is vision-capable (can natively accept image input). + * + * The OpenAI-compatible `/models` endpoint does not reliably report image + * capabilities, so this is a hardcoded heuristic by model id: a model whose id + * contains "kimi" (e.g. `kimi-k2.7`, `moonshot/kimi-k2.7`) is vision-capable; + * all others are treated as non-vision. This is the single source of truth — + * the orchestrator's vision handoff and the `read_image` tool both consult the + * `ModelInfo.vision` flag this sets, so adding a model here enables vision + * everywhere. Pure: id → boolean, no I/O. + * + * (When an endpoint gains reliable vision reporting, this can be replaced with + * a real capability check without changing callers.) + */ +export function isVisionModelId(id: string): boolean { + const lower = id.toLowerCase(); + return lower.includes("kimi"); +} + +/** * Pure mapping: raw OpenAI-compatible model list → ModelInfo[]. - * Extracts `contextWindow` from common field names (providers vary). - * Extracted for direct unit testing with no I/O. + * Extracts `contextWindow` from common field names (providers vary) and + * detects vision capability via {@link isVisionModelId}. Extracted for direct + * unit testing with no I/O. */ export function parseModelList(data: readonly OpenAIModelEntry[]): readonly ModelInfo[] { return data.map((entry) => { const contextWindow = entry.context_length ?? entry.context_window ?? entry.max_context_length ?? entry.max_tokens; + const vision = isVisionModelId(entry.id); return { id: entry.id, ...(contextWindow !== undefined ? { contextWindow } : {}), + ...(vision ? { vision } : {}), }; }); } diff --git a/packages/session-orchestrator/src/extension.ts b/packages/session-orchestrator/src/extension.ts index 5afffd8..d080e90 100644 --- a/packages/session-orchestrator/src/extension.ts +++ b/packages/session-orchestrator/src/extension.ts @@ -11,6 +11,7 @@ import { createSessionOrchestrator, createWarmService, sessionOrchestratorHandle, + visionHandoffLocalHandle, } from "./orchestrator.js"; import { selectFirstProvider } from "./pure.js"; import { filterRemoteIncompatibleTools, toolsFilter } from "./tools-filter.js"; @@ -93,6 +94,20 @@ export function activate(host: HostAPI): void { return undefined; } }, + resolveVisionHandoff: () => { + // Lazily resolve the vision-handoff service. Returns undefined when the + // vision-handoff extension isn't loaded (images pass through unchanged — + // correct for vision-capable models; the feature degrades off cleanly for + // text-only turns). Lazy so activation order doesn't matter; the + // activated-manifests guard avoids a getService throw when absent. + const loaded = host.getExtensions().some((m) => m.id === "vision-handoff"); + if (!loaded) return undefined; + try { + return host.getService(visionHandoffLocalHandle); + } catch { + return undefined; + } + }, }); host.provideService(sessionOrchestratorHandle, orchestrator); diff --git a/packages/session-orchestrator/src/orchestrator.ts b/packages/session-orchestrator/src/orchestrator.ts index 96cd3a3..ac1eaf4 100644 --- a/packages/session-orchestrator/src/orchestrator.ts +++ b/packages/session-orchestrator/src/orchestrator.ts @@ -5,6 +5,7 @@ import type { CompactionResult, ConversationStatus, EventHookDescriptor, + ImageInput, Logger, ModelInfo, ProviderContract, @@ -32,11 +33,52 @@ import { } from "./pure.js"; import type { ToolAssembly } from "./tools-filter.js"; +// --- Vision handoff (lazy, optional) --- + +/** + * Minimal contract the vision-handoff service satisfies. Defined here (not + * imported from the vision-handoff package) so the orchestrator has NO + * compile-time dependency on it — the service is resolved lazily at runtime + * (like the message-queue / system-prompt services), and the feature degrades + * off cleanly when the extension isn't loaded (images pass through unchanged, + * which is correct for vision-capable models and a no-op for text-only turns). + * + * `transcribeForProvider` transforms a message list for the provider: if the + * active model is vision-capable, messages pass through unchanged; otherwise + * image chunks are replaced with text descriptions (transcribed via a + * vision-capable model). Never throws — degrades to placeholders. + */ +export interface VisionHandoffService { + readonly transcribeForProvider: ( + messages: readonly ChatMessage[], + currentModelName: string | undefined, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ) => Promise<readonly ChatMessage[]>; +} + +/** + * Local handle for the vision-handoff service, keyed by the same ID the + * vision-handoff extension registers under (`"vision-handoff/service"`). Defined + * locally (not imported) so the orchestrator has no compile-time dependency on + * the vision-handoff package — the service is resolved lazily at runtime, and + * the feature degrades off cleanly when the extension isn't loaded. + */ +export const visionHandoffLocalHandle: ServiceHandle<VisionHandoffService> = + defineService<VisionHandoffService>("vision-handoff/service"); + // --- Broadcast hub types --- export interface StartTurnInput { readonly conversationId: string; readonly text: string; + /** + * Images attached to this turn (e.g. user-pasted screenshots). Each is + * appended as an `image` chunk on the persisted user message. For a + * vision-capable model the images pass through to the provider natively; for + * a non-vision model the vision handoff transcribes them to text first. + * Optional — omit for a text-only turn. + */ + readonly images?: readonly ImageInput[]; readonly modelName?: string; readonly cwd?: string; /** @@ -75,6 +117,12 @@ export type StartTurnResult = export interface EnqueueInput { readonly conversationId: string; readonly text: string; + /** + * Images attached (the steering / opening message analog of + * `StartTurnInput.images`). Threaded to `startTurn` when the conversation is + * idle (the message starts a turn). Additive optional. + */ + readonly images?: readonly ImageInput[]; /** Workspace to stamp on a new conversation. Defaults to `"default"`. */ readonly workspaceId?: string; /** @@ -289,6 +337,8 @@ export interface SessionOrchestrator { workspaceId?: string; /** Explicit system-prompt override — see {@link StartTurnInput.systemPrompt}. */ systemPrompt?: string; + /** Images attached to this turn — see {@link StartTurnInput.images}. */ + images?: readonly ImageInput[]; }): Promise<void>; } @@ -335,6 +385,17 @@ export interface SessionOrchestratorDeps { * order doesn't matter. */ readonly resolveSystemPrompt?: () => SystemPromptService | undefined; + /** + * Lazily resolves the vision-handoff service, or `undefined` when the + * vision-handoff extension isn't loaded. Used to transcribe image chunks to + * text for non-vision models before they reach the provider (so a text-only + * model can still reason about pasted/code images). When `undefined`, images + * pass through unchanged (correct for vision-capable models; a text-only model + * would then receive image content its API may reject — the feature degrades + * off cleanly for text-only turns since there are no images). Lazy so + * activation order doesn't matter; called per-turn. + */ + readonly resolveVisionHandoff?: () => VisionHandoffService | undefined; /** Apply the per-turn tools filter chain. Injected for testability. */ readonly applyToolsFilter: (assembly: ToolAssembly) => Promise<ToolAssembly>; /** Base logger (auto-scoped to this extension); childed per turn for span capture. */ @@ -437,6 +498,7 @@ export function createSessionOrchestrator( reasoningEffortOverride: ReasoningEffort | undefined, workspaceId: string, systemPromptOverride: string | undefined, + images: readonly ImageInput[] | undefined, ): void { const turnId = generateTurnId(); const controller = new AbortController(); @@ -558,7 +620,7 @@ export function createSessionOrchestrator( const effectiveModelName = resolveModelName(modelName, storedModel); const history = await deps.conversationStore.load(conversationId); - const userMsg = buildUserMessage(text); + const userMsg = buildUserMessage(text, images); // Workspace assignment for new conversations happens BEFORE // effective-cwd resolution (see workspaceSetupPromise above) so @@ -697,9 +759,32 @@ export function createSessionOrchestrator( return [{ role: "user", chunks: [{ type: "text", text: steerText }] }]; }; + // Vision handoff: transform the message list for the provider. When the + // active model is vision-capable, images pass through natively (no-op). + // When it is NOT vision-capable, image chunks are transcribed to text + // descriptions via a vision-capable model — so a text-only model can + // still reason about images. The PERSISTED user message keeps the + // original image chunks (appended below); only the provider's view is + // transcribed. When the vision-handoff service isn't loaded, images pass + // through unchanged (correct for vision models; text-only models would + // then receive image content their API may reject — degrades off cleanly + // for text-only turns with no images). + const visionHandoff = deps.resolveVisionHandoff?.(); + let providerMessages: readonly ChatMessage[] = [...history, userMsg]; + if (visionHandoff !== undefined) { + providerMessages = await visionHandoff.transcribeForProvider( + providerMessages, + effectiveModelName, + { + signal: controller.signal, + ...(turnLogger !== undefined ? { logger: turnLogger } : {}), + }, + ); + } + const opts: RunTurnInput = { provider, - messages: [...history, userMsg], + messages: providerMessages, tools: assembled.tools, dispatch, emit: emitAndAccumulate, @@ -805,6 +890,7 @@ export function createSessionOrchestrator( reasoningEffort, workspaceId, systemPrompt, + images, }) { if (activeTurns.has(conversationId)) { return { started: false, reason: "already-active" }; @@ -818,18 +904,20 @@ export function createSessionOrchestrator( reasoningEffort, workspaceId ?? "default", systemPrompt, + images, ); const turn = activeTurns.get(conversationId); const turnId = turn !== undefined ? turn.turnId : ""; return { started: true, turnId }; }, - enqueue({ conversationId, text, workspaceId, computerId }) { + enqueue({ conversationId, text, workspaceId, computerId, images }) { const result = orchestrator.startTurn({ conversationId, text, ...(workspaceId !== undefined ? { workspaceId } : {}), ...(computerId !== undefined ? { computerId } : {}), + ...(images !== undefined ? { images } : {}), }); if (result.started) { return { startedTurn: true, queue: [] }; @@ -914,6 +1002,7 @@ export function createSessionOrchestrator( reasoningEffort, workspaceId, systemPrompt, + images, }) { const turnInput: StartTurnInput = { conversationId, @@ -924,6 +1013,7 @@ export function createSessionOrchestrator( ...(reasoningEffort !== undefined ? { reasoningEffort } : {}), ...(workspaceId !== undefined ? { workspaceId } : {}), ...(systemPrompt !== undefined ? { systemPrompt } : {}), + ...(images !== undefined ? { images } : {}), }; const result = orchestrator.startTurn(turnInput); if (!result.started) { diff --git a/packages/session-orchestrator/src/pure.test.ts b/packages/session-orchestrator/src/pure.test.ts index c75cb82..7a574f1 100644 --- a/packages/session-orchestrator/src/pure.test.ts +++ b/packages/session-orchestrator/src/pure.test.ts @@ -26,6 +26,39 @@ describe("buildUserMessage", () => { expect(msg.role).toBe("user"); expect(msg.chunks[0]).toEqual({ type: "text", text: "" }); }); + + it("appends image chunks after the text chunk when images are given", () => { + const msg = buildUserMessage("look at this", [ + { url: "data:image/png;base64,aaa" }, + { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" }, + ]); + expect(msg.chunks).toHaveLength(3); + expect(msg.chunks[0]).toEqual({ type: "text", text: "look at this" }); + expect(msg.chunks[1]).toEqual({ type: "image", url: "data:image/png;base64,aaa" }); + expect(msg.chunks[2]).toEqual({ + type: "image", + url: "data:image/jpeg;base64,bbb", + mimeType: "image/jpeg", + }); + }); + + it("builds an image-only message when text is empty", () => { + const msg = buildUserMessage("", [{ url: "data:image/png;base64,zzz" }]); + expect(msg.chunks).toHaveLength(1); + expect(msg.chunks[0]).toEqual({ type: "image", url: "data:image/png;base64,zzz" }); + }); + + it("includes mimeType when provided", () => { + const msg = buildUserMessage("hi", [ + { url: "data:image/webp;base64,x", mimeType: "image/webp" }, + ]); + expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBe("image/webp"); + }); + + it("omits mimeType when not provided", () => { + const msg = buildUserMessage("hi", [{ url: "https://example.com/x.png" }]); + expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBeUndefined(); + }); }); describe("selectFirstProvider", () => { diff --git a/packages/session-orchestrator/src/pure.ts b/packages/session-orchestrator/src/pure.ts index 2208e8f..0d2068f 100644 --- a/packages/session-orchestrator/src/pure.ts +++ b/packages/session-orchestrator/src/pure.ts @@ -1,12 +1,40 @@ import type { ChatMessage, + Chunk, + ImageInput, ProviderContract, ReasoningEffort, ToolDispatchPolicy, } from "@dispatch/kernel"; -export function buildUserMessage(text: string): ChatMessage { - return { role: "user", chunks: [{ type: "text", text }] }; +/** + * Build the persisted user message for a turn. When `images` are provided, each + * is appended as an `image` chunk AFTER the text chunk, so the persisted message + * carries both the prompt text and the attached images (the frontend renders + * the images; vision-capable providers receive them natively; non-vision + * providers have them transcribed by the vision handoff before streaming). + * + * Pure: inputs → a ChatMessage, no I/O. + */ +export function buildUserMessage(text: string, images?: readonly ImageInput[]): ChatMessage { + const chunks: Chunk[] = []; + if (text.length > 0) { + chunks.push({ type: "text", text }); + } + if (images !== undefined) { + for (const img of images) { + chunks.push({ + type: "image", + url: img.url, + ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}), + }); + } + } + // An image-only message (empty text) is valid. + if (chunks.length === 0) { + chunks.push({ type: "text", text: "" }); + } + return { role: "user", chunks }; } // ── Provider-error retry backoff schedule ─────────────────────────────────── diff --git a/packages/transport-contract/src/contract.types.test.ts b/packages/transport-contract/src/contract.types.test.ts index 9d3d904..34ff544 100644 --- a/packages/transport-contract/src/contract.types.test.ts +++ b/packages/transport-contract/src/contract.types.test.ts @@ -20,6 +20,7 @@ import type { LspServerState, LspStatusResponse, McpStatusResponse, + ModelsResponse, SetConversationComputerRequest, SetCwdRequest, SetWorkspaceDefaultComputerRequest, @@ -55,6 +56,18 @@ const _chatWithoutComputer: ChatRequest = { message: "hello", }; +// ─── ChatRequest.images (additive optional) ────────────────────────────────── + +const _chatWithImages: ChatRequest = { + message: "What's in this screenshot?", + images: [{ url: "data:image/png;base64,iVBORw0KGgo=", mimeType: "image/png" }], +}; + +const _chatWithHttpImage: ChatRequest = { + message: "analyze this", + images: [{ url: "https://example.com/diagram.png" }], +}; + // ─── Computer list / single response ───────────────────────────────────────── const _computer: Computer = { @@ -255,6 +268,35 @@ describe("transport-contract types compile and are exported", () => { expect(_chatWithComputer.computerId).toBe("prod-box"); }); + // ─── ChatRequest.images (additive optional) ────────────────────────────── + + it("ChatRequest: images is additive optional (omittable)", () => { + expect(_chatWithoutComputer.images).toBeUndefined(); + }); + + it("ChatRequest: carries images (data URL) when set", () => { + expect(_chatWithImages.images).toHaveLength(1); + expect(_chatWithImages.images?.[0]?.url).toContain("base64"); + expect(_chatWithImages.images?.[0]?.mimeType).toBe("image/png"); + }); + + it("ChatRequest: carries images (http URL, mimeType optional)", () => { + expect(_chatWithHttpImage.images?.[0]?.url).toBe("https://example.com/diagram.png"); + expect(_chatWithHttpImage.images?.[0]?.mimeType).toBeUndefined(); + }); + + it("ModelsResponse: ModelMetadata carries optional vision flag", () => { + const resp: ModelsResponse = { + models: ["umans/kimi-k2.7", "umans/glm-5.2"], + modelInfo: { + "umans/kimi-k2.7": { contextWindow: 200000, vision: true }, + "umans/glm-5.2": { contextWindow: 128000 }, + }, + }; + expect(resp.modelInfo?.["umans/kimi-k2.7"]?.vision).toBe(true); + expect(resp.modelInfo?.["umans/glm-5.2"]?.vision).toBeUndefined(); + }); + // ─── Computers ─────────────────────────────────────────────────────────── it("ComputerListResponse: carries entries with usage counts", () => { diff --git a/packages/transport-contract/src/index.ts b/packages/transport-contract/src/index.ts index 6a9a29f..0444f29 100644 --- a/packages/transport-contract/src/index.ts +++ b/packages/transport-contract/src/index.ts @@ -26,6 +26,7 @@ import type { ComputerEntry, ConversationMeta, ConversationStatus, + ImageInput, QueuedMessage, ReasoningEffort, StoredChunk, @@ -41,6 +42,8 @@ export type { ComputerEntry, ConversationMeta, ConversationStatus, + ImageChunk, + ImageInput, QueuedMessage, ReasoningEffort, StepMetrics, @@ -68,6 +71,19 @@ export interface ChatRequest { readonly message: string; /** + * Images attached to this turn (e.g. a user-pasted screenshot). Each entry's + * `url` is a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` + * URL. The server converts these to `image` chunks on the persisted user + * message. For a VISION-capable model (e.g. kimi), the images are passed + * through to the provider natively. For a NON-vision model (e.g. glm-5.2), + * the server's vision handoff transcribes each image to a text description + * (via a vision-capable model) and feeds that text instead — so a text-only + * model can still reason about the image's contents. Optional — omit for a + * text-only turn (backward compatible). + */ + readonly images?: readonly ImageInput[]; + + /** * The model to use, as a model name in `<credentialName>/<model>` form — one * of the exact strings returned by `GET /models`. Omit to use the server's * default credential + model. @@ -124,6 +140,14 @@ export interface ModelsResponse { /** Per-model metadata returned alongside the model catalog. */ export interface ModelMetadata { readonly contextWindow?: number; + /** + * Whether this model can natively accept image input (vision/multimodal). + * When `true`, image chunks in a user message are passed through to the + * provider. When `false`/absent, the server's vision handoff transcribes + * images to text before the model sees them. A client may use this to show a + * vision badge in the model picker. Optional — absent when unknown. + */ + readonly vision?: boolean; } /** diff --git a/packages/transport-http/src/app.ts b/packages/transport-http/src/app.ts index 4fb295e..a9a23da 100644 --- a/packages/transport-http/src/app.ts +++ b/packages/transport-http/src/app.ts @@ -294,11 +294,14 @@ export function createApp(opts: CreateServerOptions): Hono { app.get("/models", async (c) => { try { const models = await opts.credentialStore.listCatalog(); - const modelInfo: Record<string, { contextWindow?: number }> = {}; + const modelInfo: Record<string, { contextWindow?: number; vision?: boolean }> = {}; for (const modelName of models) { const info = await opts.credentialStore.getModelInfo(modelName); - if (info?.contextWindow !== undefined) { - modelInfo[modelName] = { contextWindow: info.contextWindow }; + if (info?.contextWindow !== undefined || info?.vision === true) { + const entry: { contextWindow?: number; vision?: boolean } = {}; + if (info?.contextWindow !== undefined) entry.contextWindow = info.contextWindow; + if (info?.vision === true) entry.vision = true; + modelInfo[modelName] = entry; } } const body: ModelsResponse = { @@ -398,8 +401,16 @@ export function createApp(opts: CreateServerOptions): Hono { return c.json({ error: result.error }, 400); } - const { conversationId, message, model, cwd, computerId, reasoningEffort, workspaceId } = - result; + const { + conversationId, + message, + model, + cwd, + computerId, + reasoningEffort, + workspaceId, + images, + } = result; log.info("chat: request accepted", { conversationId, hasModel: model !== undefined, @@ -407,6 +418,7 @@ export function createApp(opts: CreateServerOptions): Hono { hasComputerId: computerId !== undefined, hasReasoningEffort: reasoningEffort !== undefined, hasWorkspaceId: workspaceId !== undefined, + imageCount: images?.length ?? 0, }); const events: AgentEvent[] = []; @@ -457,6 +469,7 @@ export function createApp(opts: CreateServerOptions): Hono { ...(computerId !== undefined ? { computerId } : {}), ...(reasoningEffort !== undefined ? { reasoningEffort } : {}), ...(workspaceId !== undefined ? { workspaceId } : {}), + ...(images !== undefined ? { images } : {}), }; opts.orchestrator diff --git a/packages/transport-http/src/logic.test.ts b/packages/transport-http/src/logic.test.ts index fc8302e..67632f3 100644 --- a/packages/transport-http/src/logic.test.ts +++ b/packages/transport-http/src/logic.test.ts @@ -182,6 +182,69 @@ describe("parseChatBody", () => { expect(result.reasoningEffort).toBeUndefined(); } }); + + // ── images ────────────────────────────────────────────────────────────── + + it("parses images array with data URLs", () => { + const result = parseChatBody( + { + message: "what is this?", + images: [ + { url: "data:image/png;base64,aaa" }, + { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" }, + ], + }, + fakeId, + ); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images).toHaveLength(2); + expect(result.images?.[0]?.url).toBe("data:image/png;base64,aaa"); + expect(result.images?.[1]?.mimeType).toBe("image/jpeg"); + } + }); + + it("parses images with http URLs", () => { + const result = parseChatBody( + { message: "hi", images: [{ url: "https://example.com/x.png" }] }, + fakeId, + ); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images?.[0]?.url).toBe("https://example.com/x.png"); + } + }); + + it("returns error when images is not an array", () => { + const result = parseChatBody({ message: "hi", images: "not-an-array" }, fakeId); + expect(isParseError(result)).toBe(true); + }); + + it("returns error when an image lacks a url", () => { + const result = parseChatBody({ message: "hi", images: [{ mimeType: "image/png" }] }, fakeId); + expect(isParseError(result)).toBe(true); + }); + + it("returns error when an image url is empty", () => { + const result = parseChatBody({ message: "hi", images: [{ url: "" }] }, fakeId); + expect(isParseError(result)).toBe(true); + }); + + it("omits images when absent (backward compatible)", () => { + const result = parseChatBody({ message: "hi" }, fakeId); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images).toBeUndefined(); + } + }); + + it("omits images when the array is empty", () => { + const result = parseChatBody({ message: "hi", images: [] }, fakeId); + expect(isParseError(result)).toBe(false); + if (!isParseError(result)) { + expect(result.images).toBeUndefined(); + } + }); }); describe("parseSinceSeq", () => { diff --git a/packages/transport-http/src/logic.ts b/packages/transport-http/src/logic.ts index 97ad426..a928147 100644 --- a/packages/transport-http/src/logic.ts +++ b/packages/transport-http/src/logic.ts @@ -55,6 +55,13 @@ export interface ChatCommand { readonly computerId?: string; readonly reasoningEffort?: ReasoningEffort; readonly workspaceId?: string; + /** + * Images attached to this turn (data URLs or http URLs). Parsed from the + * `ChatRequest.images` field; forwarded to the orchestrator which converts + * them to `image` chunks on the user message. Each entry must have a non-empty + * string `url`; `mimeType` is optional. + */ + readonly images?: readonly { readonly url: string; readonly mimeType?: string }[]; } export interface ParseError { @@ -121,6 +128,33 @@ export function parseChatBody(body: unknown, generateId: () => string): ParseRes (result as { workspaceId?: string }).workspaceId = obj.workspaceId; } + if (obj.images !== undefined) { + if (!Array.isArray(obj.images)) { + return { error: "Field 'images' must be an array" }; + } + const images: { url: string; mimeType?: string }[] = []; + for (const entry of obj.images) { + if (entry === null || typeof entry !== "object") { + return { error: "Each image must be an object with a 'url' string" }; + } + const img = entry as { url?: unknown; mimeType?: unknown }; + if (typeof img.url !== "string" || img.url.length === 0) { + return { error: "Each image must have a non-empty string 'url'" }; + } + const parsed: { url: string; mimeType?: string } = { url: img.url }; + if (img.mimeType !== undefined) { + if (typeof img.mimeType !== "string") { + return { error: "Field 'mimeType' on an image must be a string" }; + } + parsed.mimeType = img.mimeType; + } + images.push(parsed); + } + if (images.length > 0) { + (result as { images?: readonly { url: string; mimeType?: string }[] }).images = images; + } + } + return result; } diff --git a/packages/transport-ws/src/extension.ts b/packages/transport-ws/src/extension.ts index 3811ed7..d26712b 100644 --- a/packages/transport-ws/src/extension.ts +++ b/packages/transport-ws/src/extension.ts @@ -291,6 +291,7 @@ export function createTransportWsExtension(): Extension { : {}), ...(result.workspaceId !== undefined ? { workspaceId: result.workspaceId } : {}), ...(result.computerId !== undefined ? { computerId: result.computerId } : {}), + ...(result.images !== undefined ? { images: result.images } : {}), }); if (!startResult.started) { send(ws, { diff --git a/packages/transport-ws/src/router.ts b/packages/transport-ws/src/router.ts index a33aa5a..0caf305 100644 --- a/packages/transport-ws/src/router.ts +++ b/packages/transport-ws/src/router.ts @@ -58,6 +58,12 @@ export interface ChatRouteResult { * conversation → workspace → local chain). */ readonly computerId?: string; + /** + * Images attached to this turn (data URLs or http URLs), forwarded verbatim to + * the orchestrator. Absent when the client omits it. Each entry must have a + * non-empty string `url`; `mimeType` is optional. + */ + readonly images?: readonly { readonly url: string; readonly mimeType?: string }[]; } /** A malformed chat.send that should yield a chat.error reply. */ @@ -174,6 +180,36 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError errorMessage: `chat.send: invalid reasoningEffort "${msg.reasoningEffort}" — must be one of: low, medium, high, xhigh, max`, }; } + // Validate images (if present): each must be an object with a non-empty url. + let images: readonly { url: string; mimeType?: string }[] | undefined; + if (msg.images !== undefined) { + if (!Array.isArray(msg.images)) { + return { + kind: "chat-error", + conversationId: msg.conversationId, + errorMessage: "chat.send: 'images' must be an array", + }; + } + const parsed: { url: string; mimeType?: string }[] = []; + for (const entry of msg.images) { + if ( + entry === null || + typeof entry !== "object" || + typeof entry.url !== "string" || + entry.url.length === 0 + ) { + return { + kind: "chat-error", + conversationId: msg.conversationId, + errorMessage: "chat.send: each image must have a non-empty string 'url'", + }; + } + const p: { url: string; mimeType?: string } = { url: entry.url }; + if (entry.mimeType !== undefined) p.mimeType = entry.mimeType; + parsed.push(p); + } + if (parsed.length > 0) images = parsed; + } return { kind: "chat", conversationId: msg.conversationId, @@ -183,6 +219,7 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError ...(msg.reasoningEffort !== undefined ? { reasoningEffort: msg.reasoningEffort } : {}), ...(msg.workspaceId !== undefined ? { workspaceId: msg.workspaceId } : {}), ...(msg.computerId !== undefined ? { computerId: msg.computerId } : {}), + ...(images !== undefined ? { images } : {}), }; } diff --git a/packages/vision-handoff/package.json b/packages/vision-handoff/package.json new file mode 100644 index 0000000..a88ab49 --- /dev/null +++ b/packages/vision-handoff/package.json @@ -0,0 +1,13 @@ +{ + "name": "@dispatch/vision-handoff", + "version": "0.0.0", + "type": "module", + "private": true, + "main": "dist/index.js", + "types": "dist/index.d.ts", + "dependencies": { + "@dispatch/credential-store": "workspace:*", + "@dispatch/kernel": "workspace:*", + "@dispatch/openai-stream": "workspace:*" + } +} diff --git a/packages/vision-handoff/src/extension.ts b/packages/vision-handoff/src/extension.ts new file mode 100644 index 0000000..aa745b7 --- /dev/null +++ b/packages/vision-handoff/src/extension.ts @@ -0,0 +1,106 @@ +/** + * vision-handoff extension — registers the universal vision handoff service + + * the `read_image` tool. + * + * The service performs provider-agnostic vision handoff: it resolves a + * vision-capable model from the catalog (any provider), streams an image to it + * via the standard `ProviderContract.stream` interface, and folds the textual + * description back — so a non-vision model (e.g. glm-5.2) can still reason about + * images, and any model can analyze image FILES referenced in code. + * + * Effects (filesystem, fetch) live here in the shell, injected into the service. + * The pure decisions live in `pure.ts`. No `console.*`; logging via `host.logger`. + */ + +import { readFile } from "node:fs/promises"; +import { extname, isAbsolute, resolve as pathResolve } from "node:path"; +import type { CredentialStore } from "@dispatch/credential-store"; +import { credentialStoreHandle } from "@dispatch/credential-store"; +import type { Extension, HostAPI, Manifest } from "@dispatch/kernel"; +import { createVisionHandoffService, visionHandoffHandle } from "./service.js"; +import { createReadImageTool } from "./tool.js"; + +export const manifest: Manifest = { + id: "vision-handoff", + name: "Vision Handoff", + version: "0.0.0", + apiVersion: "^0.1.0", + trust: "bundled", + activation: "eager", + capabilities: { network: true }, + contributes: { services: ["vision-handoff/service"], tools: ["read_image"] }, +}; + +/** MIME types for recognized image extensions. */ +const MIME_BY_EXT: Readonly<Record<string, string>> = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".webp": "image/webp", + ".gif": "image/gif", + ".bmp": "image/bmp", +}; + +/** + * Read an image file from disk as a base64 data URL. Resolves relative paths + * against the cwd (the conversation's working directory). Throws on missing + * file / read error (the caller surfaces it). The shell edge — real `node:fs`. + */ +async function readFileAsDataUrl(path: string, cwd?: string): Promise<string> { + const abs = cwd !== undefined && !isAbsolute(path) ? pathResolve(cwd, path) : pathResolve(path); + const buf = await readFile(abs); + const ext = extname(abs).toLowerCase(); + const mime = MIME_BY_EXT[ext] ?? "image/png"; + return `data:${mime};base64,${buf.toString("base64")}`; +} + +/** + * Fetch an HTTP(S) image URL and convert it to a base64 data URL (so it can be + * sent to the vision model inline, regardless of whether the provider can fetch + * remote URLs). The shell edge — real `globalThis.fetch`. + */ +async function fetchUrlAsDataUrl(url: string): Promise<string> { + const res = await fetch(url); + if (!res.ok) { + throw new Error(`Failed to fetch image: HTTP ${res.status}`); + } + const buf = new Uint8Array(await res.arrayBuffer()); + const mime = res.headers.get("content-type") ?? "image/png"; + // Buffer/base64 in Bun + Node. Convert byte-by-byte without non-null asserts. + let binary = ""; + for (const byte of buf) binary += String.fromCharCode(byte); + const base64 = btoa(binary); + return `data:${mime};base64,${base64}`; +} + +export async function activate(host: HostAPI): Promise<void> { + const credentialStore = host.getService(credentialStoreHandle) as CredentialStore | undefined; + if (credentialStore === undefined) { + host.logger.warn( + "vision-handoff: credential-store service not available. The read_image tool and image transcription are disabled.", + ); + return; + } + + const resolveModel = (modelName: string) => { + const resolved = credentialStore.resolve(modelName); + if (resolved === undefined) return undefined; + const provider = host.getProviders().get(resolved.providerId); + if (provider === undefined) return undefined; + return { provider, model: resolved.model }; + }; + + const service = createVisionHandoffService({ + credentialStore, + resolveModel, + readFileAsDataUrl, + fetchUrlAsDataUrl, + logger: host.logger.child({ extensionId: "vision-handoff" }), + }); + + host.provideService(visionHandoffHandle, service); + host.defineTool(createReadImageTool(service)); + host.logger.info("vision-handoff: registered (read_image tool + transcription service)"); +} + +export const extension: Extension = { manifest, activate }; diff --git a/packages/vision-handoff/src/index.ts b/packages/vision-handoff/src/index.ts new file mode 100644 index 0000000..4a13e65 --- /dev/null +++ b/packages/vision-handoff/src/index.ts @@ -0,0 +1,19 @@ +export { extension, manifest } from "./extension.js"; +export { + buildTranscriptionPrompt, + collectTextFromStream, + findVisionModelName, + formatNoVisionPlaceholder, + formatTranscriptionText, + isVisionCapable, +} from "./pure.js"; +export type { + ResolvedVisionModel, + VisionHandoffDeps, + VisionHandoffService, +} from "./service.js"; +export { + createVisionHandoffService, + visionHandoffHandle, +} from "./service.js"; +export { createReadImageTool } from "./tool.js"; diff --git a/packages/vision-handoff/src/pure.test.ts b/packages/vision-handoff/src/pure.test.ts new file mode 100644 index 0000000..89dac72 --- /dev/null +++ b/packages/vision-handoff/src/pure.test.ts @@ -0,0 +1,141 @@ +import type { ModelInfo, ProviderEvent } from "@dispatch/kernel"; +import { describe, expect, it } from "vitest"; +import { + buildTranscriptionPrompt, + collectTextFromStream, + findVisionModelName, + formatNoVisionPlaceholder, + formatTranscriptionText, + isVisionCapable, +} from "./pure.js"; + +describe("isVisionCapable", () => { + it("returns true when ModelInfo.vision is true", () => { + expect(isVisionCapable("umans/kimi-k2.7", { id: "kimi-k2.7", vision: true })).toBe(true); + }); + + it("returns false when ModelInfo.vision is false (overrides name heuristic)", () => { + expect(isVisionCapable("umans/kimi-k2.7", { id: "kimi-k2.7", vision: false })).toBe(false); + }); + + it("falls back to name heuristic when vision is absent (kimi)", () => { + expect(isVisionCapable("umans/kimi-k2.7", undefined)).toBe(true); + expect(isVisionCapable("umans/Kimi-K2.7", undefined)).toBe(true); // case-insensitive + }); + + it("falls back to name heuristic when vision is absent (non-kimi)", () => { + expect(isVisionCapable("umans/glm-5.2", undefined)).toBe(false); + expect(isVisionCapable("umans/deepseek-v4-flash", { id: "deepseek-v4-flash" })).toBe(false); + }); + + it("returns false for undefined model name", () => { + expect(isVisionCapable(undefined, undefined)).toBe(false); + }); +}); + +describe("findVisionModelName", () => { + const getInfo = async (name: string): Promise<ModelInfo | undefined> => { + const map: Record<string, ModelInfo> = { + "umans/kimi-k2.7": { id: "kimi-k2.7", vision: true }, + "umans/glm-5.2": { id: "glm-5.2" }, + "umans/llama-vision": { id: "llama-vision", vision: true }, + }; + return map[name]; + }; + + it("finds the first kimi-family model via name heuristic (no async lookup needed)", async () => { + const name = await findVisionModelName( + ["umans/glm-5.2", "umans/kimi-k2.7", "umans/llama-vision"], + getInfo, + ); + expect(name).toBe("umans/kimi-k2.7"); + }); + + it("finds a vision model via ModelInfo.vision when name heuristic misses", async () => { + const name = await findVisionModelName(["umans/glm-5.2", "umans/llama-vision"], getInfo); + expect(name).toBe("umans/llama-vision"); + }); + + it("skips the excluded model", async () => { + const name = await findVisionModelName( + ["umans/kimi-k2.7", "umans/llama-vision"], + getInfo, + "umans/kimi-k2.7", + ); + expect(name).toBe("umans/llama-vision"); + }); + + it("returns undefined when no vision model is available", async () => { + const name = await findVisionModelName(["umans/glm-5.2"], getInfo); + expect(name).toBeUndefined(); + }); + + it("returns undefined for empty catalog", async () => { + const name = await findVisionModelName([], getInfo); + expect(name).toBeUndefined(); + }); +}); + +describe("collectTextFromStream", () => { + async function* stream(events: ProviderEvent[]): AsyncIterable<ProviderEvent> { + for (const e of events) yield e; + } + + it("collects text-delta events into a single string", async () => { + const events: ProviderEvent[] = [ + { type: "text-delta", delta: "Hello " }, + { type: "text-delta", delta: "world!" }, + ]; + const text = await collectTextFromStream(stream(events)); + expect(text).toBe("Hello world!"); + }); + + it("ignores non-text events (reasoning, usage, tool-call, finish)", async () => { + const events: ProviderEvent[] = [ + { type: "reasoning-delta", delta: "thinking..." }, + { type: "text-delta", delta: "answer" }, + { type: "usage", usage: { inputTokens: 5, outputTokens: 1 } }, + { type: "finish", reason: "stop" }, + ]; + const text = await collectTextFromStream(stream(events)); + expect(text).toBe("answer"); + }); + + it("throws on an error event", async () => { + const events: ProviderEvent[] = [ + { type: "text-delta", delta: "partial" }, + { type: "error", message: "boom" }, + ]; + await expect(collectTextFromStream(stream(events))).rejects.toThrow("boom"); + }); + + it("returns empty string for an empty stream", async () => { + const text = await collectTextFromStream(stream([])); + expect(text).toBe(""); + }); +}); + +describe("prompt + formatting helpers", () => { + it("buildTranscriptionPrompt includes focus when a question is given", () => { + const prompt = buildTranscriptionPrompt("What error is shown?"); + expect(prompt).toContain("Describe this image in detail"); + expect(prompt).toContain('The user asked: "What error is shown?"'); + }); + + it("buildTranscriptionPrompt omits focus when no question", () => { + const prompt = buildTranscriptionPrompt(undefined); + expect(prompt).toContain("Describe this image in detail"); + expect(prompt).not.toContain("The user asked"); + }); + + it("formatTranscriptionText names the vision model", () => { + expect(formatTranscriptionText("a red car", "umans/kimi-k2.7")).toBe( + "[Image analysis (via umans/kimi-k2.7)]: a red car", + ); + }); + + it("formatNoVisionPlaceholder explains the limitation", () => { + const text = formatNoVisionPlaceholder(); + expect(text).toContain("no vision-capable model"); + }); +}); diff --git a/packages/vision-handoff/src/pure.ts b/packages/vision-handoff/src/pure.ts new file mode 100644 index 0000000..11eeefc --- /dev/null +++ b/packages/vision-handoff/src/pure.ts @@ -0,0 +1,129 @@ +/** + * Pure decision helpers for the vision handoff. + * + * No I/O, no ambient state. The shell (the extension + the service) injects the + * effects (credential store lookups, provider streaming). This module owns only + * the policy: which model is vision-capable, how to build a transcription + * request, and how to fold a provider's streamed text into a description. + */ + +import type { ModelInfo, ProviderEvent } from "@dispatch/kernel"; +import { isVisionModelId } from "@dispatch/openai-stream"; + +/** + * Whether a model is vision-capable, given its catalog name and (optional) + * resolved `ModelInfo`. When `ModelInfo.vision` is present it is authoritative; + * otherwise fall back to the hardcoded name heuristic ({@link isVisionModelId}). + * + * The `modelName` is the `<credentialName>/<model>` catalog form; the heuristic + * inspects the model SEGMENT (after the first `/`) so `umans/kimi-k2.7` → the + * `kimi-k2.7` segment is checked. Pure. + */ +export function isVisionCapable( + modelName: string | undefined, + info: ModelInfo | undefined, +): boolean { + // When ModelInfo explicitly reports vision (true OR false), it is authoritative + // — an explicit false overrides the name heuristic (a provider that KNOWS a + // model is non-vision wins over the name guess). + if (info?.vision !== undefined) return info.vision; + if (modelName === undefined) return false; + const slash = modelName.indexOf("/"); + const modelId = slash >= 0 ? modelName.slice(slash + 1) : modelName; + return isVisionModelId(modelId); +} + +/** + * Find the first vision-capable model name in a catalog, given a lookup that + * resolves a `<credentialName>/<model>` → `ModelInfo`. Returns `undefined` when + * no vision-capable model is available (the handoff degrades: images are + * replaced with a placeholder note). Pure given the (async) lookup — no + * ambient state, no side effects. + * + * @param catalog The full list of model names (`<credentialName>/<model>`). + * @param getInfo Async lookup of a model name → ModelInfo (from the credential store). + * @param exclude Optional model name to skip (e.g. the current non-vision model). + */ +export async function findVisionModelName( + catalog: readonly string[], + getInfo: (modelName: string) => Promise<ModelInfo | undefined>, + exclude?: string, +): Promise<string | undefined> { + for (const name of catalog) { + if (exclude !== undefined && name === exclude) continue; + // Fast path: the name heuristic lets us short-circuit without an async + // lookup for known vision families (kimi). This avoids a round-trip to + // listModels for the common case. + const slash = name.indexOf("/"); + const modelId = slash >= 0 ? name.slice(slash + 1) : name; + if (isVisionModelId(modelId)) return name; + const info = await getInfo(name); + if (info?.vision === true) return name; + } + return undefined; +} + +/** + * Fold a provider's streamed events into a single text string (the + * transcription). Pure given the async iterable — collects `text-delta` events, + * ignores everything else (reasoning, usage, tool-calls, errors). If the stream + * yields an error event, it is surfaced as a thrown Error so the caller can + * decide how to degrade (placeholder vs. fail). Pure: input → output, no I/O. + */ +export async function collectTextFromStream(stream: AsyncIterable<ProviderEvent>): Promise<string> { + let text = ""; + for await (const event of stream) { + if (event.type === "text-delta") { + text += event.delta; + } else if (event.type === "error") { + throw new Error(event.message); + } + } + return text; +} + +/** + * Build the prompt sent to the vision model to transcribe an image. Kept here + * (pure) so the prompt is testable and stable. The prompt asks for a thorough + * description so the text-only model has enough detail to reason about the + * image's contents. Pure. + * + * @param userQuestion The user's own message text (may be empty) — passed so + * the vision model can tailor its description to what the user actually asked. + */ +export function buildTranscriptionPrompt(userQuestion: string | undefined): string { + const focus = + userQuestion && userQuestion.trim().length > 0 + ? `\n\nThe user asked: "${userQuestion.trim()}". Focus your description on what is relevant to that question, but still describe the whole image.` + : ""; + return ( + "Describe this image in detail. Include: the overall scene/subject, " + + "visible text (transcribe verbatim), key objects, layout, colors, and any " + + "notable details a developer or user would need to understand the image." + + focus + ); +} + +/** + * Format a single image's transcription as a text chunk string for the + * persisted user message. The note names the vision model so the consumer knows + * the description's provenance. Pure. + */ +export function formatTranscriptionText( + description: string, + visionModelName: string | undefined, +): string { + const source = visionModelName ?? "vision model"; + return `[Image analysis (via ${source})]: ${description}`; +} + +/** + * Placeholder text used when NO vision-capable model is available (the + * degraded path). Pure. + */ +export function formatNoVisionPlaceholder(): string { + return ( + "[Image attached — no vision-capable model is available to analyze it. " + + "Install or configure a vision-capable model (e.g. kimi) to enable image analysis.]" + ); +} diff --git a/packages/vision-handoff/src/service.test.ts b/packages/vision-handoff/src/service.test.ts new file mode 100644 index 0000000..fe99d17 --- /dev/null +++ b/packages/vision-handoff/src/service.test.ts @@ -0,0 +1,242 @@ +import type { + ChatMessage, + ModelInfo, + ProviderContract, + ProviderEvent, + ProviderStreamOptions, + ToolContract, +} from "@dispatch/kernel"; +import { describe, expect, it, vi } from "vitest"; +import { createVisionHandoffService, type VisionHandoffDeps } from "./service.js"; + +// ── Test doubles (outermost-edge fakes — NOT @dispatch/* mocks) ────────────── + +function makeVisionProvider( + describe: (imageUrl: string) => string, + id = "umans", +): ProviderContract { + return { + id, + stream: vi.fn( + ( + messages: readonly ChatMessage[], + _tools: readonly ToolContract[], + _opts?: ProviderStreamOptions, + ): AsyncIterable<ProviderEvent> => { + const img = messages.flatMap((m) => m.chunks).find((c) => c.type === "image"); + const url = img && img.type === "image" ? img.url : ""; + const text = describe(url); + async function* gen(): AsyncIterable<ProviderEvent> { + yield { type: "text-delta", delta: text }; + yield { type: "finish", reason: "stop" }; + } + return gen(); + }, + ), + }; +} + +function makeDeps(overrides: Partial<VisionHandoffDeps> = {}): VisionHandoffDeps { + const visionProvider = makeVisionProvider((url) => `DESCRIPTION of ${url}`); + const catalog = ["umans/kimi-k2.7", "umans/glm-5.2"]; + const infoMap: Record<string, ModelInfo> = { + "umans/kimi-k2.7": { id: "kimi-k2.7", vision: true }, + "umans/glm-5.2": { id: "glm-5.2" }, + }; + return { + credentialStore: { + listCatalog: vi.fn(async () => catalog), + getModelInfo: vi.fn(async (name: string) => infoMap[name]), + resolve: vi.fn((name: string) => { + if (name === "umans/kimi-k2.7") return { providerId: "umans", model: "kimi-k2.7" }; + if (name === "umans/glm-5.2") return { providerId: "umans", model: "glm-5.2" }; + return undefined; + }), + }, + resolveModel: vi.fn((name: string) => + name === "umans/kimi-k2.7" || name === "umans/glm-5.2" + ? { provider: visionProvider, model: name.split("/")[1] } + : undefined, + ), + readFileAsDataUrl: vi.fn(async (path: string) => `data:image/png;base64,FILE(${path})`), + ...overrides, + }; +} + +describe("VisionHandoffService.isVisionCapable", () => { + it("returns true for kimi (via ModelInfo)", async () => { + const svc = createVisionHandoffService(makeDeps()); + expect(await svc.isVisionCapable("umans/kimi-k2.7")).toBe(true); + }); + + it("returns false for glm-5.2", async () => { + const svc = createVisionHandoffService(makeDeps()); + expect(await svc.isVisionCapable("umans/glm-5.2")).toBe(false); + }); + + it("returns false for undefined model name", async () => { + const svc = createVisionHandoffService(makeDeps()); + expect(await svc.isVisionCapable(undefined)).toBe(false); + }); +}); + +describe("VisionHandoffService.resolveVisionModel", () => { + it("resolves the kimi model from the catalog", async () => { + const svc = createVisionHandoffService(makeDeps()); + const vision = await svc.resolveVisionModel(); + expect(vision?.modelName).toBe("umans/kimi-k2.7"); + expect(vision?.model).toBe("kimi-k2.7"); + }); + + it("excludes the given model", async () => { + const svc = createVisionHandoffService(makeDeps()); + const vision = await svc.resolveVisionModel("umans/kimi-k2.7"); + // kimi is the only vision model; excluding it → undefined. + expect(vision).toBeUndefined(); + }); +}); + +describe("VisionHandoffService.transcribeImage", () => { + it("returns a formatted description from the vision model", async () => { + const svc = createVisionHandoffService(makeDeps()); + const result = await svc.transcribeImage("data:image/png;base64,xxx", "what is this?"); + expect(result).toBe( + "[Image analysis (via umans/kimi-k2.7)]: DESCRIPTION of data:image/png;base64,xxx", + ); + }); + + it("returns a placeholder when no vision model is available", async () => { + const deps = makeDeps(); + // Empty catalog → no vision model. + (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]); + const svc = createVisionHandoffService(deps); + const result = await svc.transcribeImage("data:image/png;base64,xxx", undefined); + expect(result).toContain("no vision-capable model"); + }); + + it("returns an error note when the vision stream errors", async () => { + const errorProvider: ProviderContract = { + id: "umans", + stream: vi.fn(async function* (): AsyncIterable<ProviderEvent> { + yield { type: "error", message: "vision API down" }; + }), + }; + const deps = makeDeps({ + resolveModel: vi.fn(() => ({ provider: errorProvider, model: "kimi-k2.7" })), + }); + const svc = createVisionHandoffService(deps); + const result = await svc.transcribeImage("data:image/png;base64,xxx", undefined); + expect(result).toContain("Image analysis failed: vision API down"); + }); +}); + +describe("VisionHandoffService.transcribeForProvider", () => { + it("passes messages through unchanged when the model is vision-capable", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "What's this?" }, + { type: "image", url: "data:image/png;base64,abc" }, + ], + }, + ]; + const result = await svc.transcribeForProvider(messages, "umans/kimi-k2.7"); + expect(result).toBe(messages); // same reference — no copy, no transcription + }); + + it("passes messages through unchanged when there are no images", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [{ role: "user", chunks: [{ type: "text", text: "hi" }] }]; + const result = await svc.transcribeForProvider(messages, "umans/glm-5.2"); + expect(result).toBe(messages); + }); + + it("transcribes image chunks to text for a non-vision model", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "text", text: "Describe this" }, + { type: "image", url: "data:image/png;base64,img1" }, + ], + }, + ]; + const result = await svc.transcribeForProvider(messages, "umans/glm-5.2"); + expect(result).toHaveLength(1); + const chunks = result[0]?.chunks; + expect(chunks).toHaveLength(2); + expect(chunks?.[0]).toEqual({ type: "text", text: "Describe this" }); + // The image chunk was replaced with a transcribed text chunk. + expect(chunks?.[1]?.type).toBe("text"); + expect((chunks?.[1] as { text: string }).text).toContain("Image analysis"); + expect((chunks?.[1] as { text: string }).text).toContain("img1"); + }); + + it("caches transcription per unique image URL within a call", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { + role: "user", + chunks: [ + { type: "image", url: "data:image/png;base64,same" }, + { type: "image", url: "data:image/png;base64,same" }, + ], + }, + ]; + const result = await svc.transcribeForProvider(messages, "umans/glm-5.2"); + const chunks = result[0]?.chunks; + // Both image chunks → text, same description (cached). + expect(chunks).toHaveLength(2); + expect((chunks?.[0] as { text: string }).text).toBe((chunks?.[1] as { text: string }).text); + // The vision provider was called only once (cache hit on the second). + const provider = deps.resolveModel("umans/kimi-k2.7")?.provider; + expect((provider?.stream as ReturnType<typeof vi.fn>).mock.calls).toHaveLength(1); + }); + + it("transcribes images in history messages too (non-vision model)", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,hist" }] }, + { role: "assistant", chunks: [{ type: "text", text: "got it" }] }, + { role: "user", chunks: [{ type: "text", text: "and now?" }] }, + ]; + const result = await svc.transcribeForProvider(messages, "umans/glm-5.2"); + // First message's image chunk is now text. + expect(result[0]?.chunks[0]?.type).toBe("text"); + expect((result[0]?.chunks[0] as { text: string }).text).toContain("Image analysis"); + // Assistant message unchanged. + expect(result[1]?.chunks[0]?.type).toBe("text"); + // Last user message unchanged. + expect(result[2]?.chunks[0]).toEqual({ type: "text", text: "and now?" }); + }); + + it("uses a placeholder when no vision model is available (non-vision model)", async () => { + const deps = makeDeps(); + (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]); + const svc = createVisionHandoffService(deps); + const messages: ChatMessage[] = [ + { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,abc" }] }, + ]; + const result = await svc.transcribeForProvider(messages, "umans/glm-5.2"); + expect((result[0]?.chunks[0] as { text: string }).text).toContain("no vision-capable model"); + }); +}); + +describe("VisionHandoffService.readImageFile", () => { + it("reads the file and transcribes it", async () => { + const deps = makeDeps(); + const svc = createVisionHandoffService(deps); + const result = await svc.readImageFile("screenshot.png", "/work"); + expect(deps.readFileAsDataUrl).toHaveBeenCalledWith("screenshot.png", "/work"); + expect(result).toContain("Image analysis"); + expect(result).toContain("FILE(screenshot.png)"); + }); +}); diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts new file mode 100644 index 0000000..5e6ad70 --- /dev/null +++ b/packages/vision-handoff/src/service.ts @@ -0,0 +1,281 @@ +/** + * Vision handoff service — the imperative shell that performs the universal, + * provider-agnostic vision handoff. + * + * Two capabilities: + * 1. **Transcription for non-vision models** (`transcribeForProvider`): when a + * user message carries images but the active model cannot see them, this + * calls a vision-capable model (resolved from the catalog — any provider) to + * describe each image, then replaces the image chunks with text. Universal: + * it uses the standard `ProviderContract.stream` interface, never a + * provider-specific vision endpoint. + * 2. **`read_image` tool** (`readImageFile`): reads an image FILE from disk and + * transcribes it via a vision-capable model, returning the text description + * — so any model (vision or not) can analyze an image referenced in code. + * + * Effects (credential store, provider streaming, filesystem, fetch) are + * injected. The pure decisions live in `pure.ts`. This shell wires them. + */ + +import type { CredentialStore } from "@dispatch/credential-store"; +import type { + ChatMessage, + Chunk, + Logger, + ModelInfo, + ProviderContract, + ProviderStreamOptions, +} from "@dispatch/kernel"; +import { defineService, type ServiceHandle } from "@dispatch/kernel"; +import { + buildTranscriptionPrompt, + collectTextFromStream, + findVisionModelName, + formatNoVisionPlaceholder, + formatTranscriptionText, + isVisionCapable, +} from "./pure.js"; + +/** + * Resolved vision model — a provider + its model id, ready to stream from. + */ +export interface ResolvedVisionModel { + readonly provider: ProviderContract; + readonly model: string; + readonly modelName: string; +} + +/** + * Dependencies the service needs — all injected (no ambient state). + */ +export interface VisionHandoffDeps { + readonly credentialStore: CredentialStore; + /** Resolve a `<credentialName>/<model>` → its provider + model id. */ + readonly resolveModel: ( + modelName: string, + ) => { provider: ProviderContract; model: string } | undefined; + /** + * Read a file from disk as a base64 data URL. Injected so the shell controls + * the filesystem edge (and tests inject a fake). Returns the data URL, or + * throws on error (the caller surfaces it as a tool error). + */ + readonly readFileAsDataUrl: (path: string, cwd?: string) => Promise<string>; + /** + * Fetch an HTTP(S) URL to a data URL (for http image sources). Injected so + * tests inject a fake. Optional — when absent, HTTP image URLs are passed to + * the vision provider as-is (it fetches them). + */ + readonly fetchUrlAsDataUrl?: (url: string) => Promise<string>; + readonly logger?: Logger; +} + +export interface VisionHandoffService { + /** + * Whether a given model (by catalog name) is vision-capable. Uses the + * credential store's ModelInfo + the name heuristic. Async because ModelInfo + * may require a listModels round-trip (cached by the credential store). + */ + readonly isVisionCapable: (modelName: string | undefined) => Promise<boolean>; + + /** + * Resolve a vision-capable model from the catalog (any provider). Returns + * `undefined` when none is available. + */ + readonly resolveVisionModel: (excludeName?: string) => Promise<ResolvedVisionModel | undefined>; + + /** + * Transcribe a single image URL to a text description via a vision-capable + * model. Returns the description, or a placeholder string when no vision + * model is available (does NOT throw — callers want graceful degradation). + */ + readonly transcribeImage: ( + imageUrl: string, + userQuestion: string | undefined, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ) => Promise<string>; + + /** + * Transform a message list for the provider: if the active model is + * vision-capable, return messages unchanged (images pass through natively). + * If NOT vision-capable, replace every `image` chunk with a text + * description (transcribed via a vision model — once per unique image URL, + * cached within the call) so a text-only model can still reason about the + * images. Never throws — on failure an image becomes a placeholder note. + * + * The PERSISTED history is NOT modified by this (the caller persists the + * original messages with images); this only transforms what the provider sees. + */ + readonly transcribeForProvider: ( + messages: readonly ChatMessage[], + currentModelName: string | undefined, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ) => Promise<readonly ChatMessage[]>; + + /** + * Read an image FILE from disk and transcribe it (the `read_image` tool's + * core). Returns the description text. Throws on filesystem error (the tool + * surfaces it as a tool-error result). + */ + readonly readImageFile: ( + path: string, + cwd: string | undefined, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ) => Promise<string>; +} + +export const visionHandoffHandle: ServiceHandle<VisionHandoffService> = + defineService<VisionHandoffService>("vision-handoff/service"); + +/** Whether a message list contains any image chunks. Pure. */ +function hasImageChunks(messages: readonly ChatMessage[]): boolean { + return messages.some((m) => m.chunks.some((c) => c.type === "image")); +} + +export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHandoffService { + const log = deps.logger; + + async function getInfo(modelName: string): Promise<ModelInfo | undefined> { + return deps.credentialStore.getModelInfo(modelName); + } + + async function resolveVisionModel( + excludeName?: string, + ): Promise<ResolvedVisionModel | undefined> { + const catalog = await deps.credentialStore.listCatalog(); + const name = await findVisionModelName(catalog, getInfo, excludeName); + if (name === undefined) return undefined; + const resolved = deps.resolveModel(name); + if (resolved === undefined) return undefined; + return { provider: resolved.provider, model: resolved.model, modelName: name }; + } + + async function streamVisionText( + vision: ResolvedVisionModel, + imageUrl: string, + prompt: string, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ): Promise<string> { + // Build a single-turn user message: [text prompt, image]. The vision model + // receives the image natively via the OpenAI-compatible content array + // (convertMessages serializes the image chunk to image_url). + const userMessage: ChatMessage = { + role: "user", + chunks: [ + { type: "text", text: prompt }, + { type: "image", url: imageUrl }, + ], + }; + const providerOpts: ProviderStreamOptions = { + model: vision.model, + // Low temperature for faithful transcription. + temperature: 0, + // A short system prompt keeps the vision model focused on describing. + systemPrompt: + "You are a vision assistant. Describe images faithfully and thoroughly for a developer who cannot see them.", + }; + const streamOpts: Parameters<ProviderContract["stream"]>[2] = { + ...providerOpts, + ...(opts?.logger !== undefined ? { logger: opts.logger } : {}), + }; + const stream = vision.provider.stream([userMessage], [], streamOpts); + return collectTextFromStream(stream); + } + + const service: VisionHandoffService = { + async isVisionCapable(modelName: string | undefined): Promise<boolean> { + if (modelName === undefined) return false; + const info = await getInfo(modelName); + return isVisionCapable(modelName, info); + }, + + resolveVisionModel, + + async transcribeImage( + imageUrl: string, + userQuestion: string | undefined, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ): Promise<string> { + const vision = await resolveVisionModel(); + if (vision === undefined) { + log?.warn("vision-handoff: no vision-capable model available for transcription"); + return formatNoVisionPlaceholder(); + } + const prompt = buildTranscriptionPrompt(userQuestion); + try { + const description = await streamVisionText(vision, imageUrl, prompt, opts); + const trimmed = description.trim(); + if (trimmed.length === 0) { + return "[Image analysis produced no output.]"; + } + return formatTranscriptionText(trimmed, vision.modelName); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + log?.warn("vision-handoff: transcription failed", { error: msg }); + return `[Image analysis failed: ${msg}]`; + } + }, + + async transcribeForProvider( + messages: readonly ChatMessage[], + currentModelName: string | undefined, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ): Promise<readonly ChatMessage[]> { + // Fast path: no images anywhere → nothing to do. + if (!hasImageChunks(messages)) return messages; + + // If the active model IS vision-capable, pass images through natively. + if (currentModelName !== undefined) { + const capable = await isVisionCapable(currentModelName, await getInfo(currentModelName)); + if (capable) return messages; + } + + // Non-vision model: transcribe each unique image URL once (cached). + const cache = new Map<string, string>(); + const userText = messages + .filter((m) => m.role === "user") + .flatMap((m) => m.chunks) + .filter((c): c is { type: "text"; text: string } => c.type === "text") + .map((c) => c.text) + .join(" "); + + async function transcribeCached(url: string): Promise<string> { + const cached = cache.get(url); + if (cached !== undefined) return cached; + const description = await service.transcribeImage(url, userText, opts); + cache.set(url, description); + return description; + } + + const result: ChatMessage[] = []; + for (const msg of messages) { + if (!msg.chunks.some((c) => c.type === "image")) { + result.push(msg); + continue; + } + // Replace image chunks with transcribed text chunks; keep all else. + const newChunks: Chunk[] = []; + for (const chunk of msg.chunks) { + if (chunk.type === "image") { + const description = await transcribeCached(chunk.url); + newChunks.push({ type: "text", text: description }); + } else { + newChunks.push(chunk); + } + } + result.push({ role: msg.role, chunks: newChunks }); + } + return result; + }, + + async readImageFile( + path: string, + cwd: string | undefined, + opts?: { readonly signal?: AbortSignal; readonly logger?: Logger }, + ): Promise<string> { + const dataUrl = await deps.readFileAsDataUrl(path, cwd); + return service.transcribeImage(dataUrl, undefined, opts); + }, + }; + + return service; +} diff --git a/packages/vision-handoff/src/tool.ts b/packages/vision-handoff/src/tool.ts new file mode 100644 index 0000000..3995598 --- /dev/null +++ b/packages/vision-handoff/src/tool.ts @@ -0,0 +1,68 @@ +/** + * read_image tool — lets any model (vision-capable or not) analyze an image + * FILE on disk by handing it off to a vision-capable model. + * + * The tool reads the image file into a base64 data URL, then asks the vision + * handoff service to transcribe it (via a vision-capable model resolved from + * the catalog) and returns the textual description as the tool result. This is + * the universal mechanism: it works regardless of whether the active model has + * vision, because the result is plain text the model reasons about. + * + * For images PASTED into the chat, the orchestrator's auto-transcription handles + * them (no tool call needed). This tool is for images REFERENCED IN CODE by path + * (e.g. a screenshot, diagram, or mockup the model discovered while reading files). + */ + +import type { ToolContract, ToolExecuteContext, ToolResult } from "@dispatch/kernel"; +import type { VisionHandoffService } from "./service.js"; + +export function createReadImageTool(service: VisionHandoffService): ToolContract { + return { + name: "read_image", + description: + "Read and analyze an image file on disk (PNG, JPEG, WebP, GIF). Returns a " + + "detailed textual description of the image's contents — useful when you " + + "encounter a screenshot, diagram, UI mockup, or chart referenced in the " + + "codebase and need to understand what it shows. The analysis is performed " + + "by a vision-capable model, so you can use this even if you cannot " + + "directly view images. Pass a file path (relative to the cwd or absolute).", + parameters: { + type: "object", + properties: { + path: { + type: "string", + description: + "Path to the image file to analyze. Relative paths resolve against " + + "the conversation's working directory; absolute paths are used as-is.", + }, + }, + required: ["path"], + }, + concurrencySafe: true, + async execute(args: unknown, ctx: ToolExecuteContext): Promise<ToolResult> { + const input = args as { path?: unknown } | null; + const path = input?.path; + if (typeof path !== "string" || path.trim().length === 0) { + return { + content: "Error: 'path' is required and must be a non-empty string.", + isError: true, + }; + } + const span = ctx.log.span("read_image.execute", { path }); + try { + const description = await service.readImageFile(path, ctx.cwd, { + signal: ctx.signal, + logger: ctx.log, + }); + span.end({ attrs: { descriptionLength: description.length } }); + return { content: description }; + } catch (err: unknown) { + span.end({ err }); + return { + content: `Error reading image: ${err instanceof Error ? err.message : String(err)}`, + isError: true, + }; + } + }, + }; +} diff --git a/packages/vision-handoff/tsconfig.json b/packages/vision-handoff/tsconfig.json new file mode 100644 index 0000000..ec597fc --- /dev/null +++ b/packages/vision-handoff/tsconfig.json @@ -0,0 +1,11 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { "rootDir": "src", "outDir": "dist", "composite": true }, + "include": ["src/**/*.ts"], + "references": [ + { "path": "../kernel" }, + { "path": "../wire" }, + { "path": "../credential-store" }, + { "path": "../openai-stream" } + ] +} diff --git a/packages/wire/src/index.test.ts b/packages/wire/src/index.test.ts index 3f07e00..81d10c1 100644 --- a/packages/wire/src/index.test.ts +++ b/packages/wire/src/index.test.ts @@ -8,7 +8,7 @@ */ import { describe, expect, it } from "vitest"; -import type { Computer, ComputerEntry, Workspace } from "./index.js"; +import type { Chunk, Computer, ComputerEntry, ImageChunk, ImageInput, Workspace } from "./index.js"; describe("@dispatch/wire — Computer / Workspace shapes", () => { it("a Computer literal satisfies the Computer type", () => { @@ -57,3 +57,32 @@ describe("@dispatch/wire — Computer / Workspace shapes", () => { expect(local.defaultComputerId).toBeNull(); }); }); + +describe("@dispatch/wire — ImageChunk / ImageInput shapes", () => { + it("an ImageChunk carries a data URL and optional mimeType", () => { + const c: ImageChunk = { + type: "image", + url: "data:image/png;base64,iVBORw0KGgo=", + mimeType: "image/png", + }; + expect(c.type).toBe("image"); + expect(c.url).toContain("base64"); + expect(c.mimeType).toBe("image/png"); + }); + + it("an ImageChunk with only a url is valid (mimeType optional)", () => { + const c: ImageChunk = { type: "image", url: "https://example.com/cat.png" }; + expect(c.mimeType).toBeUndefined(); + }); + + it("ImageInput mirrors ImageChunk's url semantics", () => { + const input: ImageInput = { url: "data:image/jpeg;base64,/9j/4AAQ" }; + expect(input.url).toContain("jpeg"); + }); + + it("ImageChunk is a member of the Chunk union (assignable)", () => { + const chunk: Chunk = { type: "image", url: "data:image/png;base64,x" }; + // Compile-time proof: an ImageChunk satisfies the Chunk union. + expect(chunk.type).toBe("image"); + }); +}); diff --git a/packages/wire/src/index.ts b/packages/wire/src/index.ts index 16b7023..d6ea1c1 100644 --- a/packages/wire/src/index.ts +++ b/packages/wire/src/index.ts @@ -36,7 +36,8 @@ export type Chunk = | ToolCallChunk | ToolResultChunk | ErrorChunk - | SystemChunk; + | SystemChunk + | ImageChunk; /** A piece of plain text content from the assistant or user. */ export interface TextChunk { @@ -113,6 +114,46 @@ export interface SystemChunk { } /** + * An image attached to a message (e.g. a user-pasted screenshot or pasted + * photo). Carries a `url` that is EITHER a base64 data URL + * (`data:image/png;base64,…`) OR an `http(s)://` URL. Vision-capable models + * receive it natively (the provider serializes it to its image-content + * format); non-vision models never see it directly — the orchestrator's + * **vision handoff** transcribes it to a text description (via a + * vision-capable model) and feeds that text instead, so a text-only model can + * still reason about the image's contents. + * + * When a transcription was performed, it is persisted as a separate `text` + * chunk alongside the `image` chunk in the SAME user message, so the + * description is reused on every later turn (no re-transcription) and a + * client renders both the original image and its textual analysis. + */ +export interface ImageChunk { + readonly type: "image"; + /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */ + readonly url: string; + /** + * Optional MIME type of the image (e.g. `"image/png"`). Inferred from the + * data URL when absent; present so a client can render an icon/label without + * parsing the URL. Optional — callers that only have a URL omit it. + */ + readonly mimeType?: string; +} + +/** + * An image a client attaches to a chat message (`ChatRequest.images`). The + * transport-facing input shape; the orchestrator converts each `ImageInput` + * into an `ImageChunk` on the persisted user message. Carries the same `url` + * semantics as `ImageChunk.url`. + */ +export interface ImageInput { + /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */ + readonly url: string; + /** Optional MIME type (e.g. `"image/png"`). Optional — inferred from the data URL when absent. */ + readonly mimeType?: string; +} + +/** * A chat message: a role plus an ordered sequence of chunks. Messages are the * unit passed to and from the provider; chunks are the unit persisted and * rendered. diff --git a/tsconfig.json b/tsconfig.json index e4e833d..fe5ea92 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -41,6 +41,9 @@ "path": "./packages/credential-store" }, { + "path": "./packages/vision-handoff" + }, + { "path": "./packages/exec-backend" }, { |
