summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorAdam Malczewski <[email protected]>2026-06-27 03:40:38 +0900
committerAdam Malczewski <[email protected]>2026-06-27 03:40:38 +0900
commitd5633cf6e007eaf8255a44529a638d2466a74ba3 (patch)
tree14fe72f5b585eb72c763073b4e7022b914bdbafb
parentad9d135e583c99a0d93327115defa43187cde1c3 (diff)
downloaddispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.tar.gz
dispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.zip
feat(vision-handoff): implement vision for capable models and universal vision handoff
-rw-r--r--bun.lock12
-rw-r--r--packages/host-bin/package.json3
-rw-r--r--packages/host-bin/src/main.ts8
-rw-r--r--packages/host-bin/tsconfig.json3
-rw-r--r--packages/kernel/src/contracts/conversation.ts2
-rw-r--r--packages/kernel/src/contracts/index.ts2
-rw-r--r--packages/kernel/src/contracts/provider.ts10
-rw-r--r--packages/openai-stream/src/convert-messages.test.ts94
-rw-r--r--packages/openai-stream/src/convert-messages.ts45
-rw-r--r--packages/openai-stream/src/index.ts10
-rw-r--r--packages/openai-stream/src/listModels.test.ts44
-rw-r--r--packages/openai-stream/src/listModels.ts26
-rw-r--r--packages/session-orchestrator/src/extension.ts15
-rw-r--r--packages/session-orchestrator/src/orchestrator.ts96
-rw-r--r--packages/session-orchestrator/src/pure.test.ts33
-rw-r--r--packages/session-orchestrator/src/pure.ts32
-rw-r--r--packages/transport-contract/src/contract.types.test.ts42
-rw-r--r--packages/transport-contract/src/index.ts24
-rw-r--r--packages/transport-http/src/app.ts23
-rw-r--r--packages/transport-http/src/logic.test.ts63
-rw-r--r--packages/transport-http/src/logic.ts34
-rw-r--r--packages/transport-ws/src/extension.ts1
-rw-r--r--packages/transport-ws/src/router.ts37
-rw-r--r--packages/vision-handoff/package.json13
-rw-r--r--packages/vision-handoff/src/extension.ts106
-rw-r--r--packages/vision-handoff/src/index.ts19
-rw-r--r--packages/vision-handoff/src/pure.test.ts141
-rw-r--r--packages/vision-handoff/src/pure.ts129
-rw-r--r--packages/vision-handoff/src/service.test.ts242
-rw-r--r--packages/vision-handoff/src/service.ts281
-rw-r--r--packages/vision-handoff/src/tool.ts68
-rw-r--r--packages/vision-handoff/tsconfig.json11
-rw-r--r--packages/wire/src/index.test.ts31
-rw-r--r--packages/wire/src/index.ts43
-rw-r--r--tsconfig.json3
35 files changed, 1727 insertions, 19 deletions
diff --git a/bun.lock b/bun.lock
index 2261ba8..8a913d0 100644
--- a/bun.lock
+++ b/bun.lock
@@ -103,6 +103,7 @@
"@dispatch/tool-youtube-transcript": "workspace:*",
"@dispatch/transport-http": "workspace:*",
"@dispatch/transport-ws": "workspace:*",
+ "@dispatch/vision-handoff": "workspace:*",
},
},
"packages/journal-sink": {
@@ -361,6 +362,15 @@
"name": "@dispatch/ui-contract",
"version": "0.2.0",
},
+ "packages/vision-handoff": {
+ "name": "@dispatch/vision-handoff",
+ "version": "0.0.0",
+ "dependencies": {
+ "@dispatch/credential-store": "workspace:*",
+ "@dispatch/kernel": "workspace:*",
+ "@dispatch/openai-stream": "workspace:*",
+ },
+ },
"packages/wire": {
"name": "@dispatch/wire",
"version": "0.12.0",
@@ -461,6 +471,8 @@
"@dispatch/ui-contract": ["@dispatch/ui-contract@workspace:packages/ui-contract"],
+ "@dispatch/vision-handoff": ["@dispatch/vision-handoff@workspace:packages/vision-handoff"],
+
"@dispatch/wire": ["@dispatch/wire@workspace:packages/wire"],
"@esbuild/aix-ppc64": ["@esbuild/[email protected]", "", { "os": "aix", "cpu": "ppc64" }, "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg=="],
diff --git a/packages/host-bin/package.json b/packages/host-bin/package.json
index 65ea305..b5ab954 100644
--- a/packages/host-bin/package.json
+++ b/packages/host-bin/package.json
@@ -33,6 +33,7 @@
"@dispatch/surface-loaded-extensions": "workspace:*",
"@dispatch/surface-registry": "workspace:*",
"@dispatch/transport-ws": "workspace:*",
- "@dispatch/system-prompt": "workspace:*"
+ "@dispatch/system-prompt": "workspace:*",
+ "@dispatch/vision-handoff": "workspace:*"
}
}
diff --git a/packages/host-bin/src/main.ts b/packages/host-bin/src/main.ts
index 8633052..a5dabab 100644
--- a/packages/host-bin/src/main.ts
+++ b/packages/host-bin/src/main.ts
@@ -43,6 +43,7 @@ import { extension as toolWriteFileExt } from "@dispatch/tool-write-file";
import { extension as toolYoutubeTranscriptExt } from "@dispatch/tool-youtube-transcript";
import { createTransportHttpExtension } from "@dispatch/transport-http";
import { createTransportWsExtension } from "@dispatch/transport-ws";
+import { extension as visionHandoffExt } from "@dispatch/vision-handoff";
import type { ChildHandle } from "./collector-supervisor.js";
import { createCollectorSupervisor } from "./collector-supervisor.js";
import { configMapToAccess, envToConfigMap } from "./config.js";
@@ -204,6 +205,13 @@ async function boot(): Promise<void> {
const extensions: Extension[] = [
...CORE_EXTENSIONS,
createCredentialStoreExtension({ credentials }),
+ // vision-handoff activates AFTER credential-store (it resolves the
+ // credential-store service at activate time to find vision-capable models).
+ // Placed here, not in CORE_EXTENSIONS, so the service is available when it
+ // activates. The session-orchestrator resolves its service LAZILY
+ // (per-turn), so activation order between it and session-orchestrator
+ // doesn't matter.
+ visionHandoffExt,
...externalExtensions,
];
diff --git a/packages/host-bin/tsconfig.json b/packages/host-bin/tsconfig.json
index 2b1edf5..305274c 100644
--- a/packages/host-bin/tsconfig.json
+++ b/packages/host-bin/tsconfig.json
@@ -60,6 +60,9 @@
},
{
"path": "../transport-ws"
+ },
+ {
+ "path": "../vision-handoff"
}
]
}
diff --git a/packages/kernel/src/contracts/conversation.ts b/packages/kernel/src/contracts/conversation.ts
index f074c52..80da86e 100644
--- a/packages/kernel/src/contracts/conversation.ts
+++ b/packages/kernel/src/contracts/conversation.ts
@@ -12,6 +12,8 @@ export type {
ConversationMeta,
ConversationStatus,
ErrorChunk,
+ ImageChunk,
+ ImageInput,
Role,
StepId,
StepMetrics,
diff --git a/packages/kernel/src/contracts/index.ts b/packages/kernel/src/contracts/index.ts
index 09e0a56..28e0a0b 100644
--- a/packages/kernel/src/contracts/index.ts
+++ b/packages/kernel/src/contracts/index.ts
@@ -19,6 +19,8 @@ export type {
ConversationMeta,
ConversationStatus,
ErrorChunk,
+ ImageChunk,
+ ImageInput,
Role,
StepId,
StepMetrics,
diff --git a/packages/kernel/src/contracts/provider.ts b/packages/kernel/src/contracts/provider.ts
index b6dc8ca..3137073 100644
--- a/packages/kernel/src/contracts/provider.ts
+++ b/packages/kernel/src/contracts/provider.ts
@@ -114,6 +114,16 @@ export interface ModelInfo {
readonly displayName?: string;
/** The model's max context window in tokens (e.g. 200000). Optional — providers that don't report it leave it undefined. */
readonly contextWindow?: number;
+ /**
+ * Whether this model can natively accept image input (vision/multimodal).
+ * When `true`, image chunks in a user message are passed through to the
+ * provider serialized to its image-content format. When `false`/absent, the
+ * orchestrator's vision handoff transcribes images to text (via a
+ * vision-capable model) before the model sees them. Optional — providers
+ * that cannot detect it leave it undefined (treated as non-vision); a
+ * provider that knows a model is vision-capable sets it `true`.
+ */
+ readonly vision?: boolean;
}
/**
diff --git a/packages/openai-stream/src/convert-messages.test.ts b/packages/openai-stream/src/convert-messages.test.ts
index 3520eb5..57c7d81 100644
--- a/packages/openai-stream/src/convert-messages.test.ts
+++ b/packages/openai-stream/src/convert-messages.test.ts
@@ -35,6 +35,100 @@ describe("convertMessages", () => {
expect(result).toEqual([{ role: "user", content: "Hello, world!" }]);
});
+ it("converts a user message with a text + image chunk to a multimodal content array", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "What is in this image?" },
+ { type: "image", url: "data:image/png;base64,iVBORw0KGgo=" },
+ ],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ expect(result).toEqual([
+ {
+ role: "user",
+ content: [
+ { type: "text", text: "What is in this image?" },
+ { type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } },
+ ],
+ },
+ ]);
+ });
+
+ it("converts an image-only user message (no text) to a content array with just the image", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [{ type: "image", url: "https://example.com/cat.png" }],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ expect(result).toEqual([
+ {
+ role: "user",
+ content: [{ type: "image_url", image_url: { url: "https://example.com/cat.png" } }],
+ },
+ ]);
+ });
+
+ it("converts a user message with multiple images interspersed with text", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "Compare these:" },
+ { type: "image", url: "data:image/png;base64,aaa" },
+ { type: "text", text: "and" },
+ { type: "image", url: "data:image/jpeg;base64,bbb" },
+ ],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ expect(result).toHaveLength(1);
+ const content = result[0]?.content;
+ expect(Array.isArray(content)).toBe(true);
+ if (Array.isArray(content)) {
+ expect(content).toHaveLength(4);
+ expect(content[0]).toEqual({ type: "text", text: "Compare these:" });
+ expect(content[1]).toEqual({
+ type: "image_url",
+ image_url: { url: "data:image/png;base64,aaa" },
+ });
+ expect(content[2]).toEqual({ type: "text", text: "and" });
+ expect(content[3]).toEqual({
+ type: "image_url",
+ image_url: { url: "data:image/jpeg;base64,bbb" },
+ });
+ }
+ });
+
+ it("skips empty text parts in a multimodal message but keeps images", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "" },
+ { type: "image", url: "data:image/png;base64,x" },
+ ],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ const content = result[0]?.content;
+ expect(Array.isArray(content)).toBe(true);
+ if (Array.isArray(content)) {
+ // Empty text part is dropped; only the image remains.
+ expect(content).toEqual([
+ { type: "image_url", image_url: { url: "data:image/png;base64,x" } },
+ ]);
+ }
+ });
+
it("converts an assistant message with text only", () => {
const messages: ChatMessage[] = [
{
diff --git a/packages/openai-stream/src/convert-messages.ts b/packages/openai-stream/src/convert-messages.ts
index e830243..eba3575 100644
--- a/packages/openai-stream/src/convert-messages.ts
+++ b/packages/openai-stream/src/convert-messages.ts
@@ -1,8 +1,28 @@
import type { ChatMessage, Chunk } from "@dispatch/kernel";
+/** A text part within a multimodal OpenAI content array. */
+export interface OpenAITextPart {
+ readonly type: "text";
+ readonly text: string;
+}
+
+/** An image part within a multimodal OpenAI content array (OpenAI vision format). */
+export interface OpenAIImagePart {
+ readonly type: "image_url";
+ readonly image_url: { readonly url: string };
+}
+
+/**
+ * A part of a multimodal message content array. When a message has mixed text
+ * and image chunks, the content is serialized as an array of these parts
+ * (OpenAI's vision format). Plain-text messages keep a string `content` for
+ * byte-stability with providers that only accept strings.
+ */
+export type OpenAIContentPart = OpenAITextPart | OpenAIImagePart;
+
export interface OpenAIMessage {
readonly role: "system" | "user" | "assistant" | "tool";
- readonly content: string | null;
+ readonly content: string | null | readonly OpenAIContentPart[];
readonly tool_calls?: readonly OpenAIToolCall[];
readonly tool_call_id?: string;
}
@@ -49,6 +69,29 @@ function convertSystemMessage(msg: ChatMessage): OpenAIMessage {
}
function convertUserMessage(msg: ChatMessage): OpenAIMessage {
+ // If the message has image chunks, serialize as a multimodal content array
+ // (OpenAI vision format): text parts + image_url parts in chunk order.
+ // Plain text-only messages keep a string `content` for byte-stability with
+ // providers that only accept a string (and to keep prompt-cache prefixes
+ // unchanged for the common no-image case).
+ const hasImage = msg.chunks.some((c) => c.type === "image");
+ if (hasImage) {
+ const parts: OpenAIContentPart[] = [];
+ for (const chunk of msg.chunks) {
+ if (chunk.type === "text") {
+ if (chunk.text.length > 0) {
+ parts.push({ type: "text", text: chunk.text });
+ }
+ } else if (chunk.type === "image") {
+ parts.push({ type: "image_url", image_url: { url: chunk.url } });
+ }
+ // Non-text/non-image chunks (tool-call, thinking, etc.) are not part of a
+ // user message's provider content and are skipped here.
+ }
+ // An image-only message (no text) still needs at least the image part.
+ return { role: "user", content: parts.length > 0 ? parts : "" };
+ }
+
const text = msg.chunks
.filter((c): c is Extract<Chunk, { type: "text" }> => c.type === "text")
.map((c) => c.text)
diff --git a/packages/openai-stream/src/index.ts b/packages/openai-stream/src/index.ts
index bd2f673..3f76b99 100644
--- a/packages/openai-stream/src/index.ts
+++ b/packages/openai-stream/src/index.ts
@@ -1,8 +1,14 @@
-export type { OpenAIMessage, OpenAIToolCall } from "./convert-messages.js";
+export type {
+ OpenAIContentPart,
+ OpenAIImagePart,
+ OpenAIMessage,
+ OpenAITextPart,
+ OpenAIToolCall,
+} from "./convert-messages.js";
export { convertMessages } from "./convert-messages.js";
export type { OpenAITool } from "./convert-tools.js";
export { convertTools } from "./convert-tools.js";
-export { parseModelList } from "./listModels.js";
+export { isVisionModelId, parseModelList } from "./listModels.js";
export { parseSSELines } from "./parse-sse.js";
export type { CreateOpenAICompatProviderOpts } from "./provider.js";
export { createOpenAICompatProvider } from "./provider.js";
diff --git a/packages/openai-stream/src/listModels.test.ts b/packages/openai-stream/src/listModels.test.ts
index c2438bc..3acf46e 100644
--- a/packages/openai-stream/src/listModels.test.ts
+++ b/packages/openai-stream/src/listModels.test.ts
@@ -1,7 +1,7 @@
import type { ApiKeyCredentials, ModelInfo, ProviderContract } from "@dispatch/kernel";
import type { FetchLike } from "@dispatch/trace-replay";
import { describe, expect, it, vi } from "vitest";
-import { parseModelList } from "./listModels.js";
+import { isVisionModelId, parseModelList } from "./listModels.js";
import { createOpenAICompatProvider } from "./provider.js";
function makeProvider(fetchFn: FetchLike, apiKey = "sk-test-1234567890abcdef"): ProviderContract {
@@ -35,6 +35,48 @@ describe("listModels — pure mapping (parseModelList)", () => {
const result = parseModelList([]);
expect(result).toEqual([]);
});
+
+ it("extracts contextWindow from common field names", () => {
+ const result = parseModelList([
+ { id: "m1", context_length: 128000 },
+ { id: "m2", context_window: 200000 },
+ { id: "m3", max_context_length: 64000 },
+ { id: "m4", max_tokens: 8000 },
+ ]);
+ expect(result).toEqual([
+ { id: "m1", contextWindow: 128000 },
+ { id: "m2", contextWindow: 200000 },
+ { id: "m3", contextWindow: 64000 },
+ { id: "m4", contextWindow: 8000 },
+ ]);
+ });
+});
+
+describe("listModels — vision capability detection", () => {
+ it("isVisionModelId returns true for kimi-family model ids", () => {
+ expect(isVisionModelId("kimi-k2.7")).toBe(true);
+ expect(isVisionModelId("Kimi-K2.7")).toBe(true); // case-insensitive
+ expect(isVisionModelId("moonshot/kimi-k2-thinking")).toBe(true);
+ });
+
+ it("isVisionModelId returns false for non-kimi model ids", () => {
+ expect(isVisionModelId("glm-5.2")).toBe(false);
+ expect(isVisionModelId("deepseek-v4-flash")).toBe(false);
+ expect(isVisionModelId("umans-coder")).toBe(false);
+ });
+
+ it("parseModelList sets vision: true on kimi-family models", () => {
+ const result = parseModelList([
+ { id: "kimi-k2.7", context_length: 200000 },
+ { id: "glm-5.2", context_length: 128000 },
+ { id: "deepseek-v4-flash" },
+ ]);
+ expect(result).toEqual([
+ { id: "kimi-k2.7", contextWindow: 200000, vision: true },
+ { id: "glm-5.2", contextWindow: 128000 },
+ { id: "deepseek-v4-flash" },
+ ]);
+ });
});
describe("listModels — provider contract", () => {
diff --git a/packages/openai-stream/src/listModels.ts b/packages/openai-stream/src/listModels.ts
index 0e94c43..273fee3 100644
--- a/packages/openai-stream/src/listModels.ts
+++ b/packages/openai-stream/src/listModels.ts
@@ -24,17 +24,39 @@ interface OpenAIModelListResponse {
}
/**
+ * Whether a model id is vision-capable (can natively accept image input).
+ *
+ * The OpenAI-compatible `/models` endpoint does not reliably report image
+ * capabilities, so this is a hardcoded heuristic by model id: a model whose id
+ * contains "kimi" (e.g. `kimi-k2.7`, `moonshot/kimi-k2.7`) is vision-capable;
+ * all others are treated as non-vision. This is the single source of truth —
+ * the orchestrator's vision handoff and the `read_image` tool both consult the
+ * `ModelInfo.vision` flag this sets, so adding a model here enables vision
+ * everywhere. Pure: id → boolean, no I/O.
+ *
+ * (When an endpoint gains reliable vision reporting, this can be replaced with
+ * a real capability check without changing callers.)
+ */
+export function isVisionModelId(id: string): boolean {
+ const lower = id.toLowerCase();
+ return lower.includes("kimi");
+}
+
+/**
* Pure mapping: raw OpenAI-compatible model list → ModelInfo[].
- * Extracts `contextWindow` from common field names (providers vary).
- * Extracted for direct unit testing with no I/O.
+ * Extracts `contextWindow` from common field names (providers vary) and
+ * detects vision capability via {@link isVisionModelId}. Extracted for direct
+ * unit testing with no I/O.
*/
export function parseModelList(data: readonly OpenAIModelEntry[]): readonly ModelInfo[] {
return data.map((entry) => {
const contextWindow =
entry.context_length ?? entry.context_window ?? entry.max_context_length ?? entry.max_tokens;
+ const vision = isVisionModelId(entry.id);
return {
id: entry.id,
...(contextWindow !== undefined ? { contextWindow } : {}),
+ ...(vision ? { vision } : {}),
};
});
}
diff --git a/packages/session-orchestrator/src/extension.ts b/packages/session-orchestrator/src/extension.ts
index 5afffd8..d080e90 100644
--- a/packages/session-orchestrator/src/extension.ts
+++ b/packages/session-orchestrator/src/extension.ts
@@ -11,6 +11,7 @@ import {
createSessionOrchestrator,
createWarmService,
sessionOrchestratorHandle,
+ visionHandoffLocalHandle,
} from "./orchestrator.js";
import { selectFirstProvider } from "./pure.js";
import { filterRemoteIncompatibleTools, toolsFilter } from "./tools-filter.js";
@@ -93,6 +94,20 @@ export function activate(host: HostAPI): void {
return undefined;
}
},
+ resolveVisionHandoff: () => {
+ // Lazily resolve the vision-handoff service. Returns undefined when the
+ // vision-handoff extension isn't loaded (images pass through unchanged —
+ // correct for vision-capable models; the feature degrades off cleanly for
+ // text-only turns). Lazy so activation order doesn't matter; the
+ // activated-manifests guard avoids a getService throw when absent.
+ const loaded = host.getExtensions().some((m) => m.id === "vision-handoff");
+ if (!loaded) return undefined;
+ try {
+ return host.getService(visionHandoffLocalHandle);
+ } catch {
+ return undefined;
+ }
+ },
});
host.provideService(sessionOrchestratorHandle, orchestrator);
diff --git a/packages/session-orchestrator/src/orchestrator.ts b/packages/session-orchestrator/src/orchestrator.ts
index 96cd3a3..ac1eaf4 100644
--- a/packages/session-orchestrator/src/orchestrator.ts
+++ b/packages/session-orchestrator/src/orchestrator.ts
@@ -5,6 +5,7 @@ import type {
CompactionResult,
ConversationStatus,
EventHookDescriptor,
+ ImageInput,
Logger,
ModelInfo,
ProviderContract,
@@ -32,11 +33,52 @@ import {
} from "./pure.js";
import type { ToolAssembly } from "./tools-filter.js";
+// --- Vision handoff (lazy, optional) ---
+
+/**
+ * Minimal contract the vision-handoff service satisfies. Defined here (not
+ * imported from the vision-handoff package) so the orchestrator has NO
+ * compile-time dependency on it — the service is resolved lazily at runtime
+ * (like the message-queue / system-prompt services), and the feature degrades
+ * off cleanly when the extension isn't loaded (images pass through unchanged,
+ * which is correct for vision-capable models and a no-op for text-only turns).
+ *
+ * `transcribeForProvider` transforms a message list for the provider: if the
+ * active model is vision-capable, messages pass through unchanged; otherwise
+ * image chunks are replaced with text descriptions (transcribed via a
+ * vision-capable model). Never throws — degrades to placeholders.
+ */
+export interface VisionHandoffService {
+ readonly transcribeForProvider: (
+ messages: readonly ChatMessage[],
+ currentModelName: string | undefined,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ) => Promise<readonly ChatMessage[]>;
+}
+
+/**
+ * Local handle for the vision-handoff service, keyed by the same ID the
+ * vision-handoff extension registers under (`"vision-handoff/service"`). Defined
+ * locally (not imported) so the orchestrator has no compile-time dependency on
+ * the vision-handoff package — the service is resolved lazily at runtime, and
+ * the feature degrades off cleanly when the extension isn't loaded.
+ */
+export const visionHandoffLocalHandle: ServiceHandle<VisionHandoffService> =
+ defineService<VisionHandoffService>("vision-handoff/service");
+
// --- Broadcast hub types ---
export interface StartTurnInput {
readonly conversationId: string;
readonly text: string;
+ /**
+ * Images attached to this turn (e.g. user-pasted screenshots). Each is
+ * appended as an `image` chunk on the persisted user message. For a
+ * vision-capable model the images pass through to the provider natively; for
+ * a non-vision model the vision handoff transcribes them to text first.
+ * Optional — omit for a text-only turn.
+ */
+ readonly images?: readonly ImageInput[];
readonly modelName?: string;
readonly cwd?: string;
/**
@@ -75,6 +117,12 @@ export type StartTurnResult =
export interface EnqueueInput {
readonly conversationId: string;
readonly text: string;
+ /**
+ * Images attached (the steering / opening message analog of
+ * `StartTurnInput.images`). Threaded to `startTurn` when the conversation is
+ * idle (the message starts a turn). Additive optional.
+ */
+ readonly images?: readonly ImageInput[];
/** Workspace to stamp on a new conversation. Defaults to `"default"`. */
readonly workspaceId?: string;
/**
@@ -289,6 +337,8 @@ export interface SessionOrchestrator {
workspaceId?: string;
/** Explicit system-prompt override — see {@link StartTurnInput.systemPrompt}. */
systemPrompt?: string;
+ /** Images attached to this turn — see {@link StartTurnInput.images}. */
+ images?: readonly ImageInput[];
}): Promise<void>;
}
@@ -335,6 +385,17 @@ export interface SessionOrchestratorDeps {
* order doesn't matter.
*/
readonly resolveSystemPrompt?: () => SystemPromptService | undefined;
+ /**
+ * Lazily resolves the vision-handoff service, or `undefined` when the
+ * vision-handoff extension isn't loaded. Used to transcribe image chunks to
+ * text for non-vision models before they reach the provider (so a text-only
+ * model can still reason about pasted/code images). When `undefined`, images
+ * pass through unchanged (correct for vision-capable models; a text-only model
+ * would then receive image content its API may reject — the feature degrades
+ * off cleanly for text-only turns since there are no images). Lazy so
+ * activation order doesn't matter; called per-turn.
+ */
+ readonly resolveVisionHandoff?: () => VisionHandoffService | undefined;
/** Apply the per-turn tools filter chain. Injected for testability. */
readonly applyToolsFilter: (assembly: ToolAssembly) => Promise<ToolAssembly>;
/** Base logger (auto-scoped to this extension); childed per turn for span capture. */
@@ -437,6 +498,7 @@ export function createSessionOrchestrator(
reasoningEffortOverride: ReasoningEffort | undefined,
workspaceId: string,
systemPromptOverride: string | undefined,
+ images: readonly ImageInput[] | undefined,
): void {
const turnId = generateTurnId();
const controller = new AbortController();
@@ -558,7 +620,7 @@ export function createSessionOrchestrator(
const effectiveModelName = resolveModelName(modelName, storedModel);
const history = await deps.conversationStore.load(conversationId);
- const userMsg = buildUserMessage(text);
+ const userMsg = buildUserMessage(text, images);
// Workspace assignment for new conversations happens BEFORE
// effective-cwd resolution (see workspaceSetupPromise above) so
@@ -697,9 +759,32 @@ export function createSessionOrchestrator(
return [{ role: "user", chunks: [{ type: "text", text: steerText }] }];
};
+ // Vision handoff: transform the message list for the provider. When the
+ // active model is vision-capable, images pass through natively (no-op).
+ // When it is NOT vision-capable, image chunks are transcribed to text
+ // descriptions via a vision-capable model — so a text-only model can
+ // still reason about images. The PERSISTED user message keeps the
+ // original image chunks (appended below); only the provider's view is
+ // transcribed. When the vision-handoff service isn't loaded, images pass
+ // through unchanged (correct for vision models; text-only models would
+ // then receive image content their API may reject — degrades off cleanly
+ // for text-only turns with no images).
+ const visionHandoff = deps.resolveVisionHandoff?.();
+ let providerMessages: readonly ChatMessage[] = [...history, userMsg];
+ if (visionHandoff !== undefined) {
+ providerMessages = await visionHandoff.transcribeForProvider(
+ providerMessages,
+ effectiveModelName,
+ {
+ signal: controller.signal,
+ ...(turnLogger !== undefined ? { logger: turnLogger } : {}),
+ },
+ );
+ }
+
const opts: RunTurnInput = {
provider,
- messages: [...history, userMsg],
+ messages: providerMessages,
tools: assembled.tools,
dispatch,
emit: emitAndAccumulate,
@@ -805,6 +890,7 @@ export function createSessionOrchestrator(
reasoningEffort,
workspaceId,
systemPrompt,
+ images,
}) {
if (activeTurns.has(conversationId)) {
return { started: false, reason: "already-active" };
@@ -818,18 +904,20 @@ export function createSessionOrchestrator(
reasoningEffort,
workspaceId ?? "default",
systemPrompt,
+ images,
);
const turn = activeTurns.get(conversationId);
const turnId = turn !== undefined ? turn.turnId : "";
return { started: true, turnId };
},
- enqueue({ conversationId, text, workspaceId, computerId }) {
+ enqueue({ conversationId, text, workspaceId, computerId, images }) {
const result = orchestrator.startTurn({
conversationId,
text,
...(workspaceId !== undefined ? { workspaceId } : {}),
...(computerId !== undefined ? { computerId } : {}),
+ ...(images !== undefined ? { images } : {}),
});
if (result.started) {
return { startedTurn: true, queue: [] };
@@ -914,6 +1002,7 @@ export function createSessionOrchestrator(
reasoningEffort,
workspaceId,
systemPrompt,
+ images,
}) {
const turnInput: StartTurnInput = {
conversationId,
@@ -924,6 +1013,7 @@ export function createSessionOrchestrator(
...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
...(workspaceId !== undefined ? { workspaceId } : {}),
...(systemPrompt !== undefined ? { systemPrompt } : {}),
+ ...(images !== undefined ? { images } : {}),
};
const result = orchestrator.startTurn(turnInput);
if (!result.started) {
diff --git a/packages/session-orchestrator/src/pure.test.ts b/packages/session-orchestrator/src/pure.test.ts
index c75cb82..7a574f1 100644
--- a/packages/session-orchestrator/src/pure.test.ts
+++ b/packages/session-orchestrator/src/pure.test.ts
@@ -26,6 +26,39 @@ describe("buildUserMessage", () => {
expect(msg.role).toBe("user");
expect(msg.chunks[0]).toEqual({ type: "text", text: "" });
});
+
+ it("appends image chunks after the text chunk when images are given", () => {
+ const msg = buildUserMessage("look at this", [
+ { url: "data:image/png;base64,aaa" },
+ { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+ ]);
+ expect(msg.chunks).toHaveLength(3);
+ expect(msg.chunks[0]).toEqual({ type: "text", text: "look at this" });
+ expect(msg.chunks[1]).toEqual({ type: "image", url: "data:image/png;base64,aaa" });
+ expect(msg.chunks[2]).toEqual({
+ type: "image",
+ url: "data:image/jpeg;base64,bbb",
+ mimeType: "image/jpeg",
+ });
+ });
+
+ it("builds an image-only message when text is empty", () => {
+ const msg = buildUserMessage("", [{ url: "data:image/png;base64,zzz" }]);
+ expect(msg.chunks).toHaveLength(1);
+ expect(msg.chunks[0]).toEqual({ type: "image", url: "data:image/png;base64,zzz" });
+ });
+
+ it("includes mimeType when provided", () => {
+ const msg = buildUserMessage("hi", [
+ { url: "data:image/webp;base64,x", mimeType: "image/webp" },
+ ]);
+ expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBe("image/webp");
+ });
+
+ it("omits mimeType when not provided", () => {
+ const msg = buildUserMessage("hi", [{ url: "https://example.com/x.png" }]);
+ expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBeUndefined();
+ });
});
describe("selectFirstProvider", () => {
diff --git a/packages/session-orchestrator/src/pure.ts b/packages/session-orchestrator/src/pure.ts
index 2208e8f..0d2068f 100644
--- a/packages/session-orchestrator/src/pure.ts
+++ b/packages/session-orchestrator/src/pure.ts
@@ -1,12 +1,40 @@
import type {
ChatMessage,
+ Chunk,
+ ImageInput,
ProviderContract,
ReasoningEffort,
ToolDispatchPolicy,
} from "@dispatch/kernel";
-export function buildUserMessage(text: string): ChatMessage {
- return { role: "user", chunks: [{ type: "text", text }] };
+/**
+ * Build the persisted user message for a turn. When `images` are provided, each
+ * is appended as an `image` chunk AFTER the text chunk, so the persisted message
+ * carries both the prompt text and the attached images (the frontend renders
+ * the images; vision-capable providers receive them natively; non-vision
+ * providers have them transcribed by the vision handoff before streaming).
+ *
+ * Pure: inputs → a ChatMessage, no I/O.
+ */
+export function buildUserMessage(text: string, images?: readonly ImageInput[]): ChatMessage {
+ const chunks: Chunk[] = [];
+ if (text.length > 0) {
+ chunks.push({ type: "text", text });
+ }
+ if (images !== undefined) {
+ for (const img of images) {
+ chunks.push({
+ type: "image",
+ url: img.url,
+ ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+ });
+ }
+ }
+ // An image-only message (empty text) is valid.
+ if (chunks.length === 0) {
+ chunks.push({ type: "text", text: "" });
+ }
+ return { role: "user", chunks };
}
// ── Provider-error retry backoff schedule ───────────────────────────────────
diff --git a/packages/transport-contract/src/contract.types.test.ts b/packages/transport-contract/src/contract.types.test.ts
index 9d3d904..34ff544 100644
--- a/packages/transport-contract/src/contract.types.test.ts
+++ b/packages/transport-contract/src/contract.types.test.ts
@@ -20,6 +20,7 @@ import type {
LspServerState,
LspStatusResponse,
McpStatusResponse,
+ ModelsResponse,
SetConversationComputerRequest,
SetCwdRequest,
SetWorkspaceDefaultComputerRequest,
@@ -55,6 +56,18 @@ const _chatWithoutComputer: ChatRequest = {
message: "hello",
};
+// ─── ChatRequest.images (additive optional) ──────────────────────────────────
+
+const _chatWithImages: ChatRequest = {
+ message: "What's in this screenshot?",
+ images: [{ url: "data:image/png;base64,iVBORw0KGgo=", mimeType: "image/png" }],
+};
+
+const _chatWithHttpImage: ChatRequest = {
+ message: "analyze this",
+ images: [{ url: "https://example.com/diagram.png" }],
+};
+
// ─── Computer list / single response ─────────────────────────────────────────
const _computer: Computer = {
@@ -255,6 +268,35 @@ describe("transport-contract types compile and are exported", () => {
expect(_chatWithComputer.computerId).toBe("prod-box");
});
+ // ─── ChatRequest.images (additive optional) ──────────────────────────────
+
+ it("ChatRequest: images is additive optional (omittable)", () => {
+ expect(_chatWithoutComputer.images).toBeUndefined();
+ });
+
+ it("ChatRequest: carries images (data URL) when set", () => {
+ expect(_chatWithImages.images).toHaveLength(1);
+ expect(_chatWithImages.images?.[0]?.url).toContain("base64");
+ expect(_chatWithImages.images?.[0]?.mimeType).toBe("image/png");
+ });
+
+ it("ChatRequest: carries images (http URL, mimeType optional)", () => {
+ expect(_chatWithHttpImage.images?.[0]?.url).toBe("https://example.com/diagram.png");
+ expect(_chatWithHttpImage.images?.[0]?.mimeType).toBeUndefined();
+ });
+
+ it("ModelsResponse: ModelMetadata carries optional vision flag", () => {
+ const resp: ModelsResponse = {
+ models: ["umans/kimi-k2.7", "umans/glm-5.2"],
+ modelInfo: {
+ "umans/kimi-k2.7": { contextWindow: 200000, vision: true },
+ "umans/glm-5.2": { contextWindow: 128000 },
+ },
+ };
+ expect(resp.modelInfo?.["umans/kimi-k2.7"]?.vision).toBe(true);
+ expect(resp.modelInfo?.["umans/glm-5.2"]?.vision).toBeUndefined();
+ });
+
// ─── Computers ───────────────────────────────────────────────────────────
it("ComputerListResponse: carries entries with usage counts", () => {
diff --git a/packages/transport-contract/src/index.ts b/packages/transport-contract/src/index.ts
index 6a9a29f..0444f29 100644
--- a/packages/transport-contract/src/index.ts
+++ b/packages/transport-contract/src/index.ts
@@ -26,6 +26,7 @@ import type {
ComputerEntry,
ConversationMeta,
ConversationStatus,
+ ImageInput,
QueuedMessage,
ReasoningEffort,
StoredChunk,
@@ -41,6 +42,8 @@ export type {
ComputerEntry,
ConversationMeta,
ConversationStatus,
+ ImageChunk,
+ ImageInput,
QueuedMessage,
ReasoningEffort,
StepMetrics,
@@ -68,6 +71,19 @@ export interface ChatRequest {
readonly message: string;
/**
+ * Images attached to this turn (e.g. a user-pasted screenshot). Each entry's
+ * `url` is a base64 data URL (`data:image/…;base64,…`) or an `http(s)://`
+ * URL. The server converts these to `image` chunks on the persisted user
+ * message. For a VISION-capable model (e.g. kimi), the images are passed
+ * through to the provider natively. For a NON-vision model (e.g. glm-5.2),
+ * the server's vision handoff transcribes each image to a text description
+ * (via a vision-capable model) and feeds that text instead — so a text-only
+ * model can still reason about the image's contents. Optional — omit for a
+ * text-only turn (backward compatible).
+ */
+ readonly images?: readonly ImageInput[];
+
+ /**
* The model to use, as a model name in `<credentialName>/<model>` form — one
* of the exact strings returned by `GET /models`. Omit to use the server's
* default credential + model.
@@ -124,6 +140,14 @@ export interface ModelsResponse {
/** Per-model metadata returned alongside the model catalog. */
export interface ModelMetadata {
readonly contextWindow?: number;
+ /**
+ * Whether this model can natively accept image input (vision/multimodal).
+ * When `true`, image chunks in a user message are passed through to the
+ * provider. When `false`/absent, the server's vision handoff transcribes
+ * images to text before the model sees them. A client may use this to show a
+ * vision badge in the model picker. Optional — absent when unknown.
+ */
+ readonly vision?: boolean;
}
/**
diff --git a/packages/transport-http/src/app.ts b/packages/transport-http/src/app.ts
index 4fb295e..a9a23da 100644
--- a/packages/transport-http/src/app.ts
+++ b/packages/transport-http/src/app.ts
@@ -294,11 +294,14 @@ export function createApp(opts: CreateServerOptions): Hono {
app.get("/models", async (c) => {
try {
const models = await opts.credentialStore.listCatalog();
- const modelInfo: Record<string, { contextWindow?: number }> = {};
+ const modelInfo: Record<string, { contextWindow?: number; vision?: boolean }> = {};
for (const modelName of models) {
const info = await opts.credentialStore.getModelInfo(modelName);
- if (info?.contextWindow !== undefined) {
- modelInfo[modelName] = { contextWindow: info.contextWindow };
+ if (info?.contextWindow !== undefined || info?.vision === true) {
+ const entry: { contextWindow?: number; vision?: boolean } = {};
+ if (info?.contextWindow !== undefined) entry.contextWindow = info.contextWindow;
+ if (info?.vision === true) entry.vision = true;
+ modelInfo[modelName] = entry;
}
}
const body: ModelsResponse = {
@@ -398,8 +401,16 @@ export function createApp(opts: CreateServerOptions): Hono {
return c.json({ error: result.error }, 400);
}
- const { conversationId, message, model, cwd, computerId, reasoningEffort, workspaceId } =
- result;
+ const {
+ conversationId,
+ message,
+ model,
+ cwd,
+ computerId,
+ reasoningEffort,
+ workspaceId,
+ images,
+ } = result;
log.info("chat: request accepted", {
conversationId,
hasModel: model !== undefined,
@@ -407,6 +418,7 @@ export function createApp(opts: CreateServerOptions): Hono {
hasComputerId: computerId !== undefined,
hasReasoningEffort: reasoningEffort !== undefined,
hasWorkspaceId: workspaceId !== undefined,
+ imageCount: images?.length ?? 0,
});
const events: AgentEvent[] = [];
@@ -457,6 +469,7 @@ export function createApp(opts: CreateServerOptions): Hono {
...(computerId !== undefined ? { computerId } : {}),
...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
...(workspaceId !== undefined ? { workspaceId } : {}),
+ ...(images !== undefined ? { images } : {}),
};
opts.orchestrator
diff --git a/packages/transport-http/src/logic.test.ts b/packages/transport-http/src/logic.test.ts
index fc8302e..67632f3 100644
--- a/packages/transport-http/src/logic.test.ts
+++ b/packages/transport-http/src/logic.test.ts
@@ -182,6 +182,69 @@ describe("parseChatBody", () => {
expect(result.reasoningEffort).toBeUndefined();
}
});
+
+ // ── images ──────────────────────────────────────────────────────────────
+
+ it("parses images array with data URLs", () => {
+ const result = parseChatBody(
+ {
+ message: "what is this?",
+ images: [
+ { url: "data:image/png;base64,aaa" },
+ { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+ ],
+ },
+ fakeId,
+ );
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images).toHaveLength(2);
+ expect(result.images?.[0]?.url).toBe("data:image/png;base64,aaa");
+ expect(result.images?.[1]?.mimeType).toBe("image/jpeg");
+ }
+ });
+
+ it("parses images with http URLs", () => {
+ const result = parseChatBody(
+ { message: "hi", images: [{ url: "https://example.com/x.png" }] },
+ fakeId,
+ );
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images?.[0]?.url).toBe("https://example.com/x.png");
+ }
+ });
+
+ it("returns error when images is not an array", () => {
+ const result = parseChatBody({ message: "hi", images: "not-an-array" }, fakeId);
+ expect(isParseError(result)).toBe(true);
+ });
+
+ it("returns error when an image lacks a url", () => {
+ const result = parseChatBody({ message: "hi", images: [{ mimeType: "image/png" }] }, fakeId);
+ expect(isParseError(result)).toBe(true);
+ });
+
+ it("returns error when an image url is empty", () => {
+ const result = parseChatBody({ message: "hi", images: [{ url: "" }] }, fakeId);
+ expect(isParseError(result)).toBe(true);
+ });
+
+ it("omits images when absent (backward compatible)", () => {
+ const result = parseChatBody({ message: "hi" }, fakeId);
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images).toBeUndefined();
+ }
+ });
+
+ it("omits images when the array is empty", () => {
+ const result = parseChatBody({ message: "hi", images: [] }, fakeId);
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images).toBeUndefined();
+ }
+ });
});
describe("parseSinceSeq", () => {
diff --git a/packages/transport-http/src/logic.ts b/packages/transport-http/src/logic.ts
index 97ad426..a928147 100644
--- a/packages/transport-http/src/logic.ts
+++ b/packages/transport-http/src/logic.ts
@@ -55,6 +55,13 @@ export interface ChatCommand {
readonly computerId?: string;
readonly reasoningEffort?: ReasoningEffort;
readonly workspaceId?: string;
+ /**
+ * Images attached to this turn (data URLs or http URLs). Parsed from the
+ * `ChatRequest.images` field; forwarded to the orchestrator which converts
+ * them to `image` chunks on the user message. Each entry must have a non-empty
+ * string `url`; `mimeType` is optional.
+ */
+ readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
}
export interface ParseError {
@@ -121,6 +128,33 @@ export function parseChatBody(body: unknown, generateId: () => string): ParseRes
(result as { workspaceId?: string }).workspaceId = obj.workspaceId;
}
+ if (obj.images !== undefined) {
+ if (!Array.isArray(obj.images)) {
+ return { error: "Field 'images' must be an array" };
+ }
+ const images: { url: string; mimeType?: string }[] = [];
+ for (const entry of obj.images) {
+ if (entry === null || typeof entry !== "object") {
+ return { error: "Each image must be an object with a 'url' string" };
+ }
+ const img = entry as { url?: unknown; mimeType?: unknown };
+ if (typeof img.url !== "string" || img.url.length === 0) {
+ return { error: "Each image must have a non-empty string 'url'" };
+ }
+ const parsed: { url: string; mimeType?: string } = { url: img.url };
+ if (img.mimeType !== undefined) {
+ if (typeof img.mimeType !== "string") {
+ return { error: "Field 'mimeType' on an image must be a string" };
+ }
+ parsed.mimeType = img.mimeType;
+ }
+ images.push(parsed);
+ }
+ if (images.length > 0) {
+ (result as { images?: readonly { url: string; mimeType?: string }[] }).images = images;
+ }
+ }
+
return result;
}
diff --git a/packages/transport-ws/src/extension.ts b/packages/transport-ws/src/extension.ts
index 3811ed7..d26712b 100644
--- a/packages/transport-ws/src/extension.ts
+++ b/packages/transport-ws/src/extension.ts
@@ -291,6 +291,7 @@ export function createTransportWsExtension(): Extension {
: {}),
...(result.workspaceId !== undefined ? { workspaceId: result.workspaceId } : {}),
...(result.computerId !== undefined ? { computerId: result.computerId } : {}),
+ ...(result.images !== undefined ? { images: result.images } : {}),
});
if (!startResult.started) {
send(ws, {
diff --git a/packages/transport-ws/src/router.ts b/packages/transport-ws/src/router.ts
index a33aa5a..0caf305 100644
--- a/packages/transport-ws/src/router.ts
+++ b/packages/transport-ws/src/router.ts
@@ -58,6 +58,12 @@ export interface ChatRouteResult {
* conversation → workspace → local chain).
*/
readonly computerId?: string;
+ /**
+ * Images attached to this turn (data URLs or http URLs), forwarded verbatim to
+ * the orchestrator. Absent when the client omits it. Each entry must have a
+ * non-empty string `url`; `mimeType` is optional.
+ */
+ readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
}
/** A malformed chat.send that should yield a chat.error reply. */
@@ -174,6 +180,36 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
errorMessage: `chat.send: invalid reasoningEffort "${msg.reasoningEffort}" — must be one of: low, medium, high, xhigh, max`,
};
}
+ // Validate images (if present): each must be an object with a non-empty url.
+ let images: readonly { url: string; mimeType?: string }[] | undefined;
+ if (msg.images !== undefined) {
+ if (!Array.isArray(msg.images)) {
+ return {
+ kind: "chat-error",
+ conversationId: msg.conversationId,
+ errorMessage: "chat.send: 'images' must be an array",
+ };
+ }
+ const parsed: { url: string; mimeType?: string }[] = [];
+ for (const entry of msg.images) {
+ if (
+ entry === null ||
+ typeof entry !== "object" ||
+ typeof entry.url !== "string" ||
+ entry.url.length === 0
+ ) {
+ return {
+ kind: "chat-error",
+ conversationId: msg.conversationId,
+ errorMessage: "chat.send: each image must have a non-empty string 'url'",
+ };
+ }
+ const p: { url: string; mimeType?: string } = { url: entry.url };
+ if (entry.mimeType !== undefined) p.mimeType = entry.mimeType;
+ parsed.push(p);
+ }
+ if (parsed.length > 0) images = parsed;
+ }
return {
kind: "chat",
conversationId: msg.conversationId,
@@ -183,6 +219,7 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
...(msg.reasoningEffort !== undefined ? { reasoningEffort: msg.reasoningEffort } : {}),
...(msg.workspaceId !== undefined ? { workspaceId: msg.workspaceId } : {}),
...(msg.computerId !== undefined ? { computerId: msg.computerId } : {}),
+ ...(images !== undefined ? { images } : {}),
};
}
diff --git a/packages/vision-handoff/package.json b/packages/vision-handoff/package.json
new file mode 100644
index 0000000..a88ab49
--- /dev/null
+++ b/packages/vision-handoff/package.json
@@ -0,0 +1,13 @@
+{
+ "name": "@dispatch/vision-handoff",
+ "version": "0.0.0",
+ "type": "module",
+ "private": true,
+ "main": "dist/index.js",
+ "types": "dist/index.d.ts",
+ "dependencies": {
+ "@dispatch/credential-store": "workspace:*",
+ "@dispatch/kernel": "workspace:*",
+ "@dispatch/openai-stream": "workspace:*"
+ }
+}
diff --git a/packages/vision-handoff/src/extension.ts b/packages/vision-handoff/src/extension.ts
new file mode 100644
index 0000000..aa745b7
--- /dev/null
+++ b/packages/vision-handoff/src/extension.ts
@@ -0,0 +1,106 @@
+/**
+ * vision-handoff extension — registers the universal vision handoff service +
+ * the `read_image` tool.
+ *
+ * The service performs provider-agnostic vision handoff: it resolves a
+ * vision-capable model from the catalog (any provider), streams an image to it
+ * via the standard `ProviderContract.stream` interface, and folds the textual
+ * description back — so a non-vision model (e.g. glm-5.2) can still reason about
+ * images, and any model can analyze image FILES referenced in code.
+ *
+ * Effects (filesystem, fetch) live here in the shell, injected into the service.
+ * The pure decisions live in `pure.ts`. No `console.*`; logging via `host.logger`.
+ */
+
+import { readFile } from "node:fs/promises";
+import { extname, isAbsolute, resolve as pathResolve } from "node:path";
+import type { CredentialStore } from "@dispatch/credential-store";
+import { credentialStoreHandle } from "@dispatch/credential-store";
+import type { Extension, HostAPI, Manifest } from "@dispatch/kernel";
+import { createVisionHandoffService, visionHandoffHandle } from "./service.js";
+import { createReadImageTool } from "./tool.js";
+
+export const manifest: Manifest = {
+ id: "vision-handoff",
+ name: "Vision Handoff",
+ version: "0.0.0",
+ apiVersion: "^0.1.0",
+ trust: "bundled",
+ activation: "eager",
+ capabilities: { network: true },
+ contributes: { services: ["vision-handoff/service"], tools: ["read_image"] },
+};
+
+/** MIME types for recognized image extensions. */
+const MIME_BY_EXT: Readonly<Record<string, string>> = {
+ ".png": "image/png",
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".webp": "image/webp",
+ ".gif": "image/gif",
+ ".bmp": "image/bmp",
+};
+
+/**
+ * Read an image file from disk as a base64 data URL. Resolves relative paths
+ * against the cwd (the conversation's working directory). Throws on missing
+ * file / read error (the caller surfaces it). The shell edge — real `node:fs`.
+ */
+async function readFileAsDataUrl(path: string, cwd?: string): Promise<string> {
+ const abs = cwd !== undefined && !isAbsolute(path) ? pathResolve(cwd, path) : pathResolve(path);
+ const buf = await readFile(abs);
+ const ext = extname(abs).toLowerCase();
+ const mime = MIME_BY_EXT[ext] ?? "image/png";
+ return `data:${mime};base64,${buf.toString("base64")}`;
+}
+
+/**
+ * Fetch an HTTP(S) image URL and convert it to a base64 data URL (so it can be
+ * sent to the vision model inline, regardless of whether the provider can fetch
+ * remote URLs). The shell edge — real `globalThis.fetch`.
+ */
+async function fetchUrlAsDataUrl(url: string): Promise<string> {
+ const res = await fetch(url);
+ if (!res.ok) {
+ throw new Error(`Failed to fetch image: HTTP ${res.status}`);
+ }
+ const buf = new Uint8Array(await res.arrayBuffer());
+ const mime = res.headers.get("content-type") ?? "image/png";
+ // Buffer/base64 in Bun + Node. Convert byte-by-byte without non-null asserts.
+ let binary = "";
+ for (const byte of buf) binary += String.fromCharCode(byte);
+ const base64 = btoa(binary);
+ return `data:${mime};base64,${base64}`;
+}
+
+export async function activate(host: HostAPI): Promise<void> {
+ const credentialStore = host.getService(credentialStoreHandle) as CredentialStore | undefined;
+ if (credentialStore === undefined) {
+ host.logger.warn(
+ "vision-handoff: credential-store service not available. The read_image tool and image transcription are disabled.",
+ );
+ return;
+ }
+
+ const resolveModel = (modelName: string) => {
+ const resolved = credentialStore.resolve(modelName);
+ if (resolved === undefined) return undefined;
+ const provider = host.getProviders().get(resolved.providerId);
+ if (provider === undefined) return undefined;
+ return { provider, model: resolved.model };
+ };
+
+ const service = createVisionHandoffService({
+ credentialStore,
+ resolveModel,
+ readFileAsDataUrl,
+ fetchUrlAsDataUrl,
+ logger: host.logger.child({ extensionId: "vision-handoff" }),
+ });
+
+ host.provideService(visionHandoffHandle, service);
+ host.defineTool(createReadImageTool(service));
+ host.logger.info("vision-handoff: registered (read_image tool + transcription service)");
+}
+
+export const extension: Extension = { manifest, activate };
diff --git a/packages/vision-handoff/src/index.ts b/packages/vision-handoff/src/index.ts
new file mode 100644
index 0000000..4a13e65
--- /dev/null
+++ b/packages/vision-handoff/src/index.ts
@@ -0,0 +1,19 @@
+export { extension, manifest } from "./extension.js";
+export {
+ buildTranscriptionPrompt,
+ collectTextFromStream,
+ findVisionModelName,
+ formatNoVisionPlaceholder,
+ formatTranscriptionText,
+ isVisionCapable,
+} from "./pure.js";
+export type {
+ ResolvedVisionModel,
+ VisionHandoffDeps,
+ VisionHandoffService,
+} from "./service.js";
+export {
+ createVisionHandoffService,
+ visionHandoffHandle,
+} from "./service.js";
+export { createReadImageTool } from "./tool.js";
diff --git a/packages/vision-handoff/src/pure.test.ts b/packages/vision-handoff/src/pure.test.ts
new file mode 100644
index 0000000..89dac72
--- /dev/null
+++ b/packages/vision-handoff/src/pure.test.ts
@@ -0,0 +1,141 @@
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { describe, expect, it } from "vitest";
+import {
+ buildTranscriptionPrompt,
+ collectTextFromStream,
+ findVisionModelName,
+ formatNoVisionPlaceholder,
+ formatTranscriptionText,
+ isVisionCapable,
+} from "./pure.js";
+
+describe("isVisionCapable", () => {
+ it("returns true when ModelInfo.vision is true", () => {
+ expect(isVisionCapable("umans/kimi-k2.7", { id: "kimi-k2.7", vision: true })).toBe(true);
+ });
+
+ it("returns false when ModelInfo.vision is false (overrides name heuristic)", () => {
+ expect(isVisionCapable("umans/kimi-k2.7", { id: "kimi-k2.7", vision: false })).toBe(false);
+ });
+
+ it("falls back to name heuristic when vision is absent (kimi)", () => {
+ expect(isVisionCapable("umans/kimi-k2.7", undefined)).toBe(true);
+ expect(isVisionCapable("umans/Kimi-K2.7", undefined)).toBe(true); // case-insensitive
+ });
+
+ it("falls back to name heuristic when vision is absent (non-kimi)", () => {
+ expect(isVisionCapable("umans/glm-5.2", undefined)).toBe(false);
+ expect(isVisionCapable("umans/deepseek-v4-flash", { id: "deepseek-v4-flash" })).toBe(false);
+ });
+
+ it("returns false for undefined model name", () => {
+ expect(isVisionCapable(undefined, undefined)).toBe(false);
+ });
+});
+
+describe("findVisionModelName", () => {
+ const getInfo = async (name: string): Promise<ModelInfo | undefined> => {
+ const map: Record<string, ModelInfo> = {
+ "umans/kimi-k2.7": { id: "kimi-k2.7", vision: true },
+ "umans/glm-5.2": { id: "glm-5.2" },
+ "umans/llama-vision": { id: "llama-vision", vision: true },
+ };
+ return map[name];
+ };
+
+ it("finds the first kimi-family model via name heuristic (no async lookup needed)", async () => {
+ const name = await findVisionModelName(
+ ["umans/glm-5.2", "umans/kimi-k2.7", "umans/llama-vision"],
+ getInfo,
+ );
+ expect(name).toBe("umans/kimi-k2.7");
+ });
+
+ it("finds a vision model via ModelInfo.vision when name heuristic misses", async () => {
+ const name = await findVisionModelName(["umans/glm-5.2", "umans/llama-vision"], getInfo);
+ expect(name).toBe("umans/llama-vision");
+ });
+
+ it("skips the excluded model", async () => {
+ const name = await findVisionModelName(
+ ["umans/kimi-k2.7", "umans/llama-vision"],
+ getInfo,
+ "umans/kimi-k2.7",
+ );
+ expect(name).toBe("umans/llama-vision");
+ });
+
+ it("returns undefined when no vision model is available", async () => {
+ const name = await findVisionModelName(["umans/glm-5.2"], getInfo);
+ expect(name).toBeUndefined();
+ });
+
+ it("returns undefined for empty catalog", async () => {
+ const name = await findVisionModelName([], getInfo);
+ expect(name).toBeUndefined();
+ });
+});
+
+describe("collectTextFromStream", () => {
+ async function* stream(events: ProviderEvent[]): AsyncIterable<ProviderEvent> {
+ for (const e of events) yield e;
+ }
+
+ it("collects text-delta events into a single string", async () => {
+ const events: ProviderEvent[] = [
+ { type: "text-delta", delta: "Hello " },
+ { type: "text-delta", delta: "world!" },
+ ];
+ const text = await collectTextFromStream(stream(events));
+ expect(text).toBe("Hello world!");
+ });
+
+ it("ignores non-text events (reasoning, usage, tool-call, finish)", async () => {
+ const events: ProviderEvent[] = [
+ { type: "reasoning-delta", delta: "thinking..." },
+ { type: "text-delta", delta: "answer" },
+ { type: "usage", usage: { inputTokens: 5, outputTokens: 1 } },
+ { type: "finish", reason: "stop" },
+ ];
+ const text = await collectTextFromStream(stream(events));
+ expect(text).toBe("answer");
+ });
+
+ it("throws on an error event", async () => {
+ const events: ProviderEvent[] = [
+ { type: "text-delta", delta: "partial" },
+ { type: "error", message: "boom" },
+ ];
+ await expect(collectTextFromStream(stream(events))).rejects.toThrow("boom");
+ });
+
+ it("returns empty string for an empty stream", async () => {
+ const text = await collectTextFromStream(stream([]));
+ expect(text).toBe("");
+ });
+});
+
+describe("prompt + formatting helpers", () => {
+ it("buildTranscriptionPrompt includes focus when a question is given", () => {
+ const prompt = buildTranscriptionPrompt("What error is shown?");
+ expect(prompt).toContain("Describe this image in detail");
+ expect(prompt).toContain('The user asked: "What error is shown?"');
+ });
+
+ it("buildTranscriptionPrompt omits focus when no question", () => {
+ const prompt = buildTranscriptionPrompt(undefined);
+ expect(prompt).toContain("Describe this image in detail");
+ expect(prompt).not.toContain("The user asked");
+ });
+
+ it("formatTranscriptionText names the vision model", () => {
+ expect(formatTranscriptionText("a red car", "umans/kimi-k2.7")).toBe(
+ "[Image analysis (via umans/kimi-k2.7)]: a red car",
+ );
+ });
+
+ it("formatNoVisionPlaceholder explains the limitation", () => {
+ const text = formatNoVisionPlaceholder();
+ expect(text).toContain("no vision-capable model");
+ });
+});
diff --git a/packages/vision-handoff/src/pure.ts b/packages/vision-handoff/src/pure.ts
new file mode 100644
index 0000000..11eeefc
--- /dev/null
+++ b/packages/vision-handoff/src/pure.ts
@@ -0,0 +1,129 @@
+/**
+ * Pure decision helpers for the vision handoff.
+ *
+ * No I/O, no ambient state. The shell (the extension + the service) injects the
+ * effects (credential store lookups, provider streaming). This module owns only
+ * the policy: which model is vision-capable, how to build a transcription
+ * request, and how to fold a provider's streamed text into a description.
+ */
+
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { isVisionModelId } from "@dispatch/openai-stream";
+
+/**
+ * Whether a model is vision-capable, given its catalog name and (optional)
+ * resolved `ModelInfo`. When `ModelInfo.vision` is present it is authoritative;
+ * otherwise fall back to the hardcoded name heuristic ({@link isVisionModelId}).
+ *
+ * The `modelName` is the `<credentialName>/<model>` catalog form; the heuristic
+ * inspects the model SEGMENT (after the first `/`) so `umans/kimi-k2.7` → the
+ * `kimi-k2.7` segment is checked. Pure.
+ */
+export function isVisionCapable(
+ modelName: string | undefined,
+ info: ModelInfo | undefined,
+): boolean {
+ // When ModelInfo explicitly reports vision (true OR false), it is authoritative
+ // — an explicit false overrides the name heuristic (a provider that KNOWS a
+ // model is non-vision wins over the name guess).
+ if (info?.vision !== undefined) return info.vision;
+ if (modelName === undefined) return false;
+ const slash = modelName.indexOf("/");
+ const modelId = slash >= 0 ? modelName.slice(slash + 1) : modelName;
+ return isVisionModelId(modelId);
+}
+
+/**
+ * Find the first vision-capable model name in a catalog, given a lookup that
+ * resolves a `<credentialName>/<model>` → `ModelInfo`. Returns `undefined` when
+ * no vision-capable model is available (the handoff degrades: images are
+ * replaced with a placeholder note). Pure given the (async) lookup — no
+ * ambient state, no side effects.
+ *
+ * @param catalog The full list of model names (`<credentialName>/<model>`).
+ * @param getInfo Async lookup of a model name → ModelInfo (from the credential store).
+ * @param exclude Optional model name to skip (e.g. the current non-vision model).
+ */
+export async function findVisionModelName(
+ catalog: readonly string[],
+ getInfo: (modelName: string) => Promise<ModelInfo | undefined>,
+ exclude?: string,
+): Promise<string | undefined> {
+ for (const name of catalog) {
+ if (exclude !== undefined && name === exclude) continue;
+ // Fast path: the name heuristic lets us short-circuit without an async
+ // lookup for known vision families (kimi). This avoids a round-trip to
+ // listModels for the common case.
+ const slash = name.indexOf("/");
+ const modelId = slash >= 0 ? name.slice(slash + 1) : name;
+ if (isVisionModelId(modelId)) return name;
+ const info = await getInfo(name);
+ if (info?.vision === true) return name;
+ }
+ return undefined;
+}
+
+/**
+ * Fold a provider's streamed events into a single text string (the
+ * transcription). Pure given the async iterable — collects `text-delta` events,
+ * ignores everything else (reasoning, usage, tool-calls, errors). If the stream
+ * yields an error event, it is surfaced as a thrown Error so the caller can
+ * decide how to degrade (placeholder vs. fail). Pure: input → output, no I/O.
+ */
+export async function collectTextFromStream(stream: AsyncIterable<ProviderEvent>): Promise<string> {
+ let text = "";
+ for await (const event of stream) {
+ if (event.type === "text-delta") {
+ text += event.delta;
+ } else if (event.type === "error") {
+ throw new Error(event.message);
+ }
+ }
+ return text;
+}
+
+/**
+ * Build the prompt sent to the vision model to transcribe an image. Kept here
+ * (pure) so the prompt is testable and stable. The prompt asks for a thorough
+ * description so the text-only model has enough detail to reason about the
+ * image's contents. Pure.
+ *
+ * @param userQuestion The user's own message text (may be empty) — passed so
+ * the vision model can tailor its description to what the user actually asked.
+ */
+export function buildTranscriptionPrompt(userQuestion: string | undefined): string {
+ const focus =
+ userQuestion && userQuestion.trim().length > 0
+ ? `\n\nThe user asked: "${userQuestion.trim()}". Focus your description on what is relevant to that question, but still describe the whole image.`
+ : "";
+ return (
+ "Describe this image in detail. Include: the overall scene/subject, " +
+ "visible text (transcribe verbatim), key objects, layout, colors, and any " +
+ "notable details a developer or user would need to understand the image." +
+ focus
+ );
+}
+
+/**
+ * Format a single image's transcription as a text chunk string for the
+ * persisted user message. The note names the vision model so the consumer knows
+ * the description's provenance. Pure.
+ */
+export function formatTranscriptionText(
+ description: string,
+ visionModelName: string | undefined,
+): string {
+ const source = visionModelName ?? "vision model";
+ return `[Image analysis (via ${source})]: ${description}`;
+}
+
+/**
+ * Placeholder text used when NO vision-capable model is available (the
+ * degraded path). Pure.
+ */
+export function formatNoVisionPlaceholder(): string {
+ return (
+ "[Image attached — no vision-capable model is available to analyze it. " +
+ "Install or configure a vision-capable model (e.g. kimi) to enable image analysis.]"
+ );
+}
diff --git a/packages/vision-handoff/src/service.test.ts b/packages/vision-handoff/src/service.test.ts
new file mode 100644
index 0000000..fe99d17
--- /dev/null
+++ b/packages/vision-handoff/src/service.test.ts
@@ -0,0 +1,242 @@
+import type {
+ ChatMessage,
+ ModelInfo,
+ ProviderContract,
+ ProviderEvent,
+ ProviderStreamOptions,
+ ToolContract,
+} from "@dispatch/kernel";
+import { describe, expect, it, vi } from "vitest";
+import { createVisionHandoffService, type VisionHandoffDeps } from "./service.js";
+
+// ── Test doubles (outermost-edge fakes — NOT @dispatch/* mocks) ──────────────
+
+function makeVisionProvider(
+ describe: (imageUrl: string) => string,
+ id = "umans",
+): ProviderContract {
+ return {
+ id,
+ stream: vi.fn(
+ (
+ messages: readonly ChatMessage[],
+ _tools: readonly ToolContract[],
+ _opts?: ProviderStreamOptions,
+ ): AsyncIterable<ProviderEvent> => {
+ const img = messages.flatMap((m) => m.chunks).find((c) => c.type === "image");
+ const url = img && img.type === "image" ? img.url : "";
+ const text = describe(url);
+ async function* gen(): AsyncIterable<ProviderEvent> {
+ yield { type: "text-delta", delta: text };
+ yield { type: "finish", reason: "stop" };
+ }
+ return gen();
+ },
+ ),
+ };
+}
+
+function makeDeps(overrides: Partial<VisionHandoffDeps> = {}): VisionHandoffDeps {
+ const visionProvider = makeVisionProvider((url) => `DESCRIPTION of ${url}`);
+ const catalog = ["umans/kimi-k2.7", "umans/glm-5.2"];
+ const infoMap: Record<string, ModelInfo> = {
+ "umans/kimi-k2.7": { id: "kimi-k2.7", vision: true },
+ "umans/glm-5.2": { id: "glm-5.2" },
+ };
+ return {
+ credentialStore: {
+ listCatalog: vi.fn(async () => catalog),
+ getModelInfo: vi.fn(async (name: string) => infoMap[name]),
+ resolve: vi.fn((name: string) => {
+ if (name === "umans/kimi-k2.7") return { providerId: "umans", model: "kimi-k2.7" };
+ if (name === "umans/glm-5.2") return { providerId: "umans", model: "glm-5.2" };
+ return undefined;
+ }),
+ },
+ resolveModel: vi.fn((name: string) =>
+ name === "umans/kimi-k2.7" || name === "umans/glm-5.2"
+ ? { provider: visionProvider, model: name.split("/")[1] }
+ : undefined,
+ ),
+ readFileAsDataUrl: vi.fn(async (path: string) => `data:image/png;base64,FILE(${path})`),
+ ...overrides,
+ };
+}
+
+describe("VisionHandoffService.isVisionCapable", () => {
+ it("returns true for kimi (via ModelInfo)", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ expect(await svc.isVisionCapable("umans/kimi-k2.7")).toBe(true);
+ });
+
+ it("returns false for glm-5.2", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ expect(await svc.isVisionCapable("umans/glm-5.2")).toBe(false);
+ });
+
+ it("returns false for undefined model name", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ expect(await svc.isVisionCapable(undefined)).toBe(false);
+ });
+});
+
+describe("VisionHandoffService.resolveVisionModel", () => {
+ it("resolves the kimi model from the catalog", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ const vision = await svc.resolveVisionModel();
+ expect(vision?.modelName).toBe("umans/kimi-k2.7");
+ expect(vision?.model).toBe("kimi-k2.7");
+ });
+
+ it("excludes the given model", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ const vision = await svc.resolveVisionModel("umans/kimi-k2.7");
+ // kimi is the only vision model; excluding it → undefined.
+ expect(vision).toBeUndefined();
+ });
+});
+
+describe("VisionHandoffService.transcribeImage", () => {
+ it("returns a formatted description from the vision model", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ const result = await svc.transcribeImage("data:image/png;base64,xxx", "what is this?");
+ expect(result).toBe(
+ "[Image analysis (via umans/kimi-k2.7)]: DESCRIPTION of data:image/png;base64,xxx",
+ );
+ });
+
+ it("returns a placeholder when no vision model is available", async () => {
+ const deps = makeDeps();
+ // Empty catalog → no vision model.
+ (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+ const svc = createVisionHandoffService(deps);
+ const result = await svc.transcribeImage("data:image/png;base64,xxx", undefined);
+ expect(result).toContain("no vision-capable model");
+ });
+
+ it("returns an error note when the vision stream errors", async () => {
+ const errorProvider: ProviderContract = {
+ id: "umans",
+ stream: vi.fn(async function* (): AsyncIterable<ProviderEvent> {
+ yield { type: "error", message: "vision API down" };
+ }),
+ };
+ const deps = makeDeps({
+ resolveModel: vi.fn(() => ({ provider: errorProvider, model: "kimi-k2.7" })),
+ });
+ const svc = createVisionHandoffService(deps);
+ const result = await svc.transcribeImage("data:image/png;base64,xxx", undefined);
+ expect(result).toContain("Image analysis failed: vision API down");
+ });
+});
+
+describe("VisionHandoffService.transcribeForProvider", () => {
+ it("passes messages through unchanged when the model is vision-capable", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "What's this?" },
+ { type: "image", url: "data:image/png;base64,abc" },
+ ],
+ },
+ ];
+ const result = await svc.transcribeForProvider(messages, "umans/kimi-k2.7");
+ expect(result).toBe(messages); // same reference — no copy, no transcription
+ });
+
+ it("passes messages through unchanged when there are no images", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [{ role: "user", chunks: [{ type: "text", text: "hi" }] }];
+ const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+ expect(result).toBe(messages);
+ });
+
+ it("transcribes image chunks to text for a non-vision model", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "Describe this" },
+ { type: "image", url: "data:image/png;base64,img1" },
+ ],
+ },
+ ];
+ const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+ expect(result).toHaveLength(1);
+ const chunks = result[0]?.chunks;
+ expect(chunks).toHaveLength(2);
+ expect(chunks?.[0]).toEqual({ type: "text", text: "Describe this" });
+ // The image chunk was replaced with a transcribed text chunk.
+ expect(chunks?.[1]?.type).toBe("text");
+ expect((chunks?.[1] as { text: string }).text).toContain("Image analysis");
+ expect((chunks?.[1] as { text: string }).text).toContain("img1");
+ });
+
+ it("caches transcription per unique image URL within a call", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "image", url: "data:image/png;base64,same" },
+ { type: "image", url: "data:image/png;base64,same" },
+ ],
+ },
+ ];
+ const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+ const chunks = result[0]?.chunks;
+ // Both image chunks → text, same description (cached).
+ expect(chunks).toHaveLength(2);
+ expect((chunks?.[0] as { text: string }).text).toBe((chunks?.[1] as { text: string }).text);
+ // The vision provider was called only once (cache hit on the second).
+ const provider = deps.resolveModel("umans/kimi-k2.7")?.provider;
+ expect((provider?.stream as ReturnType<typeof vi.fn>).mock.calls).toHaveLength(1);
+ });
+
+ it("transcribes images in history messages too (non-vision model)", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,hist" }] },
+ { role: "assistant", chunks: [{ type: "text", text: "got it" }] },
+ { role: "user", chunks: [{ type: "text", text: "and now?" }] },
+ ];
+ const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+ // First message's image chunk is now text.
+ expect(result[0]?.chunks[0]?.type).toBe("text");
+ expect((result[0]?.chunks[0] as { text: string }).text).toContain("Image analysis");
+ // Assistant message unchanged.
+ expect(result[1]?.chunks[0]?.type).toBe("text");
+ // Last user message unchanged.
+ expect(result[2]?.chunks[0]).toEqual({ type: "text", text: "and now?" });
+ });
+
+ it("uses a placeholder when no vision model is available (non-vision model)", async () => {
+ const deps = makeDeps();
+ (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,abc" }] },
+ ];
+ const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+ expect((result[0]?.chunks[0] as { text: string }).text).toContain("no vision-capable model");
+ });
+});
+
+describe("VisionHandoffService.readImageFile", () => {
+ it("reads the file and transcribes it", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const result = await svc.readImageFile("screenshot.png", "/work");
+ expect(deps.readFileAsDataUrl).toHaveBeenCalledWith("screenshot.png", "/work");
+ expect(result).toContain("Image analysis");
+ expect(result).toContain("FILE(screenshot.png)");
+ });
+});
diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts
new file mode 100644
index 0000000..5e6ad70
--- /dev/null
+++ b/packages/vision-handoff/src/service.ts
@@ -0,0 +1,281 @@
+/**
+ * Vision handoff service — the imperative shell that performs the universal,
+ * provider-agnostic vision handoff.
+ *
+ * Two capabilities:
+ * 1. **Transcription for non-vision models** (`transcribeForProvider`): when a
+ * user message carries images but the active model cannot see them, this
+ * calls a vision-capable model (resolved from the catalog — any provider) to
+ * describe each image, then replaces the image chunks with text. Universal:
+ * it uses the standard `ProviderContract.stream` interface, never a
+ * provider-specific vision endpoint.
+ * 2. **`read_image` tool** (`readImageFile`): reads an image FILE from disk and
+ * transcribes it via a vision-capable model, returning the text description
+ * — so any model (vision or not) can analyze an image referenced in code.
+ *
+ * Effects (credential store, provider streaming, filesystem, fetch) are
+ * injected. The pure decisions live in `pure.ts`. This shell wires them.
+ */
+
+import type { CredentialStore } from "@dispatch/credential-store";
+import type {
+ ChatMessage,
+ Chunk,
+ Logger,
+ ModelInfo,
+ ProviderContract,
+ ProviderStreamOptions,
+} from "@dispatch/kernel";
+import { defineService, type ServiceHandle } from "@dispatch/kernel";
+import {
+ buildTranscriptionPrompt,
+ collectTextFromStream,
+ findVisionModelName,
+ formatNoVisionPlaceholder,
+ formatTranscriptionText,
+ isVisionCapable,
+} from "./pure.js";
+
+/**
+ * Resolved vision model — a provider + its model id, ready to stream from.
+ */
+export interface ResolvedVisionModel {
+ readonly provider: ProviderContract;
+ readonly model: string;
+ readonly modelName: string;
+}
+
+/**
+ * Dependencies the service needs — all injected (no ambient state).
+ */
+export interface VisionHandoffDeps {
+ readonly credentialStore: CredentialStore;
+ /** Resolve a `<credentialName>/<model>` → its provider + model id. */
+ readonly resolveModel: (
+ modelName: string,
+ ) => { provider: ProviderContract; model: string } | undefined;
+ /**
+ * Read a file from disk as a base64 data URL. Injected so the shell controls
+ * the filesystem edge (and tests inject a fake). Returns the data URL, or
+ * throws on error (the caller surfaces it as a tool error).
+ */
+ readonly readFileAsDataUrl: (path: string, cwd?: string) => Promise<string>;
+ /**
+ * Fetch an HTTP(S) URL to a data URL (for http image sources). Injected so
+ * tests inject a fake. Optional — when absent, HTTP image URLs are passed to
+ * the vision provider as-is (it fetches them).
+ */
+ readonly fetchUrlAsDataUrl?: (url: string) => Promise<string>;
+ readonly logger?: Logger;
+}
+
+export interface VisionHandoffService {
+ /**
+ * Whether a given model (by catalog name) is vision-capable. Uses the
+ * credential store's ModelInfo + the name heuristic. Async because ModelInfo
+ * may require a listModels round-trip (cached by the credential store).
+ */
+ readonly isVisionCapable: (modelName: string | undefined) => Promise<boolean>;
+
+ /**
+ * Resolve a vision-capable model from the catalog (any provider). Returns
+ * `undefined` when none is available.
+ */
+ readonly resolveVisionModel: (excludeName?: string) => Promise<ResolvedVisionModel | undefined>;
+
+ /**
+ * Transcribe a single image URL to a text description via a vision-capable
+ * model. Returns the description, or a placeholder string when no vision
+ * model is available (does NOT throw — callers want graceful degradation).
+ */
+ readonly transcribeImage: (
+ imageUrl: string,
+ userQuestion: string | undefined,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ) => Promise<string>;
+
+ /**
+ * Transform a message list for the provider: if the active model is
+ * vision-capable, return messages unchanged (images pass through natively).
+ * If NOT vision-capable, replace every `image` chunk with a text
+ * description (transcribed via a vision model — once per unique image URL,
+ * cached within the call) so a text-only model can still reason about the
+ * images. Never throws — on failure an image becomes a placeholder note.
+ *
+ * The PERSISTED history is NOT modified by this (the caller persists the
+ * original messages with images); this only transforms what the provider sees.
+ */
+ readonly transcribeForProvider: (
+ messages: readonly ChatMessage[],
+ currentModelName: string | undefined,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ) => Promise<readonly ChatMessage[]>;
+
+ /**
+ * Read an image FILE from disk and transcribe it (the `read_image` tool's
+ * core). Returns the description text. Throws on filesystem error (the tool
+ * surfaces it as a tool-error result).
+ */
+ readonly readImageFile: (
+ path: string,
+ cwd: string | undefined,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ) => Promise<string>;
+}
+
+export const visionHandoffHandle: ServiceHandle<VisionHandoffService> =
+ defineService<VisionHandoffService>("vision-handoff/service");
+
+/** Whether a message list contains any image chunks. Pure. */
+function hasImageChunks(messages: readonly ChatMessage[]): boolean {
+ return messages.some((m) => m.chunks.some((c) => c.type === "image"));
+}
+
+export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHandoffService {
+ const log = deps.logger;
+
+ async function getInfo(modelName: string): Promise<ModelInfo | undefined> {
+ return deps.credentialStore.getModelInfo(modelName);
+ }
+
+ async function resolveVisionModel(
+ excludeName?: string,
+ ): Promise<ResolvedVisionModel | undefined> {
+ const catalog = await deps.credentialStore.listCatalog();
+ const name = await findVisionModelName(catalog, getInfo, excludeName);
+ if (name === undefined) return undefined;
+ const resolved = deps.resolveModel(name);
+ if (resolved === undefined) return undefined;
+ return { provider: resolved.provider, model: resolved.model, modelName: name };
+ }
+
+ async function streamVisionText(
+ vision: ResolvedVisionModel,
+ imageUrl: string,
+ prompt: string,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ): Promise<string> {
+ // Build a single-turn user message: [text prompt, image]. The vision model
+ // receives the image natively via the OpenAI-compatible content array
+ // (convertMessages serializes the image chunk to image_url).
+ const userMessage: ChatMessage = {
+ role: "user",
+ chunks: [
+ { type: "text", text: prompt },
+ { type: "image", url: imageUrl },
+ ],
+ };
+ const providerOpts: ProviderStreamOptions = {
+ model: vision.model,
+ // Low temperature for faithful transcription.
+ temperature: 0,
+ // A short system prompt keeps the vision model focused on describing.
+ systemPrompt:
+ "You are a vision assistant. Describe images faithfully and thoroughly for a developer who cannot see them.",
+ };
+ const streamOpts: Parameters<ProviderContract["stream"]>[2] = {
+ ...providerOpts,
+ ...(opts?.logger !== undefined ? { logger: opts.logger } : {}),
+ };
+ const stream = vision.provider.stream([userMessage], [], streamOpts);
+ return collectTextFromStream(stream);
+ }
+
+ const service: VisionHandoffService = {
+ async isVisionCapable(modelName: string | undefined): Promise<boolean> {
+ if (modelName === undefined) return false;
+ const info = await getInfo(modelName);
+ return isVisionCapable(modelName, info);
+ },
+
+ resolveVisionModel,
+
+ async transcribeImage(
+ imageUrl: string,
+ userQuestion: string | undefined,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ): Promise<string> {
+ const vision = await resolveVisionModel();
+ if (vision === undefined) {
+ log?.warn("vision-handoff: no vision-capable model available for transcription");
+ return formatNoVisionPlaceholder();
+ }
+ const prompt = buildTranscriptionPrompt(userQuestion);
+ try {
+ const description = await streamVisionText(vision, imageUrl, prompt, opts);
+ const trimmed = description.trim();
+ if (trimmed.length === 0) {
+ return "[Image analysis produced no output.]";
+ }
+ return formatTranscriptionText(trimmed, vision.modelName);
+ } catch (err) {
+ const msg = err instanceof Error ? err.message : String(err);
+ log?.warn("vision-handoff: transcription failed", { error: msg });
+ return `[Image analysis failed: ${msg}]`;
+ }
+ },
+
+ async transcribeForProvider(
+ messages: readonly ChatMessage[],
+ currentModelName: string | undefined,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ): Promise<readonly ChatMessage[]> {
+ // Fast path: no images anywhere → nothing to do.
+ if (!hasImageChunks(messages)) return messages;
+
+ // If the active model IS vision-capable, pass images through natively.
+ if (currentModelName !== undefined) {
+ const capable = await isVisionCapable(currentModelName, await getInfo(currentModelName));
+ if (capable) return messages;
+ }
+
+ // Non-vision model: transcribe each unique image URL once (cached).
+ const cache = new Map<string, string>();
+ const userText = messages
+ .filter((m) => m.role === "user")
+ .flatMap((m) => m.chunks)
+ .filter((c): c is { type: "text"; text: string } => c.type === "text")
+ .map((c) => c.text)
+ .join(" ");
+
+ async function transcribeCached(url: string): Promise<string> {
+ const cached = cache.get(url);
+ if (cached !== undefined) return cached;
+ const description = await service.transcribeImage(url, userText, opts);
+ cache.set(url, description);
+ return description;
+ }
+
+ const result: ChatMessage[] = [];
+ for (const msg of messages) {
+ if (!msg.chunks.some((c) => c.type === "image")) {
+ result.push(msg);
+ continue;
+ }
+ // Replace image chunks with transcribed text chunks; keep all else.
+ const newChunks: Chunk[] = [];
+ for (const chunk of msg.chunks) {
+ if (chunk.type === "image") {
+ const description = await transcribeCached(chunk.url);
+ newChunks.push({ type: "text", text: description });
+ } else {
+ newChunks.push(chunk);
+ }
+ }
+ result.push({ role: msg.role, chunks: newChunks });
+ }
+ return result;
+ },
+
+ async readImageFile(
+ path: string,
+ cwd: string | undefined,
+ opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+ ): Promise<string> {
+ const dataUrl = await deps.readFileAsDataUrl(path, cwd);
+ return service.transcribeImage(dataUrl, undefined, opts);
+ },
+ };
+
+ return service;
+}
diff --git a/packages/vision-handoff/src/tool.ts b/packages/vision-handoff/src/tool.ts
new file mode 100644
index 0000000..3995598
--- /dev/null
+++ b/packages/vision-handoff/src/tool.ts
@@ -0,0 +1,68 @@
+/**
+ * read_image tool — lets any model (vision-capable or not) analyze an image
+ * FILE on disk by handing it off to a vision-capable model.
+ *
+ * The tool reads the image file into a base64 data URL, then asks the vision
+ * handoff service to transcribe it (via a vision-capable model resolved from
+ * the catalog) and returns the textual description as the tool result. This is
+ * the universal mechanism: it works regardless of whether the active model has
+ * vision, because the result is plain text the model reasons about.
+ *
+ * For images PASTED into the chat, the orchestrator's auto-transcription handles
+ * them (no tool call needed). This tool is for images REFERENCED IN CODE by path
+ * (e.g. a screenshot, diagram, or mockup the model discovered while reading files).
+ */
+
+import type { ToolContract, ToolExecuteContext, ToolResult } from "@dispatch/kernel";
+import type { VisionHandoffService } from "./service.js";
+
+export function createReadImageTool(service: VisionHandoffService): ToolContract {
+ return {
+ name: "read_image",
+ description:
+ "Read and analyze an image file on disk (PNG, JPEG, WebP, GIF). Returns a " +
+ "detailed textual description of the image's contents — useful when you " +
+ "encounter a screenshot, diagram, UI mockup, or chart referenced in the " +
+ "codebase and need to understand what it shows. The analysis is performed " +
+ "by a vision-capable model, so you can use this even if you cannot " +
+ "directly view images. Pass a file path (relative to the cwd or absolute).",
+ parameters: {
+ type: "object",
+ properties: {
+ path: {
+ type: "string",
+ description:
+ "Path to the image file to analyze. Relative paths resolve against " +
+ "the conversation's working directory; absolute paths are used as-is.",
+ },
+ },
+ required: ["path"],
+ },
+ concurrencySafe: true,
+ async execute(args: unknown, ctx: ToolExecuteContext): Promise<ToolResult> {
+ const input = args as { path?: unknown } | null;
+ const path = input?.path;
+ if (typeof path !== "string" || path.trim().length === 0) {
+ return {
+ content: "Error: 'path' is required and must be a non-empty string.",
+ isError: true,
+ };
+ }
+ const span = ctx.log.span("read_image.execute", { path });
+ try {
+ const description = await service.readImageFile(path, ctx.cwd, {
+ signal: ctx.signal,
+ logger: ctx.log,
+ });
+ span.end({ attrs: { descriptionLength: description.length } });
+ return { content: description };
+ } catch (err: unknown) {
+ span.end({ err });
+ return {
+ content: `Error reading image: ${err instanceof Error ? err.message : String(err)}`,
+ isError: true,
+ };
+ }
+ },
+ };
+}
diff --git a/packages/vision-handoff/tsconfig.json b/packages/vision-handoff/tsconfig.json
new file mode 100644
index 0000000..ec597fc
--- /dev/null
+++ b/packages/vision-handoff/tsconfig.json
@@ -0,0 +1,11 @@
+{
+ "extends": "../../tsconfig.base.json",
+ "compilerOptions": { "rootDir": "src", "outDir": "dist", "composite": true },
+ "include": ["src/**/*.ts"],
+ "references": [
+ { "path": "../kernel" },
+ { "path": "../wire" },
+ { "path": "../credential-store" },
+ { "path": "../openai-stream" }
+ ]
+}
diff --git a/packages/wire/src/index.test.ts b/packages/wire/src/index.test.ts
index 3f07e00..81d10c1 100644
--- a/packages/wire/src/index.test.ts
+++ b/packages/wire/src/index.test.ts
@@ -8,7 +8,7 @@
*/
import { describe, expect, it } from "vitest";
-import type { Computer, ComputerEntry, Workspace } from "./index.js";
+import type { Chunk, Computer, ComputerEntry, ImageChunk, ImageInput, Workspace } from "./index.js";
describe("@dispatch/wire — Computer / Workspace shapes", () => {
it("a Computer literal satisfies the Computer type", () => {
@@ -57,3 +57,32 @@ describe("@dispatch/wire — Computer / Workspace shapes", () => {
expect(local.defaultComputerId).toBeNull();
});
});
+
+describe("@dispatch/wire — ImageChunk / ImageInput shapes", () => {
+ it("an ImageChunk carries a data URL and optional mimeType", () => {
+ const c: ImageChunk = {
+ type: "image",
+ url: "data:image/png;base64,iVBORw0KGgo=",
+ mimeType: "image/png",
+ };
+ expect(c.type).toBe("image");
+ expect(c.url).toContain("base64");
+ expect(c.mimeType).toBe("image/png");
+ });
+
+ it("an ImageChunk with only a url is valid (mimeType optional)", () => {
+ const c: ImageChunk = { type: "image", url: "https://example.com/cat.png" };
+ expect(c.mimeType).toBeUndefined();
+ });
+
+ it("ImageInput mirrors ImageChunk's url semantics", () => {
+ const input: ImageInput = { url: "data:image/jpeg;base64,/9j/4AAQ" };
+ expect(input.url).toContain("jpeg");
+ });
+
+ it("ImageChunk is a member of the Chunk union (assignable)", () => {
+ const chunk: Chunk = { type: "image", url: "data:image/png;base64,x" };
+ // Compile-time proof: an ImageChunk satisfies the Chunk union.
+ expect(chunk.type).toBe("image");
+ });
+});
diff --git a/packages/wire/src/index.ts b/packages/wire/src/index.ts
index 16b7023..d6ea1c1 100644
--- a/packages/wire/src/index.ts
+++ b/packages/wire/src/index.ts
@@ -36,7 +36,8 @@ export type Chunk =
| ToolCallChunk
| ToolResultChunk
| ErrorChunk
- | SystemChunk;
+ | SystemChunk
+ | ImageChunk;
/** A piece of plain text content from the assistant or user. */
export interface TextChunk {
@@ -113,6 +114,46 @@ export interface SystemChunk {
}
/**
+ * An image attached to a message (e.g. a user-pasted screenshot or pasted
+ * photo). Carries a `url` that is EITHER a base64 data URL
+ * (`data:image/png;base64,…`) OR an `http(s)://` URL. Vision-capable models
+ * receive it natively (the provider serializes it to its image-content
+ * format); non-vision models never see it directly — the orchestrator's
+ * **vision handoff** transcribes it to a text description (via a
+ * vision-capable model) and feeds that text instead, so a text-only model can
+ * still reason about the image's contents.
+ *
+ * When a transcription was performed, it is persisted as a separate `text`
+ * chunk alongside the `image` chunk in the SAME user message, so the
+ * description is reused on every later turn (no re-transcription) and a
+ * client renders both the original image and its textual analysis.
+ */
+export interface ImageChunk {
+ readonly type: "image";
+ /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+ readonly url: string;
+ /**
+ * Optional MIME type of the image (e.g. `"image/png"`). Inferred from the
+ * data URL when absent; present so a client can render an icon/label without
+ * parsing the URL. Optional — callers that only have a URL omit it.
+ */
+ readonly mimeType?: string;
+}
+
+/**
+ * An image a client attaches to a chat message (`ChatRequest.images`). The
+ * transport-facing input shape; the orchestrator converts each `ImageInput`
+ * into an `ImageChunk` on the persisted user message. Carries the same `url`
+ * semantics as `ImageChunk.url`.
+ */
+export interface ImageInput {
+ /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+ readonly url: string;
+ /** Optional MIME type (e.g. `"image/png"`). Optional — inferred from the data URL when absent. */
+ readonly mimeType?: string;
+}
+
+/**
* A chat message: a role plus an ordered sequence of chunks. Messages are the
* unit passed to and from the provider; chunks are the unit persisted and
* rendered.
diff --git a/tsconfig.json b/tsconfig.json
index e4e833d..fe5ea92 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -41,6 +41,9 @@
"path": "./packages/credential-store"
},
{
+ "path": "./packages/vision-handoff"
+ },
+ {
"path": "./packages/exec-backend"
},
{