summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorAdam Malczewski <[email protected]>2026-06-27 20:48:24 +0900
committerAdam Malczewski <[email protected]>2026-06-27 20:48:24 +0900
commit04356c8678ae8dd1d7ddca2d0460b514116adc2e (patch)
tree6c81894ef02d062570b12f4d3a871e58600dcb9c
parent3184b10e614ce6249c83aa111368e98f6689f456 (diff)
parentb24ed99e89bc657e8c98c7cef8608e0c0b7594da (diff)
downloaddispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.tar.gz
dispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.zip
Merge branch 'feature/vision-handoff' into dev
# Conflicts: # packages/session-orchestrator/src/extension.ts # packages/session-orchestrator/src/orchestrator.ts
-rw-r--r--bun.lock13
-rw-r--r--packages/conversation-store/src/keys.ts8
-rw-r--r--packages/conversation-store/src/store.ts77
-rw-r--r--packages/host-bin/package.json3
-rw-r--r--packages/host-bin/src/main.ts8
-rw-r--r--packages/host-bin/tsconfig.json3
-rw-r--r--packages/kernel/src/contracts/conversation.ts2
-rw-r--r--packages/kernel/src/contracts/index.ts2
-rw-r--r--packages/kernel/src/contracts/provider.ts10
-rw-r--r--packages/openai-stream/src/convert-messages.test.ts94
-rw-r--r--packages/openai-stream/src/convert-messages.ts45
-rw-r--r--packages/openai-stream/src/index.ts10
-rw-r--r--packages/openai-stream/src/listModels.test.ts49
-rw-r--r--packages/openai-stream/src/listModels.ts26
-rw-r--r--packages/session-orchestrator/src/extension.ts15
-rw-r--r--packages/session-orchestrator/src/orchestrator.ts132
-rw-r--r--packages/session-orchestrator/src/pure.test.ts33
-rw-r--r--packages/session-orchestrator/src/pure.ts32
-rw-r--r--packages/transport-contract/src/contract.types.test.ts42
-rw-r--r--packages/transport-contract/src/index.ts41
-rw-r--r--packages/transport-http/src/app.ts92
-rw-r--r--packages/transport-http/src/logic.test.ts63
-rw-r--r--packages/transport-http/src/logic.ts34
-rw-r--r--packages/transport-ws/src/extension.ts1
-rw-r--r--packages/transport-ws/src/router.ts37
-rw-r--r--packages/vision-handoff/package.json14
-rw-r--r--packages/vision-handoff/src/extension.ts198
-rw-r--r--packages/vision-handoff/src/index.ts21
-rw-r--r--packages/vision-handoff/src/pure.test.ts180
-rw-r--r--packages/vision-handoff/src/pure.ts156
-rw-r--r--packages/vision-handoff/src/service.test.ts375
-rw-r--r--packages/vision-handoff/src/service.ts684
-rw-r--r--packages/vision-handoff/src/tool.ts137
-rw-r--r--packages/vision-handoff/tsconfig.json12
-rw-r--r--packages/wire/src/index.test.ts31
-rw-r--r--packages/wire/src/index.ts43
-rw-r--r--tsconfig.json3
37 files changed, 2707 insertions, 19 deletions
diff --git a/bun.lock b/bun.lock
index 602e42a..493da15 100644
--- a/bun.lock
+++ b/bun.lock
@@ -104,6 +104,7 @@
"@dispatch/tool-youtube-transcript": "workspace:*",
"@dispatch/transport-http": "workspace:*",
"@dispatch/transport-ws": "workspace:*",
+ "@dispatch/vision-handoff": "workspace:*",
},
},
"packages/journal-sink": {
@@ -371,6 +372,16 @@
"name": "@dispatch/ui-contract",
"version": "0.2.0",
},
+ "packages/vision-handoff": {
+ "name": "@dispatch/vision-handoff",
+ "version": "0.0.0",
+ "dependencies": {
+ "@dispatch/conversation-store": "workspace:*",
+ "@dispatch/credential-store": "workspace:*",
+ "@dispatch/kernel": "workspace:*",
+ "@dispatch/openai-stream": "workspace:*",
+ },
+ },
"packages/wire": {
"name": "@dispatch/wire",
"version": "0.12.0",
@@ -473,6 +484,8 @@
"@dispatch/ui-contract": ["@dispatch/ui-contract@workspace:packages/ui-contract"],
+ "@dispatch/vision-handoff": ["@dispatch/vision-handoff@workspace:packages/vision-handoff"],
+
"@dispatch/wire": ["@dispatch/wire@workspace:packages/wire"],
"@esbuild/aix-ppc64": ["@esbuild/[email protected]", "", { "os": "aix", "cpu": "ppc64" }, "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg=="],
diff --git a/packages/conversation-store/src/keys.ts b/packages/conversation-store/src/keys.ts
index b2c635d..6ec2bc5 100644
--- a/packages/conversation-store/src/keys.ts
+++ b/packages/conversation-store/src/keys.ts
@@ -66,6 +66,14 @@ export function compactThresholdKey(conversationId: string): string {
return `conv:${conversationId}:compact-percent`;
}
+/** Per-conversation image transcription cache (JSON map of imageUrl → transcription). */
+export function imageTranscriptionsKey(conversationId: string): string {
+ return `conv:${conversationId}:image-transcriptions`;
+}
+
+/** Global vision settings (image compaction limit + compaction model). */
+export const VISION_SETTINGS_KEY = "vision-settings";
+
export function metaKey(conversationId: string): string {
return `conv:${conversationId}:meta`;
}
diff --git a/packages/conversation-store/src/store.ts b/packages/conversation-store/src/store.ts
index f90e809..69334e6 100644
--- a/packages/conversation-store/src/store.ts
+++ b/packages/conversation-store/src/store.ts
@@ -20,6 +20,7 @@ import {
compactThresholdKey,
computerKey,
cwdKey,
+ imageTranscriptionsKey,
metaKey,
metricsKey,
metricsPrefix,
@@ -28,6 +29,7 @@ import {
parseSeq,
reasoningEffortKey,
seqKey,
+ VISION_SETTINGS_KEY,
workspaceKey,
} from "./keys.js";
import { reconcileWithReport } from "./reconcile.js";
@@ -141,6 +143,35 @@ export interface ConversationStore {
/** Set the compact percent (0-100, 0 = manual only). */
readonly setCompactPercent: (conversationId: string, percent: number) => Promise<void>;
/**
+ * Get the per-conversation image transcription cache: a map of image URL →
+ * transcription text. Used by the vision handoff to avoid re-transcribing
+ * old images that were compacted to text on a previous turn. Returns an
+ * empty map when none are cached.
+ */
+ readonly getImageTranscriptions: (conversationId: string) => Promise<ReadonlyMap<string, string>>;
+ /**
+ * Upsert a single image transcription into the per-conversation cache.
+ * Merges with any existing transcriptions (does NOT replace the whole map).
+ */
+ readonly setImageTranscription: (
+ conversationId: string,
+ imageUrl: string,
+ transcription: string,
+ ) => Promise<void>;
+ /**
+ * Get the global vision settings (image compaction limit + compaction model).
+ * The limit defaults to 10 when never set; the compaction model defaults to
+ * null (auto-select). Shared across ALL conversations and vision models.
+ */
+ readonly getVisionSettings: () => Promise<{
+ readonly imageLimit: number;
+ readonly compactionModel: string | null;
+ }>;
+ /** Set the global vision image compaction limit (0 = disabled). */
+ readonly setVisionImageLimit: (limit: number) => Promise<void>;
+ /** Set the global vision compaction model (null = auto-select). */
+ readonly setVisionCompactionModel: (model: string | null) => Promise<void>;
+ /**
* Set the `compactedFrom` field on a conversation's metadata, pointing to
* the archive conversation that holds the pre-compaction history.
*/
@@ -1004,6 +1035,52 @@ export function createConversationStore(
}
},
+ async getImageTranscriptions(conversationId) {
+ const raw = await storage.get(imageTranscriptionsKey(conversationId));
+ if (raw === null) return new Map();
+ try {
+ const obj = JSON.parse(raw) as Record<string, string>;
+ return new Map(Object.entries(obj));
+ } catch {
+ return new Map();
+ }
+ },
+
+ async setImageTranscription(conversationId, imageUrl, transcription) {
+ const existing = await this.getImageTranscriptions(conversationId);
+ const merged = new Map(existing);
+ merged.set(imageUrl, transcription);
+ const obj: Record<string, string> = {};
+ for (const [k, v] of merged) obj[k] = v;
+ await storage.set(imageTranscriptionsKey(conversationId), JSON.stringify(obj));
+ },
+
+ async getVisionSettings() {
+ const raw = await storage.get(VISION_SETTINGS_KEY);
+ if (raw === null) return { imageLimit: 10, compactionModel: null };
+ try {
+ const obj = JSON.parse(raw) as { imageLimit?: number; compactionModel?: string | null };
+ return {
+ imageLimit: typeof obj.imageLimit === "number" ? obj.imageLimit : 10,
+ compactionModel: obj.compactionModel ?? null,
+ };
+ } catch {
+ return { imageLimit: 10, compactionModel: null };
+ }
+ },
+
+ async setVisionImageLimit(limit) {
+ const current = await this.getVisionSettings();
+ const obj = { imageLimit: limit, compactionModel: current.compactionModel };
+ await storage.set(VISION_SETTINGS_KEY, JSON.stringify(obj));
+ },
+
+ async setVisionCompactionModel(model) {
+ const current = await this.getVisionSettings();
+ const obj = { imageLimit: current.imageLimit, compactionModel: model };
+ await storage.set(VISION_SETTINGS_KEY, JSON.stringify(obj));
+ },
+
async setCompactedFrom(conversationId, newConversationId) {
const raw = await storage.get(metaKey(conversationId));
const existing = raw !== null ? parseMetaRow(raw) : null;
diff --git a/packages/host-bin/package.json b/packages/host-bin/package.json
index e68251b..7d3b38c 100644
--- a/packages/host-bin/package.json
+++ b/packages/host-bin/package.json
@@ -34,6 +34,7 @@
"@dispatch/surface-loaded-extensions": "workspace:*",
"@dispatch/surface-registry": "workspace:*",
"@dispatch/transport-ws": "workspace:*",
- "@dispatch/system-prompt": "workspace:*"
+ "@dispatch/system-prompt": "workspace:*",
+ "@dispatch/vision-handoff": "workspace:*"
}
}
diff --git a/packages/host-bin/src/main.ts b/packages/host-bin/src/main.ts
index 2ab1118..aa114d5 100644
--- a/packages/host-bin/src/main.ts
+++ b/packages/host-bin/src/main.ts
@@ -44,6 +44,7 @@ import { extension as toolWriteFileExt } from "@dispatch/tool-write-file";
import { extension as toolYoutubeTranscriptExt } from "@dispatch/tool-youtube-transcript";
import { createTransportHttpExtension } from "@dispatch/transport-http";
import { createTransportWsExtension } from "@dispatch/transport-ws";
+import { extension as visionHandoffExt } from "@dispatch/vision-handoff";
import type { ChildHandle } from "./collector-supervisor.js";
import { createCollectorSupervisor } from "./collector-supervisor.js";
import { configMapToAccess, envToConfigMap } from "./config.js";
@@ -206,6 +207,13 @@ async function boot(): Promise<void> {
const extensions: Extension[] = [
...CORE_EXTENSIONS,
createCredentialStoreExtension({ credentials }),
+ // vision-handoff activates AFTER credential-store (it resolves the
+ // credential-store service at activate time to find vision-capable models).
+ // Placed here, not in CORE_EXTENSIONS, so the service is available when it
+ // activates. The session-orchestrator resolves its service LAZILY
+ // (per-turn), so activation order between it and session-orchestrator
+ // doesn't matter.
+ visionHandoffExt,
...externalExtensions,
];
diff --git a/packages/host-bin/tsconfig.json b/packages/host-bin/tsconfig.json
index cb85915..09b87df 100644
--- a/packages/host-bin/tsconfig.json
+++ b/packages/host-bin/tsconfig.json
@@ -63,6 +63,9 @@
},
{
"path": "../transport-ws"
+ },
+ {
+ "path": "../vision-handoff"
}
]
}
diff --git a/packages/kernel/src/contracts/conversation.ts b/packages/kernel/src/contracts/conversation.ts
index f074c52..80da86e 100644
--- a/packages/kernel/src/contracts/conversation.ts
+++ b/packages/kernel/src/contracts/conversation.ts
@@ -12,6 +12,8 @@ export type {
ConversationMeta,
ConversationStatus,
ErrorChunk,
+ ImageChunk,
+ ImageInput,
Role,
StepId,
StepMetrics,
diff --git a/packages/kernel/src/contracts/index.ts b/packages/kernel/src/contracts/index.ts
index 09e0a56..28e0a0b 100644
--- a/packages/kernel/src/contracts/index.ts
+++ b/packages/kernel/src/contracts/index.ts
@@ -19,6 +19,8 @@ export type {
ConversationMeta,
ConversationStatus,
ErrorChunk,
+ ImageChunk,
+ ImageInput,
Role,
StepId,
StepMetrics,
diff --git a/packages/kernel/src/contracts/provider.ts b/packages/kernel/src/contracts/provider.ts
index b6dc8ca..3137073 100644
--- a/packages/kernel/src/contracts/provider.ts
+++ b/packages/kernel/src/contracts/provider.ts
@@ -114,6 +114,16 @@ export interface ModelInfo {
readonly displayName?: string;
/** The model's max context window in tokens (e.g. 200000). Optional — providers that don't report it leave it undefined. */
readonly contextWindow?: number;
+ /**
+ * Whether this model can natively accept image input (vision/multimodal).
+ * When `true`, image chunks in a user message are passed through to the
+ * provider serialized to its image-content format. When `false`/absent, the
+ * orchestrator's vision handoff transcribes images to text (via a
+ * vision-capable model) before the model sees them. Optional — providers
+ * that cannot detect it leave it undefined (treated as non-vision); a
+ * provider that knows a model is vision-capable sets it `true`.
+ */
+ readonly vision?: boolean;
}
/**
diff --git a/packages/openai-stream/src/convert-messages.test.ts b/packages/openai-stream/src/convert-messages.test.ts
index 3520eb5..57c7d81 100644
--- a/packages/openai-stream/src/convert-messages.test.ts
+++ b/packages/openai-stream/src/convert-messages.test.ts
@@ -35,6 +35,100 @@ describe("convertMessages", () => {
expect(result).toEqual([{ role: "user", content: "Hello, world!" }]);
});
+ it("converts a user message with a text + image chunk to a multimodal content array", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "What is in this image?" },
+ { type: "image", url: "data:image/png;base64,iVBORw0KGgo=" },
+ ],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ expect(result).toEqual([
+ {
+ role: "user",
+ content: [
+ { type: "text", text: "What is in this image?" },
+ { type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } },
+ ],
+ },
+ ]);
+ });
+
+ it("converts an image-only user message (no text) to a content array with just the image", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [{ type: "image", url: "https://example.com/cat.png" }],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ expect(result).toEqual([
+ {
+ role: "user",
+ content: [{ type: "image_url", image_url: { url: "https://example.com/cat.png" } }],
+ },
+ ]);
+ });
+
+ it("converts a user message with multiple images interspersed with text", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "Compare these:" },
+ { type: "image", url: "data:image/png;base64,aaa" },
+ { type: "text", text: "and" },
+ { type: "image", url: "data:image/jpeg;base64,bbb" },
+ ],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ expect(result).toHaveLength(1);
+ const content = result[0]?.content;
+ expect(Array.isArray(content)).toBe(true);
+ if (Array.isArray(content)) {
+ expect(content).toHaveLength(4);
+ expect(content[0]).toEqual({ type: "text", text: "Compare these:" });
+ expect(content[1]).toEqual({
+ type: "image_url",
+ image_url: { url: "data:image/png;base64,aaa" },
+ });
+ expect(content[2]).toEqual({ type: "text", text: "and" });
+ expect(content[3]).toEqual({
+ type: "image_url",
+ image_url: { url: "data:image/jpeg;base64,bbb" },
+ });
+ }
+ });
+
+ it("skips empty text parts in a multimodal message but keeps images", () => {
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "" },
+ { type: "image", url: "data:image/png;base64,x" },
+ ],
+ },
+ ];
+
+ const result = convertMessages(messages);
+ const content = result[0]?.content;
+ expect(Array.isArray(content)).toBe(true);
+ if (Array.isArray(content)) {
+ // Empty text part is dropped; only the image remains.
+ expect(content).toEqual([
+ { type: "image_url", image_url: { url: "data:image/png;base64,x" } },
+ ]);
+ }
+ });
+
it("converts an assistant message with text only", () => {
const messages: ChatMessage[] = [
{
diff --git a/packages/openai-stream/src/convert-messages.ts b/packages/openai-stream/src/convert-messages.ts
index e830243..eba3575 100644
--- a/packages/openai-stream/src/convert-messages.ts
+++ b/packages/openai-stream/src/convert-messages.ts
@@ -1,8 +1,28 @@
import type { ChatMessage, Chunk } from "@dispatch/kernel";
+/** A text part within a multimodal OpenAI content array. */
+export interface OpenAITextPart {
+ readonly type: "text";
+ readonly text: string;
+}
+
+/** An image part within a multimodal OpenAI content array (OpenAI vision format). */
+export interface OpenAIImagePart {
+ readonly type: "image_url";
+ readonly image_url: { readonly url: string };
+}
+
+/**
+ * A part of a multimodal message content array. When a message has mixed text
+ * and image chunks, the content is serialized as an array of these parts
+ * (OpenAI's vision format). Plain-text messages keep a string `content` for
+ * byte-stability with providers that only accept strings.
+ */
+export type OpenAIContentPart = OpenAITextPart | OpenAIImagePart;
+
export interface OpenAIMessage {
readonly role: "system" | "user" | "assistant" | "tool";
- readonly content: string | null;
+ readonly content: string | null | readonly OpenAIContentPart[];
readonly tool_calls?: readonly OpenAIToolCall[];
readonly tool_call_id?: string;
}
@@ -49,6 +69,29 @@ function convertSystemMessage(msg: ChatMessage): OpenAIMessage {
}
function convertUserMessage(msg: ChatMessage): OpenAIMessage {
+ // If the message has image chunks, serialize as a multimodal content array
+ // (OpenAI vision format): text parts + image_url parts in chunk order.
+ // Plain text-only messages keep a string `content` for byte-stability with
+ // providers that only accept a string (and to keep prompt-cache prefixes
+ // unchanged for the common no-image case).
+ const hasImage = msg.chunks.some((c) => c.type === "image");
+ if (hasImage) {
+ const parts: OpenAIContentPart[] = [];
+ for (const chunk of msg.chunks) {
+ if (chunk.type === "text") {
+ if (chunk.text.length > 0) {
+ parts.push({ type: "text", text: chunk.text });
+ }
+ } else if (chunk.type === "image") {
+ parts.push({ type: "image_url", image_url: { url: chunk.url } });
+ }
+ // Non-text/non-image chunks (tool-call, thinking, etc.) are not part of a
+ // user message's provider content and are skipped here.
+ }
+ // An image-only message (no text) still needs at least the image part.
+ return { role: "user", content: parts.length > 0 ? parts : "" };
+ }
+
const text = msg.chunks
.filter((c): c is Extract<Chunk, { type: "text" }> => c.type === "text")
.map((c) => c.text)
diff --git a/packages/openai-stream/src/index.ts b/packages/openai-stream/src/index.ts
index bd2f673..3f76b99 100644
--- a/packages/openai-stream/src/index.ts
+++ b/packages/openai-stream/src/index.ts
@@ -1,8 +1,14 @@
-export type { OpenAIMessage, OpenAIToolCall } from "./convert-messages.js";
+export type {
+ OpenAIContentPart,
+ OpenAIImagePart,
+ OpenAIMessage,
+ OpenAITextPart,
+ OpenAIToolCall,
+} from "./convert-messages.js";
export { convertMessages } from "./convert-messages.js";
export type { OpenAITool } from "./convert-tools.js";
export { convertTools } from "./convert-tools.js";
-export { parseModelList } from "./listModels.js";
+export { isVisionModelId, parseModelList } from "./listModels.js";
export { parseSSELines } from "./parse-sse.js";
export type { CreateOpenAICompatProviderOpts } from "./provider.js";
export { createOpenAICompatProvider } from "./provider.js";
diff --git a/packages/openai-stream/src/listModels.test.ts b/packages/openai-stream/src/listModels.test.ts
index c2438bc..2e3b1a3 100644
--- a/packages/openai-stream/src/listModels.test.ts
+++ b/packages/openai-stream/src/listModels.test.ts
@@ -1,7 +1,7 @@
import type { ApiKeyCredentials, ModelInfo, ProviderContract } from "@dispatch/kernel";
import type { FetchLike } from "@dispatch/trace-replay";
import { describe, expect, it, vi } from "vitest";
-import { parseModelList } from "./listModels.js";
+import { isVisionModelId, parseModelList } from "./listModels.js";
import { createOpenAICompatProvider } from "./provider.js";
function makeProvider(fetchFn: FetchLike, apiKey = "sk-test-1234567890abcdef"): ProviderContract {
@@ -35,6 +35,53 @@ describe("listModels — pure mapping (parseModelList)", () => {
const result = parseModelList([]);
expect(result).toEqual([]);
});
+
+ it("extracts contextWindow from common field names", () => {
+ const result = parseModelList([
+ { id: "m1", context_length: 128000 },
+ { id: "m2", context_window: 200000 },
+ { id: "m3", max_context_length: 64000 },
+ { id: "m4", max_tokens: 8000 },
+ ]);
+ expect(result).toEqual([
+ { id: "m1", contextWindow: 128000 },
+ { id: "m2", contextWindow: 200000 },
+ { id: "m3", contextWindow: 64000 },
+ { id: "m4", contextWindow: 8000 },
+ ]);
+ });
+});
+
+describe("listModels — vision capability detection", () => {
+ it("isVisionModelId returns true for umans kimi and qwen model ids", () => {
+ expect(isVisionModelId("umans-kimi-k2.7")).toBe(true);
+ expect(isVisionModelId("Umans-Kimi-K2.7")).toBe(true); // case-insensitive
+ expect(isVisionModelId("umans-qwen3.6-35b-a3b")).toBe(true);
+ });
+
+ it("isVisionModelId returns false for non-vision model ids", () => {
+ expect(isVisionModelId("umans-glm-5.2")).toBe(false);
+ expect(isVisionModelId("umans-coder")).toBe(false);
+ expect(isVisionModelId("umans-flash")).toBe(false);
+ expect(isVisionModelId("kimi-k2.7-code")).toBe(false); // opencode kimi, not umans
+ expect(isVisionModelId("qwen3.7-max")).toBe(false); // opencode qwen, not umans
+ expect(isVisionModelId("deepseek-v4-flash")).toBe(false);
+ });
+
+ it("parseModelList sets vision: true on umans kimi and qwen models only", () => {
+ const result = parseModelList([
+ { id: "umans-kimi-k2.7", context_length: 262144 },
+ { id: "umans-qwen3.6-35b-a3b", context_length: 262144 },
+ { id: "umans-glm-5.2", context_length: 405504 },
+ { id: "umans-coder" },
+ ]);
+ expect(result).toEqual([
+ { id: "umans-kimi-k2.7", contextWindow: 262144, vision: true },
+ { id: "umans-qwen3.6-35b-a3b", contextWindow: 262144, vision: true },
+ { id: "umans-glm-5.2", contextWindow: 405504 },
+ { id: "umans-coder" },
+ ]);
+ });
});
describe("listModels — provider contract", () => {
diff --git a/packages/openai-stream/src/listModels.ts b/packages/openai-stream/src/listModels.ts
index 0e94c43..df116b0 100644
--- a/packages/openai-stream/src/listModels.ts
+++ b/packages/openai-stream/src/listModels.ts
@@ -24,17 +24,39 @@ interface OpenAIModelListResponse {
}
/**
+ * Whether a model id is vision-capable (can natively accept image input).
+ *
+ * The OpenAI-compatible `/models` endpoint does not reliably report image
+ * capabilities, so this is a hardcoded heuristic by model id: the Umans Kimi
+ * (`umans-kimi-k2.7`) and Umans Qwen (`umans-qwen3.6-35b-a3b`) models are
+ * vision-capable; all others are treated as non-vision. This is the single
+ * source of truth — the orchestrator's vision handoff and the `consult_vision`
+ * tool both consult the `ModelInfo.vision` flag this sets, so adding a model
+ * here enables vision everywhere. Pure: id → boolean, no I/O.
+ *
+ * (When an endpoint gains reliable vision reporting, this can be replaced with
+ * a real capability check without changing callers.)
+ */
+export function isVisionModelId(id: string): boolean {
+ const lower = id.toLowerCase();
+ return lower.includes("umans-kimi") || lower.includes("umans-qwen");
+}
+
+/**
* Pure mapping: raw OpenAI-compatible model list → ModelInfo[].
- * Extracts `contextWindow` from common field names (providers vary).
- * Extracted for direct unit testing with no I/O.
+ * Extracts `contextWindow` from common field names (providers vary) and
+ * detects vision capability via {@link isVisionModelId}. Extracted for direct
+ * unit testing with no I/O.
*/
export function parseModelList(data: readonly OpenAIModelEntry[]): readonly ModelInfo[] {
return data.map((entry) => {
const contextWindow =
entry.context_length ?? entry.context_window ?? entry.max_context_length ?? entry.max_tokens;
+ const vision = isVisionModelId(entry.id);
return {
id: entry.id,
...(contextWindow !== undefined ? { contextWindow } : {}),
+ ...(vision ? { vision } : {}),
};
});
}
diff --git a/packages/session-orchestrator/src/extension.ts b/packages/session-orchestrator/src/extension.ts
index 0cd83ef..783d894 100644
--- a/packages/session-orchestrator/src/extension.ts
+++ b/packages/session-orchestrator/src/extension.ts
@@ -12,6 +12,7 @@ import {
createSessionOrchestrator,
createWarmService,
sessionOrchestratorHandle,
+ visionHandoffLocalHandle,
} from "./orchestrator.js";
import { selectFirstProvider } from "./pure.js";
import { filterRemoteIncompatibleTools, toolsFilter } from "./tools-filter.js";
@@ -107,6 +108,20 @@ export function activate(host: HostAPI): void {
return undefined;
}
},
+ resolveVisionHandoff: () => {
+ // Lazily resolve the vision-handoff service. Returns undefined when the
+ // vision-handoff extension isn't loaded (images pass through unchanged —
+ // correct for vision-capable models; the feature degrades off cleanly for
+ // text-only turns). Lazy so activation order doesn't matter; the
+ // activated-manifests guard avoids a getService throw when absent.
+ const loaded = host.getExtensions().some((m) => m.id === "vision-handoff");
+ if (!loaded) return undefined;
+ try {
+ return host.getService(visionHandoffLocalHandle);
+ } catch {
+ return undefined;
+ }
+ },
});
host.provideService(sessionOrchestratorHandle, orchestrator);
diff --git a/packages/session-orchestrator/src/orchestrator.ts b/packages/session-orchestrator/src/orchestrator.ts
index 617c079..5c36922 100644
--- a/packages/session-orchestrator/src/orchestrator.ts
+++ b/packages/session-orchestrator/src/orchestrator.ts
@@ -5,6 +5,7 @@ import type {
CompactionResult,
ConversationStatus,
EventHookDescriptor,
+ ImageInput,
Logger,
ModelInfo,
ProviderContract,
@@ -34,11 +35,71 @@ import {
} from "./pure.js";
import type { ToolAssembly } from "./tools-filter.js";
+// --- Vision handoff (lazy, optional) ---
+
+/**
+ * Minimal contract the vision-handoff service satisfies. Defined here (not
+ * imported from the vision-handoff package) so the orchestrator has NO
+ * compile-time dependency on it — the service is resolved lazily at runtime
+ * (like the message-queue / system-prompt services), and the feature degrades
+ * off cleanly when the extension isn't loaded (images pass through unchanged,
+ * which is correct for vision-capable models and a no-op for text-only turns).
+ *
+ * `prepareForProvider` transforms a message list for the provider: if the
+ * active model is vision-capable, messages pass through unchanged; otherwise
+ * image chunks are replaced with numbered placeholders (telling the model to
+ * call `consult_vision`) and the images are registered for tool access.
+ */
+export interface VisionHandoffService {
+ /**
+ * Store images to tmp files and return compact URLs. Each input image's data
+ * URL is saved to a tmp file and replaced with a compact HTTP path so the
+ * persisted conversation store holds a tiny string, not megabytes of base64.
+ * When `saveImageToTmp` is not configured, data URLs pass through unchanged.
+ */
+ readonly storeImages: (
+ conversationId: string,
+ images: readonly ImageInput[],
+ ) => Promise<readonly ImageInput[]>;
+
+ /** Delete all tmp images for a conversation (on close). Best-effort. */
+ readonly purgeConversationImages: (conversationId: string) => Promise<void>;
+
+ readonly prepareForProvider: (
+ messages: readonly ChatMessage[],
+ currentModelName: string | undefined,
+ opts?: {
+ readonly conversationId?: string;
+ readonly imageLimit?: number;
+ readonly signal?: AbortSignal;
+ readonly logger?: Logger;
+ },
+ ) => Promise<readonly ChatMessage[]>;
+}
+
+/**
+ * Local handle for the vision-handoff service, keyed by the same ID the
+ * vision-handoff extension registers under (`"vision-handoff/service"`). Defined
+ * locally (not imported) so the orchestrator has no compile-time dependency on
+ * the vision-handoff package — the service is resolved lazily at runtime, and
+ * the feature degrades off cleanly when the extension isn't loaded.
+ */
+export const visionHandoffLocalHandle: ServiceHandle<VisionHandoffService> =
+ defineService<VisionHandoffService>("vision-handoff/service");
+
// --- Broadcast hub types ---
export interface StartTurnInput {
readonly conversationId: string;
readonly text: string;
+ /**
+ * Images attached to this turn (e.g. user-pasted screenshots). Each is
+ * appended as an `image` chunk on the persisted user message. For a
+ * vision-capable model the images pass through to the provider natively; for
+ * a non-vision model the vision handoff transcribes them to text first.
+ * Optional — omit for a text-only turn.
+ */
+ readonly images?: readonly ImageInput[];
readonly modelName?: string;
readonly cwd?: string;
/**
@@ -77,6 +138,12 @@ export type StartTurnResult =
export interface EnqueueInput {
readonly conversationId: string;
readonly text: string;
+ /**
+ * Images attached (the steering / opening message analog of
+ * `StartTurnInput.images`). Threaded to `startTurn` when the conversation is
+ * idle (the message starts a turn). Additive optional.
+ */
+ readonly images?: readonly ImageInput[];
/** Workspace to stamp on a new conversation. Defaults to `"default"`. */
readonly workspaceId?: string;
/**
@@ -291,6 +358,8 @@ export interface SessionOrchestrator {
workspaceId?: string;
/** Explicit system-prompt override — see {@link StartTurnInput.systemPrompt}. */
systemPrompt?: string;
+ /** Images attached to this turn — see {@link StartTurnInput.images}. */
+ images?: readonly ImageInput[];
}): Promise<void>;
}
@@ -345,6 +414,17 @@ export interface SessionOrchestratorDeps {
* when the stream completes. Lazy so activation order doesn't matter.
*/
readonly resolveConcurrencyLimiter?: () => ConcurrencyLimiter | undefined;
+ /**
+ * Lazily resolves the vision-handoff service, or `undefined` when the
+ * vision-handoff extension isn't loaded. Used to transcribe image chunks to
+ * text for non-vision models before they reach the provider (so a text-only
+ * model can still reason about pasted/code images). When `undefined`, images
+ * pass through unchanged (correct for vision-capable models; a text-only model
+ * would then receive image content its API may reject — the feature degrades
+ * off cleanly for text-only turns since there are no images). Lazy so
+ * activation order doesn't matter; called per-turn.
+ */
+ readonly resolveVisionHandoff?: () => VisionHandoffService | undefined;
/** Apply the per-turn tools filter chain. Injected for testability. */
readonly applyToolsFilter: (assembly: ToolAssembly) => Promise<ToolAssembly>;
/** Base logger (auto-scoped to this extension); childed per turn for span capture. */
@@ -447,6 +527,7 @@ export function createSessionOrchestrator(
reasoningEffortOverride: ReasoningEffort | undefined,
workspaceId: string,
systemPromptOverride: string | undefined,
+ images: readonly ImageInput[] | undefined,
): void {
const turnId = generateTurnId();
const promptStartedAt = deps.now?.() ?? Date.now();
@@ -569,7 +650,18 @@ export function createSessionOrchestrator(
const effectiveModelName = resolveModelName(modelName, storedModel);
const history = await deps.conversationStore.load(conversationId);
- const userMsg = buildUserMessage(text);
+
+ // Store images to tmp files (compact URLs) BEFORE building the user
+ // message so the persisted chunks hold tiny URL references, not
+ // megabytes of base64 data URLs. When the vision-handoff service isn't
+ // loaded, images pass through unchanged (backward compatible).
+ const visionHandoffForStore = deps.resolveVisionHandoff?.();
+ const storedImages =
+ visionHandoffForStore !== undefined && images !== undefined
+ ? await visionHandoffForStore.storeImages(conversationId, images)
+ : images;
+
+ const userMsg = buildUserMessage(text, storedImages);
// Workspace assignment for new conversations happens BEFORE
// effective-cwd resolution (see workspaceSetupPromise above) so
@@ -744,9 +836,35 @@ export function createSessionOrchestrator(
return [{ role: "user", chunks: [{ type: "text", text: steerText }] }];
};
+ // Vision handoff: transform the message list for the provider. When the
+ // active model is vision-capable, images pass through natively (no-op).
+ // When it is NOT vision-capable, image chunks are transcribed to text
+ // descriptions via a vision-capable model — so a text-only model can
+ // still reason about images. The PERSISTED user message keeps the
+ // original image chunks (appended below); only the provider's view is
+ // transcribed. When the vision-handoff service isn't loaded, images pass
+ // through unchanged (correct for vision models; text-only models would
+ // then receive image content their API may reject — degrades off cleanly
+ // for text-only turns with no images).
+ const visionHandoff = deps.resolveVisionHandoff?.();
+ let providerMessages: readonly ChatMessage[] = [...history, userMsg];
+ if (visionHandoff !== undefined) {
+ const visionSettings = await deps.conversationStore.getVisionSettings();
+ providerMessages = await visionHandoff.prepareForProvider(
+ providerMessages,
+ effectiveModelName,
+ {
+ conversationId,
+ imageLimit: visionSettings.imageLimit,
+ signal: controller.signal,
+ ...(turnLogger !== undefined ? { logger: turnLogger } : {}),
+ },
+ );
+ }
+
const opts: RunTurnInput = {
provider,
- messages: [...history, userMsg],
+ messages: providerMessages,
tools: assembled.tools,
dispatch,
emit: emitAndAccumulate,
@@ -852,6 +970,7 @@ export function createSessionOrchestrator(
reasoningEffort,
workspaceId,
systemPrompt,
+ images,
}) {
if (activeTurns.has(conversationId)) {
return { started: false, reason: "already-active" };
@@ -865,18 +984,20 @@ export function createSessionOrchestrator(
reasoningEffort,
workspaceId ?? "default",
systemPrompt,
+ images,
);
const turn = activeTurns.get(conversationId);
const turnId = turn !== undefined ? turn.turnId : "";
return { started: true, turnId };
},
- enqueue({ conversationId, text, workspaceId, computerId }) {
+ enqueue({ conversationId, text, workspaceId, computerId, images }) {
const result = orchestrator.startTurn({
conversationId,
text,
...(workspaceId !== undefined ? { workspaceId } : {}),
...(computerId !== undefined ? { computerId } : {}),
+ ...(images !== undefined ? { images } : {}),
});
if (result.started) {
return { startedTurn: true, queue: [] };
@@ -939,6 +1060,9 @@ export function createSessionOrchestrator(
});
});
void deps.conversationStore.setConversationStatus(conversationId, "closed");
+ // Purge tmp images for this conversation (best-effort, fire-and-forget).
+ const vh = deps.resolveVisionHandoff?.();
+ if (vh !== undefined) void vh.purgeConversationImages(conversationId);
return { abortedTurn };
},
@@ -961,6 +1085,7 @@ export function createSessionOrchestrator(
reasoningEffort,
workspaceId,
systemPrompt,
+ images,
}) {
const turnInput: StartTurnInput = {
conversationId,
@@ -971,6 +1096,7 @@ export function createSessionOrchestrator(
...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
...(workspaceId !== undefined ? { workspaceId } : {}),
...(systemPrompt !== undefined ? { systemPrompt } : {}),
+ ...(images !== undefined ? { images } : {}),
};
const result = orchestrator.startTurn(turnInput);
if (!result.started) {
diff --git a/packages/session-orchestrator/src/pure.test.ts b/packages/session-orchestrator/src/pure.test.ts
index c75cb82..7a574f1 100644
--- a/packages/session-orchestrator/src/pure.test.ts
+++ b/packages/session-orchestrator/src/pure.test.ts
@@ -26,6 +26,39 @@ describe("buildUserMessage", () => {
expect(msg.role).toBe("user");
expect(msg.chunks[0]).toEqual({ type: "text", text: "" });
});
+
+ it("appends image chunks after the text chunk when images are given", () => {
+ const msg = buildUserMessage("look at this", [
+ { url: "data:image/png;base64,aaa" },
+ { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+ ]);
+ expect(msg.chunks).toHaveLength(3);
+ expect(msg.chunks[0]).toEqual({ type: "text", text: "look at this" });
+ expect(msg.chunks[1]).toEqual({ type: "image", url: "data:image/png;base64,aaa" });
+ expect(msg.chunks[2]).toEqual({
+ type: "image",
+ url: "data:image/jpeg;base64,bbb",
+ mimeType: "image/jpeg",
+ });
+ });
+
+ it("builds an image-only message when text is empty", () => {
+ const msg = buildUserMessage("", [{ url: "data:image/png;base64,zzz" }]);
+ expect(msg.chunks).toHaveLength(1);
+ expect(msg.chunks[0]).toEqual({ type: "image", url: "data:image/png;base64,zzz" });
+ });
+
+ it("includes mimeType when provided", () => {
+ const msg = buildUserMessage("hi", [
+ { url: "data:image/webp;base64,x", mimeType: "image/webp" },
+ ]);
+ expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBe("image/webp");
+ });
+
+ it("omits mimeType when not provided", () => {
+ const msg = buildUserMessage("hi", [{ url: "https://example.com/x.png" }]);
+ expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBeUndefined();
+ });
});
describe("selectFirstProvider", () => {
diff --git a/packages/session-orchestrator/src/pure.ts b/packages/session-orchestrator/src/pure.ts
index 2208e8f..0d2068f 100644
--- a/packages/session-orchestrator/src/pure.ts
+++ b/packages/session-orchestrator/src/pure.ts
@@ -1,12 +1,40 @@
import type {
ChatMessage,
+ Chunk,
+ ImageInput,
ProviderContract,
ReasoningEffort,
ToolDispatchPolicy,
} from "@dispatch/kernel";
-export function buildUserMessage(text: string): ChatMessage {
- return { role: "user", chunks: [{ type: "text", text }] };
+/**
+ * Build the persisted user message for a turn. When `images` are provided, each
+ * is appended as an `image` chunk AFTER the text chunk, so the persisted message
+ * carries both the prompt text and the attached images (the frontend renders
+ * the images; vision-capable providers receive them natively; non-vision
+ * providers have them transcribed by the vision handoff before streaming).
+ *
+ * Pure: inputs → a ChatMessage, no I/O.
+ */
+export function buildUserMessage(text: string, images?: readonly ImageInput[]): ChatMessage {
+ const chunks: Chunk[] = [];
+ if (text.length > 0) {
+ chunks.push({ type: "text", text });
+ }
+ if (images !== undefined) {
+ for (const img of images) {
+ chunks.push({
+ type: "image",
+ url: img.url,
+ ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+ });
+ }
+ }
+ // An image-only message (empty text) is valid.
+ if (chunks.length === 0) {
+ chunks.push({ type: "text", text: "" });
+ }
+ return { role: "user", chunks };
}
// ── Provider-error retry backoff schedule ───────────────────────────────────
diff --git a/packages/transport-contract/src/contract.types.test.ts b/packages/transport-contract/src/contract.types.test.ts
index 9d3d904..34ff544 100644
--- a/packages/transport-contract/src/contract.types.test.ts
+++ b/packages/transport-contract/src/contract.types.test.ts
@@ -20,6 +20,7 @@ import type {
LspServerState,
LspStatusResponse,
McpStatusResponse,
+ ModelsResponse,
SetConversationComputerRequest,
SetCwdRequest,
SetWorkspaceDefaultComputerRequest,
@@ -55,6 +56,18 @@ const _chatWithoutComputer: ChatRequest = {
message: "hello",
};
+// ─── ChatRequest.images (additive optional) ──────────────────────────────────
+
+const _chatWithImages: ChatRequest = {
+ message: "What's in this screenshot?",
+ images: [{ url: "data:image/png;base64,iVBORw0KGgo=", mimeType: "image/png" }],
+};
+
+const _chatWithHttpImage: ChatRequest = {
+ message: "analyze this",
+ images: [{ url: "https://example.com/diagram.png" }],
+};
+
// ─── Computer list / single response ─────────────────────────────────────────
const _computer: Computer = {
@@ -255,6 +268,35 @@ describe("transport-contract types compile and are exported", () => {
expect(_chatWithComputer.computerId).toBe("prod-box");
});
+ // ─── ChatRequest.images (additive optional) ──────────────────────────────
+
+ it("ChatRequest: images is additive optional (omittable)", () => {
+ expect(_chatWithoutComputer.images).toBeUndefined();
+ });
+
+ it("ChatRequest: carries images (data URL) when set", () => {
+ expect(_chatWithImages.images).toHaveLength(1);
+ expect(_chatWithImages.images?.[0]?.url).toContain("base64");
+ expect(_chatWithImages.images?.[0]?.mimeType).toBe("image/png");
+ });
+
+ it("ChatRequest: carries images (http URL, mimeType optional)", () => {
+ expect(_chatWithHttpImage.images?.[0]?.url).toBe("https://example.com/diagram.png");
+ expect(_chatWithHttpImage.images?.[0]?.mimeType).toBeUndefined();
+ });
+
+ it("ModelsResponse: ModelMetadata carries optional vision flag", () => {
+ const resp: ModelsResponse = {
+ models: ["umans/kimi-k2.7", "umans/glm-5.2"],
+ modelInfo: {
+ "umans/kimi-k2.7": { contextWindow: 200000, vision: true },
+ "umans/glm-5.2": { contextWindow: 128000 },
+ },
+ };
+ expect(resp.modelInfo?.["umans/kimi-k2.7"]?.vision).toBe(true);
+ expect(resp.modelInfo?.["umans/glm-5.2"]?.vision).toBeUndefined();
+ });
+
// ─── Computers ───────────────────────────────────────────────────────────
it("ComputerListResponse: carries entries with usage counts", () => {
diff --git a/packages/transport-contract/src/index.ts b/packages/transport-contract/src/index.ts
index 400d9d5..d5f3000 100644
--- a/packages/transport-contract/src/index.ts
+++ b/packages/transport-contract/src/index.ts
@@ -26,6 +26,7 @@ import type {
ComputerEntry,
ConversationMeta,
ConversationStatus,
+ ImageInput,
QueuedMessage,
ReasoningEffort,
StoredChunk,
@@ -41,6 +42,8 @@ export type {
ComputerEntry,
ConversationMeta,
ConversationStatus,
+ ImageChunk,
+ ImageInput,
QueuedMessage,
ReasoningEffort,
StepMetrics,
@@ -68,6 +71,19 @@ export interface ChatRequest {
readonly message: string;
/**
+ * Images attached to this turn (e.g. a user-pasted screenshot). Each entry's
+ * `url` is a base64 data URL (`data:image/…;base64,…`) or an `http(s)://`
+ * URL. The server converts these to `image` chunks on the persisted user
+ * message. For a VISION-capable model (e.g. kimi), the images are passed
+ * through to the provider natively. For a NON-vision model (e.g. glm-5.2),
+ * the server's vision handoff transcribes each image to a text description
+ * (via a vision-capable model) and feeds that text instead — so a text-only
+ * model can still reason about the image's contents. Optional — omit for a
+ * text-only turn (backward compatible).
+ */
+ readonly images?: readonly ImageInput[];
+
+ /**
* The model to use, as a model name in `<credentialName>/<model>` form — one
* of the exact strings returned by `GET /models`. Omit to use the server's
* default credential + model.
@@ -124,6 +140,14 @@ export interface ModelsResponse {
/** Per-model metadata returned alongside the model catalog. */
export interface ModelMetadata {
readonly contextWindow?: number;
+ /**
+ * Whether this model can natively accept image input (vision/multimodal).
+ * When `true`, image chunks in a user message are passed through to the
+ * provider. When `false`/absent, the server's vision handoff transcribes
+ * images to text before the model sees them. A client may use this to show a
+ * vision badge in the model picker. Optional — absent when unknown.
+ */
+ readonly vision?: boolean;
}
/**
@@ -387,6 +411,23 @@ export interface SystemPromptVariablesResponse {
readonly variables: readonly SystemPromptVariable[];
}
+// ─── Vision settings (global) ──────────────────────────────────────────────────
+
+/**
+ * Response of `GET /settings/vision` — the global vision configuration shared
+ * across all conversations and vision models.
+ */
+export interface VisionSettingsResponse {
+ readonly imageLimit: number;
+ readonly compactionModel: string | null;
+}
+
+/** Body of `PUT /settings/vision` — a partial update. */
+export interface SetVisionSettingsRequest {
+ readonly imageLimit?: number;
+ readonly compactionModel?: string | null;
+}
+
// ─── Message queue (steering) ─────────────────────────────────────────────────
/**
diff --git a/packages/transport-http/src/app.ts b/packages/transport-http/src/app.ts
index 23f8dde..0fcc8f0 100644
--- a/packages/transport-http/src/app.ts
+++ b/packages/transport-http/src/app.ts
@@ -42,6 +42,7 @@ import type {
ThroughputResponse,
TitleResponse,
UpdateHeartbeatRequest,
+ VisionSettingsResponse,
WarmResponse,
WorkspaceListResponse,
WorkspaceResponse,
@@ -212,6 +213,37 @@ export function createApp(opts: CreateServerOptions): Hono {
app.get("/health", (c) => c.json({ ok: true }));
+ // ── Tmp image serving (vision handoff) ──────────────────────────────────────
+ app.get("/images/:conversationId/:imageId", async (c) => {
+ const conversationId = c.req.param("conversationId");
+ const imageId = c.req.param("imageId");
+ if (imageId.includes("/") || imageId.includes("..")) {
+ return c.json({ error: "Invalid image ID" }, 400);
+ }
+ const imageDir = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images";
+ const { join } = await import("node:path");
+ const { readFile: fsReadFile } = await import("node:fs/promises");
+ const filePath = join(imageDir, conversationId, imageId);
+ try {
+ const buf = await fsReadFile(filePath);
+ const ext = imageId.toLowerCase();
+ const mime = ext.endsWith(".png")
+ ? "image/png"
+ : ext.endsWith(".jpg") || ext.endsWith(".jpeg")
+ ? "image/jpeg"
+ : ext.endsWith(".webp")
+ ? "image/webp"
+ : ext.endsWith(".gif")
+ ? "image/gif"
+ : ext.endsWith(".bmp")
+ ? "image/bmp"
+ : "application/octet-stream";
+ return new Response(buf, { headers: { "Content-Type": mime, "Cache-Control": "no-cache" } });
+ } catch {
+ return c.json({ error: "Image not found" }, 404);
+ }
+ });
+
app.get("/conversations/:id/metrics", async (c) => {
const conversationId = c.req.param("id");
@@ -306,11 +338,14 @@ export function createApp(opts: CreateServerOptions): Hono {
app.get("/models", async (c) => {
try {
const models = await opts.credentialStore.listCatalog();
- const modelInfo: Record<string, { contextWindow?: number }> = {};
+ const modelInfo: Record<string, { contextWindow?: number; vision?: boolean }> = {};
for (const modelName of models) {
const info = await opts.credentialStore.getModelInfo(modelName);
- if (info?.contextWindow !== undefined) {
- modelInfo[modelName] = { contextWindow: info.contextWindow };
+ if (info?.contextWindow !== undefined || info?.vision === true) {
+ const entry: { contextWindow?: number; vision?: boolean } = {};
+ if (info?.contextWindow !== undefined) entry.contextWindow = info.contextWindow;
+ if (info?.vision === true) entry.vision = true;
+ modelInfo[modelName] = entry;
}
}
const body: ModelsResponse = {
@@ -410,8 +445,16 @@ export function createApp(opts: CreateServerOptions): Hono {
return c.json({ error: result.error }, 400);
}
- const { conversationId, message, model, cwd, computerId, reasoningEffort, workspaceId } =
- result;
+ const {
+ conversationId,
+ message,
+ model,
+ cwd,
+ computerId,
+ reasoningEffort,
+ workspaceId,
+ images,
+ } = result;
log.info("chat: request accepted", {
conversationId,
hasModel: model !== undefined,
@@ -419,6 +462,7 @@ export function createApp(opts: CreateServerOptions): Hono {
hasComputerId: computerId !== undefined,
hasReasoningEffort: reasoningEffort !== undefined,
hasWorkspaceId: workspaceId !== undefined,
+ imageCount: images?.length ?? 0,
});
const events: AgentEvent[] = [];
@@ -469,6 +513,7 @@ export function createApp(opts: CreateServerOptions): Hono {
...(computerId !== undefined ? { computerId } : {}),
...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
...(workspaceId !== undefined ? { workspaceId } : {}),
+ ...(images !== undefined ? { images } : {}),
};
opts.orchestrator
@@ -1671,6 +1716,43 @@ export function createApp(opts: CreateServerOptions): Hono {
return c.json(response, 200);
});
+ app.get("/settings/vision", async (c) => {
+ const settings = await opts.conversationStore.getVisionSettings();
+ const body: VisionSettingsResponse = settings;
+ return c.json(body, 200);
+ });
+
+ app.put("/settings/vision", async (c) => {
+ let body: unknown;
+ try {
+ body = await c.req.json();
+ } catch {
+ return c.json({ error: "Invalid JSON body" }, 400);
+ }
+ const obj = body as { imageLimit?: unknown; compactionModel?: unknown };
+ if (obj.imageLimit !== undefined) {
+ if (
+ typeof obj.imageLimit !== "number" ||
+ !Number.isInteger(obj.imageLimit) ||
+ obj.imageLimit < 0
+ ) {
+ return c.json({ error: "imageLimit must be a non-negative integer" }, 400);
+ }
+ await opts.conversationStore.setVisionImageLimit(obj.imageLimit);
+ log.info("vision: image limit set", { imageLimit: obj.imageLimit });
+ }
+ if (obj.compactionModel !== undefined) {
+ if (obj.compactionModel !== null && typeof obj.compactionModel !== "string") {
+ return c.json({ error: "compactionModel must be a string or null" }, 400);
+ }
+ await opts.conversationStore.setVisionCompactionModel(obj.compactionModel);
+ log.info("vision: compaction model set", { compactionModel: obj.compactionModel });
+ }
+ const settings = await opts.conversationStore.getVisionSettings();
+ const response: VisionSettingsResponse = settings;
+ return c.json(response, 200);
+ });
+
// ─── Static frontend serving (catch-all, API routes take precedence) ──────
if (opts.webDir !== undefined) {
const webDir = opts.webDir;
diff --git a/packages/transport-http/src/logic.test.ts b/packages/transport-http/src/logic.test.ts
index fc8302e..67632f3 100644
--- a/packages/transport-http/src/logic.test.ts
+++ b/packages/transport-http/src/logic.test.ts
@@ -182,6 +182,69 @@ describe("parseChatBody", () => {
expect(result.reasoningEffort).toBeUndefined();
}
});
+
+ // ── images ──────────────────────────────────────────────────────────────
+
+ it("parses images array with data URLs", () => {
+ const result = parseChatBody(
+ {
+ message: "what is this?",
+ images: [
+ { url: "data:image/png;base64,aaa" },
+ { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+ ],
+ },
+ fakeId,
+ );
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images).toHaveLength(2);
+ expect(result.images?.[0]?.url).toBe("data:image/png;base64,aaa");
+ expect(result.images?.[1]?.mimeType).toBe("image/jpeg");
+ }
+ });
+
+ it("parses images with http URLs", () => {
+ const result = parseChatBody(
+ { message: "hi", images: [{ url: "https://example.com/x.png" }] },
+ fakeId,
+ );
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images?.[0]?.url).toBe("https://example.com/x.png");
+ }
+ });
+
+ it("returns error when images is not an array", () => {
+ const result = parseChatBody({ message: "hi", images: "not-an-array" }, fakeId);
+ expect(isParseError(result)).toBe(true);
+ });
+
+ it("returns error when an image lacks a url", () => {
+ const result = parseChatBody({ message: "hi", images: [{ mimeType: "image/png" }] }, fakeId);
+ expect(isParseError(result)).toBe(true);
+ });
+
+ it("returns error when an image url is empty", () => {
+ const result = parseChatBody({ message: "hi", images: [{ url: "" }] }, fakeId);
+ expect(isParseError(result)).toBe(true);
+ });
+
+ it("omits images when absent (backward compatible)", () => {
+ const result = parseChatBody({ message: "hi" }, fakeId);
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images).toBeUndefined();
+ }
+ });
+
+ it("omits images when the array is empty", () => {
+ const result = parseChatBody({ message: "hi", images: [] }, fakeId);
+ expect(isParseError(result)).toBe(false);
+ if (!isParseError(result)) {
+ expect(result.images).toBeUndefined();
+ }
+ });
});
describe("parseSinceSeq", () => {
diff --git a/packages/transport-http/src/logic.ts b/packages/transport-http/src/logic.ts
index d5f2dea..c97f320 100644
--- a/packages/transport-http/src/logic.ts
+++ b/packages/transport-http/src/logic.ts
@@ -55,6 +55,13 @@ export interface ChatCommand {
readonly computerId?: string;
readonly reasoningEffort?: ReasoningEffort;
readonly workspaceId?: string;
+ /**
+ * Images attached to this turn (data URLs or http URLs). Parsed from the
+ * `ChatRequest.images` field; forwarded to the orchestrator which converts
+ * them to `image` chunks on the user message. Each entry must have a non-empty
+ * string `url`; `mimeType` is optional.
+ */
+ readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
}
export interface ParseError {
@@ -121,6 +128,33 @@ export function parseChatBody(body: unknown, generateId: () => string): ParseRes
(result as { workspaceId?: string }).workspaceId = obj.workspaceId;
}
+ if (obj.images !== undefined) {
+ if (!Array.isArray(obj.images)) {
+ return { error: "Field 'images' must be an array" };
+ }
+ const images: { url: string; mimeType?: string }[] = [];
+ for (const entry of obj.images) {
+ if (entry === null || typeof entry !== "object") {
+ return { error: "Each image must be an object with a 'url' string" };
+ }
+ const img = entry as { url?: unknown; mimeType?: unknown };
+ if (typeof img.url !== "string" || img.url.length === 0) {
+ return { error: "Each image must have a non-empty string 'url'" };
+ }
+ const parsed: { url: string; mimeType?: string } = { url: img.url };
+ if (img.mimeType !== undefined) {
+ if (typeof img.mimeType !== "string") {
+ return { error: "Field 'mimeType' on an image must be a string" };
+ }
+ parsed.mimeType = img.mimeType;
+ }
+ images.push(parsed);
+ }
+ if (images.length > 0) {
+ (result as { images?: readonly { url: string; mimeType?: string }[] }).images = images;
+ }
+ }
+
return result;
}
diff --git a/packages/transport-ws/src/extension.ts b/packages/transport-ws/src/extension.ts
index 3811ed7..d26712b 100644
--- a/packages/transport-ws/src/extension.ts
+++ b/packages/transport-ws/src/extension.ts
@@ -291,6 +291,7 @@ export function createTransportWsExtension(): Extension {
: {}),
...(result.workspaceId !== undefined ? { workspaceId: result.workspaceId } : {}),
...(result.computerId !== undefined ? { computerId: result.computerId } : {}),
+ ...(result.images !== undefined ? { images: result.images } : {}),
});
if (!startResult.started) {
send(ws, {
diff --git a/packages/transport-ws/src/router.ts b/packages/transport-ws/src/router.ts
index a33aa5a..0caf305 100644
--- a/packages/transport-ws/src/router.ts
+++ b/packages/transport-ws/src/router.ts
@@ -58,6 +58,12 @@ export interface ChatRouteResult {
* conversation → workspace → local chain).
*/
readonly computerId?: string;
+ /**
+ * Images attached to this turn (data URLs or http URLs), forwarded verbatim to
+ * the orchestrator. Absent when the client omits it. Each entry must have a
+ * non-empty string `url`; `mimeType` is optional.
+ */
+ readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
}
/** A malformed chat.send that should yield a chat.error reply. */
@@ -174,6 +180,36 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
errorMessage: `chat.send: invalid reasoningEffort "${msg.reasoningEffort}" — must be one of: low, medium, high, xhigh, max`,
};
}
+ // Validate images (if present): each must be an object with a non-empty url.
+ let images: readonly { url: string; mimeType?: string }[] | undefined;
+ if (msg.images !== undefined) {
+ if (!Array.isArray(msg.images)) {
+ return {
+ kind: "chat-error",
+ conversationId: msg.conversationId,
+ errorMessage: "chat.send: 'images' must be an array",
+ };
+ }
+ const parsed: { url: string; mimeType?: string }[] = [];
+ for (const entry of msg.images) {
+ if (
+ entry === null ||
+ typeof entry !== "object" ||
+ typeof entry.url !== "string" ||
+ entry.url.length === 0
+ ) {
+ return {
+ kind: "chat-error",
+ conversationId: msg.conversationId,
+ errorMessage: "chat.send: each image must have a non-empty string 'url'",
+ };
+ }
+ const p: { url: string; mimeType?: string } = { url: entry.url };
+ if (entry.mimeType !== undefined) p.mimeType = entry.mimeType;
+ parsed.push(p);
+ }
+ if (parsed.length > 0) images = parsed;
+ }
return {
kind: "chat",
conversationId: msg.conversationId,
@@ -183,6 +219,7 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
...(msg.reasoningEffort !== undefined ? { reasoningEffort: msg.reasoningEffort } : {}),
...(msg.workspaceId !== undefined ? { workspaceId: msg.workspaceId } : {}),
...(msg.computerId !== undefined ? { computerId: msg.computerId } : {}),
+ ...(images !== undefined ? { images } : {}),
};
}
diff --git a/packages/vision-handoff/package.json b/packages/vision-handoff/package.json
new file mode 100644
index 0000000..b11f7ee
--- /dev/null
+++ b/packages/vision-handoff/package.json
@@ -0,0 +1,14 @@
+{
+ "name": "@dispatch/vision-handoff",
+ "version": "0.0.0",
+ "type": "module",
+ "private": true,
+ "main": "dist/index.js",
+ "types": "dist/index.d.ts",
+ "dependencies": {
+ "@dispatch/conversation-store": "workspace:*",
+ "@dispatch/credential-store": "workspace:*",
+ "@dispatch/kernel": "workspace:*",
+ "@dispatch/openai-stream": "workspace:*"
+ }
+}
diff --git a/packages/vision-handoff/src/extension.ts b/packages/vision-handoff/src/extension.ts
new file mode 100644
index 0000000..08fddca
--- /dev/null
+++ b/packages/vision-handoff/src/extension.ts
@@ -0,0 +1,198 @@
+/**
+ * vision-handoff extension — registers the universal vision handoff service +
+ * the `consult_vision` tool.
+ *
+ * The service performs provider-agnostic vision handoff: when a non-vision model
+ * (e.g. glm-5.2) receives an image, it replaces the image with a numbered
+ * placeholder and registers it for tool access. The `consult_vision` tool opens
+ * a NEW conversation tab with a vision-capable model (e.g. Kimi), attaches the
+ * image + the model's specific question, and returns the conversation ID + the
+ * vision model's answer. Follow-ups go through the dispatch CLI.
+ *
+ * Images are saved to a tmp directory (`/tmp/dispatch/images/<convId>/`) so the
+ * conversation store (SQLite) only holds a compact URL reference — not
+ * megabytes of base64. Tmp files are purged on reboot (ephemeral dir), after
+ * compaction (the transcription replaces the image), and on conversation close.
+ *
+ * Effects (filesystem, orchestrator) live here in the shell, injected into the
+ * service. The pure decisions live in `pure.ts`. No `console.*`; logging via
+ * `host.logger`.
+ */
+
+import { mkdir, readFile, rm, unlink, writeFile } from "node:fs/promises";
+import { extname, isAbsolute, join, resolve as pathResolve } from "node:path";
+import { conversationStoreHandle } from "@dispatch/conversation-store";
+import type { CredentialStore } from "@dispatch/credential-store";
+import { credentialStoreHandle } from "@dispatch/credential-store";
+import type { Extension, HostAPI, Manifest } from "@dispatch/kernel";
+import {
+ createVisionHandoffService,
+ orchestratorLocalHandle,
+ visionHandoffHandle,
+} from "./service.js";
+import { createConsultVisionTool } from "./tool.js";
+
+export const manifest: Manifest = {
+ id: "vision-handoff",
+ name: "Vision Handoff",
+ version: "0.0.0",
+ apiVersion: "^0.1.0",
+ trust: "bundled",
+ activation: "eager",
+ capabilities: { network: true },
+ contributes: { services: ["vision-handoff/service"], tools: ["consult_vision"] },
+};
+
+const IMAGE_DIR = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images";
+
+/** MIME types for recognized image extensions. */
+const MIME_BY_EXT: Readonly<Record<string, string>> = {
+ ".png": "image/png",
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".webp": "image/webp",
+ ".gif": "image/gif",
+ ".bmp": "image/bmp",
+};
+
+/** Reverse: MIME → extension. */
+const EXT_BY_MIME: Readonly<Record<string, string>> = {
+ "image/png": ".png",
+ "image/jpeg": ".jpg",
+ "image/webp": ".webp",
+ "image/gif": ".gif",
+ "image/bmp": ".bmp",
+};
+
+/**
+ * Read an image file from disk as a base64 data URL. Resolves relative paths
+ * against the cwd (the conversation's working directory). Throws on missing
+ * file / read error (the caller surfaces it). The shell edge — real `node:fs`.
+ */
+async function readFileAsDataUrl(path: string, cwd?: string): Promise<string> {
+ const abs = cwd !== undefined && !isAbsolute(path) ? pathResolve(cwd, path) : pathResolve(path);
+ const buf = await readFile(abs);
+ const ext = extname(abs).toLowerCase();
+ const mime = MIME_BY_EXT[ext] ?? "image/png";
+ return `data:${mime};base64,${buf.toString("base64")}`;
+}
+
+/**
+ * Save a data URL image to a tmp file and return a compact HTTP path.
+ * The compact URL (`/images/<conversationId>/<uuid>.<ext>`) is what gets
+ * persisted in the conversation store — a tiny string, not megabytes of base64.
+ */
+async function saveImageToTmp(
+ conversationId: string,
+ dataUrl: string,
+ mimeType?: string,
+): Promise<string> {
+ const mime = mimeType ?? "image/png";
+ const ext = EXT_BY_MIME[mime] ?? ".png";
+ const imageId = `${crypto.randomUUID()}${ext}`;
+ const dir = join(IMAGE_DIR, conversationId);
+ await mkdir(dir, { recursive: true });
+ const filePath = join(dir, imageId);
+ const base64 = dataUrl.split(",")[1] ?? "";
+ await writeFile(filePath, Buffer.from(base64, "base64"));
+ return `/images/${conversationId}/${imageId}`;
+}
+
+/**
+ * Resolve a compact URL (`/images/<convId>/<imageId>`) back to a data URL by
+ * reading the tmp file. Data URLs and HTTP URLs pass through unchanged.
+ */
+async function resolveImageUrl(url: string): Promise<string> {
+ if (url.startsWith("data:") || url.startsWith("http")) return url;
+ if (!url.startsWith("/images/")) return url;
+ const parts = url.split("/"); // ["", "images", convId, imageId]
+ const convId = parts[2];
+ const imageId = parts[3];
+ if (convId === undefined || imageId === undefined) return url;
+ const filePath = join(IMAGE_DIR, convId, imageId);
+ const buf = await readFile(filePath);
+ const ext = extname(imageId).toLowerCase();
+ const mime = MIME_BY_EXT[ext] ?? "image/png";
+ return `data:${mime};base64,${buf.toString("base64")}`;
+}
+
+/** Delete a single tmp image file (after compaction — best-effort). */
+async function deleteTmpImage(compactUrl: string): Promise<void> {
+ if (!compactUrl.startsWith("/images/")) return;
+ const parts = compactUrl.split("/");
+ const convId = parts[2];
+ const imageId = parts[3];
+ if (convId === undefined || imageId === undefined) return;
+ const filePath = join(IMAGE_DIR, convId, imageId);
+ try {
+ await unlink(filePath);
+ } catch {
+ // Best-effort — file may already be deleted.
+ }
+}
+
+/** Delete all tmp images for a conversation (on close — best-effort). */
+async function deleteConversationImages(conversationId: string): Promise<void> {
+ const dir = join(IMAGE_DIR, conversationId);
+ try {
+ await rm(dir, { recursive: true, force: true });
+ } catch {
+ // Best-effort.
+ }
+}
+
+export async function activate(host: HostAPI): Promise<void> {
+ const credentialStore = host.getService(credentialStoreHandle) as CredentialStore | undefined;
+ if (credentialStore === undefined) {
+ host.logger.warn(
+ "vision-handoff: credential-store service not available. The consult_vision tool and image handoff are disabled.",
+ );
+ return;
+ }
+
+ const resolveModel = (modelName: string) => {
+ const resolved = credentialStore.resolve(modelName);
+ if (resolved === undefined) return undefined;
+ const provider = host.getProviders().get(resolved.providerId);
+ if (provider === undefined) return undefined;
+ return { provider, model: resolved.model };
+ };
+
+ const service = createVisionHandoffService({
+ credentialStore,
+ resolveModel,
+ readFileAsDataUrl,
+ saveImageToTmp,
+ resolveImageUrl,
+ deleteTmpImage,
+ deleteConversationImages,
+ resolveOrchestrator: () => {
+ const loaded = host.getExtensions().some((m) => m.id === "session-orchestrator");
+ if (!loaded) return undefined;
+ try {
+ return host.getService(orchestratorLocalHandle);
+ } catch {
+ return undefined;
+ }
+ },
+ getImageTranscriptions: async (conversationId: string) => {
+ const store = host.getService(conversationStoreHandle);
+ return store.getImageTranscriptions(conversationId);
+ },
+ setImageTranscription: async (conversationId: string, url: string, text: string) => {
+ const store = host.getService(conversationStoreHandle);
+ await store.setImageTranscription(conversationId, url, text);
+ },
+ setConversationTitle: async (conversationId: string, title: string) => {
+ const store = host.getService(conversationStoreHandle);
+ await store.setConversationTitle(conversationId, title);
+ },
+ logger: host.logger.child({ extensionId: "vision-handoff" }),
+ });
+
+ host.provideService(visionHandoffHandle, service);
+ host.defineTool(createConsultVisionTool(service));
+ host.logger.info("vision-handoff: registered (consult_vision tool + handoff service)");
+}
+
+export const extension: Extension = { manifest, activate };
diff --git a/packages/vision-handoff/src/index.ts b/packages/vision-handoff/src/index.ts
new file mode 100644
index 0000000..2713346
--- /dev/null
+++ b/packages/vision-handoff/src/index.ts
@@ -0,0 +1,21 @@
+export { extension, manifest } from "./extension.js";
+export {
+ collectTextFromStream,
+ findVisionModelName,
+ formatConsultResult,
+ formatImagePlaceholder,
+ formatNoVisionPlaceholder,
+ isVisionCapable,
+} from "./pure.js";
+export type {
+ OrchestratorForVision,
+ ResolvedVisionModel,
+ VisionHandoffDeps,
+ VisionHandoffService,
+} from "./service.js";
+export {
+ createVisionHandoffService,
+ orchestratorLocalHandle,
+ visionHandoffHandle,
+} from "./service.js";
+export { createConsultVisionTool } from "./tool.js";
diff --git a/packages/vision-handoff/src/pure.test.ts b/packages/vision-handoff/src/pure.test.ts
new file mode 100644
index 0000000..21b1224
--- /dev/null
+++ b/packages/vision-handoff/src/pure.test.ts
@@ -0,0 +1,180 @@
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { describe, expect, it } from "vitest";
+import {
+ collectTextFromStream,
+ findVisionModelName,
+ formatConsultationTitle,
+ formatConsultResult,
+ formatImagePlaceholder,
+ formatNoVisionPlaceholder,
+ isVisionCapable,
+} from "./pure.js";
+
+describe("isVisionCapable", () => {
+ it("returns true when ModelInfo.vision is true", () => {
+ expect(isVisionCapable("umans/umans-kimi-k2.7", { id: "umans-kimi-k2.7", vision: true })).toBe(
+ true,
+ );
+ });
+
+ it("returns false when ModelInfo.vision is false (overrides name heuristic)", () => {
+ expect(isVisionCapable("umans/umans-kimi-k2.7", { id: "umans-kimi-k2.7", vision: false })).toBe(
+ false,
+ );
+ });
+
+ it("falls back to name heuristic when vision is absent (umans kimi + qwen)", () => {
+ expect(isVisionCapable("umans/umans-kimi-k2.7", undefined)).toBe(true);
+ expect(isVisionCapable("umans/umans-qwen3.6-35b-a3b", undefined)).toBe(true);
+ });
+
+ it("falls back to name heuristic when vision is absent (non-vision)", () => {
+ expect(isVisionCapable("umans/umans-glm-5.2", undefined)).toBe(false);
+ expect(isVisionCapable("umans/umans-coder", { id: "umans-coder" })).toBe(false);
+ });
+
+ it("returns false for undefined model name", () => {
+ expect(isVisionCapable(undefined, undefined)).toBe(false);
+ });
+});
+
+describe("findVisionModelName", () => {
+ const getInfo = async (name: string): Promise<ModelInfo | undefined> => {
+ const map: Record<string, ModelInfo> = {
+ "umans/umans-kimi-k2.7": { id: "umans-kimi-k2.7", vision: true },
+ "umans/umans-qwen3.6-35b-a3b": { id: "umans-qwen3.6-35b-a3b", vision: true },
+ "umans/umans-glm-5.2": { id: "umans-glm-5.2" },
+ "umans/llama-vision": { id: "llama-vision", vision: true },
+ };
+ return map[name];
+ };
+
+ it("finds the first umans kimi model via name heuristic", async () => {
+ const name = await findVisionModelName(
+ ["umans/umans-glm-5.2", "umans/umans-kimi-k2.7", "umans/llama-vision"],
+ getInfo,
+ );
+ expect(name).toBe("umans/umans-kimi-k2.7");
+ });
+
+ it("finds a vision model via ModelInfo.vision when name heuristic misses", async () => {
+ const name = await findVisionModelName(["umans/umans-glm-5.2", "umans/llama-vision"], getInfo);
+ expect(name).toBe("umans/llama-vision");
+ });
+
+ it("skips the excluded model and finds the next vision model", async () => {
+ const name = await findVisionModelName(
+ ["umans/umans-kimi-k2.7", "umans/umans-qwen3.6-35b-a3b"],
+ getInfo,
+ "umans/umans-kimi-k2.7",
+ );
+ expect(name).toBe("umans/umans-qwen3.6-35b-a3b");
+ });
+
+ it("returns undefined when no vision model is available", async () => {
+ const name = await findVisionModelName(["umans/umans-glm-5.2"], getInfo);
+ expect(name).toBeUndefined();
+ });
+
+ it("returns undefined for empty catalog", async () => {
+ const name = await findVisionModelName([], getInfo);
+ expect(name).toBeUndefined();
+ });
+});
+
+describe("collectTextFromStream", () => {
+ async function* stream(events: ProviderEvent[]): AsyncIterable<ProviderEvent> {
+ for (const e of events) yield e;
+ }
+
+ it("collects text-delta events into a single string", async () => {
+ const events: ProviderEvent[] = [
+ { type: "text-delta", delta: "Hello " },
+ { type: "text-delta", delta: "world!" },
+ ];
+ const text = await collectTextFromStream(stream(events));
+ expect(text).toBe("Hello world!");
+ });
+
+ it("ignores non-text events", async () => {
+ const events: ProviderEvent[] = [
+ { type: "reasoning-delta", delta: "thinking..." },
+ { type: "text-delta", delta: "answer" },
+ { type: "usage", usage: { inputTokens: 5, outputTokens: 1 } },
+ { type: "finish", reason: "stop" },
+ ];
+ const text = await collectTextFromStream(stream(events));
+ expect(text).toBe("answer");
+ });
+
+ it("throws on an error event", async () => {
+ const events: ProviderEvent[] = [
+ { type: "text-delta", delta: "partial" },
+ { type: "error", message: "boom" },
+ ];
+ await expect(collectTextFromStream(stream(events))).rejects.toThrow("boom");
+ });
+
+ it("returns empty string for an empty stream", async () => {
+ const text = await collectTextFromStream(stream([]));
+ expect(text).toBe("");
+ });
+});
+
+describe("formatImagePlaceholder", () => {
+ it("includes the image ID and mentions consult_vision", () => {
+ const text = formatImagePlaceholder(1);
+ expect(text).toContain("Image 1");
+ expect(text).toContain("consult_vision");
+ expect(text).toContain("imageIds=[1]");
+ });
+
+ it("increments the ID for each image", () => {
+ expect(formatImagePlaceholder(2)).toContain("Image 2");
+ expect(formatImagePlaceholder(2)).toContain("imageIds=[2]");
+ });
+});
+
+describe("formatNoVisionPlaceholder", () => {
+ it("explains the limitation", () => {
+ const text = formatNoVisionPlaceholder();
+ expect(text).toContain("no vision-capable model");
+ });
+});
+
+describe("formatConsultResult", () => {
+ it("includes the conversation ID, the response, and the dispatch CLI hint", () => {
+ const result = formatConsultResult("abc-123", "The error is on line 12.");
+ expect(result).toContain("abc-123");
+ expect(result).toContain("The error is on line 12.");
+ expect(result).toContain("dispatch CLI");
+ });
+
+ it("trims the response", () => {
+ const result = formatConsultResult("c1", " spaced ");
+ expect(result).toContain("spaced");
+ expect(result).not.toContain("spaced ");
+ });
+});
+
+describe("formatConsultationTitle", () => {
+ it("prefixes the question with 'IMAGE - '", () => {
+ expect(formatConsultationTitle("What error is shown?")).toBe("IMAGE - What error is shown?");
+ });
+
+ it("truncates long questions to 80 chars with an ellipsis (matching the store's TITLE_MAX)", () => {
+ const long = "x".repeat(100);
+ const title = formatConsultationTitle(long);
+ expect(title).toBe(`IMAGE - ${"x".repeat(80)}…`);
+ expect(title.length).toBe("IMAGE - ".length + 80 + 1); // prefix + 80 + ellipsis
+ });
+
+ it("does not truncate questions at or under 80 chars", () => {
+ expect(formatConsultationTitle("x".repeat(80))).toBe(`IMAGE - ${"x".repeat(80)}`);
+ expect(formatConsultationTitle("x".repeat(79))).toBe(`IMAGE - ${"x".repeat(79)}`);
+ });
+
+ it("handles an empty question", () => {
+ expect(formatConsultationTitle("")).toBe("IMAGE - ");
+ });
+});
diff --git a/packages/vision-handoff/src/pure.ts b/packages/vision-handoff/src/pure.ts
new file mode 100644
index 0000000..af3476f
--- /dev/null
+++ b/packages/vision-handoff/src/pure.ts
@@ -0,0 +1,156 @@
+/**
+ * Pure decision helpers for the vision handoff.
+ *
+ * No I/O, no ambient state. The shell (the extension + the service) injects the
+ * effects (credential store lookups, orchestrator, provider streaming). This
+ * module owns only the policy: which model is vision-capable, how to format
+ * image placeholders for non-vision models, and how to format the
+ * consultation tool's result.
+ */
+
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { isVisionModelId } from "@dispatch/openai-stream";
+
+/**
+ * Whether a model is vision-capable, given its catalog name and (optional)
+ * resolved `ModelInfo`. When `ModelInfo.vision` is present it is authoritative;
+ * otherwise fall back to the hardcoded name heuristic ({@link isVisionModelId}).
+ *
+ * The `modelName` is the `<credentialName>/<model>` catalog form; the heuristic
+ * inspects the model SEGMENT (after the first `/`) so `umans/kimi-k2.7` → the
+ * `kimi-k2.7` segment is checked. Pure.
+ */
+export function isVisionCapable(
+ modelName: string | undefined,
+ info: ModelInfo | undefined,
+): boolean {
+ // When ModelInfo explicitly reports vision (true OR false), it is authoritative
+ // — an explicit false overrides the name heuristic (a provider that KNOWS a
+ // model is non-vision wins over the name guess).
+ if (info?.vision !== undefined) return info.vision;
+ if (modelName === undefined) return false;
+ const slash = modelName.indexOf("/");
+ const modelId = slash >= 0 ? modelName.slice(slash + 1) : modelName;
+ return isVisionModelId(modelId);
+}
+
+/**
+ * Find the first vision-capable model name in a catalog, given a lookup that
+ * resolves a `<credentialName>/<model>` → `ModelInfo`. Returns `undefined` when
+ * no vision-capable model is available. Pure given the (async) lookup.
+ *
+ * @param catalog The full list of model names (`<credentialName>/<model>`).
+ * @param getInfo Async lookup of a model name → ModelInfo (from the credential store).
+ * @param exclude Optional model name to skip (e.g. the current non-vision model).
+ */
+export async function findVisionModelName(
+ catalog: readonly string[],
+ getInfo: (modelName: string) => Promise<ModelInfo | undefined>,
+ exclude?: string,
+): Promise<string | undefined> {
+ for (const name of catalog) {
+ if (exclude !== undefined && name === exclude) continue;
+ // Fast path: the name heuristic lets us short-circuit without an async
+ // lookup for known vision families (kimi).
+ const slash = name.indexOf("/");
+ const modelId = slash >= 0 ? name.slice(slash + 1) : name;
+ if (isVisionModelId(modelId)) return name;
+ const info = await getInfo(name);
+ if (info?.vision === true) return name;
+ }
+ return undefined;
+}
+
+/**
+ * Fold a provider's streamed events into a single text string. Pure given the
+ * async iterable — collects `text-delta` events, ignores everything else
+ * (reasoning, usage, tool-calls). If the stream yields an error event, it is
+ * surfaced as a thrown Error so the caller can decide how to degrade.
+ */
+export async function collectTextFromStream(stream: AsyncIterable<ProviderEvent>): Promise<string> {
+ let text = "";
+ for await (const event of stream) {
+ if (event.type === "text-delta") {
+ text += event.delta;
+ } else if (event.type === "error") {
+ throw new Error(event.message);
+ }
+ }
+ return text;
+}
+
+/**
+ * Format the placeholder text that replaces an `image` chunk when a non-vision
+ * model is active. The placeholder tells the model an image is attached and it
+ * should call `consult_vision` to analyze it — the model drives the analysis
+ * (asking a specific question) rather than receiving a pre-emptive generic dump.
+ *
+ * @param imageId The 1-based ID assigned to this image (used by the tool to
+ * look up the registered image data).
+ * Pure.
+ */
+export function formatImagePlaceholder(imageId: number): string {
+ return (
+ `[Image ${imageId} attached — you cannot view images. Call the ` +
+ `consult_vision tool with imageIds=[${imageId}] and a specific question ` +
+ `to analyze it via a vision-capable model.]`
+ );
+}
+
+/**
+ * Placeholder text used when NO vision-capable model is available (the
+ * degraded path — the tool cannot function). Pure.
+ */
+export function formatNoVisionPlaceholder(): string {
+ return (
+ "[Image attached — no vision-capable model is available to analyze it. " +
+ "Install or configure a vision-capable model (e.g. kimi) to enable image analysis.]"
+ );
+}
+
+/**
+ * Maximum length of the consultation title body (matching the conversation
+ * store's `TITLE_MAX`). The question is truncated to this before the
+ * `"IMAGE - "` prefix is applied so the consultation tab's title stays in line
+ * with the store's own title-derivation limit.
+ */
+const CONSULTATION_TITLE_MAX = 80;
+
+/**
+ * Format the title for a vision consultation conversation tab. The title is
+ * `"IMAGE - "` prefixed to the (truncated) question so the tab is visually
+ * distinguishable from normal conversation tabs. The question is truncated to
+ * match the conversation store's title-derivation limit (`TITLE_MAX = 80`).
+ *
+ * Pure.
+ *
+ * @param question The question the model asked the vision model.
+ */
+export function formatConsultationTitle(question: string): string {
+ const body =
+ question.length > CONSULTATION_TITLE_MAX
+ ? `${question.slice(0, CONSULTATION_TITLE_MAX)}…`
+ : question;
+ return `IMAGE - ${body}`;
+}
+
+/**
+ * Format the `consult_vision` tool's result string. Returns the conversation ID
+ * (so the model / user can continue the vision consultation), the vision model's
+ * response, and a note that follow-up questions use the dispatch CLI (the model
+ * can load the `dispatch-cli` skill for the exact commands).
+ *
+ * Pure.
+ *
+ * @param conversationId The new vision consultation conversation ID.
+ * @param response The vision model's answer to the model's question.
+ */
+export function formatConsultResult(conversationId: string, response: string): string {
+ const trimmed = response.trim();
+ return (
+ `Vision consultation opened in conversation ${conversationId}.\n\n` +
+ `Response: ${trimmed}\n\n` +
+ `To ask follow-up questions about this image, use the dispatch CLI ` +
+ `(conversation: ${conversationId}).`
+ );
+}
diff --git a/packages/vision-handoff/src/service.test.ts b/packages/vision-handoff/src/service.test.ts
new file mode 100644
index 0000000..8c4117e
--- /dev/null
+++ b/packages/vision-handoff/src/service.test.ts
@@ -0,0 +1,375 @@
+import type {
+ AgentEvent,
+ ChatMessage,
+ ModelInfo,
+ ProviderContract,
+ ProviderEvent,
+ ToolContract,
+} from "@dispatch/kernel";
+import { describe, expect, it, vi } from "vitest";
+import { createVisionHandoffService, type VisionHandoffDeps } from "./service.js";
+
+// ── Test doubles (outermost-edge fakes — NOT @dispatch/* mocks) ──────────────
+
+function makeVisionProvider(
+ describe: (imageUrl: string) => string,
+ id = "umans",
+): ProviderContract {
+ return {
+ id,
+ stream: vi.fn(
+ (
+ messages: readonly ChatMessage[],
+ _tools: readonly ToolContract[],
+ ): AsyncIterable<ProviderEvent> => {
+ const img = messages.flatMap((m) => m.chunks).find((c) => c.type === "image");
+ const url = img && img.type === "image" ? img.url : "";
+ const text = describe(url);
+ async function* gen(): AsyncIterable<ProviderEvent> {
+ yield { type: "text-delta", delta: text };
+ yield { type: "finish", reason: "stop" };
+ }
+ return gen();
+ },
+ ),
+ };
+}
+
+function makeDeps(overrides: Partial<VisionHandoffDeps> = {}): VisionHandoffDeps {
+ const visionProvider = makeVisionProvider((url) => `DESCRIPTION of ${url}`);
+ const catalog = ["umans/umans-kimi-k2.7", "umans/umans-glm-5.2"];
+ const infoMap: Record<string, ModelInfo> = {
+ "umans/umans-kimi-k2.7": { id: "umans-kimi-k2.7", vision: true },
+ "umans/umans-glm-5.2": { id: "umans-glm-5.2" },
+ };
+ return {
+ credentialStore: {
+ listCatalog: vi.fn(async () => catalog),
+ getModelInfo: vi.fn(async (name: string) => infoMap[name]),
+ resolve: vi.fn((name: string) => {
+ if (name === "umans/umans-kimi-k2.7")
+ return { providerId: "umans", model: "umans-kimi-k2.7" };
+ if (name === "umans/umans-glm-5.2") return { providerId: "umans", model: "umans-glm-5.2" };
+ return undefined;
+ }),
+ },
+ resolveModel: vi.fn((name: string) =>
+ name === "umans/umans-kimi-k2.7" || name === "umans/umans-glm-5.2"
+ ? { provider: visionProvider, model: name.split("/")[1] }
+ : undefined,
+ ),
+ readFileAsDataUrl: vi.fn(async (path: string) => `data:image/png;base64,FILE(${path})`),
+ setConversationTitle: vi.fn(async (_conversationId: string, _title: string) => {}),
+ ...overrides,
+ };
+}
+
+describe("VisionHandoffService.isVisionCapable", () => {
+ it("returns true for kimi (via ModelInfo)", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ expect(await svc.isVisionCapable("umans/umans-kimi-k2.7")).toBe(true);
+ });
+
+ it("returns false for glm-5.2", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ expect(await svc.isVisionCapable("umans/umans-glm-5.2")).toBe(false);
+ });
+
+ it("returns false for undefined model name", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ expect(await svc.isVisionCapable(undefined)).toBe(false);
+ });
+});
+
+describe("VisionHandoffService.resolveVisionModel", () => {
+ it("resolves the kimi model from the catalog", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ const vision = await svc.resolveVisionModel();
+ expect(vision?.modelName).toBe("umans/umans-kimi-k2.7");
+ expect(vision?.model).toBe("umans-kimi-k2.7");
+ });
+
+ it("excludes the given model", async () => {
+ const svc = createVisionHandoffService(makeDeps());
+ const vision = await svc.resolveVisionModel("umans/umans-kimi-k2.7");
+ expect(vision).toBeUndefined();
+ });
+});
+
+describe("VisionHandoffService.prepareForProvider", () => {
+ it("passes messages through unchanged when the model is vision-capable", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "What's this?" },
+ { type: "image", url: "data:image/png;base64,abc" },
+ ],
+ },
+ ];
+ const result = await svc.prepareForProvider(messages, "umans/umans-kimi-k2.7");
+ expect(result).toBe(messages); // same reference — no copy, no change
+ });
+
+ it("passes messages through unchanged when there are no images", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [{ role: "user", chunks: [{ type: "text", text: "hi" }] }];
+ const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2");
+ expect(result).toBe(messages);
+ });
+
+ it("replaces image chunks with numbered placeholders for a non-vision model", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [
+ { type: "text", text: "Describe this" },
+ { type: "image", url: "data:image/png;base64,img1" },
+ ],
+ },
+ ];
+ const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", {
+ conversationId: "conv-1",
+ });
+ expect(result).toHaveLength(1);
+ const chunks = result[0]?.chunks;
+ expect(chunks).toHaveLength(2);
+ // Text chunk unchanged.
+ expect(chunks?.[0]).toEqual({ type: "text", text: "Describe this" });
+ // Image chunk → placeholder text.
+ expect(chunks?.[1]?.type).toBe("text");
+ const placeholder = (chunks?.[1] as { text: string }).text;
+ expect(placeholder).toContain("Image 1");
+ expect(placeholder).toContain("consult_vision");
+ });
+
+ it("assigns sequential image IDs across multiple messages", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,a" }] },
+ { role: "assistant", chunks: [{ type: "text", text: "ok" }] },
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,b" }] },
+ ];
+ const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", {
+ conversationId: "conv-1",
+ });
+ // First image → Image 1, second → Image 2.
+ expect((result[0]?.chunks[0] as { text: string }).text).toContain("Image 1");
+ // Assistant message unchanged.
+ expect(result[1]?.chunks[0]?.type).toBe("text");
+ expect((result[2]?.chunks[0] as { text: string }).text).toContain("Image 2");
+ });
+
+ it("registers images so getRegisteredImage can look them up", async () => {
+ const deps = makeDeps();
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ {
+ role: "user",
+ chunks: [{ type: "image", url: "data:image/png;base64,registered" }],
+ },
+ ];
+ await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-42" });
+ const img = svc.getRegisteredImage("conv-42", 1);
+ expect(img?.url).toBe("data:image/png;base64,registered");
+ });
+
+ it("uses no-vision placeholder when no vision model is available", async () => {
+ const deps = makeDeps();
+ (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+ const svc = createVisionHandoffService(deps);
+ const messages: ChatMessage[] = [
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,abc" }] },
+ ];
+ const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", {
+ conversationId: "conv-1",
+ });
+ const text = (result[0]?.chunks[0] as { text: string }).text;
+ expect(text).toContain("no vision-capable model");
+ expect(text).not.toContain("consult_vision");
+ });
+});
+
+describe("VisionHandoffService.consultVision", () => {
+ function makeOrchestratorDouble(response: string): {
+ orchestrator: NonNullable<
+ VisionHandoffDeps["resolveOrchestrator"] extends () => infer T ? T : never
+ >;
+ handleMessage: ReturnType<typeof vi.fn>;
+ } {
+ const handleMessage = vi.fn(
+ async (input: {
+ conversationId: string;
+ text: string;
+ onEvent: (event: AgentEvent) => void;
+ }): Promise<void> => {
+ input.onEvent({
+ type: "text-delta",
+ conversationId: input.conversationId,
+ turnId: "t1",
+ delta: response,
+ });
+ input.onEvent({
+ type: "done",
+ conversationId: input.conversationId,
+ turnId: "t1",
+ reason: "stop",
+ });
+ },
+ );
+ return { orchestrator: { handleMessage }, handleMessage };
+ }
+
+ it("opens a new consultation with a pasted image and returns convId + response", async () => {
+ const deps = makeDeps();
+ const { orchestrator, handleMessage } = makeOrchestratorDouble("The error is on line 12.");
+ deps.resolveOrchestrator = () => orchestrator;
+ const svc = createVisionHandoffService(deps);
+
+ // Register an image first (as prepareForProvider would).
+ const messages: ChatMessage[] = [
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] },
+ ];
+ await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" });
+
+ const result = await svc.consultVision("What error is shown?", {
+ conversationId: "conv-1",
+ imageIds: [1],
+ });
+
+ expect("error" in result).toBe(false);
+ if (!("error" in result)) {
+ expect(result.conversationId).toBeTruthy();
+ expect(result.response).toContain("line 12");
+ expect(result.response).toContain(result.conversationId);
+ expect(result.response).toContain("dispatch CLI");
+ }
+ // The orchestrator was called with the vision model + the image.
+ expect(handleMessage).toHaveBeenCalledOnce();
+ const call = handleMessage.mock.calls[0]?.[0];
+ expect(call.modelName).toBe("umans/umans-kimi-k2.7");
+ expect(call.images).toHaveLength(1);
+ expect(call.images?.[0]?.url).toBe("data:image/png;base64,img1");
+ });
+
+ it("labels the consultation tab with an 'IMAGE - ' prefixed title", async () => {
+ const deps = makeDeps();
+ const { orchestrator } = makeOrchestratorDouble("The error is on line 12.");
+ deps.resolveOrchestrator = () => orchestrator;
+ const svc = createVisionHandoffService(deps);
+
+ // Register an image first (as prepareForProvider would).
+ const messages: ChatMessage[] = [
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] },
+ ];
+ await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" });
+
+ const result = await svc.consultVision("What error is shown?", {
+ conversationId: "conv-1",
+ imageIds: [1],
+ });
+
+ expect("error" in result).toBe(false);
+ // The title was set with the IMAGE - prefix + the question.
+ expect(deps.setConversationTitle).toHaveBeenCalledOnce();
+ const [titleConvId, title] = (deps.setConversationTitle as ReturnType<typeof vi.fn>).mock
+ .calls[0];
+ expect(titleConvId).toBe((result as { conversationId: string }).conversationId);
+ expect(title).toBe("IMAGE - What error is shown?");
+ });
+
+ it("does not call setConversationTitle when it is not provided", async () => {
+ const deps = makeDeps({ setConversationTitle: undefined });
+ const { orchestrator } = makeOrchestratorDouble("response");
+ deps.resolveOrchestrator = () => orchestrator;
+ const svc = createVisionHandoffService(deps);
+
+ const messages: ChatMessage[] = [
+ { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] },
+ ];
+ await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" });
+
+ // Should NOT throw — setConversationTitle is optional.
+ const result = await svc.consultVision("What?", {
+ conversationId: "conv-1",
+ imageIds: [1],
+ });
+ expect("error" in result).toBe(false);
+ });
+
+ it("opens a consultation with a file path image", async () => {
+ const deps = makeDeps();
+ const { orchestrator } = makeOrchestratorDouble("It's a diagram.");
+ deps.resolveOrchestrator = () => orchestrator;
+ const svc = createVisionHandoffService(deps);
+
+ const result = await svc.consultVision("What is this diagram?", {
+ conversationId: "conv-1",
+ path: "diagram.png",
+ cwd: "/work",
+ });
+
+ expect("error" in result).toBe(false);
+ expect(deps.readFileAsDataUrl).toHaveBeenCalledWith("diagram.png", "/work");
+ });
+
+ it("returns an error when imageId is not registered", async () => {
+ const deps = makeDeps();
+ const { orchestrator } = makeOrchestratorDouble("response");
+ deps.resolveOrchestrator = () => orchestrator;
+ const svc = createVisionHandoffService(deps);
+
+ const result = await svc.consultVision("What?", {
+ conversationId: "conv-1",
+ imageIds: [99], // not registered
+ });
+ expect("error" in result).toBe(true);
+ if ("error" in result) {
+ expect(result.error).toContain("Image 99");
+ }
+ });
+
+ it("returns an error when no orchestrator is available", async () => {
+ const deps = makeDeps();
+ // No resolveOrchestrator provided.
+ const svc = createVisionHandoffService(deps);
+ const result = await svc.consultVision("What?", {
+ conversationId: "conv-1",
+ imageIds: [1],
+ });
+ expect("error" in result).toBe(true);
+ });
+
+ it("returns an error when no vision model is available", async () => {
+ const deps = makeDeps();
+ (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+ const { orchestrator } = makeOrchestratorDouble("response");
+ deps.resolveOrchestrator = () => orchestrator;
+ const svc = createVisionHandoffService(deps);
+ const result = await svc.consultVision("What?", {
+ conversationId: "conv-1",
+ imageIds: [1],
+ });
+ expect("error" in result).toBe(true);
+ if ("error" in result) {
+ expect(result.error).toContain("No vision-capable model");
+ }
+ });
+
+ it("returns an error when no image source is provided", async () => {
+ const deps = makeDeps();
+ const { orchestrator } = makeOrchestratorDouble("response");
+ deps.resolveOrchestrator = () => orchestrator;
+ const svc = createVisionHandoffService(deps);
+ const result = await svc.consultVision("What?", {
+ conversationId: "conv-1",
+ });
+ expect("error" in result).toBe(true);
+ });
+});
diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts
new file mode 100644
index 0000000..01245df
--- /dev/null
+++ b/packages/vision-handoff/src/service.ts
@@ -0,0 +1,684 @@
+/**
+ * Vision handoff service — the imperative shell that performs the universal,
+ * provider-agnostic vision handoff.
+ *
+ * Two capabilities:
+ * 1. **prepareForProvider** (`prepareForProvider`): when a user message carries
+ * images but the active model cannot see them, this replaces each image chunk
+ * with a numbered placeholder (telling the model to call `consult_vision`)
+ * and registers the image data in a per-conversation registry for tool
+ * access. Vision-capable models pass through unchanged (images flow natively).
+ * 2. **consult_vision tool** (`consultVision`): opens a NEW conversation tab with
+ * a vision-capable model (resolved from the catalog — any provider), attaches
+ * the image(s) + the model's specific question, waits for the response, and
+ * returns the conversation ID + the vision model's answer. The model (e.g.
+ * GLM 5.2) directs the analysis — asking exactly what it needs — instead of
+ * receiving a pre-emptive generic dump. Follow-up questions go through the
+ * dispatch CLI (the conversation ID is the bridge), not another tool call.
+ *
+ * Effects (credential store, orchestrator, filesystem) are injected. The pure
+ * decisions live in `pure.ts`. This shell wires them.
+ */
+
+import type { CredentialStore } from "@dispatch/credential-store";
+import type {
+ AgentEvent,
+ ChatMessage,
+ Chunk,
+ ImageInput,
+ Logger,
+ ModelInfo,
+ ProviderContract,
+} from "@dispatch/kernel";
+import { defineService, type ServiceHandle } from "@dispatch/kernel";
+import {
+ collectTextFromStream,
+ findVisionModelName,
+ formatConsultationTitle,
+ formatConsultResult,
+ formatImagePlaceholder,
+ formatNoVisionPlaceholder,
+ isVisionCapable,
+} from "./pure.js";
+
+/**
+ * Minimal orchestrator interface the service needs to start vision consultation
+ * turns. Defined locally (not imported from session-orchestrator) to avoid a
+ * compile-time dependency — resolved lazily at runtime via a local handle keyed
+ * to the same service ID.
+ */
+export interface OrchestratorForVision {
+ readonly handleMessage: (input: {
+ readonly conversationId: string;
+ readonly text: string;
+ readonly onEvent: (event: AgentEvent) => void;
+ readonly modelName?: string;
+ readonly cwd?: string;
+ readonly images?: readonly ImageInput[];
+ readonly systemPrompt?: string;
+ }) => Promise<void>;
+}
+
+/** Local handle for the session-orchestrator service (same ID, no import dep). */
+export const orchestratorLocalHandle: ServiceHandle<OrchestratorForVision> =
+ defineService<OrchestratorForVision>("session-orchestrator/orchestrator");
+
+/**
+ * Resolved vision model — a provider + its model id, ready to stream from.
+ */
+export interface ResolvedVisionModel {
+ readonly provider: ProviderContract;
+ readonly model: string;
+ readonly modelName: string;
+}
+
+/** A registered image (looked up by the consult_vision tool via imageId). */
+interface RegisteredImage {
+ readonly url: string;
+ readonly mimeType?: string;
+}
+
+/**
+ * Dependencies the service needs — all injected (no ambient state).
+ */
+export interface VisionHandoffDeps {
+ readonly credentialStore: CredentialStore;
+ /** Resolve a `<credentialName>/<model>` → its provider + model id. */
+ readonly resolveModel: (
+ modelName: string,
+ ) => { provider: ProviderContract; model: string } | undefined;
+ /**
+ * Read a file from disk as a base64 data URL. Injected so the shell controls
+ * the filesystem edge. Returns the data URL, or throws on error.
+ */
+ readonly readFileAsDataUrl: (path: string, cwd?: string) => Promise<string>;
+ /**
+ * Lazily resolve the session-orchestrator (for starting vision consultation
+ * turns). Returns `undefined` when not available — `consult_vision` degrades
+ * with an error. Lazy so activation order doesn't matter.
+ */
+ readonly resolveOrchestrator?: () => OrchestratorForVision | undefined;
+ /**
+ * Get the per-conversation cached image transcriptions (imageUrl → text).
+ * Used to avoid re-transcribing old images that were compacted to text on a
+ * previous turn. Optional — when absent, compaction still works but
+ * re-transcribes every turn (no caching).
+ */
+ readonly getImageTranscriptions?: (
+ conversationId: string,
+ ) => Promise<ReadonlyMap<string, string>>;
+ /**
+ * Upsert a single image transcription into the per-conversation cache.
+ * Optional — paired with getImageTranscriptions.
+ */
+ readonly setImageTranscription?: (
+ conversationId: string,
+ imageUrl: string,
+ transcription: string,
+ ) => Promise<void>;
+ /**
+ * Save an image data URL to a tmp file and return a compact URL
+ * (`/images/<conversationId>/<imageId>.<ext>`) that can be persisted in the
+ * conversation store instead of the full data URL (which would be megabytes).
+ * The frontend serves the image via `GET /images/...`; the provider resolves
+ * it back to a data URL via {@link resolveImageUrl} at runtime. When `undefined`,
+ * data URLs pass through unchanged (images persist in SQLite — the large-DB
+ * path, for environments without tmp file support).
+ */
+ readonly saveImageToTmp?: (
+ conversationId: string,
+ dataUrl: string,
+ mimeType?: string,
+ ) => Promise<string>;
+ /**
+ * Resolve a compact URL (`/images/...`) back to a data URL by reading the tmp
+ * file. Data URLs and HTTP URLs pass through unchanged. Paired with
+ * {@link saveImageToTmp}.
+ */
+ readonly resolveImageUrl?: (url: string) => Promise<string>;
+ /**
+ * Delete a tmp image file (after it has been compacted to text — the
+ * transcription is cached, the raw image is no longer needed). Best-effort:
+ * errors are logged, not thrown.
+ */
+ readonly deleteTmpImage?: (compactUrl: string) => Promise<void>;
+ /**
+ * Delete all tmp images for a conversation (on conversation close).
+ * Best-effort.
+ */
+ readonly deleteConversationImages?: (conversationId: string) => Promise<void>;
+ /**
+ * Set the human-readable title of a conversation. Used to label vision
+ * consultation tabs with an `"IMAGE - "` prefix so they're visually
+ * distinguishable from normal conversation tabs. Backed by the conversation
+ * store's `setConversationTitle`. Optional — when absent, consultation tabs
+ * keep their default (question-derived) title.
+ */
+ readonly setConversationTitle?: (conversationId: string, title: string) => Promise<void>;
+ /** Generate a new conversation ID for a consultation. Defaults to crypto.randomUUID. */
+ readonly generateId?: () => string;
+ readonly logger?: Logger;
+}
+
+export interface VisionHandoffService {
+ /**
+ * Whether a given model (by catalog name) is vision-capable. Uses the
+ * credential store's ModelInfo + the name heuristic.
+ */
+ readonly isVisionCapable: (modelName: string | undefined) => Promise<boolean>;
+
+ /**
+ * Store images to tmp files and return compact URLs. Each input image's data
+ * URL is saved to `/tmp/dispatch/images/<conversationId>/<uuid>.<ext>` and
+ * replaced with a compact HTTP path (`/images/<conversationId>/<uuid>.<ext>`)
+ * so the persisted conversation store holds a tiny string, not megabytes of
+ * base64. When `saveImageToTmp` is not configured, data URLs pass through
+ * unchanged (backward compatible).
+ */
+ readonly storeImages: (
+ conversationId: string,
+ images: readonly ImageInput[],
+ ) => Promise<readonly ImageInput[]>;
+
+ /**
+ * Delete all tmp images for a conversation (on close). Best-effort.
+ */
+ readonly purgeConversationImages: (conversationId: string) => Promise<void>;
+
+ /**
+ * Resolve a vision-capable model from the catalog (any provider). Returns
+ * `undefined` when none is available.
+ */
+ readonly resolveVisionModel: (excludeName?: string) => Promise<ResolvedVisionModel | undefined>;
+
+ /**
+ * Transform a message list for the provider: if the active model is
+ * vision-capable, return messages unchanged (images pass through natively).
+ * If NOT vision-capable, replace every `image` chunk with a numbered
+ * placeholder (telling the model to call `consult_vision`) and register the
+ * image data in the per-conversation registry for tool access. The PERSISTED
+ * history is NOT modified — only what the provider sees. Never throws.
+ */
+ readonly prepareForProvider: (
+ messages: readonly ChatMessage[],
+ currentModelName: string | undefined,
+ opts?: {
+ readonly conversationId?: string;
+ readonly imageLimit?: number;
+ readonly signal?: AbortSignal;
+ readonly logger?: Logger;
+ },
+ ) => Promise<readonly ChatMessage[]>;
+
+ /**
+ * Look up a registered image by conversation ID + image ID. Returns
+ * `undefined` when the image isn't registered (e.g. after a server restart).
+ */
+ readonly getRegisteredImage: (
+ conversationId: string,
+ imageId: number,
+ ) => RegisteredImage | undefined;
+
+ /**
+ * Open a NEW vision consultation conversation: attach image(s) + the model's
+ * question to a vision-capable model, wait for the response, and return the
+ * conversation ID + the vision model's answer. The model drives the analysis
+ * — it asks exactly what it needs. Follow-ups go through the dispatch CLI.
+ *
+ * @returns The conversation ID + the vision model's response text, or an
+ * error string (never throws — the tool surfaces it).
+ */
+ readonly consultVision: (
+ question: string,
+ opts: {
+ readonly conversationId: string;
+ readonly imageIds?: readonly number[];
+ readonly path?: string;
+ readonly cwd?: string;
+ readonly signal?: AbortSignal;
+ readonly logger?: Logger;
+ },
+ ) => Promise<
+ { readonly conversationId: string; readonly response: string } | { readonly error: string }
+ >;
+}
+
+export const visionHandoffHandle: ServiceHandle<VisionHandoffService> =
+ defineService<VisionHandoffService>("vision-handoff/service");
+
+/** Whether a message list contains any image chunks. Pure. */
+function hasImageChunks(messages: readonly ChatMessage[]): boolean {
+ return messages.some((m) => m.chunks.some((c) => c.type === "image"));
+}
+
+export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHandoffService {
+ const log = deps.logger;
+ const generateId = deps.generateId ?? (() => crypto.randomUUID());
+
+ // Per-conversation image registry: conversationId → (imageId → image data).
+ // Populated by prepareForProvider; consulted by the consult_vision tool.
+ // In-memory only (cleared on restart — the user re-pastes if needed).
+ const imageRegistry = new Map<string, Map<number, RegisteredImage>>();
+
+ async function getInfo(modelName: string): Promise<ModelInfo | undefined> {
+ return deps.credentialStore.getModelInfo(modelName);
+ }
+
+ async function resolveVisionModel(
+ excludeName?: string,
+ ): Promise<ResolvedVisionModel | undefined> {
+ const catalog = await deps.credentialStore.listCatalog();
+ const name = await findVisionModelName(catalog, getInfo, excludeName);
+ if (name === undefined) return undefined;
+ const resolved = deps.resolveModel(name);
+ if (resolved === undefined) return undefined;
+ return { provider: resolved.provider, model: resolved.model, modelName: name };
+ }
+
+ /**
+ * Compact images for a vision-capable model: when the conversation has more
+ * image chunks than the limit, the oldest images are transcribed to text
+ * (one-time, cached in the conversation store) and stripped from the
+ * provider messages. Recent images (within the limit) stay native.
+ *
+ * The persisted history is NOT modified — only the provider's view.
+ * Transcriptions are cached so they're reused on subsequent turns (no
+ * re-transcription). When no caching deps are available, it still works but
+ * re-transcribes every turn.
+ */
+ async function compactImagesForVisionModel(
+ messages: readonly ChatMessage[],
+ opts:
+ | {
+ readonly conversationId?: string;
+ readonly imageLimit?: number;
+ readonly signal?: AbortSignal;
+ readonly logger?: Logger;
+ }
+ | undefined,
+ currentModelName: string | undefined,
+ ): Promise<readonly ChatMessage[]> {
+ void currentModelName; // reserved for future model-specific compaction logic
+ const limit = opts?.imageLimit;
+ // No limit or limit <= 0 → pass all images through (compaction disabled).
+ if (limit === undefined || limit <= 0) return messages;
+
+ // Collect all image chunks in order (oldest first, across all messages).
+ const imageEntries: { msgIdx: number; chunkIdx: number; url: string }[] = [];
+ for (const [mi, msg] of messages.entries()) {
+ for (const [ci, chunk] of msg.chunks.entries()) {
+ if (chunk.type === "image") {
+ imageEntries.push({ msgIdx: mi, chunkIdx: ci, url: chunk.url });
+ }
+ }
+ }
+
+ // If within the limit, pass everything through natively.
+ if (imageEntries.length <= limit) return messages;
+
+ // The oldest (imageEntries.length - limit) images need transcription.
+ const toTranscribeCount = imageEntries.length - limit;
+ const toTranscribe = imageEntries.slice(0, toTranscribeCount);
+
+ // Load cached transcriptions.
+ const convId = opts?.conversationId;
+ const cache =
+ convId !== undefined && deps.getImageTranscriptions !== undefined
+ ? await deps.getImageTranscriptions(convId)
+ : new Map<string, string>();
+
+ // Transcribe any that aren't cached yet (via the vision model).
+ const transcriptions = new Map<string, string>(cache);
+ const vision = await resolveVisionModel();
+ for (const entry of toTranscribe) {
+ if (transcriptions.has(entry.url)) continue;
+ if (vision === undefined) {
+ // No vision model available for transcription — use a placeholder.
+ transcriptions.set(
+ entry.url,
+ "[Image was compacted — no vision model available to transcribe it.]",
+ );
+ continue;
+ }
+ try {
+ const prompt =
+ "Describe this image in detail. Include visible text (transcribe verbatim), " +
+ "key objects, layout, and notable details. This description will replace " +
+ "the image in a conversation history, so be thorough.";
+ const userMessage: ChatMessage = {
+ role: "user",
+ chunks: [
+ { type: "text", text: prompt },
+ { type: "image", url: entry.url },
+ ],
+ };
+ const stream = vision.provider.stream([userMessage], [], {
+ model: vision.model,
+ systemPrompt: "You are a vision assistant. Describe images faithfully and thoroughly.",
+ });
+ const description = (await collectTextFromStream(stream)).trim();
+ const text =
+ description.length > 0 ? description : "[Image transcription produced no output.]";
+ transcriptions.set(entry.url, text);
+ // Cache it in the conversation store (if available).
+ if (convId !== undefined && deps.setImageTranscription !== undefined) {
+ await deps.setImageTranscription(convId, entry.url, text);
+ }
+ // The image has been transcribed to text — delete the tmp file
+ // (the transcription is cached, the raw image is no longer needed).
+ if (deps.deleteTmpImage !== undefined) {
+ try {
+ await deps.deleteTmpImage(entry.url);
+ } catch {
+ // Best-effort — don't let cleanup failure break the turn.
+ }
+ }
+ } catch (err) {
+ const msg = err instanceof Error ? err.message : String(err);
+ log?.warn("vision-handoff: image compaction transcription failed", { error: msg });
+ transcriptions.set(entry.url, `[Image transcription failed: ${msg}]`);
+ }
+ }
+
+ // Build the provider messages: replace transcribed images with text,
+ // keep recent images (within the limit) native.
+ const transcribedUrls = new Set(toTranscribe.map((e) => e.url));
+ const result: ChatMessage[] = [];
+ for (const msg of messages) {
+ if (!msg.chunks.some((c) => c.type === "image")) {
+ result.push(msg);
+ continue;
+ }
+ const newChunks: Chunk[] = [];
+ for (const chunk of msg.chunks) {
+ if (chunk.type === "image" && transcribedUrls.has(chunk.url)) {
+ const transcription = transcriptions.get(chunk.url);
+ if (transcription !== undefined) {
+ newChunks.push({ type: "text", text: `[Compacted image]: ${transcription}` });
+ } else {
+ newChunks.push(chunk); // fallback: keep the image
+ }
+ } else {
+ newChunks.push(chunk);
+ }
+ }
+ result.push({ role: msg.role, chunks: newChunks });
+ }
+ return result;
+ }
+
+ async function resolveImageUrlsInMessages(
+ messages: readonly ChatMessage[],
+ ): Promise<readonly ChatMessage[]> {
+ if (deps.resolveImageUrl === undefined) return messages;
+ let hasCompact = false;
+ for (const msg of messages) {
+ if (msg.chunks.some((c) => c.type === "image")) {
+ hasCompact = true;
+ break;
+ }
+ }
+ if (!hasCompact) return messages;
+ const result: ChatMessage[] = [];
+ for (const msg of messages) {
+ if (!msg.chunks.some((c) => c.type === "image")) {
+ result.push(msg);
+ continue;
+ }
+ const newChunks: Chunk[] = [];
+ for (const chunk of msg.chunks) {
+ if (chunk.type === "image") {
+ const dataUrl = await deps.resolveImageUrl!(chunk.url);
+ newChunks.push({
+ type: "image",
+ url: dataUrl,
+ ...(chunk.mimeType !== undefined ? { mimeType: chunk.mimeType } : {}),
+ });
+ } else {
+ newChunks.push(chunk);
+ }
+ }
+ result.push({ role: msg.role, chunks: newChunks });
+ }
+ return result;
+ }
+
+ const service: VisionHandoffService = {
+ async isVisionCapable(modelName: string | undefined): Promise<boolean> {
+ if (modelName === undefined) return false;
+ const info = await getInfo(modelName);
+ return isVisionCapable(modelName, info);
+ },
+
+ async storeImages(
+ conversationId: string,
+ images: readonly ImageInput[],
+ ): Promise<readonly ImageInput[]> {
+ if (deps.saveImageToTmp === undefined) return images;
+ const result: ImageInput[] = [];
+ for (const img of images) {
+ if (img.url.startsWith("data:")) {
+ const compactUrl = await deps.saveImageToTmp(conversationId, img.url, img.mimeType);
+ result.push({
+ url: compactUrl,
+ ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+ });
+ } else {
+ result.push(img);
+ }
+ }
+ return result;
+ },
+
+ async purgeConversationImages(conversationId: string): Promise<void> {
+ if (deps.deleteConversationImages === undefined) return;
+ try {
+ await deps.deleteConversationImages(conversationId);
+ } catch (err) {
+ log?.warn("vision-handoff: failed to purge conversation images", {
+ conversationId,
+ error: err instanceof Error ? err.message : String(err),
+ });
+ }
+ },
+
+ resolveVisionModel,
+
+ async prepareForProvider(
+ messages: readonly ChatMessage[],
+ currentModelName: string | undefined,
+ opts?: {
+ readonly conversationId?: string;
+ readonly imageLimit?: number;
+ readonly signal?: AbortSignal;
+ readonly logger?: Logger;
+ },
+ ): Promise<readonly ChatMessage[]> {
+ // Fast path: no images anywhere → nothing to do.
+ if (!hasImageChunks(messages)) return messages;
+
+ // Resolve compact URLs (/images/...) → data URLs for the provider.
+ // The persisted chunks store compact URLs (tiny strings); the provider
+ // needs data URLs (read from tmp files at runtime).
+ const resolved = await resolveImageUrlsInMessages(messages);
+
+ const isCapable =
+ currentModelName !== undefined &&
+ (await isVisionCapable(currentModelName, await getInfo(currentModelName)));
+
+ // ── Vision-capable model: image compaction ──────────────────────────
+ // When the conversation has more images than the limit, the oldest images
+ // are transcribed to text (one-time, cached) and stripped from the
+ // provider messages. Recent images (within the limit) stay native.
+ if (isCapable) {
+ return compactImagesForVisionModel(resolved, opts, currentModelName);
+ }
+
+ // ── Non-vision model: placeholders + consult_vision ──────────────────
+ const vision = await resolveVisionModel();
+ const convId = opts?.conversationId;
+
+ const placeholderFn =
+ vision !== undefined && convId !== undefined
+ ? (id: number) => formatImagePlaceholder(id)
+ : () => formatNoVisionPlaceholder();
+
+ // Replace each image chunk with a numbered placeholder. Assign sequential
+ // 1-based IDs across all messages and register each image in the
+ // per-conversation registry so the consult_vision tool can look it up.
+ let seqId = 0;
+ const result: ChatMessage[] = [];
+ for (const msg of resolved) {
+ if (!msg.chunks.some((c) => c.type === "image")) {
+ result.push(msg);
+ continue;
+ }
+ const newChunks: Chunk[] = [];
+ for (const chunk of msg.chunks) {
+ if (chunk.type === "image") {
+ seqId++;
+ if (convId !== undefined && vision !== undefined) {
+ let convImages = imageRegistry.get(convId);
+ if (convImages === undefined) {
+ convImages = new Map();
+ imageRegistry.set(convId, convImages);
+ }
+ convImages.set(seqId, {
+ url: chunk.url,
+ ...(chunk.mimeType !== undefined ? { mimeType: chunk.mimeType } : {}),
+ });
+ }
+ newChunks.push({ type: "text", text: placeholderFn(seqId) });
+ } else {
+ newChunks.push(chunk);
+ }
+ }
+ result.push({ role: msg.role, chunks: newChunks });
+ }
+ return result;
+ },
+
+ getRegisteredImage(conversationId: string, imageId: number): RegisteredImage | undefined {
+ return imageRegistry.get(conversationId)?.get(imageId);
+ },
+
+ async consultVision(
+ question: string,
+ opts: {
+ readonly conversationId: string;
+ readonly imageIds?: readonly number[];
+ readonly path?: string;
+ readonly cwd?: string;
+ readonly signal?: AbortSignal;
+ readonly logger?: Logger;
+ },
+ ): Promise<
+ { readonly conversationId: string; readonly response: string } | { readonly error: string }
+ > {
+ const orchestrator = deps.resolveOrchestrator?.();
+ if (orchestrator === undefined) {
+ return {
+ error: "The session orchestrator is not available — cannot start a vision consultation.",
+ };
+ }
+
+ const vision = await resolveVisionModel();
+ if (vision === undefined) {
+ return {
+ error:
+ "No vision-capable model is available in the catalog. Install or configure one (e.g. kimi) to enable image analysis.",
+ };
+ }
+
+ // Collect image data URLs to attach.
+ const images: ImageInput[] = [];
+ if (opts.imageIds !== undefined) {
+ for (const id of opts.imageIds) {
+ const img = service.getRegisteredImage(opts.conversationId, id);
+ if (img === undefined) {
+ return {
+ error: `Image ${id} is not registered. It may have been lost after a server restart — ask the user to re-paste the image.`,
+ };
+ }
+ images.push({
+ url: img.url,
+ ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+ });
+ }
+ }
+ if (opts.path !== undefined) {
+ try {
+ const dataUrl = await deps.readFileAsDataUrl(opts.path, opts.cwd);
+ images.push({ url: dataUrl });
+ } catch (err) {
+ const msg = err instanceof Error ? err.message : String(err);
+ return { error: `Failed to read image file "${opts.path}": ${msg}` };
+ }
+ }
+ if (images.length === 0) {
+ return {
+ error:
+ "No image to consult about. Provide imageIds (for pasted images) or path (for a file).",
+ };
+ }
+
+ // Start a NEW conversation with the vision model.
+ const consultationId = generateId();
+ log?.info("vision-handoff: starting consultation", {
+ consultationId,
+ visionModel: vision.modelName,
+ imageCount: images.length,
+ fromConversation: opts.conversationId,
+ });
+
+ // Label the consultation tab with an "IMAGE - " prefix so it's visually
+ // distinguishable from normal conversation tabs. Set BEFORE the turn
+ // starts so the tab shows the correct title from the first moment (the
+ // store keeps a non-"Untitled" title on first message append).
+ if (deps.setConversationTitle !== undefined) {
+ try {
+ await deps.setConversationTitle(consultationId, formatConsultationTitle(question));
+ } catch (err) {
+ // Best-effort — don't let a title-write failure break the consultation.
+ log?.warn("vision-handoff: failed to set consultation title", {
+ consultationId,
+ error: err instanceof Error ? err.message : String(err),
+ });
+ }
+ }
+
+ let responseText = "";
+ let errorMessage = "";
+ try {
+ await orchestrator.handleMessage({
+ conversationId: consultationId,
+ text: question,
+ images,
+ modelName: vision.modelName,
+ ...(opts.cwd !== undefined ? { cwd: opts.cwd } : {}),
+ systemPrompt:
+ "You are a vision assistant. A developer who cannot see images is asking you specific questions about an image they attached. Answer their question precisely and thoroughly.",
+ onEvent: (event: AgentEvent) => {
+ if (event.type === "text-delta") {
+ responseText += event.delta;
+ } else if (event.type === "error") {
+ errorMessage = event.message;
+ }
+ },
+ });
+ } catch (err) {
+ const msg = err instanceof Error ? err.message : String(err);
+ return { error: `Vision consultation failed: ${msg}` };
+ }
+
+ if (errorMessage.length > 0 && responseText.trim().length === 0) {
+ return { error: `Vision consultation failed: ${errorMessage}` };
+ }
+
+ const response = formatConsultResult(consultationId, responseText);
+ return { conversationId: consultationId, response };
+ },
+ };
+
+ return service;
+}
diff --git a/packages/vision-handoff/src/tool.ts b/packages/vision-handoff/src/tool.ts
new file mode 100644
index 0000000..86be2ed
--- /dev/null
+++ b/packages/vision-handoff/src/tool.ts
@@ -0,0 +1,137 @@
+/**
+ * consult_vision tool — lets any model (vision-capable or not) consult a
+ * vision-capable model about an image by opening a NEW conversation tab.
+ *
+ * The tool attaches image(s) + the model's specific question to a vision-capable
+ * model (resolved from the catalog — e.g. Kimi), waits for the response, and
+ * returns the conversation ID + the vision model's answer. The MODEL directs the
+ * analysis — it asks exactly what it needs to know — instead of receiving a
+ * pre-emptive generic dump.
+ *
+ * For images PASTED into the chat, the model references them by `imageIds` (from
+ * the "[Image N attached]" placeholders the orchestrator injected). For image
+ * FILES on disk, the model passes a `path`.
+ *
+ * Follow-up questions are NOT handled by this tool — the model uses the dispatch
+ * CLI to continue the vision conversation (the returned conversation ID is the
+ * bridge; the model can load the `dispatch-cli` skill for the exact commands).
+ */
+
+import type { ToolContract, ToolExecuteContext, ToolResult } from "@dispatch/kernel";
+import type { VisionHandoffService } from "./service.js";
+
+export function createConsultVisionTool(service: VisionHandoffService): ToolContract {
+ return {
+ name: "consult_vision",
+ description:
+ "Consult a vision-capable model (e.g. Kimi) about an image by opening a new " +
+ "conversation tab. Attaches the image(s) + your specific question, waits for " +
+ "the vision model's response, and returns the conversation ID + the answer. " +
+ "Use this when you cannot view an image (e.g. a pasted screenshot or diagram) " +
+ "and need to know what it shows — ask a SPECIFIC question (e.g. 'What error " +
+ "message is on line 12?' rather than 'describe this image'). The conversation " +
+ "ID is returned so follow-up questions can be asked via the dispatch CLI.",
+ parameters: {
+ type: "object",
+ properties: {
+ question: {
+ type: "string",
+ description:
+ "Your specific question about the image. Be precise — the vision model " +
+ "will answer exactly this. E.g. 'What error message is displayed?' or " +
+ "'Compare the layout of these two screenshots.'",
+ },
+ imageIds: {
+ type: "array",
+ items: { type: "number" },
+ description:
+ "The IDs of pasted images to attach (from the '[Image N attached]' " +
+ "placeholders in the conversation). Pass multiple to attach several " +
+ "images to one consultation (e.g. [1, 2] to compare them).",
+ },
+ path: {
+ type: "string",
+ description:
+ "Path to an image FILE on disk to attach (alternative to imageIds for " +
+ "code-referenced images). Relative paths resolve against the cwd.",
+ },
+ },
+ required: ["question"],
+ },
+ concurrencySafe: true,
+ async execute(args: unknown, ctx: ToolExecuteContext): Promise<ToolResult> {
+ const input = args as {
+ question?: unknown;
+ imageIds?: unknown;
+ path?: unknown;
+ } | null;
+
+ const question = input?.question;
+ if (typeof question !== "string" || question.trim().length === 0) {
+ return {
+ content: "Error: 'question' is required and must be a non-empty string.",
+ isError: true,
+ };
+ }
+
+ const imageIds = input?.imageIds;
+ const path = input?.path;
+
+ // Parse imageIds (must be an array of numbers if present).
+ let parsedImageIds: number[] | undefined;
+ if (imageIds !== undefined) {
+ if (!Array.isArray(imageIds)) {
+ return { content: "Error: 'imageIds' must be an array of numbers.", isError: true };
+ }
+ parsedImageIds = imageIds.filter((n): n is number => typeof n === "number");
+ if (parsedImageIds.length === 0) {
+ return { content: "Error: 'imageIds' must contain at least one number.", isError: true };
+ }
+ }
+
+ // path must be a string if present.
+ let parsedPath: string | undefined;
+ if (path !== undefined) {
+ if (typeof path !== "string" || path.trim().length === 0) {
+ return { content: "Error: 'path' must be a non-empty string.", isError: true };
+ }
+ parsedPath = path;
+ }
+
+ // At least one image source is required.
+ if (parsedImageIds === undefined && parsedPath === undefined) {
+ return {
+ content:
+ "Error: provide 'imageIds' (for pasted images) or 'path' (for a file) " +
+ "to attach an image to the consultation.",
+ isError: true,
+ };
+ }
+
+ const span = ctx.log.span("consult_vision.execute", {
+ imageCount: (parsedImageIds?.length ?? 0) + (parsedPath !== undefined ? 1 : 0),
+ });
+ try {
+ const result = await service.consultVision(question, {
+ conversationId: ctx.conversationId ?? "",
+ ...(parsedImageIds !== undefined ? { imageIds: parsedImageIds } : {}),
+ ...(parsedPath !== undefined ? { path: parsedPath } : {}),
+ ...(ctx.cwd !== undefined ? { cwd: ctx.cwd } : {}),
+ signal: ctx.signal,
+ logger: ctx.log,
+ });
+ span.end({ attrs: { ok: !("error" in result) } });
+ if ("error" in result) {
+ return { content: result.error, isError: true };
+ }
+ return { content: result.response };
+ } catch (err: unknown) {
+ span.end({ err });
+ return {
+ content: `Error during vision consultation: ${err instanceof Error ? err.message : String(err)}`,
+ isError: true,
+ };
+ }
+ },
+ };
+}
diff --git a/packages/vision-handoff/tsconfig.json b/packages/vision-handoff/tsconfig.json
new file mode 100644
index 0000000..b5439aa
--- /dev/null
+++ b/packages/vision-handoff/tsconfig.json
@@ -0,0 +1,12 @@
+{
+ "extends": "../../tsconfig.base.json",
+ "compilerOptions": { "rootDir": "src", "outDir": "dist", "composite": true },
+ "include": ["src/**/*.ts"],
+ "references": [
+ { "path": "../kernel" },
+ { "path": "../wire" },
+ { "path": "../conversation-store" },
+ { "path": "../credential-store" },
+ { "path": "../openai-stream" }
+ ]
+}
diff --git a/packages/wire/src/index.test.ts b/packages/wire/src/index.test.ts
index 3f07e00..81d10c1 100644
--- a/packages/wire/src/index.test.ts
+++ b/packages/wire/src/index.test.ts
@@ -8,7 +8,7 @@
*/
import { describe, expect, it } from "vitest";
-import type { Computer, ComputerEntry, Workspace } from "./index.js";
+import type { Chunk, Computer, ComputerEntry, ImageChunk, ImageInput, Workspace } from "./index.js";
describe("@dispatch/wire — Computer / Workspace shapes", () => {
it("a Computer literal satisfies the Computer type", () => {
@@ -57,3 +57,32 @@ describe("@dispatch/wire — Computer / Workspace shapes", () => {
expect(local.defaultComputerId).toBeNull();
});
});
+
+describe("@dispatch/wire — ImageChunk / ImageInput shapes", () => {
+ it("an ImageChunk carries a data URL and optional mimeType", () => {
+ const c: ImageChunk = {
+ type: "image",
+ url: "data:image/png;base64,iVBORw0KGgo=",
+ mimeType: "image/png",
+ };
+ expect(c.type).toBe("image");
+ expect(c.url).toContain("base64");
+ expect(c.mimeType).toBe("image/png");
+ });
+
+ it("an ImageChunk with only a url is valid (mimeType optional)", () => {
+ const c: ImageChunk = { type: "image", url: "https://example.com/cat.png" };
+ expect(c.mimeType).toBeUndefined();
+ });
+
+ it("ImageInput mirrors ImageChunk's url semantics", () => {
+ const input: ImageInput = { url: "data:image/jpeg;base64,/9j/4AAQ" };
+ expect(input.url).toContain("jpeg");
+ });
+
+ it("ImageChunk is a member of the Chunk union (assignable)", () => {
+ const chunk: Chunk = { type: "image", url: "data:image/png;base64,x" };
+ // Compile-time proof: an ImageChunk satisfies the Chunk union.
+ expect(chunk.type).toBe("image");
+ });
+});
diff --git a/packages/wire/src/index.ts b/packages/wire/src/index.ts
index 6d10e0f..113f684 100644
--- a/packages/wire/src/index.ts
+++ b/packages/wire/src/index.ts
@@ -36,7 +36,8 @@ export type Chunk =
| ToolCallChunk
| ToolResultChunk
| ErrorChunk
- | SystemChunk;
+ | SystemChunk
+ | ImageChunk;
/** A piece of plain text content from the assistant or user. */
export interface TextChunk {
@@ -113,6 +114,46 @@ export interface SystemChunk {
}
/**
+ * An image attached to a message (e.g. a user-pasted screenshot or pasted
+ * photo). Carries a `url` that is EITHER a base64 data URL
+ * (`data:image/png;base64,…`) OR an `http(s)://` URL. Vision-capable models
+ * receive it natively (the provider serializes it to its image-content
+ * format); non-vision models never see it directly — the orchestrator's
+ * **vision handoff** transcribes it to a text description (via a
+ * vision-capable model) and feeds that text instead, so a text-only model can
+ * still reason about the image's contents.
+ *
+ * When a transcription was performed, it is persisted as a separate `text`
+ * chunk alongside the `image` chunk in the SAME user message, so the
+ * description is reused on every later turn (no re-transcription) and a
+ * client renders both the original image and its textual analysis.
+ */
+export interface ImageChunk {
+ readonly type: "image";
+ /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+ readonly url: string;
+ /**
+ * Optional MIME type of the image (e.g. `"image/png"`). Inferred from the
+ * data URL when absent; present so a client can render an icon/label without
+ * parsing the URL. Optional — callers that only have a URL omit it.
+ */
+ readonly mimeType?: string;
+}
+
+/**
+ * An image a client attaches to a chat message (`ChatRequest.images`). The
+ * transport-facing input shape; the orchestrator converts each `ImageInput`
+ * into an `ImageChunk` on the persisted user message. Carries the same `url`
+ * semantics as `ImageChunk.url`.
+ */
+export interface ImageInput {
+ /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+ readonly url: string;
+ /** Optional MIME type (e.g. `"image/png"`). Optional — inferred from the data URL when absent. */
+ readonly mimeType?: string;
+}
+
+/**
* A chat message: a role plus an ordered sequence of chunks. Messages are the
* unit passed to and from the provider; chunks are the unit persisted and
* rendered.
diff --git a/tsconfig.json b/tsconfig.json
index d31b44a..f97edde 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -41,6 +41,9 @@
"path": "./packages/credential-store"
},
{
+ "path": "./packages/vision-handoff"
+ },
+ {
"path": "./packages/exec-backend"
},
{