Merge branch 'feature/vision-handoff' into dev

# Conflicts: # packages/session-orchestrator/src/extension.ts # packages/session-orchestrator/src/orchestrator.ts
author: Adam Malczewski <[email protected]> 2026-06-27 20:48:24 +0900
committer: Adam Malczewski <[email protected]> 2026-06-27 20:48:24 +0900
commit: 04356c8678ae8dd1d7ddca2d0460b514116adc2e (patch)
tree: 6c81894ef02d062570b12f4d3a871e58600dcb9c
parent: 3184b10e614ce6249c83aa111368e98f6689f456 (diff)
parent: b24ed99e89bc657e8c98c7cef8608e0c0b7594da (diff)
download: dispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.tar.gz
dispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.zip
37 files changed, 2707 insertions, 19 deletions
diff --git a/bun.lock b/bun.lock
index 602e42a..493da15 100644
--- a/bun.lock
+++ b/bun.lock
@@ -104,6 +104,7 @@
         "@dispatch/tool-youtube-transcript": "workspace:*",
         "@dispatch/transport-http": "workspace:*",
         "@dispatch/transport-ws": "workspace:*",
+        "@dispatch/vision-handoff": "workspace:*",
       },
     },
     "packages/journal-sink": {
@@ -371,6 +372,16 @@
       "name": "@dispatch/ui-contract",
       "version": "0.2.0",
     },
+    "packages/vision-handoff": {
+      "name": "@dispatch/vision-handoff",
+      "version": "0.0.0",
+      "dependencies": {
+        "@dispatch/conversation-store": "workspace:*",
+        "@dispatch/credential-store": "workspace:*",
+        "@dispatch/kernel": "workspace:*",
+        "@dispatch/openai-stream": "workspace:*",
+      },
+    },
     "packages/wire": {
       "name": "@dispatch/wire",
       "version": "0.12.0",
@@ -473,6 +484,8 @@
 
     "@dispatch/ui-contract": ["@dispatch/ui-contract@workspace:packages/ui-contract"],
 
+    "@dispatch/vision-handoff": ["@dispatch/vision-handoff@workspace:packages/vision-handoff"],
+
     "@dispatch/wire": ["@dispatch/wire@workspace:packages/wire"],
 
     "@esbuild/aix-ppc64": ["@esbuild/[email protected]", "", { "os": "aix", "cpu": "ppc64" }, "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg=="],
diff --git a/packages/conversation-store/src/keys.ts b/packages/conversation-store/src/keys.ts
index b2c635d..6ec2bc5 100644
--- a/packages/conversation-store/src/keys.ts
+++ b/packages/conversation-store/src/keys.ts
@@ -66,6 +66,14 @@ export function compactThresholdKey(conversationId: string): string {
   return `conv:${conversationId}:compact-percent`;
 }
 
+/** Per-conversation image transcription cache (JSON map of imageUrl → transcription). */
+export function imageTranscriptionsKey(conversationId: string): string {
+  return `conv:${conversationId}:image-transcriptions`;
+}
+
+/** Global vision settings (image compaction limit + compaction model). */
+export const VISION_SETTINGS_KEY = "vision-settings";
+
 export function metaKey(conversationId: string): string {
   return `conv:${conversationId}:meta`;
 }
diff --git a/packages/conversation-store/src/store.ts b/packages/conversation-store/src/store.ts
index f90e809..69334e6 100644
--- a/packages/conversation-store/src/store.ts
+++ b/packages/conversation-store/src/store.ts
@@ -20,6 +20,7 @@ import {
   compactThresholdKey,
   computerKey,
   cwdKey,
+  imageTranscriptionsKey,
   metaKey,
   metricsKey,
   metricsPrefix,
@@ -28,6 +29,7 @@ import {
   parseSeq,
   reasoningEffortKey,
   seqKey,
+  VISION_SETTINGS_KEY,
   workspaceKey,
 } from "./keys.js";
 import { reconcileWithReport } from "./reconcile.js";
@@ -141,6 +143,35 @@ export interface ConversationStore {
   /** Set the compact percent (0-100, 0 = manual only). */
   readonly setCompactPercent: (conversationId: string, percent: number) => Promise<void>;
   /**
+   * Get the per-conversation image transcription cache: a map of image URL →
+   * transcription text. Used by the vision handoff to avoid re-transcribing
+   * old images that were compacted to text on a previous turn. Returns an
+   * empty map when none are cached.
+   */
+  readonly getImageTranscriptions: (conversationId: string) => Promise<ReadonlyMap<string, string>>;
+  /**
+   * Upsert a single image transcription into the per-conversation cache.
+   * Merges with any existing transcriptions (does NOT replace the whole map).
+   */
+  readonly setImageTranscription: (
+    conversationId: string,
+    imageUrl: string,
+    transcription: string,
+  ) => Promise<void>;
+  /**
+   * Get the global vision settings (image compaction limit + compaction model).
+   * The limit defaults to 10 when never set; the compaction model defaults to
+   * null (auto-select). Shared across ALL conversations and vision models.
+   */
+  readonly getVisionSettings: () => Promise<{
+    readonly imageLimit: number;
+    readonly compactionModel: string | null;
+  }>;
+  /** Set the global vision image compaction limit (0 = disabled). */
+  readonly setVisionImageLimit: (limit: number) => Promise<void>;
+  /** Set the global vision compaction model (null = auto-select). */
+  readonly setVisionCompactionModel: (model: string | null) => Promise<void>;
+  /**
    * Set the `compactedFrom` field on a conversation's metadata, pointing to
    * the archive conversation that holds the pre-compaction history.
    */
@@ -1004,6 +1035,52 @@ export function createConversationStore(
       }
     },
 
+    async getImageTranscriptions(conversationId) {
+      const raw = await storage.get(imageTranscriptionsKey(conversationId));
+      if (raw === null) return new Map();
+      try {
+        const obj = JSON.parse(raw) as Record<string, string>;
+        return new Map(Object.entries(obj));
+      } catch {
+        return new Map();
+      }
+    },
+
+    async setImageTranscription(conversationId, imageUrl, transcription) {
+      const existing = await this.getImageTranscriptions(conversationId);
+      const merged = new Map(existing);
+      merged.set(imageUrl, transcription);
+      const obj: Record<string, string> = {};
+      for (const [k, v] of merged) obj[k] = v;
+      await storage.set(imageTranscriptionsKey(conversationId), JSON.stringify(obj));
+    },
+
+    async getVisionSettings() {
+      const raw = await storage.get(VISION_SETTINGS_KEY);
+      if (raw === null) return { imageLimit: 10, compactionModel: null };
+      try {
+        const obj = JSON.parse(raw) as { imageLimit?: number; compactionModel?: string | null };
+        return {
+          imageLimit: typeof obj.imageLimit === "number" ? obj.imageLimit : 10,
+          compactionModel: obj.compactionModel ?? null,
+        };
+      } catch {
+        return { imageLimit: 10, compactionModel: null };
+      }
+    },
+
+    async setVisionImageLimit(limit) {
+      const current = await this.getVisionSettings();
+      const obj = { imageLimit: limit, compactionModel: current.compactionModel };
+      await storage.set(VISION_SETTINGS_KEY, JSON.stringify(obj));
+    },
+
+    async setVisionCompactionModel(model) {
+      const current = await this.getVisionSettings();
+      const obj = { imageLimit: current.imageLimit, compactionModel: model };
+      await storage.set(VISION_SETTINGS_KEY, JSON.stringify(obj));
+    },
+
     async setCompactedFrom(conversationId, newConversationId) {
       const raw = await storage.get(metaKey(conversationId));
       const existing = raw !== null ? parseMetaRow(raw) : null;
diff --git a/packages/host-bin/package.json b/packages/host-bin/package.json
index e68251b..7d3b38c 100644
--- a/packages/host-bin/package.json
+++ b/packages/host-bin/package.json
@@ -34,6 +34,7 @@
     "@dispatch/surface-loaded-extensions": "workspace:*",
     "@dispatch/surface-registry": "workspace:*",
     "@dispatch/transport-ws": "workspace:*",
-    "@dispatch/system-prompt": "workspace:*"
+    "@dispatch/system-prompt": "workspace:*",
+    "@dispatch/vision-handoff": "workspace:*"
   }
 }
diff --git a/packages/host-bin/src/main.ts b/packages/host-bin/src/main.ts
index 2ab1118..aa114d5 100644
--- a/packages/host-bin/src/main.ts
+++ b/packages/host-bin/src/main.ts
@@ -44,6 +44,7 @@ import { extension as toolWriteFileExt } from "@dispatch/tool-write-file";
 import { extension as toolYoutubeTranscriptExt } from "@dispatch/tool-youtube-transcript";
 import { createTransportHttpExtension } from "@dispatch/transport-http";
 import { createTransportWsExtension } from "@dispatch/transport-ws";
+import { extension as visionHandoffExt } from "@dispatch/vision-handoff";
 import type { ChildHandle } from "./collector-supervisor.js";
 import { createCollectorSupervisor } from "./collector-supervisor.js";
 import { configMapToAccess, envToConfigMap } from "./config.js";
@@ -206,6 +207,13 @@ async function boot(): Promise<void> {
   const extensions: Extension[] = [
     ...CORE_EXTENSIONS,
     createCredentialStoreExtension({ credentials }),
+    // vision-handoff activates AFTER credential-store (it resolves the
+    // credential-store service at activate time to find vision-capable models).
+    // Placed here, not in CORE_EXTENSIONS, so the service is available when it
+    // activates. The session-orchestrator resolves its service LAZILY
+    // (per-turn), so activation order between it and session-orchestrator
+    // doesn't matter.
+    visionHandoffExt,
     ...externalExtensions,
   ];
 
diff --git a/packages/host-bin/tsconfig.json b/packages/host-bin/tsconfig.json
index cb85915..09b87df 100644
--- a/packages/host-bin/tsconfig.json
+++ b/packages/host-bin/tsconfig.json
@@ -63,6 +63,9 @@
     },
     {
       "path": "../transport-ws"
+    },
+    {
+      "path": "../vision-handoff"
     }
   ]
 }
diff --git a/packages/kernel/src/contracts/conversation.ts b/packages/kernel/src/contracts/conversation.ts
index f074c52..80da86e 100644
--- a/packages/kernel/src/contracts/conversation.ts
+++ b/packages/kernel/src/contracts/conversation.ts
@@ -12,6 +12,8 @@ export type {
   ConversationMeta,
   ConversationStatus,
   ErrorChunk,
+  ImageChunk,
+  ImageInput,
   Role,
   StepId,
   StepMetrics,
diff --git a/packages/kernel/src/contracts/index.ts b/packages/kernel/src/contracts/index.ts
index 09e0a56..28e0a0b 100644
--- a/packages/kernel/src/contracts/index.ts
+++ b/packages/kernel/src/contracts/index.ts
@@ -19,6 +19,8 @@ export type {
   ConversationMeta,
   ConversationStatus,
   ErrorChunk,
+  ImageChunk,
+  ImageInput,
   Role,
   StepId,
   StepMetrics,
diff --git a/packages/kernel/src/contracts/provider.ts b/packages/kernel/src/contracts/provider.ts
index b6dc8ca..3137073 100644
--- a/packages/kernel/src/contracts/provider.ts
+++ b/packages/kernel/src/contracts/provider.ts
@@ -114,6 +114,16 @@ export interface ModelInfo {
   readonly displayName?: string;
   /** The model's max context window in tokens (e.g. 200000). Optional — providers that don't report it leave it undefined. */
   readonly contextWindow?: number;
+  /**
+   * Whether this model can natively accept image input (vision/multimodal).
+   * When `true`, image chunks in a user message are passed through to the
+   * provider serialized to its image-content format. When `false`/absent, the
+   * orchestrator's vision handoff transcribes images to text (via a
+   * vision-capable model) before the model sees them. Optional — providers
+   * that cannot detect it leave it undefined (treated as non-vision); a
+   * provider that knows a model is vision-capable sets it `true`.
+   */
+  readonly vision?: boolean;
 }
 
 /**
diff --git a/packages/openai-stream/src/convert-messages.test.ts b/packages/openai-stream/src/convert-messages.test.ts
index 3520eb5..57c7d81 100644
--- a/packages/openai-stream/src/convert-messages.test.ts
+++ b/packages/openai-stream/src/convert-messages.test.ts
@@ -35,6 +35,100 @@ describe("convertMessages", () => {
     expect(result).toEqual([{ role: "user", content: "Hello, world!" }]);
   });
 
+  it("converts a user message with a text + image chunk to a multimodal content array", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "What is in this image?" },
+          { type: "image", url: "data:image/png;base64,iVBORw0KGgo=" },
+        ],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    expect(result).toEqual([
+      {
+        role: "user",
+        content: [
+          { type: "text", text: "What is in this image?" },
+          { type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } },
+        ],
+      },
+    ]);
+  });
+
+  it("converts an image-only user message (no text) to a content array with just the image", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [{ type: "image", url: "https://example.com/cat.png" }],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    expect(result).toEqual([
+      {
+        role: "user",
+        content: [{ type: "image_url", image_url: { url: "https://example.com/cat.png" } }],
+      },
+    ]);
+  });
+
+  it("converts a user message with multiple images interspersed with text", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "Compare these:" },
+          { type: "image", url: "data:image/png;base64,aaa" },
+          { type: "text", text: "and" },
+          { type: "image", url: "data:image/jpeg;base64,bbb" },
+        ],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    expect(result).toHaveLength(1);
+    const content = result[0]?.content;
+    expect(Array.isArray(content)).toBe(true);
+    if (Array.isArray(content)) {
+      expect(content).toHaveLength(4);
+      expect(content[0]).toEqual({ type: "text", text: "Compare these:" });
+      expect(content[1]).toEqual({
+        type: "image_url",
+        image_url: { url: "data:image/png;base64,aaa" },
+      });
+      expect(content[2]).toEqual({ type: "text", text: "and" });
+      expect(content[3]).toEqual({
+        type: "image_url",
+        image_url: { url: "data:image/jpeg;base64,bbb" },
+      });
+    }
+  });
+
+  it("skips empty text parts in a multimodal message but keeps images", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "" },
+          { type: "image", url: "data:image/png;base64,x" },
+        ],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    const content = result[0]?.content;
+    expect(Array.isArray(content)).toBe(true);
+    if (Array.isArray(content)) {
+      // Empty text part is dropped; only the image remains.
+      expect(content).toEqual([
+        { type: "image_url", image_url: { url: "data:image/png;base64,x" } },
+      ]);
+    }
+  });
+
   it("converts an assistant message with text only", () => {
     const messages: ChatMessage[] = [
       {
diff --git a/packages/openai-stream/src/convert-messages.ts b/packages/openai-stream/src/convert-messages.ts
index e830243..eba3575 100644
--- a/packages/openai-stream/src/convert-messages.ts
+++ b/packages/openai-stream/src/convert-messages.ts
@@ -1,8 +1,28 @@
 import type { ChatMessage, Chunk } from "@dispatch/kernel";
 
+/** A text part within a multimodal OpenAI content array. */
+export interface OpenAITextPart {
+  readonly type: "text";
+  readonly text: string;
+}
+
+/** An image part within a multimodal OpenAI content array (OpenAI vision format). */
+export interface OpenAIImagePart {
+  readonly type: "image_url";
+  readonly image_url: { readonly url: string };
+}
+
+/**
+ * A part of a multimodal message content array. When a message has mixed text
+ * and image chunks, the content is serialized as an array of these parts
+ * (OpenAI's vision format). Plain-text messages keep a string `content` for
+ * byte-stability with providers that only accept strings.
+ */
+export type OpenAIContentPart = OpenAITextPart | OpenAIImagePart;
+
 export interface OpenAIMessage {
   readonly role: "system" | "user" | "assistant" | "tool";
-  readonly content: string | null;
+  readonly content: string | null | readonly OpenAIContentPart[];
   readonly tool_calls?: readonly OpenAIToolCall[];
   readonly tool_call_id?: string;
 }
@@ -49,6 +69,29 @@ function convertSystemMessage(msg: ChatMessage): OpenAIMessage {
 }
 
 function convertUserMessage(msg: ChatMessage): OpenAIMessage {
+  // If the message has image chunks, serialize as a multimodal content array
+  // (OpenAI vision format): text parts + image_url parts in chunk order.
+  // Plain text-only messages keep a string `content` for byte-stability with
+  // providers that only accept a string (and to keep prompt-cache prefixes
+  // unchanged for the common no-image case).
+  const hasImage = msg.chunks.some((c) => c.type === "image");
+  if (hasImage) {
+    const parts: OpenAIContentPart[] = [];
+    for (const chunk of msg.chunks) {
+      if (chunk.type === "text") {
+        if (chunk.text.length > 0) {
+          parts.push({ type: "text", text: chunk.text });
+        }
+      } else if (chunk.type === "image") {
+        parts.push({ type: "image_url", image_url: { url: chunk.url } });
+      }
+      // Non-text/non-image chunks (tool-call, thinking, etc.) are not part of a
+      // user message's provider content and are skipped here.
+    }
+    // An image-only message (no text) still needs at least the image part.
+    return { role: "user", content: parts.length > 0 ? parts : "" };
+  }
+
   const text = msg.chunks
     .filter((c): c is Extract<Chunk, { type: "text" }> => c.type === "text")
     .map((c) => c.text)
diff --git a/packages/openai-stream/src/index.ts b/packages/openai-stream/src/index.ts
index bd2f673..3f76b99 100644
--- a/packages/openai-stream/src/index.ts
+++ b/packages/openai-stream/src/index.ts
@@ -1,8 +1,14 @@
-export type { OpenAIMessage, OpenAIToolCall } from "./convert-messages.js";
+export type {
+  OpenAIContentPart,
+  OpenAIImagePart,
+  OpenAIMessage,
+  OpenAITextPart,
+  OpenAIToolCall,
+} from "./convert-messages.js";
 export { convertMessages } from "./convert-messages.js";
 export type { OpenAITool } from "./convert-tools.js";
 export { convertTools } from "./convert-tools.js";
-export { parseModelList } from "./listModels.js";
+export { isVisionModelId, parseModelList } from "./listModels.js";
 export { parseSSELines } from "./parse-sse.js";
 export type { CreateOpenAICompatProviderOpts } from "./provider.js";
 export { createOpenAICompatProvider } from "./provider.js";
diff --git a/packages/openai-stream/src/listModels.test.ts b/packages/openai-stream/src/listModels.test.ts
index c2438bc..2e3b1a3 100644
--- a/packages/openai-stream/src/listModels.test.ts
+++ b/packages/openai-stream/src/listModels.test.ts
@@ -1,7 +1,7 @@
 import type { ApiKeyCredentials, ModelInfo, ProviderContract } from "@dispatch/kernel";
 import type { FetchLike } from "@dispatch/trace-replay";
 import { describe, expect, it, vi } from "vitest";
-import { parseModelList } from "./listModels.js";
+import { isVisionModelId, parseModelList } from "./listModels.js";
 import { createOpenAICompatProvider } from "./provider.js";
 
 function makeProvider(fetchFn: FetchLike, apiKey = "sk-test-1234567890abcdef"): ProviderContract {
@@ -35,6 +35,53 @@ describe("listModels — pure mapping (parseModelList)", () => {
     const result = parseModelList([]);
     expect(result).toEqual([]);
   });
+
+  it("extracts contextWindow from common field names", () => {
+    const result = parseModelList([
+      { id: "m1", context_length: 128000 },
+      { id: "m2", context_window: 200000 },
+      { id: "m3", max_context_length: 64000 },
+      { id: "m4", max_tokens: 8000 },
+    ]);
+    expect(result).toEqual([
+      { id: "m1", contextWindow: 128000 },
+      { id: "m2", contextWindow: 200000 },
+      { id: "m3", contextWindow: 64000 },
+      { id: "m4", contextWindow: 8000 },
+    ]);
+  });
+});
+
+describe("listModels — vision capability detection", () => {
+  it("isVisionModelId returns true for umans kimi and qwen model ids", () => {
+    expect(isVisionModelId("umans-kimi-k2.7")).toBe(true);
+    expect(isVisionModelId("Umans-Kimi-K2.7")).toBe(true); // case-insensitive
+    expect(isVisionModelId("umans-qwen3.6-35b-a3b")).toBe(true);
+  });
+
+  it("isVisionModelId returns false for non-vision model ids", () => {
+    expect(isVisionModelId("umans-glm-5.2")).toBe(false);
+    expect(isVisionModelId("umans-coder")).toBe(false);
+    expect(isVisionModelId("umans-flash")).toBe(false);
+    expect(isVisionModelId("kimi-k2.7-code")).toBe(false); // opencode kimi, not umans
+    expect(isVisionModelId("qwen3.7-max")).toBe(false); // opencode qwen, not umans
+    expect(isVisionModelId("deepseek-v4-flash")).toBe(false);
+  });
+
+  it("parseModelList sets vision: true on umans kimi and qwen models only", () => {
+    const result = parseModelList([
+      { id: "umans-kimi-k2.7", context_length: 262144 },
+      { id: "umans-qwen3.6-35b-a3b", context_length: 262144 },
+      { id: "umans-glm-5.2", context_length: 405504 },
+      { id: "umans-coder" },
+    ]);
+    expect(result).toEqual([
+      { id: "umans-kimi-k2.7", contextWindow: 262144, vision: true },
+      { id: "umans-qwen3.6-35b-a3b", contextWindow: 262144, vision: true },
+      { id: "umans-glm-5.2", contextWindow: 405504 },
+      { id: "umans-coder" },
+    ]);
+  });
 });
 
 describe("listModels — provider contract", () => {
diff --git a/packages/openai-stream/src/listModels.ts b/packages/openai-stream/src/listModels.ts
index 0e94c43..df116b0 100644
--- a/packages/openai-stream/src/listModels.ts
+++ b/packages/openai-stream/src/listModels.ts
@@ -24,17 +24,39 @@ interface OpenAIModelListResponse {
 }
 
 /**
+ * Whether a model id is vision-capable (can natively accept image input).
+ *
+ * The OpenAI-compatible `/models` endpoint does not reliably report image
+ * capabilities, so this is a hardcoded heuristic by model id: the Umans Kimi
+ * (`umans-kimi-k2.7`) and Umans Qwen (`umans-qwen3.6-35b-a3b`) models are
+ * vision-capable; all others are treated as non-vision. This is the single
+ * source of truth — the orchestrator's vision handoff and the `consult_vision`
+ * tool both consult the `ModelInfo.vision` flag this sets, so adding a model
+ * here enables vision everywhere. Pure: id → boolean, no I/O.
+ *
+ * (When an endpoint gains reliable vision reporting, this can be replaced with
+ * a real capability check without changing callers.)
+ */
+export function isVisionModelId(id: string): boolean {
+  const lower = id.toLowerCase();
+  return lower.includes("umans-kimi") || lower.includes("umans-qwen");
+}
+
+/**
  * Pure mapping: raw OpenAI-compatible model list → ModelInfo[].
- * Extracts `contextWindow` from common field names (providers vary).
- * Extracted for direct unit testing with no I/O.
+ * Extracts `contextWindow` from common field names (providers vary) and
+ * detects vision capability via {@link isVisionModelId}. Extracted for direct
+ * unit testing with no I/O.
  */
 export function parseModelList(data: readonly OpenAIModelEntry[]): readonly ModelInfo[] {
   return data.map((entry) => {
     const contextWindow =
       entry.context_length ?? entry.context_window ?? entry.max_context_length ?? entry.max_tokens;
+    const vision = isVisionModelId(entry.id);
     return {
       id: entry.id,
       ...(contextWindow !== undefined ? { contextWindow } : {}),
+      ...(vision ? { vision } : {}),
     };
   });
 }
diff --git a/packages/session-orchestrator/src/extension.ts b/packages/session-orchestrator/src/extension.ts
index 0cd83ef..783d894 100644
--- a/packages/session-orchestrator/src/extension.ts
+++ b/packages/session-orchestrator/src/extension.ts
@@ -12,6 +12,7 @@ import {
   createSessionOrchestrator,
   createWarmService,
   sessionOrchestratorHandle,
+  visionHandoffLocalHandle,
 } from "./orchestrator.js";
 import { selectFirstProvider } from "./pure.js";
 import { filterRemoteIncompatibleTools, toolsFilter } from "./tools-filter.js";
@@ -107,6 +108,20 @@ export function activate(host: HostAPI): void {
         return undefined;
       }
     },
+    resolveVisionHandoff: () => {
+      // Lazily resolve the vision-handoff service. Returns undefined when the
+      // vision-handoff extension isn't loaded (images pass through unchanged —
+      // correct for vision-capable models; the feature degrades off cleanly for
+      // text-only turns). Lazy so activation order doesn't matter; the
+      // activated-manifests guard avoids a getService throw when absent.
+      const loaded = host.getExtensions().some((m) => m.id === "vision-handoff");
+      if (!loaded) return undefined;
+      try {
+        return host.getService(visionHandoffLocalHandle);
+      } catch {
+        return undefined;
+      }
+    },
   });
 
   host.provideService(sessionOrchestratorHandle, orchestrator);
diff --git a/packages/session-orchestrator/src/orchestrator.ts b/packages/session-orchestrator/src/orchestrator.ts
index 617c079..5c36922 100644
--- a/packages/session-orchestrator/src/orchestrator.ts
+++ b/packages/session-orchestrator/src/orchestrator.ts
@@ -5,6 +5,7 @@ import type {
   CompactionResult,
   ConversationStatus,
   EventHookDescriptor,
+  ImageInput,
   Logger,
   ModelInfo,
   ProviderContract,
@@ -34,11 +35,71 @@ import {
 } from "./pure.js";
 import type { ToolAssembly } from "./tools-filter.js";
 
+// --- Vision handoff (lazy, optional) ---
+
+/**
+ * Minimal contract the vision-handoff service satisfies. Defined here (not
+ * imported from the vision-handoff package) so the orchestrator has NO
+ * compile-time dependency on it — the service is resolved lazily at runtime
+ * (like the message-queue / system-prompt services), and the feature degrades
+ * off cleanly when the extension isn't loaded (images pass through unchanged,
+ * which is correct for vision-capable models and a no-op for text-only turns).
+ *
+ * `prepareForProvider` transforms a message list for the provider: if the
+ * active model is vision-capable, messages pass through unchanged; otherwise
+ * image chunks are replaced with numbered placeholders (telling the model to
+ * call `consult_vision`) and the images are registered for tool access.
+ */
+export interface VisionHandoffService {
+  /**
+   * Store images to tmp files and return compact URLs. Each input image's data
+   * URL is saved to a tmp file and replaced with a compact HTTP path so the
+   * persisted conversation store holds a tiny string, not megabytes of base64.
+   * When `saveImageToTmp` is not configured, data URLs pass through unchanged.
+   */
+  readonly storeImages: (
+    conversationId: string,
+    images: readonly ImageInput[],
+  ) => Promise<readonly ImageInput[]>;
+
+  /** Delete all tmp images for a conversation (on close). Best-effort. */
+  readonly purgeConversationImages: (conversationId: string) => Promise<void>;
+
+  readonly prepareForProvider: (
+    messages: readonly ChatMessage[],
+    currentModelName: string | undefined,
+    opts?: {
+      readonly conversationId?: string;
+      readonly imageLimit?: number;
+      readonly signal?: AbortSignal;
+      readonly logger?: Logger;
+    },
+  ) => Promise<readonly ChatMessage[]>;
+}
+
+/**
+ * Local handle for the vision-handoff service, keyed by the same ID the
+ * vision-handoff extension registers under (`"vision-handoff/service"`). Defined
+ * locally (not imported) so the orchestrator has no compile-time dependency on
+ * the vision-handoff package — the service is resolved lazily at runtime, and
+ * the feature degrades off cleanly when the extension isn't loaded.
+ */
+export const visionHandoffLocalHandle: ServiceHandle<VisionHandoffService> =
+  defineService<VisionHandoffService>("vision-handoff/service");
+
 // --- Broadcast hub types ---
 
 export interface StartTurnInput {
   readonly conversationId: string;
   readonly text: string;
+  /**
+   * Images attached to this turn (e.g. user-pasted screenshots). Each is
+   * appended as an `image` chunk on the persisted user message. For a
+   * vision-capable model the images pass through to the provider natively; for
+   * a non-vision model the vision handoff transcribes them to text first.
+   * Optional — omit for a text-only turn.
+   */
+  readonly images?: readonly ImageInput[];
   readonly modelName?: string;
   readonly cwd?: string;
   /**
@@ -77,6 +138,12 @@ export type StartTurnResult =
 export interface EnqueueInput {
   readonly conversationId: string;
   readonly text: string;
+  /**
+   * Images attached (the steering / opening message analog of
+   * `StartTurnInput.images`). Threaded to `startTurn` when the conversation is
+   * idle (the message starts a turn). Additive optional.
+   */
+  readonly images?: readonly ImageInput[];
   /** Workspace to stamp on a new conversation. Defaults to `"default"`. */
   readonly workspaceId?: string;
   /**
@@ -291,6 +358,8 @@ export interface SessionOrchestrator {
     workspaceId?: string;
     /** Explicit system-prompt override — see {@link StartTurnInput.systemPrompt}. */
     systemPrompt?: string;
+    /** Images attached to this turn — see {@link StartTurnInput.images}. */
+    images?: readonly ImageInput[];
   }): Promise<void>;
 }
 
@@ -345,6 +414,17 @@ export interface SessionOrchestratorDeps {
    * when the stream completes. Lazy so activation order doesn't matter.
    */
   readonly resolveConcurrencyLimiter?: () => ConcurrencyLimiter | undefined;
+  /**
+   * Lazily resolves the vision-handoff service, or `undefined` when the
+   * vision-handoff extension isn't loaded. Used to transcribe image chunks to
+   * text for non-vision models before they reach the provider (so a text-only
+   * model can still reason about pasted/code images). When `undefined`, images
+   * pass through unchanged (correct for vision-capable models; a text-only model
+   * would then receive image content its API may reject — the feature degrades
+   * off cleanly for text-only turns since there are no images). Lazy so
+   * activation order doesn't matter; called per-turn.
+   */
+  readonly resolveVisionHandoff?: () => VisionHandoffService | undefined;
   /** Apply the per-turn tools filter chain. Injected for testability. */
   readonly applyToolsFilter: (assembly: ToolAssembly) => Promise<ToolAssembly>;
   /** Base logger (auto-scoped to this extension); childed per turn for span capture. */
@@ -447,6 +527,7 @@ export function createSessionOrchestrator(
     reasoningEffortOverride: ReasoningEffort | undefined,
     workspaceId: string,
     systemPromptOverride: string | undefined,
+    images: readonly ImageInput[] | undefined,
   ): void {
     const turnId = generateTurnId();
     const promptStartedAt = deps.now?.() ?? Date.now();
@@ -569,7 +650,18 @@ export function createSessionOrchestrator(
         const effectiveModelName = resolveModelName(modelName, storedModel);
 
         const history = await deps.conversationStore.load(conversationId);
-        const userMsg = buildUserMessage(text);
+
+        // Store images to tmp files (compact URLs) BEFORE building the user
+        // message so the persisted chunks hold tiny URL references, not
+        // megabytes of base64 data URLs. When the vision-handoff service isn't
+        // loaded, images pass through unchanged (backward compatible).
+        const visionHandoffForStore = deps.resolveVisionHandoff?.();
+        const storedImages =
+          visionHandoffForStore !== undefined && images !== undefined
+            ? await visionHandoffForStore.storeImages(conversationId, images)
+            : images;
+
+        const userMsg = buildUserMessage(text, storedImages);
 
         // Workspace assignment for new conversations happens BEFORE
         // effective-cwd resolution (see workspaceSetupPromise above) so
@@ -744,9 +836,35 @@ export function createSessionOrchestrator(
                 return [{ role: "user", chunks: [{ type: "text", text: steerText }] }];
               };
 
+        // Vision handoff: transform the message list for the provider. When the
+        // active model is vision-capable, images pass through natively (no-op).
+        // When it is NOT vision-capable, image chunks are transcribed to text
+        // descriptions via a vision-capable model — so a text-only model can
+        // still reason about images. The PERSISTED user message keeps the
+        // original image chunks (appended below); only the provider's view is
+        // transcribed. When the vision-handoff service isn't loaded, images pass
+        // through unchanged (correct for vision models; text-only models would
+        // then receive image content their API may reject — degrades off cleanly
+        // for text-only turns with no images).
+        const visionHandoff = deps.resolveVisionHandoff?.();
+        let providerMessages: readonly ChatMessage[] = [...history, userMsg];
+        if (visionHandoff !== undefined) {
+          const visionSettings = await deps.conversationStore.getVisionSettings();
+          providerMessages = await visionHandoff.prepareForProvider(
+            providerMessages,
+            effectiveModelName,
+            {
+              conversationId,
+              imageLimit: visionSettings.imageLimit,
+              signal: controller.signal,
+              ...(turnLogger !== undefined ? { logger: turnLogger } : {}),
+            },
+          );
+        }
+
         const opts: RunTurnInput = {
           provider,
-          messages: [...history, userMsg],
+          messages: providerMessages,
           tools: assembled.tools,
           dispatch,
           emit: emitAndAccumulate,
@@ -852,6 +970,7 @@ export function createSessionOrchestrator(
       reasoningEffort,
       workspaceId,
       systemPrompt,
+      images,
     }) {
       if (activeTurns.has(conversationId)) {
         return { started: false, reason: "already-active" };
@@ -865,18 +984,20 @@ export function createSessionOrchestrator(
         reasoningEffort,
         workspaceId ?? "default",
         systemPrompt,
+        images,
       );
       const turn = activeTurns.get(conversationId);
       const turnId = turn !== undefined ? turn.turnId : "";
       return { started: true, turnId };
     },
 
-    enqueue({ conversationId, text, workspaceId, computerId }) {
+    enqueue({ conversationId, text, workspaceId, computerId, images }) {
       const result = orchestrator.startTurn({
         conversationId,
         text,
         ...(workspaceId !== undefined ? { workspaceId } : {}),
         ...(computerId !== undefined ? { computerId } : {}),
+        ...(images !== undefined ? { images } : {}),
       });
       if (result.started) {
         return { startedTurn: true, queue: [] };
@@ -939,6 +1060,9 @@ export function createSessionOrchestrator(
         });
       });
       void deps.conversationStore.setConversationStatus(conversationId, "closed");
+      // Purge tmp images for this conversation (best-effort, fire-and-forget).
+      const vh = deps.resolveVisionHandoff?.();
+      if (vh !== undefined) void vh.purgeConversationImages(conversationId);
       return { abortedTurn };
     },
 
@@ -961,6 +1085,7 @@ export function createSessionOrchestrator(
       reasoningEffort,
       workspaceId,
       systemPrompt,
+      images,
     }) {
       const turnInput: StartTurnInput = {
         conversationId,
@@ -971,6 +1096,7 @@ export function createSessionOrchestrator(
         ...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
         ...(workspaceId !== undefined ? { workspaceId } : {}),
         ...(systemPrompt !== undefined ? { systemPrompt } : {}),
+        ...(images !== undefined ? { images } : {}),
       };
       const result = orchestrator.startTurn(turnInput);
       if (!result.started) {
diff --git a/packages/session-orchestrator/src/pure.test.ts b/packages/session-orchestrator/src/pure.test.ts
index c75cb82..7a574f1 100644
--- a/packages/session-orchestrator/src/pure.test.ts
+++ b/packages/session-orchestrator/src/pure.test.ts
@@ -26,6 +26,39 @@ describe("buildUserMessage", () => {
     expect(msg.role).toBe("user");
     expect(msg.chunks[0]).toEqual({ type: "text", text: "" });
   });
+
+  it("appends image chunks after the text chunk when images are given", () => {
+    const msg = buildUserMessage("look at this", [
+      { url: "data:image/png;base64,aaa" },
+      { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+    ]);
+    expect(msg.chunks).toHaveLength(3);
+    expect(msg.chunks[0]).toEqual({ type: "text", text: "look at this" });
+    expect(msg.chunks[1]).toEqual({ type: "image", url: "data:image/png;base64,aaa" });
+    expect(msg.chunks[2]).toEqual({
+      type: "image",
+      url: "data:image/jpeg;base64,bbb",
+      mimeType: "image/jpeg",
+    });
+  });
+
+  it("builds an image-only message when text is empty", () => {
+    const msg = buildUserMessage("", [{ url: "data:image/png;base64,zzz" }]);
+    expect(msg.chunks).toHaveLength(1);
+    expect(msg.chunks[0]).toEqual({ type: "image", url: "data:image/png;base64,zzz" });
+  });
+
+  it("includes mimeType when provided", () => {
+    const msg = buildUserMessage("hi", [
+      { url: "data:image/webp;base64,x", mimeType: "image/webp" },
+    ]);
+    expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBe("image/webp");
+  });
+
+  it("omits mimeType when not provided", () => {
+    const msg = buildUserMessage("hi", [{ url: "https://example.com/x.png" }]);
+    expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBeUndefined();
+  });
 });
 
 describe("selectFirstProvider", () => {
diff --git a/packages/session-orchestrator/src/pure.ts b/packages/session-orchestrator/src/pure.ts
index 2208e8f..0d2068f 100644
--- a/packages/session-orchestrator/src/pure.ts
+++ b/packages/session-orchestrator/src/pure.ts
@@ -1,12 +1,40 @@
 import type {
   ChatMessage,
+  Chunk,
+  ImageInput,
   ProviderContract,
   ReasoningEffort,
   ToolDispatchPolicy,
 } from "@dispatch/kernel";
 
-export function buildUserMessage(text: string): ChatMessage {
-  return { role: "user", chunks: [{ type: "text", text }] };
+/**
+ * Build the persisted user message for a turn. When `images` are provided, each
+ * is appended as an `image` chunk AFTER the text chunk, so the persisted message
+ * carries both the prompt text and the attached images (the frontend renders
+ * the images; vision-capable providers receive them natively; non-vision
+ * providers have them transcribed by the vision handoff before streaming).
+ *
+ * Pure: inputs → a ChatMessage, no I/O.
+ */
+export function buildUserMessage(text: string, images?: readonly ImageInput[]): ChatMessage {
+  const chunks: Chunk[] = [];
+  if (text.length > 0) {
+    chunks.push({ type: "text", text });
+  }
+  if (images !== undefined) {
+    for (const img of images) {
+      chunks.push({
+        type: "image",
+        url: img.url,
+        ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+      });
+    }
+  }
+  // An image-only message (empty text) is valid.
+  if (chunks.length === 0) {
+    chunks.push({ type: "text", text: "" });
+  }
+  return { role: "user", chunks };
 }
 
 // ── Provider-error retry backoff schedule ───────────────────────────────────
diff --git a/packages/transport-contract/src/contract.types.test.ts b/packages/transport-contract/src/contract.types.test.ts
index 9d3d904..34ff544 100644
--- a/packages/transport-contract/src/contract.types.test.ts
+++ b/packages/transport-contract/src/contract.types.test.ts
@@ -20,6 +20,7 @@ import type {
   LspServerState,
   LspStatusResponse,
   McpStatusResponse,
+  ModelsResponse,
   SetConversationComputerRequest,
   SetCwdRequest,
   SetWorkspaceDefaultComputerRequest,
@@ -55,6 +56,18 @@ const _chatWithoutComputer: ChatRequest = {
   message: "hello",
 };
 
+// ─── ChatRequest.images (additive optional) ──────────────────────────────────
+
+const _chatWithImages: ChatRequest = {
+  message: "What's in this screenshot?",
+  images: [{ url: "data:image/png;base64,iVBORw0KGgo=", mimeType: "image/png" }],
+};
+
+const _chatWithHttpImage: ChatRequest = {
+  message: "analyze this",
+  images: [{ url: "https://example.com/diagram.png" }],
+};
+
 // ─── Computer list / single response ─────────────────────────────────────────
 
 const _computer: Computer = {
@@ -255,6 +268,35 @@ describe("transport-contract types compile and are exported", () => {
     expect(_chatWithComputer.computerId).toBe("prod-box");
   });
 
+  // ─── ChatRequest.images (additive optional) ──────────────────────────────
+
+  it("ChatRequest: images is additive optional (omittable)", () => {
+    expect(_chatWithoutComputer.images).toBeUndefined();
+  });
+
+  it("ChatRequest: carries images (data URL) when set", () => {
+    expect(_chatWithImages.images).toHaveLength(1);
+    expect(_chatWithImages.images?.[0]?.url).toContain("base64");
+    expect(_chatWithImages.images?.[0]?.mimeType).toBe("image/png");
+  });
+
+  it("ChatRequest: carries images (http URL, mimeType optional)", () => {
+    expect(_chatWithHttpImage.images?.[0]?.url).toBe("https://example.com/diagram.png");
+    expect(_chatWithHttpImage.images?.[0]?.mimeType).toBeUndefined();
+  });
+
+  it("ModelsResponse: ModelMetadata carries optional vision flag", () => {
+    const resp: ModelsResponse = {
+      models: ["umans/kimi-k2.7", "umans/glm-5.2"],
+      modelInfo: {
+        "umans/kimi-k2.7": { contextWindow: 200000, vision: true },
+        "umans/glm-5.2": { contextWindow: 128000 },
+      },
+    };
+    expect(resp.modelInfo?.["umans/kimi-k2.7"]?.vision).toBe(true);
+    expect(resp.modelInfo?.["umans/glm-5.2"]?.vision).toBeUndefined();
+  });
+
   // ─── Computers ───────────────────────────────────────────────────────────
 
   it("ComputerListResponse: carries entries with usage counts", () => {
diff --git a/packages/transport-contract/src/index.ts b/packages/transport-contract/src/index.ts
index 400d9d5..d5f3000 100644
--- a/packages/transport-contract/src/index.ts
+++ b/packages/transport-contract/src/index.ts
@@ -26,6 +26,7 @@ import type {
   ComputerEntry,
   ConversationMeta,
   ConversationStatus,
+  ImageInput,
   QueuedMessage,
   ReasoningEffort,
   StoredChunk,
@@ -41,6 +42,8 @@ export type {
   ComputerEntry,
   ConversationMeta,
   ConversationStatus,
+  ImageChunk,
+  ImageInput,
   QueuedMessage,
   ReasoningEffort,
   StepMetrics,
@@ -68,6 +71,19 @@ export interface ChatRequest {
   readonly message: string;
 
   /**
+   * Images attached to this turn (e.g. a user-pasted screenshot). Each entry's
+   * `url` is a base64 data URL (`data:image/…;base64,…`) or an `http(s)://`
+   * URL. The server converts these to `image` chunks on the persisted user
+   * message. For a VISION-capable model (e.g. kimi), the images are passed
+   * through to the provider natively. For a NON-vision model (e.g. glm-5.2),
+   * the server's vision handoff transcribes each image to a text description
+   * (via a vision-capable model) and feeds that text instead — so a text-only
+   * model can still reason about the image's contents. Optional — omit for a
+   * text-only turn (backward compatible).
+   */
+  readonly images?: readonly ImageInput[];
+
+  /**
    * The model to use, as a model name in `<credentialName>/<model>` form — one
    * of the exact strings returned by `GET /models`. Omit to use the server's
    * default credential + model.
@@ -124,6 +140,14 @@ export interface ModelsResponse {
 /** Per-model metadata returned alongside the model catalog. */
 export interface ModelMetadata {
   readonly contextWindow?: number;
+  /**
+   * Whether this model can natively accept image input (vision/multimodal).
+   * When `true`, image chunks in a user message are passed through to the
+   * provider. When `false`/absent, the server's vision handoff transcribes
+   * images to text before the model sees them. A client may use this to show a
+   * vision badge in the model picker. Optional — absent when unknown.
+   */
+  readonly vision?: boolean;
 }
 
 /**
@@ -387,6 +411,23 @@ export interface SystemPromptVariablesResponse {
   readonly variables: readonly SystemPromptVariable[];
 }
 
+// ─── Vision settings (global) ──────────────────────────────────────────────────
+
+/**
+ * Response of `GET /settings/vision` — the global vision configuration shared
+ * across all conversations and vision models.
+ */
+export interface VisionSettingsResponse {
+  readonly imageLimit: number;
+  readonly compactionModel: string | null;
+}
+
+/** Body of `PUT /settings/vision` — a partial update. */
+export interface SetVisionSettingsRequest {
+  readonly imageLimit?: number;
+  readonly compactionModel?: string | null;
+}
+
 // ─── Message queue (steering) ─────────────────────────────────────────────────
 
 /**
diff --git a/packages/transport-http/src/app.ts b/packages/transport-http/src/app.ts
index 23f8dde..0fcc8f0 100644
--- a/packages/transport-http/src/app.ts
+++ b/packages/transport-http/src/app.ts
@@ -42,6 +42,7 @@ import type {
   ThroughputResponse,
   TitleResponse,
   UpdateHeartbeatRequest,
+  VisionSettingsResponse,
   WarmResponse,
   WorkspaceListResponse,
   WorkspaceResponse,
@@ -212,6 +213,37 @@ export function createApp(opts: CreateServerOptions): Hono {
 
   app.get("/health", (c) => c.json({ ok: true }));
 
+  // ── Tmp image serving (vision handoff) ──────────────────────────────────────
+  app.get("/images/:conversationId/:imageId", async (c) => {
+    const conversationId = c.req.param("conversationId");
+    const imageId = c.req.param("imageId");
+    if (imageId.includes("/") || imageId.includes("..")) {
+      return c.json({ error: "Invalid image ID" }, 400);
+    }
+    const imageDir = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images";
+    const { join } = await import("node:path");
+    const { readFile: fsReadFile } = await import("node:fs/promises");
+    const filePath = join(imageDir, conversationId, imageId);
+    try {
+      const buf = await fsReadFile(filePath);
+      const ext = imageId.toLowerCase();
+      const mime = ext.endsWith(".png")
+        ? "image/png"
+        : ext.endsWith(".jpg") || ext.endsWith(".jpeg")
+          ? "image/jpeg"
+          : ext.endsWith(".webp")
+            ? "image/webp"
+            : ext.endsWith(".gif")
+              ? "image/gif"
+              : ext.endsWith(".bmp")
+                ? "image/bmp"
+                : "application/octet-stream";
+      return new Response(buf, { headers: { "Content-Type": mime, "Cache-Control": "no-cache" } });
+    } catch {
+      return c.json({ error: "Image not found" }, 404);
+    }
+  });
+
   app.get("/conversations/:id/metrics", async (c) => {
     const conversationId = c.req.param("id");
 
@@ -306,11 +338,14 @@ export function createApp(opts: CreateServerOptions): Hono {
   app.get("/models", async (c) => {
     try {
       const models = await opts.credentialStore.listCatalog();
-      const modelInfo: Record<string, { contextWindow?: number }> = {};
+      const modelInfo: Record<string, { contextWindow?: number; vision?: boolean }> = {};
       for (const modelName of models) {
         const info = await opts.credentialStore.getModelInfo(modelName);
-        if (info?.contextWindow !== undefined) {
-          modelInfo[modelName] = { contextWindow: info.contextWindow };
+        if (info?.contextWindow !== undefined || info?.vision === true) {
+          const entry: { contextWindow?: number; vision?: boolean } = {};
+          if (info?.contextWindow !== undefined) entry.contextWindow = info.contextWindow;
+          if (info?.vision === true) entry.vision = true;
+          modelInfo[modelName] = entry;
         }
       }
       const body: ModelsResponse = {
@@ -410,8 +445,16 @@ export function createApp(opts: CreateServerOptions): Hono {
       return c.json({ error: result.error }, 400);
     }
 
-    const { conversationId, message, model, cwd, computerId, reasoningEffort, workspaceId } =
-      result;
+    const {
+      conversationId,
+      message,
+      model,
+      cwd,
+      computerId,
+      reasoningEffort,
+      workspaceId,
+      images,
+    } = result;
     log.info("chat: request accepted", {
       conversationId,
       hasModel: model !== undefined,
@@ -419,6 +462,7 @@ export function createApp(opts: CreateServerOptions): Hono {
       hasComputerId: computerId !== undefined,
       hasReasoningEffort: reasoningEffort !== undefined,
       hasWorkspaceId: workspaceId !== undefined,
+      imageCount: images?.length ?? 0,
     });
 
     const events: AgentEvent[] = [];
@@ -469,6 +513,7 @@ export function createApp(opts: CreateServerOptions): Hono {
       ...(computerId !== undefined ? { computerId } : {}),
       ...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
       ...(workspaceId !== undefined ? { workspaceId } : {}),
+      ...(images !== undefined ? { images } : {}),
     };
 
     opts.orchestrator
@@ -1671,6 +1716,43 @@ export function createApp(opts: CreateServerOptions): Hono {
     return c.json(response, 200);
   });
 
+  app.get("/settings/vision", async (c) => {
+    const settings = await opts.conversationStore.getVisionSettings();
+    const body: VisionSettingsResponse = settings;
+    return c.json(body, 200);
+  });
+
+  app.put("/settings/vision", async (c) => {
+    let body: unknown;
+    try {
+      body = await c.req.json();
+    } catch {
+      return c.json({ error: "Invalid JSON body" }, 400);
+    }
+    const obj = body as { imageLimit?: unknown; compactionModel?: unknown };
+    if (obj.imageLimit !== undefined) {
+      if (
+        typeof obj.imageLimit !== "number" ||
+        !Number.isInteger(obj.imageLimit) ||
+        obj.imageLimit < 0
+      ) {
+        return c.json({ error: "imageLimit must be a non-negative integer" }, 400);
+      }
+      await opts.conversationStore.setVisionImageLimit(obj.imageLimit);
+      log.info("vision: image limit set", { imageLimit: obj.imageLimit });
+    }
+    if (obj.compactionModel !== undefined) {
+      if (obj.compactionModel !== null && typeof obj.compactionModel !== "string") {
+        return c.json({ error: "compactionModel must be a string or null" }, 400);
+      }
+      await opts.conversationStore.setVisionCompactionModel(obj.compactionModel);
+      log.info("vision: compaction model set", { compactionModel: obj.compactionModel });
+    }
+    const settings = await opts.conversationStore.getVisionSettings();
+    const response: VisionSettingsResponse = settings;
+    return c.json(response, 200);
+  });
+
   // ─── Static frontend serving (catch-all, API routes take precedence) ──────
   if (opts.webDir !== undefined) {
     const webDir = opts.webDir;
diff --git a/packages/transport-http/src/logic.test.ts b/packages/transport-http/src/logic.test.ts
index fc8302e..67632f3 100644
--- a/packages/transport-http/src/logic.test.ts
+++ b/packages/transport-http/src/logic.test.ts
@@ -182,6 +182,69 @@ describe("parseChatBody", () => {
       expect(result.reasoningEffort).toBeUndefined();
     }
   });
+
+  // ── images ──────────────────────────────────────────────────────────────
+
+  it("parses images array with data URLs", () => {
+    const result = parseChatBody(
+      {
+        message: "what is this?",
+        images: [
+          { url: "data:image/png;base64,aaa" },
+          { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+        ],
+      },
+      fakeId,
+    );
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images).toHaveLength(2);
+      expect(result.images?.[0]?.url).toBe("data:image/png;base64,aaa");
+      expect(result.images?.[1]?.mimeType).toBe("image/jpeg");
+    }
+  });
+
+  it("parses images with http URLs", () => {
+    const result = parseChatBody(
+      { message: "hi", images: [{ url: "https://example.com/x.png" }] },
+      fakeId,
+    );
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images?.[0]?.url).toBe("https://example.com/x.png");
+    }
+  });
+
+  it("returns error when images is not an array", () => {
+    const result = parseChatBody({ message: "hi", images: "not-an-array" }, fakeId);
+    expect(isParseError(result)).toBe(true);
+  });
+
+  it("returns error when an image lacks a url", () => {
+    const result = parseChatBody({ message: "hi", images: [{ mimeType: "image/png" }] }, fakeId);
+    expect(isParseError(result)).toBe(true);
+  });
+
+  it("returns error when an image url is empty", () => {
+    const result = parseChatBody({ message: "hi", images: [{ url: "" }] }, fakeId);
+    expect(isParseError(result)).toBe(true);
+  });
+
+  it("omits images when absent (backward compatible)", () => {
+    const result = parseChatBody({ message: "hi" }, fakeId);
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images).toBeUndefined();
+    }
+  });
+
+  it("omits images when the array is empty", () => {
+    const result = parseChatBody({ message: "hi", images: [] }, fakeId);
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images).toBeUndefined();
+    }
+  });
 });
 
 describe("parseSinceSeq", () => {
diff --git a/packages/transport-http/src/logic.ts b/packages/transport-http/src/logic.ts
index d5f2dea..c97f320 100644
--- a/packages/transport-http/src/logic.ts
+++ b/packages/transport-http/src/logic.ts
@@ -55,6 +55,13 @@ export interface ChatCommand {
   readonly computerId?: string;
   readonly reasoningEffort?: ReasoningEffort;
   readonly workspaceId?: string;
+  /**
+   * Images attached to this turn (data URLs or http URLs). Parsed from the
+   * `ChatRequest.images` field; forwarded to the orchestrator which converts
+   * them to `image` chunks on the user message. Each entry must have a non-empty
+   * string `url`; `mimeType` is optional.
+   */
+  readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
 }
 
 export interface ParseError {
@@ -121,6 +128,33 @@ export function parseChatBody(body: unknown, generateId: () => string): ParseRes
     (result as { workspaceId?: string }).workspaceId = obj.workspaceId;
   }
 
+  if (obj.images !== undefined) {
+    if (!Array.isArray(obj.images)) {
+      return { error: "Field 'images' must be an array" };
+    }
+    const images: { url: string; mimeType?: string }[] = [];
+    for (const entry of obj.images) {
+      if (entry === null || typeof entry !== "object") {
+        return { error: "Each image must be an object with a 'url' string" };
+      }
+      const img = entry as { url?: unknown; mimeType?: unknown };
+      if (typeof img.url !== "string" || img.url.length === 0) {
+        return { error: "Each image must have a non-empty string 'url'" };
+      }
+      const parsed: { url: string; mimeType?: string } = { url: img.url };
+      if (img.mimeType !== undefined) {
+        if (typeof img.mimeType !== "string") {
+          return { error: "Field 'mimeType' on an image must be a string" };
+        }
+        parsed.mimeType = img.mimeType;
+      }
+      images.push(parsed);
+    }
+    if (images.length > 0) {
+      (result as { images?: readonly { url: string; mimeType?: string }[] }).images = images;
+    }
+  }
+
   return result;
 }
 
diff --git a/packages/transport-ws/src/extension.ts b/packages/transport-ws/src/extension.ts
index 3811ed7..d26712b 100644
--- a/packages/transport-ws/src/extension.ts
+++ b/packages/transport-ws/src/extension.ts
@@ -291,6 +291,7 @@ export function createTransportWsExtension(): Extension {
                     : {}),
                   ...(result.workspaceId !== undefined ? { workspaceId: result.workspaceId } : {}),
                   ...(result.computerId !== undefined ? { computerId: result.computerId } : {}),
+                  ...(result.images !== undefined ? { images: result.images } : {}),
                 });
                 if (!startResult.started) {
                   send(ws, {
diff --git a/packages/transport-ws/src/router.ts b/packages/transport-ws/src/router.ts
index a33aa5a..0caf305 100644
--- a/packages/transport-ws/src/router.ts
+++ b/packages/transport-ws/src/router.ts
@@ -58,6 +58,12 @@ export interface ChatRouteResult {
    * conversation → workspace → local chain).
    */
   readonly computerId?: string;
+  /**
+   * Images attached to this turn (data URLs or http URLs), forwarded verbatim to
+   * the orchestrator. Absent when the client omits it. Each entry must have a
+   * non-empty string `url`; `mimeType` is optional.
+   */
+  readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
 }
 
 /** A malformed chat.send that should yield a chat.error reply. */
@@ -174,6 +180,36 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
       errorMessage: `chat.send: invalid reasoningEffort "${msg.reasoningEffort}" — must be one of: low, medium, high, xhigh, max`,
     };
   }
+  // Validate images (if present): each must be an object with a non-empty url.
+  let images: readonly { url: string; mimeType?: string }[] | undefined;
+  if (msg.images !== undefined) {
+    if (!Array.isArray(msg.images)) {
+      return {
+        kind: "chat-error",
+        conversationId: msg.conversationId,
+        errorMessage: "chat.send: 'images' must be an array",
+      };
+    }
+    const parsed: { url: string; mimeType?: string }[] = [];
+    for (const entry of msg.images) {
+      if (
+        entry === null ||
+        typeof entry !== "object" ||
+        typeof entry.url !== "string" ||
+        entry.url.length === 0
+      ) {
+        return {
+          kind: "chat-error",
+          conversationId: msg.conversationId,
+          errorMessage: "chat.send: each image must have a non-empty string 'url'",
+        };
+      }
+      const p: { url: string; mimeType?: string } = { url: entry.url };
+      if (entry.mimeType !== undefined) p.mimeType = entry.mimeType;
+      parsed.push(p);
+    }
+    if (parsed.length > 0) images = parsed;
+  }
   return {
     kind: "chat",
     conversationId: msg.conversationId,
@@ -183,6 +219,7 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
     ...(msg.reasoningEffort !== undefined ? { reasoningEffort: msg.reasoningEffort } : {}),
     ...(msg.workspaceId !== undefined ? { workspaceId: msg.workspaceId } : {}),
     ...(msg.computerId !== undefined ? { computerId: msg.computerId } : {}),
+    ...(images !== undefined ? { images } : {}),
   };
 }
 
diff --git a/packages/vision-handoff/package.json b/packages/vision-handoff/package.json
new file mode 100644
index 0000000..b11f7ee
--- /dev/null
+++ b/packages/vision-handoff/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "@dispatch/vision-handoff",
+  "version": "0.0.0",
+  "type": "module",
+  "private": true,
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "dependencies": {
+    "@dispatch/conversation-store": "workspace:*",
+    "@dispatch/credential-store": "workspace:*",
+    "@dispatch/kernel": "workspace:*",
+    "@dispatch/openai-stream": "workspace:*"
+  }
+}
diff --git a/packages/vision-handoff/src/extension.ts b/packages/vision-handoff/src/extension.ts
new file mode 100644
index 0000000..08fddca
--- /dev/null
+++ b/packages/vision-handoff/src/extension.ts
@@ -0,0 +1,198 @@
+/**
+ * vision-handoff extension — registers the universal vision handoff service +
+ * the `consult_vision` tool.
+ *
+ * The service performs provider-agnostic vision handoff: when a non-vision model
+ * (e.g. glm-5.2) receives an image, it replaces the image with a numbered
+ * placeholder and registers it for tool access. The `consult_vision` tool opens
+ * a NEW conversation tab with a vision-capable model (e.g. Kimi), attaches the
+ * image + the model's specific question, and returns the conversation ID + the
+ * vision model's answer. Follow-ups go through the dispatch CLI.
+ *
+ * Images are saved to a tmp directory (`/tmp/dispatch/images/<convId>/`) so the
+ * conversation store (SQLite) only holds a compact URL reference — not
+ * megabytes of base64. Tmp files are purged on reboot (ephemeral dir), after
+ * compaction (the transcription replaces the image), and on conversation close.
+ *
+ * Effects (filesystem, orchestrator) live here in the shell, injected into the
+ * service. The pure decisions live in `pure.ts`. No `console.*`; logging via
+ * `host.logger`.
+ */
+
+import { mkdir, readFile, rm, unlink, writeFile } from "node:fs/promises";
+import { extname, isAbsolute, join, resolve as pathResolve } from "node:path";
+import { conversationStoreHandle } from "@dispatch/conversation-store";
+import type { CredentialStore } from "@dispatch/credential-store";
+import { credentialStoreHandle } from "@dispatch/credential-store";
+import type { Extension, HostAPI, Manifest } from "@dispatch/kernel";
+import {
+  createVisionHandoffService,
+  orchestratorLocalHandle,
+  visionHandoffHandle,
+} from "./service.js";
+import { createConsultVisionTool } from "./tool.js";
+
+export const manifest: Manifest = {
+  id: "vision-handoff",
+  name: "Vision Handoff",
+  version: "0.0.0",
+  apiVersion: "^0.1.0",
+  trust: "bundled",
+  activation: "eager",
+  capabilities: { network: true },
+  contributes: { services: ["vision-handoff/service"], tools: ["consult_vision"] },
+};
+
+const IMAGE_DIR = process.env.DISPATCH_IMAGE_DIR ?? "/tmp/dispatch/images";
+
+/** MIME types for recognized image extensions. */
+const MIME_BY_EXT: Readonly<Record<string, string>> = {
+  ".png": "image/png",
+  ".jpg": "image/jpeg",
+  ".jpeg": "image/jpeg",
+  ".webp": "image/webp",
+  ".gif": "image/gif",
+  ".bmp": "image/bmp",
+};
+
+/** Reverse: MIME → extension. */
+const EXT_BY_MIME: Readonly<Record<string, string>> = {
+  "image/png": ".png",
+  "image/jpeg": ".jpg",
+  "image/webp": ".webp",
+  "image/gif": ".gif",
+  "image/bmp": ".bmp",
+};
+
+/**
+ * Read an image file from disk as a base64 data URL. Resolves relative paths
+ * against the cwd (the conversation's working directory). Throws on missing
+ * file / read error (the caller surfaces it). The shell edge — real `node:fs`.
+ */
+async function readFileAsDataUrl(path: string, cwd?: string): Promise<string> {
+  const abs = cwd !== undefined && !isAbsolute(path) ? pathResolve(cwd, path) : pathResolve(path);
+  const buf = await readFile(abs);
+  const ext = extname(abs).toLowerCase();
+  const mime = MIME_BY_EXT[ext] ?? "image/png";
+  return `data:${mime};base64,${buf.toString("base64")}`;
+}
+
+/**
+ * Save a data URL image to a tmp file and return a compact HTTP path.
+ * The compact URL (`/images/<conversationId>/<uuid>.<ext>`) is what gets
+ * persisted in the conversation store — a tiny string, not megabytes of base64.
+ */
+async function saveImageToTmp(
+  conversationId: string,
+  dataUrl: string,
+  mimeType?: string,
+): Promise<string> {
+  const mime = mimeType ?? "image/png";
+  const ext = EXT_BY_MIME[mime] ?? ".png";
+  const imageId = `${crypto.randomUUID()}${ext}`;
+  const dir = join(IMAGE_DIR, conversationId);
+  await mkdir(dir, { recursive: true });
+  const filePath = join(dir, imageId);
+  const base64 = dataUrl.split(",")[1] ?? "";
+  await writeFile(filePath, Buffer.from(base64, "base64"));
+  return `/images/${conversationId}/${imageId}`;
+}
+
+/**
+ * Resolve a compact URL (`/images/<convId>/<imageId>`) back to a data URL by
+ * reading the tmp file. Data URLs and HTTP URLs pass through unchanged.
+ */
+async function resolveImageUrl(url: string): Promise<string> {
+  if (url.startsWith("data:") || url.startsWith("http")) return url;
+  if (!url.startsWith("/images/")) return url;
+  const parts = url.split("/"); // ["", "images", convId, imageId]
+  const convId = parts[2];
+  const imageId = parts[3];
+  if (convId === undefined || imageId === undefined) return url;
+  const filePath = join(IMAGE_DIR, convId, imageId);
+  const buf = await readFile(filePath);
+  const ext = extname(imageId).toLowerCase();
+  const mime = MIME_BY_EXT[ext] ?? "image/png";
+  return `data:${mime};base64,${buf.toString("base64")}`;
+}
+
+/** Delete a single tmp image file (after compaction — best-effort). */
+async function deleteTmpImage(compactUrl: string): Promise<void> {
+  if (!compactUrl.startsWith("/images/")) return;
+  const parts = compactUrl.split("/");
+  const convId = parts[2];
+  const imageId = parts[3];
+  if (convId === undefined || imageId === undefined) return;
+  const filePath = join(IMAGE_DIR, convId, imageId);
+  try {
+    await unlink(filePath);
+  } catch {
+    // Best-effort — file may already be deleted.
+  }
+}
+
+/** Delete all tmp images for a conversation (on close — best-effort). */
+async function deleteConversationImages(conversationId: string): Promise<void> {
+  const dir = join(IMAGE_DIR, conversationId);
+  try {
+    await rm(dir, { recursive: true, force: true });
+  } catch {
+    // Best-effort.
+  }
+}
+
+export async function activate(host: HostAPI): Promise<void> {
+  const credentialStore = host.getService(credentialStoreHandle) as CredentialStore | undefined;
+  if (credentialStore === undefined) {
+    host.logger.warn(
+      "vision-handoff: credential-store service not available. The consult_vision tool and image handoff are disabled.",
+    );
+    return;
+  }
+
+  const resolveModel = (modelName: string) => {
+    const resolved = credentialStore.resolve(modelName);
+    if (resolved === undefined) return undefined;
+    const provider = host.getProviders().get(resolved.providerId);
+    if (provider === undefined) return undefined;
+    return { provider, model: resolved.model };
+  };
+
+  const service = createVisionHandoffService({
+    credentialStore,
+    resolveModel,
+    readFileAsDataUrl,
+    saveImageToTmp,
+    resolveImageUrl,
+    deleteTmpImage,
+    deleteConversationImages,
+    resolveOrchestrator: () => {
+      const loaded = host.getExtensions().some((m) => m.id === "session-orchestrator");
+      if (!loaded) return undefined;
+      try {
+        return host.getService(orchestratorLocalHandle);
+      } catch {
+        return undefined;
+      }
+    },
+    getImageTranscriptions: async (conversationId: string) => {
+      const store = host.getService(conversationStoreHandle);
+      return store.getImageTranscriptions(conversationId);
+    },
+    setImageTranscription: async (conversationId: string, url: string, text: string) => {
+      const store = host.getService(conversationStoreHandle);
+      await store.setImageTranscription(conversationId, url, text);
+    },
+    setConversationTitle: async (conversationId: string, title: string) => {
+      const store = host.getService(conversationStoreHandle);
+      await store.setConversationTitle(conversationId, title);
+    },
+    logger: host.logger.child({ extensionId: "vision-handoff" }),
+  });
+
+  host.provideService(visionHandoffHandle, service);
+  host.defineTool(createConsultVisionTool(service));
+  host.logger.info("vision-handoff: registered (consult_vision tool + handoff service)");
+}
+
+export const extension: Extension = { manifest, activate };
diff --git a/packages/vision-handoff/src/index.ts b/packages/vision-handoff/src/index.ts
new file mode 100644
index 0000000..2713346
--- /dev/null
+++ b/packages/vision-handoff/src/index.ts
@@ -0,0 +1,21 @@
+export { extension, manifest } from "./extension.js";
+export {
+  collectTextFromStream,
+  findVisionModelName,
+  formatConsultResult,
+  formatImagePlaceholder,
+  formatNoVisionPlaceholder,
+  isVisionCapable,
+} from "./pure.js";
+export type {
+  OrchestratorForVision,
+  ResolvedVisionModel,
+  VisionHandoffDeps,
+  VisionHandoffService,
+} from "./service.js";
+export {
+  createVisionHandoffService,
+  orchestratorLocalHandle,
+  visionHandoffHandle,
+} from "./service.js";
+export { createConsultVisionTool } from "./tool.js";
diff --git a/packages/vision-handoff/src/pure.test.ts b/packages/vision-handoff/src/pure.test.ts
new file mode 100644
index 0000000..21b1224
--- /dev/null
+++ b/packages/vision-handoff/src/pure.test.ts
@@ -0,0 +1,180 @@
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { describe, expect, it } from "vitest";
+import {
+  collectTextFromStream,
+  findVisionModelName,
+  formatConsultationTitle,
+  formatConsultResult,
+  formatImagePlaceholder,
+  formatNoVisionPlaceholder,
+  isVisionCapable,
+} from "./pure.js";
+
+describe("isVisionCapable", () => {
+  it("returns true when ModelInfo.vision is true", () => {
+    expect(isVisionCapable("umans/umans-kimi-k2.7", { id: "umans-kimi-k2.7", vision: true })).toBe(
+      true,
+    );
+  });
+
+  it("returns false when ModelInfo.vision is false (overrides name heuristic)", () => {
+    expect(isVisionCapable("umans/umans-kimi-k2.7", { id: "umans-kimi-k2.7", vision: false })).toBe(
+      false,
+    );
+  });
+
+  it("falls back to name heuristic when vision is absent (umans kimi + qwen)", () => {
+    expect(isVisionCapable("umans/umans-kimi-k2.7", undefined)).toBe(true);
+    expect(isVisionCapable("umans/umans-qwen3.6-35b-a3b", undefined)).toBe(true);
+  });
+
+  it("falls back to name heuristic when vision is absent (non-vision)", () => {
+    expect(isVisionCapable("umans/umans-glm-5.2", undefined)).toBe(false);
+    expect(isVisionCapable("umans/umans-coder", { id: "umans-coder" })).toBe(false);
+  });
+
+  it("returns false for undefined model name", () => {
+    expect(isVisionCapable(undefined, undefined)).toBe(false);
+  });
+});
+
+describe("findVisionModelName", () => {
+  const getInfo = async (name: string): Promise<ModelInfo | undefined> => {
+    const map: Record<string, ModelInfo> = {
+      "umans/umans-kimi-k2.7": { id: "umans-kimi-k2.7", vision: true },
+      "umans/umans-qwen3.6-35b-a3b": { id: "umans-qwen3.6-35b-a3b", vision: true },
+      "umans/umans-glm-5.2": { id: "umans-glm-5.2" },
+      "umans/llama-vision": { id: "llama-vision", vision: true },
+    };
+    return map[name];
+  };
+
+  it("finds the first umans kimi model via name heuristic", async () => {
+    const name = await findVisionModelName(
+      ["umans/umans-glm-5.2", "umans/umans-kimi-k2.7", "umans/llama-vision"],
+      getInfo,
+    );
+    expect(name).toBe("umans/umans-kimi-k2.7");
+  });
+
+  it("finds a vision model via ModelInfo.vision when name heuristic misses", async () => {
+    const name = await findVisionModelName(["umans/umans-glm-5.2", "umans/llama-vision"], getInfo);
+    expect(name).toBe("umans/llama-vision");
+  });
+
+  it("skips the excluded model and finds the next vision model", async () => {
+    const name = await findVisionModelName(
+      ["umans/umans-kimi-k2.7", "umans/umans-qwen3.6-35b-a3b"],
+      getInfo,
+      "umans/umans-kimi-k2.7",
+    );
+    expect(name).toBe("umans/umans-qwen3.6-35b-a3b");
+  });
+
+  it("returns undefined when no vision model is available", async () => {
+    const name = await findVisionModelName(["umans/umans-glm-5.2"], getInfo);
+    expect(name).toBeUndefined();
+  });
+
+  it("returns undefined for empty catalog", async () => {
+    const name = await findVisionModelName([], getInfo);
+    expect(name).toBeUndefined();
+  });
+});
+
+describe("collectTextFromStream", () => {
+  async function* stream(events: ProviderEvent[]): AsyncIterable<ProviderEvent> {
+    for (const e of events) yield e;
+  }
+
+  it("collects text-delta events into a single string", async () => {
+    const events: ProviderEvent[] = [
+      { type: "text-delta", delta: "Hello " },
+      { type: "text-delta", delta: "world!" },
+    ];
+    const text = await collectTextFromStream(stream(events));
+    expect(text).toBe("Hello world!");
+  });
+
+  it("ignores non-text events", async () => {
+    const events: ProviderEvent[] = [
+      { type: "reasoning-delta", delta: "thinking..." },
+      { type: "text-delta", delta: "answer" },
+      { type: "usage", usage: { inputTokens: 5, outputTokens: 1 } },
+      { type: "finish", reason: "stop" },
+    ];
+    const text = await collectTextFromStream(stream(events));
+    expect(text).toBe("answer");
+  });
+
+  it("throws on an error event", async () => {
+    const events: ProviderEvent[] = [
+      { type: "text-delta", delta: "partial" },
+      { type: "error", message: "boom" },
+    ];
+    await expect(collectTextFromStream(stream(events))).rejects.toThrow("boom");
+  });
+
+  it("returns empty string for an empty stream", async () => {
+    const text = await collectTextFromStream(stream([]));
+    expect(text).toBe("");
+  });
+});
+
+describe("formatImagePlaceholder", () => {
+  it("includes the image ID and mentions consult_vision", () => {
+    const text = formatImagePlaceholder(1);
+    expect(text).toContain("Image 1");
+    expect(text).toContain("consult_vision");
+    expect(text).toContain("imageIds=[1]");
+  });
+
+  it("increments the ID for each image", () => {
+    expect(formatImagePlaceholder(2)).toContain("Image 2");
+    expect(formatImagePlaceholder(2)).toContain("imageIds=[2]");
+  });
+});
+
+describe("formatNoVisionPlaceholder", () => {
+  it("explains the limitation", () => {
+    const text = formatNoVisionPlaceholder();
+    expect(text).toContain("no vision-capable model");
+  });
+});
+
+describe("formatConsultResult", () => {
+  it("includes the conversation ID, the response, and the dispatch CLI hint", () => {
+    const result = formatConsultResult("abc-123", "The error is on line 12.");
+    expect(result).toContain("abc-123");
+    expect(result).toContain("The error is on line 12.");
+    expect(result).toContain("dispatch CLI");
+  });
+
+  it("trims the response", () => {
+    const result = formatConsultResult("c1", "  spaced  ");
+    expect(result).toContain("spaced");
+    expect(result).not.toContain("spaced  ");
+  });
+});
+
+describe("formatConsultationTitle", () => {
+  it("prefixes the question with 'IMAGE - '", () => {
+    expect(formatConsultationTitle("What error is shown?")).toBe("IMAGE - What error is shown?");
+  });
+
+  it("truncates long questions to 80 chars with an ellipsis (matching the store's TITLE_MAX)", () => {
+    const long = "x".repeat(100);
+    const title = formatConsultationTitle(long);
+    expect(title).toBe(`IMAGE - ${"x".repeat(80)}…`);
+    expect(title.length).toBe("IMAGE - ".length + 80 + 1); // prefix + 80 + ellipsis
+  });
+
+  it("does not truncate questions at or under 80 chars", () => {
+    expect(formatConsultationTitle("x".repeat(80))).toBe(`IMAGE - ${"x".repeat(80)}`);
+    expect(formatConsultationTitle("x".repeat(79))).toBe(`IMAGE - ${"x".repeat(79)}`);
+  });
+
+  it("handles an empty question", () => {
+    expect(formatConsultationTitle("")).toBe("IMAGE - ");
+  });
+});
diff --git a/packages/vision-handoff/src/pure.ts b/packages/vision-handoff/src/pure.ts
new file mode 100644
index 0000000..af3476f
--- /dev/null
+++ b/packages/vision-handoff/src/pure.ts
@@ -0,0 +1,156 @@
+/**
+ * Pure decision helpers for the vision handoff.
+ *
+ * No I/O, no ambient state. The shell (the extension + the service) injects the
+ * effects (credential store lookups, orchestrator, provider streaming). This
+ * module owns only the policy: which model is vision-capable, how to format
+ * image placeholders for non-vision models, and how to format the
+ * consultation tool's result.
+ */
+
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { isVisionModelId } from "@dispatch/openai-stream";
+
+/**
+ * Whether a model is vision-capable, given its catalog name and (optional)
+ * resolved `ModelInfo`. When `ModelInfo.vision` is present it is authoritative;
+ * otherwise fall back to the hardcoded name heuristic ({@link isVisionModelId}).
+ *
+ * The `modelName` is the `<credentialName>/<model>` catalog form; the heuristic
+ * inspects the model SEGMENT (after the first `/`) so `umans/kimi-k2.7` → the
+ * `kimi-k2.7` segment is checked. Pure.
+ */
+export function isVisionCapable(
+  modelName: string | undefined,
+  info: ModelInfo | undefined,
+): boolean {
+  // When ModelInfo explicitly reports vision (true OR false), it is authoritative
+  // — an explicit false overrides the name heuristic (a provider that KNOWS a
+  // model is non-vision wins over the name guess).
+  if (info?.vision !== undefined) return info.vision;
+  if (modelName === undefined) return false;
+  const slash = modelName.indexOf("/");
+  const modelId = slash >= 0 ? modelName.slice(slash + 1) : modelName;
+  return isVisionModelId(modelId);
+}
+
+/**
+ * Find the first vision-capable model name in a catalog, given a lookup that
+ * resolves a `<credentialName>/<model>` → `ModelInfo`. Returns `undefined` when
+ * no vision-capable model is available. Pure given the (async) lookup.
+ *
+ * @param catalog  The full list of model names (`<credentialName>/<model>`).
+ * @param getInfo  Async lookup of a model name → ModelInfo (from the credential store).
+ * @param exclude  Optional model name to skip (e.g. the current non-vision model).
+ */
+export async function findVisionModelName(
+  catalog: readonly string[],
+  getInfo: (modelName: string) => Promise<ModelInfo | undefined>,
+  exclude?: string,
+): Promise<string | undefined> {
+  for (const name of catalog) {
+    if (exclude !== undefined && name === exclude) continue;
+    // Fast path: the name heuristic lets us short-circuit without an async
+    // lookup for known vision families (kimi).
+    const slash = name.indexOf("/");
+    const modelId = slash >= 0 ? name.slice(slash + 1) : name;
+    if (isVisionModelId(modelId)) return name;
+    const info = await getInfo(name);
+    if (info?.vision === true) return name;
+  }
+  return undefined;
+}
+
+/**
+ * Fold a provider's streamed events into a single text string. Pure given the
+ * async iterable — collects `text-delta` events, ignores everything else
+ * (reasoning, usage, tool-calls). If the stream yields an error event, it is
+ * surfaced as a thrown Error so the caller can decide how to degrade.
+ */
+export async function collectTextFromStream(stream: AsyncIterable<ProviderEvent>): Promise<string> {
+  let text = "";
+  for await (const event of stream) {
+    if (event.type === "text-delta") {
+      text += event.delta;
+    } else if (event.type === "error") {
+      throw new Error(event.message);
+    }
+  }
+  return text;
+}
+
+/**
+ * Format the placeholder text that replaces an `image` chunk when a non-vision
+ * model is active. The placeholder tells the model an image is attached and it
+ * should call `consult_vision` to analyze it — the model drives the analysis
+ * (asking a specific question) rather than receiving a pre-emptive generic dump.
+ *
+ * @param imageId  The 1-based ID assigned to this image (used by the tool to
+ *   look up the registered image data).
+ * Pure.
+ */
+export function formatImagePlaceholder(imageId: number): string {
+  return (
+    `[Image ${imageId} attached — you cannot view images. Call the ` +
+    `consult_vision tool with imageIds=[${imageId}] and a specific question ` +
+    `to analyze it via a vision-capable model.]`
+  );
+}
+
+/**
+ * Placeholder text used when NO vision-capable model is available (the
+ * degraded path — the tool cannot function). Pure.
+ */
+export function formatNoVisionPlaceholder(): string {
+  return (
+    "[Image attached — no vision-capable model is available to analyze it. " +
+    "Install or configure a vision-capable model (e.g. kimi) to enable image analysis.]"
+  );
+}
+
+/**
+ * Maximum length of the consultation title body (matching the conversation
+ * store's `TITLE_MAX`). The question is truncated to this before the
+ * `"IMAGE - "` prefix is applied so the consultation tab's title stays in line
+ * with the store's own title-derivation limit.
+ */
+const CONSULTATION_TITLE_MAX = 80;
+
+/**
+ * Format the title for a vision consultation conversation tab. The title is
+ * `"IMAGE - "` prefixed to the (truncated) question so the tab is visually
+ * distinguishable from normal conversation tabs. The question is truncated to
+ * match the conversation store's title-derivation limit (`TITLE_MAX = 80`).
+ *
+ * Pure.
+ *
+ * @param question  The question the model asked the vision model.
+ */
+export function formatConsultationTitle(question: string): string {
+  const body =
+    question.length > CONSULTATION_TITLE_MAX
+      ? `${question.slice(0, CONSULTATION_TITLE_MAX)}…`
+      : question;
+  return `IMAGE - ${body}`;
+}
+
+/**
+ * Format the `consult_vision` tool's result string. Returns the conversation ID
+ * (so the model / user can continue the vision consultation), the vision model's
+ * response, and a note that follow-up questions use the dispatch CLI (the model
+ * can load the `dispatch-cli` skill for the exact commands).
+ *
+ * Pure.
+ *
+ * @param conversationId  The new vision consultation conversation ID.
+ * @param response        The vision model's answer to the model's question.
+ */
+export function formatConsultResult(conversationId: string, response: string): string {
+  const trimmed = response.trim();
+  return (
+    `Vision consultation opened in conversation ${conversationId}.\n\n` +
+    `Response: ${trimmed}\n\n` +
+    `To ask follow-up questions about this image, use the dispatch CLI ` +
+    `(conversation: ${conversationId}).`
+  );
+}
diff --git a/packages/vision-handoff/src/service.test.ts b/packages/vision-handoff/src/service.test.ts
new file mode 100644
index 0000000..8c4117e
--- /dev/null
+++ b/packages/vision-handoff/src/service.test.ts
@@ -0,0 +1,375 @@
+import type {
+  AgentEvent,
+  ChatMessage,
+  ModelInfo,
+  ProviderContract,
+  ProviderEvent,
+  ToolContract,
+} from "@dispatch/kernel";
+import { describe, expect, it, vi } from "vitest";
+import { createVisionHandoffService, type VisionHandoffDeps } from "./service.js";
+
+// ── Test doubles (outermost-edge fakes — NOT @dispatch/* mocks) ──────────────
+
+function makeVisionProvider(
+  describe: (imageUrl: string) => string,
+  id = "umans",
+): ProviderContract {
+  return {
+    id,
+    stream: vi.fn(
+      (
+        messages: readonly ChatMessage[],
+        _tools: readonly ToolContract[],
+      ): AsyncIterable<ProviderEvent> => {
+        const img = messages.flatMap((m) => m.chunks).find((c) => c.type === "image");
+        const url = img && img.type === "image" ? img.url : "";
+        const text = describe(url);
+        async function* gen(): AsyncIterable<ProviderEvent> {
+          yield { type: "text-delta", delta: text };
+          yield { type: "finish", reason: "stop" };
+        }
+        return gen();
+      },
+    ),
+  };
+}
+
+function makeDeps(overrides: Partial<VisionHandoffDeps> = {}): VisionHandoffDeps {
+  const visionProvider = makeVisionProvider((url) => `DESCRIPTION of ${url}`);
+  const catalog = ["umans/umans-kimi-k2.7", "umans/umans-glm-5.2"];
+  const infoMap: Record<string, ModelInfo> = {
+    "umans/umans-kimi-k2.7": { id: "umans-kimi-k2.7", vision: true },
+    "umans/umans-glm-5.2": { id: "umans-glm-5.2" },
+  };
+  return {
+    credentialStore: {
+      listCatalog: vi.fn(async () => catalog),
+      getModelInfo: vi.fn(async (name: string) => infoMap[name]),
+      resolve: vi.fn((name: string) => {
+        if (name === "umans/umans-kimi-k2.7")
+          return { providerId: "umans", model: "umans-kimi-k2.7" };
+        if (name === "umans/umans-glm-5.2") return { providerId: "umans", model: "umans-glm-5.2" };
+        return undefined;
+      }),
+    },
+    resolveModel: vi.fn((name: string) =>
+      name === "umans/umans-kimi-k2.7" || name === "umans/umans-glm-5.2"
+        ? { provider: visionProvider, model: name.split("/")[1] }
+        : undefined,
+    ),
+    readFileAsDataUrl: vi.fn(async (path: string) => `data:image/png;base64,FILE(${path})`),
+    setConversationTitle: vi.fn(async (_conversationId: string, _title: string) => {}),
+    ...overrides,
+  };
+}
+
+describe("VisionHandoffService.isVisionCapable", () => {
+  it("returns true for kimi (via ModelInfo)", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    expect(await svc.isVisionCapable("umans/umans-kimi-k2.7")).toBe(true);
+  });
+
+  it("returns false for glm-5.2", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    expect(await svc.isVisionCapable("umans/umans-glm-5.2")).toBe(false);
+  });
+
+  it("returns false for undefined model name", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    expect(await svc.isVisionCapable(undefined)).toBe(false);
+  });
+});
+
+describe("VisionHandoffService.resolveVisionModel", () => {
+  it("resolves the kimi model from the catalog", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    const vision = await svc.resolveVisionModel();
+    expect(vision?.modelName).toBe("umans/umans-kimi-k2.7");
+    expect(vision?.model).toBe("umans-kimi-k2.7");
+  });
+
+  it("excludes the given model", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    const vision = await svc.resolveVisionModel("umans/umans-kimi-k2.7");
+    expect(vision).toBeUndefined();
+  });
+});
+
+describe("VisionHandoffService.prepareForProvider", () => {
+  it("passes messages through unchanged when the model is vision-capable", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "What's this?" },
+          { type: "image", url: "data:image/png;base64,abc" },
+        ],
+      },
+    ];
+    const result = await svc.prepareForProvider(messages, "umans/umans-kimi-k2.7");
+    expect(result).toBe(messages); // same reference — no copy, no change
+  });
+
+  it("passes messages through unchanged when there are no images", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [{ role: "user", chunks: [{ type: "text", text: "hi" }] }];
+    const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2");
+    expect(result).toBe(messages);
+  });
+
+  it("replaces image chunks with numbered placeholders for a non-vision model", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "Describe this" },
+          { type: "image", url: "data:image/png;base64,img1" },
+        ],
+      },
+    ];
+    const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", {
+      conversationId: "conv-1",
+    });
+    expect(result).toHaveLength(1);
+    const chunks = result[0]?.chunks;
+    expect(chunks).toHaveLength(2);
+    // Text chunk unchanged.
+    expect(chunks?.[0]).toEqual({ type: "text", text: "Describe this" });
+    // Image chunk → placeholder text.
+    expect(chunks?.[1]?.type).toBe("text");
+    const placeholder = (chunks?.[1] as { text: string }).text;
+    expect(placeholder).toContain("Image 1");
+    expect(placeholder).toContain("consult_vision");
+  });
+
+  it("assigns sequential image IDs across multiple messages", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,a" }] },
+      { role: "assistant", chunks: [{ type: "text", text: "ok" }] },
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,b" }] },
+    ];
+    const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", {
+      conversationId: "conv-1",
+    });
+    // First image → Image 1, second → Image 2.
+    expect((result[0]?.chunks[0] as { text: string }).text).toContain("Image 1");
+    // Assistant message unchanged.
+    expect(result[1]?.chunks[0]?.type).toBe("text");
+    expect((result[2]?.chunks[0] as { text: string }).text).toContain("Image 2");
+  });
+
+  it("registers images so getRegisteredImage can look them up", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [{ type: "image", url: "data:image/png;base64,registered" }],
+      },
+    ];
+    await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-42" });
+    const img = svc.getRegisteredImage("conv-42", 1);
+    expect(img?.url).toBe("data:image/png;base64,registered");
+  });
+
+  it("uses no-vision placeholder when no vision model is available", async () => {
+    const deps = makeDeps();
+    (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,abc" }] },
+    ];
+    const result = await svc.prepareForProvider(messages, "umans/umans-glm-5.2", {
+      conversationId: "conv-1",
+    });
+    const text = (result[0]?.chunks[0] as { text: string }).text;
+    expect(text).toContain("no vision-capable model");
+    expect(text).not.toContain("consult_vision");
+  });
+});
+
+describe("VisionHandoffService.consultVision", () => {
+  function makeOrchestratorDouble(response: string): {
+    orchestrator: NonNullable<
+      VisionHandoffDeps["resolveOrchestrator"] extends () => infer T ? T : never
+    >;
+    handleMessage: ReturnType<typeof vi.fn>;
+  } {
+    const handleMessage = vi.fn(
+      async (input: {
+        conversationId: string;
+        text: string;
+        onEvent: (event: AgentEvent) => void;
+      }): Promise<void> => {
+        input.onEvent({
+          type: "text-delta",
+          conversationId: input.conversationId,
+          turnId: "t1",
+          delta: response,
+        });
+        input.onEvent({
+          type: "done",
+          conversationId: input.conversationId,
+          turnId: "t1",
+          reason: "stop",
+        });
+      },
+    );
+    return { orchestrator: { handleMessage }, handleMessage };
+  }
+
+  it("opens a new consultation with a pasted image and returns convId + response", async () => {
+    const deps = makeDeps();
+    const { orchestrator, handleMessage } = makeOrchestratorDouble("The error is on line 12.");
+    deps.resolveOrchestrator = () => orchestrator;
+    const svc = createVisionHandoffService(deps);
+
+    // Register an image first (as prepareForProvider would).
+    const messages: ChatMessage[] = [
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] },
+    ];
+    await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" });
+
+    const result = await svc.consultVision("What error is shown?", {
+      conversationId: "conv-1",
+      imageIds: [1],
+    });
+
+    expect("error" in result).toBe(false);
+    if (!("error" in result)) {
+      expect(result.conversationId).toBeTruthy();
+      expect(result.response).toContain("line 12");
+      expect(result.response).toContain(result.conversationId);
+      expect(result.response).toContain("dispatch CLI");
+    }
+    // The orchestrator was called with the vision model + the image.
+    expect(handleMessage).toHaveBeenCalledOnce();
+    const call = handleMessage.mock.calls[0]?.[0];
+    expect(call.modelName).toBe("umans/umans-kimi-k2.7");
+    expect(call.images).toHaveLength(1);
+    expect(call.images?.[0]?.url).toBe("data:image/png;base64,img1");
+  });
+
+  it("labels the consultation tab with an 'IMAGE - ' prefixed title", async () => {
+    const deps = makeDeps();
+    const { orchestrator } = makeOrchestratorDouble("The error is on line 12.");
+    deps.resolveOrchestrator = () => orchestrator;
+    const svc = createVisionHandoffService(deps);
+
+    // Register an image first (as prepareForProvider would).
+    const messages: ChatMessage[] = [
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] },
+    ];
+    await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" });
+
+    const result = await svc.consultVision("What error is shown?", {
+      conversationId: "conv-1",
+      imageIds: [1],
+    });
+
+    expect("error" in result).toBe(false);
+    // The title was set with the IMAGE - prefix + the question.
+    expect(deps.setConversationTitle).toHaveBeenCalledOnce();
+    const [titleConvId, title] = (deps.setConversationTitle as ReturnType<typeof vi.fn>).mock
+      .calls[0];
+    expect(titleConvId).toBe((result as { conversationId: string }).conversationId);
+    expect(title).toBe("IMAGE - What error is shown?");
+  });
+
+  it("does not call setConversationTitle when it is not provided", async () => {
+    const deps = makeDeps({ setConversationTitle: undefined });
+    const { orchestrator } = makeOrchestratorDouble("response");
+    deps.resolveOrchestrator = () => orchestrator;
+    const svc = createVisionHandoffService(deps);
+
+    const messages: ChatMessage[] = [
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,img1" }] },
+    ];
+    await svc.prepareForProvider(messages, "umans/umans-glm-5.2", { conversationId: "conv-1" });
+
+    // Should NOT throw — setConversationTitle is optional.
+    const result = await svc.consultVision("What?", {
+      conversationId: "conv-1",
+      imageIds: [1],
+    });
+    expect("error" in result).toBe(false);
+  });
+
+  it("opens a consultation with a file path image", async () => {
+    const deps = makeDeps();
+    const { orchestrator } = makeOrchestratorDouble("It's a diagram.");
+    deps.resolveOrchestrator = () => orchestrator;
+    const svc = createVisionHandoffService(deps);
+
+    const result = await svc.consultVision("What is this diagram?", {
+      conversationId: "conv-1",
+      path: "diagram.png",
+      cwd: "/work",
+    });
+
+    expect("error" in result).toBe(false);
+    expect(deps.readFileAsDataUrl).toHaveBeenCalledWith("diagram.png", "/work");
+  });
+
+  it("returns an error when imageId is not registered", async () => {
+    const deps = makeDeps();
+    const { orchestrator } = makeOrchestratorDouble("response");
+    deps.resolveOrchestrator = () => orchestrator;
+    const svc = createVisionHandoffService(deps);
+
+    const result = await svc.consultVision("What?", {
+      conversationId: "conv-1",
+      imageIds: [99], // not registered
+    });
+    expect("error" in result).toBe(true);
+    if ("error" in result) {
+      expect(result.error).toContain("Image 99");
+    }
+  });
+
+  it("returns an error when no orchestrator is available", async () => {
+    const deps = makeDeps();
+    // No resolveOrchestrator provided.
+    const svc = createVisionHandoffService(deps);
+    const result = await svc.consultVision("What?", {
+      conversationId: "conv-1",
+      imageIds: [1],
+    });
+    expect("error" in result).toBe(true);
+  });
+
+  it("returns an error when no vision model is available", async () => {
+    const deps = makeDeps();
+    (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+    const { orchestrator } = makeOrchestratorDouble("response");
+    deps.resolveOrchestrator = () => orchestrator;
+    const svc = createVisionHandoffService(deps);
+    const result = await svc.consultVision("What?", {
+      conversationId: "conv-1",
+      imageIds: [1],
+    });
+    expect("error" in result).toBe(true);
+    if ("error" in result) {
+      expect(result.error).toContain("No vision-capable model");
+    }
+  });
+
+  it("returns an error when no image source is provided", async () => {
+    const deps = makeDeps();
+    const { orchestrator } = makeOrchestratorDouble("response");
+    deps.resolveOrchestrator = () => orchestrator;
+    const svc = createVisionHandoffService(deps);
+    const result = await svc.consultVision("What?", {
+      conversationId: "conv-1",
+    });
+    expect("error" in result).toBe(true);
+  });
+});
diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts
new file mode 100644
index 0000000..01245df
--- /dev/null
+++ b/packages/vision-handoff/src/service.ts
@@ -0,0 +1,684 @@
+/**
+ * Vision handoff service — the imperative shell that performs the universal,
+ * provider-agnostic vision handoff.
+ *
+ * Two capabilities:
+ * 1. **prepareForProvider** (`prepareForProvider`): when a user message carries
+ *    images but the active model cannot see them, this replaces each image chunk
+ *    with a numbered placeholder (telling the model to call `consult_vision`)
+ *    and registers the image data in a per-conversation registry for tool
+ *    access. Vision-capable models pass through unchanged (images flow natively).
+ * 2. **consult_vision tool** (`consultVision`): opens a NEW conversation tab with
+ *    a vision-capable model (resolved from the catalog — any provider), attaches
+ *    the image(s) + the model's specific question, waits for the response, and
+ *    returns the conversation ID + the vision model's answer. The model (e.g.
+ *    GLM 5.2) directs the analysis — asking exactly what it needs — instead of
+ *    receiving a pre-emptive generic dump. Follow-up questions go through the
+ *    dispatch CLI (the conversation ID is the bridge), not another tool call.
+ *
+ * Effects (credential store, orchestrator, filesystem) are injected. The pure
+ * decisions live in `pure.ts`. This shell wires them.
+ */
+
+import type { CredentialStore } from "@dispatch/credential-store";
+import type {
+  AgentEvent,
+  ChatMessage,
+  Chunk,
+  ImageInput,
+  Logger,
+  ModelInfo,
+  ProviderContract,
+} from "@dispatch/kernel";
+import { defineService, type ServiceHandle } from "@dispatch/kernel";
+import {
+  collectTextFromStream,
+  findVisionModelName,
+  formatConsultationTitle,
+  formatConsultResult,
+  formatImagePlaceholder,
+  formatNoVisionPlaceholder,
+  isVisionCapable,
+} from "./pure.js";
+
+/**
+ * Minimal orchestrator interface the service needs to start vision consultation
+ * turns. Defined locally (not imported from session-orchestrator) to avoid a
+ * compile-time dependency — resolved lazily at runtime via a local handle keyed
+ * to the same service ID.
+ */
+export interface OrchestratorForVision {
+  readonly handleMessage: (input: {
+    readonly conversationId: string;
+    readonly text: string;
+    readonly onEvent: (event: AgentEvent) => void;
+    readonly modelName?: string;
+    readonly cwd?: string;
+    readonly images?: readonly ImageInput[];
+    readonly systemPrompt?: string;
+  }) => Promise<void>;
+}
+
+/** Local handle for the session-orchestrator service (same ID, no import dep). */
+export const orchestratorLocalHandle: ServiceHandle<OrchestratorForVision> =
+  defineService<OrchestratorForVision>("session-orchestrator/orchestrator");
+
+/**
+ * Resolved vision model — a provider + its model id, ready to stream from.
+ */
+export interface ResolvedVisionModel {
+  readonly provider: ProviderContract;
+  readonly model: string;
+  readonly modelName: string;
+}
+
+/** A registered image (looked up by the consult_vision tool via imageId). */
+interface RegisteredImage {
+  readonly url: string;
+  readonly mimeType?: string;
+}
+
+/**
+ * Dependencies the service needs — all injected (no ambient state).
+ */
+export interface VisionHandoffDeps {
+  readonly credentialStore: CredentialStore;
+  /** Resolve a `<credentialName>/<model>` → its provider + model id. */
+  readonly resolveModel: (
+    modelName: string,
+  ) => { provider: ProviderContract; model: string } | undefined;
+  /**
+   * Read a file from disk as a base64 data URL. Injected so the shell controls
+   * the filesystem edge. Returns the data URL, or throws on error.
+   */
+  readonly readFileAsDataUrl: (path: string, cwd?: string) => Promise<string>;
+  /**
+   * Lazily resolve the session-orchestrator (for starting vision consultation
+   * turns). Returns `undefined` when not available — `consult_vision` degrades
+   * with an error. Lazy so activation order doesn't matter.
+   */
+  readonly resolveOrchestrator?: () => OrchestratorForVision | undefined;
+  /**
+   * Get the per-conversation cached image transcriptions (imageUrl → text).
+   * Used to avoid re-transcribing old images that were compacted to text on a
+   * previous turn. Optional — when absent, compaction still works but
+   * re-transcribes every turn (no caching).
+   */
+  readonly getImageTranscriptions?: (
+    conversationId: string,
+  ) => Promise<ReadonlyMap<string, string>>;
+  /**
+   * Upsert a single image transcription into the per-conversation cache.
+   * Optional — paired with getImageTranscriptions.
+   */
+  readonly setImageTranscription?: (
+    conversationId: string,
+    imageUrl: string,
+    transcription: string,
+  ) => Promise<void>;
+  /**
+   * Save an image data URL to a tmp file and return a compact URL
+   * (`/images/<conversationId>/<imageId>.<ext>`) that can be persisted in the
+   * conversation store instead of the full data URL (which would be megabytes).
+   * The frontend serves the image via `GET /images/...`; the provider resolves
+   * it back to a data URL via {@link resolveImageUrl} at runtime. When `undefined`,
+   * data URLs pass through unchanged (images persist in SQLite — the large-DB
+   * path, for environments without tmp file support).
+   */
+  readonly saveImageToTmp?: (
+    conversationId: string,
+    dataUrl: string,
+    mimeType?: string,
+  ) => Promise<string>;
+  /**
+   * Resolve a compact URL (`/images/...`) back to a data URL by reading the tmp
+   * file. Data URLs and HTTP URLs pass through unchanged. Paired with
+   * {@link saveImageToTmp}.
+   */
+  readonly resolveImageUrl?: (url: string) => Promise<string>;
+  /**
+   * Delete a tmp image file (after it has been compacted to text — the
+   * transcription is cached, the raw image is no longer needed). Best-effort:
+   * errors are logged, not thrown.
+   */
+  readonly deleteTmpImage?: (compactUrl: string) => Promise<void>;
+  /**
+   * Delete all tmp images for a conversation (on conversation close).
+   * Best-effort.
+   */
+  readonly deleteConversationImages?: (conversationId: string) => Promise<void>;
+  /**
+   * Set the human-readable title of a conversation. Used to label vision
+   * consultation tabs with an `"IMAGE - "` prefix so they're visually
+   * distinguishable from normal conversation tabs. Backed by the conversation
+   * store's `setConversationTitle`. Optional — when absent, consultation tabs
+   * keep their default (question-derived) title.
+   */
+  readonly setConversationTitle?: (conversationId: string, title: string) => Promise<void>;
+  /** Generate a new conversation ID for a consultation. Defaults to crypto.randomUUID. */
+  readonly generateId?: () => string;
+  readonly logger?: Logger;
+}
+
+export interface VisionHandoffService {
+  /**
+   * Whether a given model (by catalog name) is vision-capable. Uses the
+   * credential store's ModelInfo + the name heuristic.
+   */
+  readonly isVisionCapable: (modelName: string | undefined) => Promise<boolean>;
+
+  /**
+   * Store images to tmp files and return compact URLs. Each input image's data
+   * URL is saved to `/tmp/dispatch/images/<conversationId>/<uuid>.<ext>` and
+   * replaced with a compact HTTP path (`/images/<conversationId>/<uuid>.<ext>`)
+   * so the persisted conversation store holds a tiny string, not megabytes of
+   * base64. When `saveImageToTmp` is not configured, data URLs pass through
+   * unchanged (backward compatible).
+   */
+  readonly storeImages: (
+    conversationId: string,
+    images: readonly ImageInput[],
+  ) => Promise<readonly ImageInput[]>;
+
+  /**
+   * Delete all tmp images for a conversation (on close). Best-effort.
+   */
+  readonly purgeConversationImages: (conversationId: string) => Promise<void>;
+
+  /**
+   * Resolve a vision-capable model from the catalog (any provider). Returns
+   * `undefined` when none is available.
+   */
+  readonly resolveVisionModel: (excludeName?: string) => Promise<ResolvedVisionModel | undefined>;
+
+  /**
+   * Transform a message list for the provider: if the active model is
+   * vision-capable, return messages unchanged (images pass through natively).
+   * If NOT vision-capable, replace every `image` chunk with a numbered
+   * placeholder (telling the model to call `consult_vision`) and register the
+   * image data in the per-conversation registry for tool access. The PERSISTED
+   * history is NOT modified — only what the provider sees. Never throws.
+   */
+  readonly prepareForProvider: (
+    messages: readonly ChatMessage[],
+    currentModelName: string | undefined,
+    opts?: {
+      readonly conversationId?: string;
+      readonly imageLimit?: number;
+      readonly signal?: AbortSignal;
+      readonly logger?: Logger;
+    },
+  ) => Promise<readonly ChatMessage[]>;
+
+  /**
+   * Look up a registered image by conversation ID + image ID. Returns
+   * `undefined` when the image isn't registered (e.g. after a server restart).
+   */
+  readonly getRegisteredImage: (
+    conversationId: string,
+    imageId: number,
+  ) => RegisteredImage | undefined;
+
+  /**
+   * Open a NEW vision consultation conversation: attach image(s) + the model's
+   * question to a vision-capable model, wait for the response, and return the
+   * conversation ID + the vision model's answer. The model drives the analysis
+   * — it asks exactly what it needs. Follow-ups go through the dispatch CLI.
+   *
+   * @returns The conversation ID + the vision model's response text, or an
+   *   error string (never throws — the tool surfaces it).
+   */
+  readonly consultVision: (
+    question: string,
+    opts: {
+      readonly conversationId: string;
+      readonly imageIds?: readonly number[];
+      readonly path?: string;
+      readonly cwd?: string;
+      readonly signal?: AbortSignal;
+      readonly logger?: Logger;
+    },
+  ) => Promise<
+    { readonly conversationId: string; readonly response: string } | { readonly error: string }
+  >;
+}
+
+export const visionHandoffHandle: ServiceHandle<VisionHandoffService> =
+  defineService<VisionHandoffService>("vision-handoff/service");
+
+/** Whether a message list contains any image chunks. Pure. */
+function hasImageChunks(messages: readonly ChatMessage[]): boolean {
+  return messages.some((m) => m.chunks.some((c) => c.type === "image"));
+}
+
+export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHandoffService {
+  const log = deps.logger;
+  const generateId = deps.generateId ?? (() => crypto.randomUUID());
+
+  // Per-conversation image registry: conversationId → (imageId → image data).
+  // Populated by prepareForProvider; consulted by the consult_vision tool.
+  // In-memory only (cleared on restart — the user re-pastes if needed).
+  const imageRegistry = new Map<string, Map<number, RegisteredImage>>();
+
+  async function getInfo(modelName: string): Promise<ModelInfo | undefined> {
+    return deps.credentialStore.getModelInfo(modelName);
+  }
+
+  async function resolveVisionModel(
+    excludeName?: string,
+  ): Promise<ResolvedVisionModel | undefined> {
+    const catalog = await deps.credentialStore.listCatalog();
+    const name = await findVisionModelName(catalog, getInfo, excludeName);
+    if (name === undefined) return undefined;
+    const resolved = deps.resolveModel(name);
+    if (resolved === undefined) return undefined;
+    return { provider: resolved.provider, model: resolved.model, modelName: name };
+  }
+
+  /**
+   * Compact images for a vision-capable model: when the conversation has more
+   * image chunks than the limit, the oldest images are transcribed to text
+   * (one-time, cached in the conversation store) and stripped from the
+   * provider messages. Recent images (within the limit) stay native.
+   *
+   * The persisted history is NOT modified — only the provider's view.
+   * Transcriptions are cached so they're reused on subsequent turns (no
+   * re-transcription). When no caching deps are available, it still works but
+   * re-transcribes every turn.
+   */
+  async function compactImagesForVisionModel(
+    messages: readonly ChatMessage[],
+    opts:
+      | {
+          readonly conversationId?: string;
+          readonly imageLimit?: number;
+          readonly signal?: AbortSignal;
+          readonly logger?: Logger;
+        }
+      | undefined,
+    currentModelName: string | undefined,
+  ): Promise<readonly ChatMessage[]> {
+    void currentModelName; // reserved for future model-specific compaction logic
+    const limit = opts?.imageLimit;
+    // No limit or limit <= 0 → pass all images through (compaction disabled).
+    if (limit === undefined || limit <= 0) return messages;
+
+    // Collect all image chunks in order (oldest first, across all messages).
+    const imageEntries: { msgIdx: number; chunkIdx: number; url: string }[] = [];
+    for (const [mi, msg] of messages.entries()) {
+      for (const [ci, chunk] of msg.chunks.entries()) {
+        if (chunk.type === "image") {
+          imageEntries.push({ msgIdx: mi, chunkIdx: ci, url: chunk.url });
+        }
+      }
+    }
+
+    // If within the limit, pass everything through natively.
+    if (imageEntries.length <= limit) return messages;
+
+    // The oldest (imageEntries.length - limit) images need transcription.
+    const toTranscribeCount = imageEntries.length - limit;
+    const toTranscribe = imageEntries.slice(0, toTranscribeCount);
+
+    // Load cached transcriptions.
+    const convId = opts?.conversationId;
+    const cache =
+      convId !== undefined && deps.getImageTranscriptions !== undefined
+        ? await deps.getImageTranscriptions(convId)
+        : new Map<string, string>();
+
+    // Transcribe any that aren't cached yet (via the vision model).
+    const transcriptions = new Map<string, string>(cache);
+    const vision = await resolveVisionModel();
+    for (const entry of toTranscribe) {
+      if (transcriptions.has(entry.url)) continue;
+      if (vision === undefined) {
+        // No vision model available for transcription — use a placeholder.
+        transcriptions.set(
+          entry.url,
+          "[Image was compacted — no vision model available to transcribe it.]",
+        );
+        continue;
+      }
+      try {
+        const prompt =
+          "Describe this image in detail. Include visible text (transcribe verbatim), " +
+          "key objects, layout, and notable details. This description will replace " +
+          "the image in a conversation history, so be thorough.";
+        const userMessage: ChatMessage = {
+          role: "user",
+          chunks: [
+            { type: "text", text: prompt },
+            { type: "image", url: entry.url },
+          ],
+        };
+        const stream = vision.provider.stream([userMessage], [], {
+          model: vision.model,
+          systemPrompt: "You are a vision assistant. Describe images faithfully and thoroughly.",
+        });
+        const description = (await collectTextFromStream(stream)).trim();
+        const text =
+          description.length > 0 ? description : "[Image transcription produced no output.]";
+        transcriptions.set(entry.url, text);
+        // Cache it in the conversation store (if available).
+        if (convId !== undefined && deps.setImageTranscription !== undefined) {
+          await deps.setImageTranscription(convId, entry.url, text);
+        }
+        // The image has been transcribed to text — delete the tmp file
+        // (the transcription is cached, the raw image is no longer needed).
+        if (deps.deleteTmpImage !== undefined) {
+          try {
+            await deps.deleteTmpImage(entry.url);
+          } catch {
+            // Best-effort — don't let cleanup failure break the turn.
+          }
+        }
+      } catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        log?.warn("vision-handoff: image compaction transcription failed", { error: msg });
+        transcriptions.set(entry.url, `[Image transcription failed: ${msg}]`);
+      }
+    }
+
+    // Build the provider messages: replace transcribed images with text,
+    // keep recent images (within the limit) native.
+    const transcribedUrls = new Set(toTranscribe.map((e) => e.url));
+    const result: ChatMessage[] = [];
+    for (const msg of messages) {
+      if (!msg.chunks.some((c) => c.type === "image")) {
+        result.push(msg);
+        continue;
+      }
+      const newChunks: Chunk[] = [];
+      for (const chunk of msg.chunks) {
+        if (chunk.type === "image" && transcribedUrls.has(chunk.url)) {
+          const transcription = transcriptions.get(chunk.url);
+          if (transcription !== undefined) {
+            newChunks.push({ type: "text", text: `[Compacted image]: ${transcription}` });
+          } else {
+            newChunks.push(chunk); // fallback: keep the image
+          }
+        } else {
+          newChunks.push(chunk);
+        }
+      }
+      result.push({ role: msg.role, chunks: newChunks });
+    }
+    return result;
+  }
+
+  async function resolveImageUrlsInMessages(
+    messages: readonly ChatMessage[],
+  ): Promise<readonly ChatMessage[]> {
+    if (deps.resolveImageUrl === undefined) return messages;
+    let hasCompact = false;
+    for (const msg of messages) {
+      if (msg.chunks.some((c) => c.type === "image")) {
+        hasCompact = true;
+        break;
+      }
+    }
+    if (!hasCompact) return messages;
+    const result: ChatMessage[] = [];
+    for (const msg of messages) {
+      if (!msg.chunks.some((c) => c.type === "image")) {
+        result.push(msg);
+        continue;
+      }
+      const newChunks: Chunk[] = [];
+      for (const chunk of msg.chunks) {
+        if (chunk.type === "image") {
+          const dataUrl = await deps.resolveImageUrl!(chunk.url);
+          newChunks.push({
+            type: "image",
+            url: dataUrl,
+            ...(chunk.mimeType !== undefined ? { mimeType: chunk.mimeType } : {}),
+          });
+        } else {
+          newChunks.push(chunk);
+        }
+      }
+      result.push({ role: msg.role, chunks: newChunks });
+    }
+    return result;
+  }
+
+  const service: VisionHandoffService = {
+    async isVisionCapable(modelName: string | undefined): Promise<boolean> {
+      if (modelName === undefined) return false;
+      const info = await getInfo(modelName);
+      return isVisionCapable(modelName, info);
+    },
+
+    async storeImages(
+      conversationId: string,
+      images: readonly ImageInput[],
+    ): Promise<readonly ImageInput[]> {
+      if (deps.saveImageToTmp === undefined) return images;
+      const result: ImageInput[] = [];
+      for (const img of images) {
+        if (img.url.startsWith("data:")) {
+          const compactUrl = await deps.saveImageToTmp(conversationId, img.url, img.mimeType);
+          result.push({
+            url: compactUrl,
+            ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+          });
+        } else {
+          result.push(img);
+        }
+      }
+      return result;
+    },
+
+    async purgeConversationImages(conversationId: string): Promise<void> {
+      if (deps.deleteConversationImages === undefined) return;
+      try {
+        await deps.deleteConversationImages(conversationId);
+      } catch (err) {
+        log?.warn("vision-handoff: failed to purge conversation images", {
+          conversationId,
+          error: err instanceof Error ? err.message : String(err),
+        });
+      }
+    },
+
+    resolveVisionModel,
+
+    async prepareForProvider(
+      messages: readonly ChatMessage[],
+      currentModelName: string | undefined,
+      opts?: {
+        readonly conversationId?: string;
+        readonly imageLimit?: number;
+        readonly signal?: AbortSignal;
+        readonly logger?: Logger;
+      },
+    ): Promise<readonly ChatMessage[]> {
+      // Fast path: no images anywhere → nothing to do.
+      if (!hasImageChunks(messages)) return messages;
+
+      // Resolve compact URLs (/images/...) → data URLs for the provider.
+      // The persisted chunks store compact URLs (tiny strings); the provider
+      // needs data URLs (read from tmp files at runtime).
+      const resolved = await resolveImageUrlsInMessages(messages);
+
+      const isCapable =
+        currentModelName !== undefined &&
+        (await isVisionCapable(currentModelName, await getInfo(currentModelName)));
+
+      // ── Vision-capable model: image compaction ──────────────────────────
+      // When the conversation has more images than the limit, the oldest images
+      // are transcribed to text (one-time, cached) and stripped from the
+      // provider messages. Recent images (within the limit) stay native.
+      if (isCapable) {
+        return compactImagesForVisionModel(resolved, opts, currentModelName);
+      }
+
+      // ── Non-vision model: placeholders + consult_vision ──────────────────
+      const vision = await resolveVisionModel();
+      const convId = opts?.conversationId;
+
+      const placeholderFn =
+        vision !== undefined && convId !== undefined
+          ? (id: number) => formatImagePlaceholder(id)
+          : () => formatNoVisionPlaceholder();
+
+      // Replace each image chunk with a numbered placeholder. Assign sequential
+      // 1-based IDs across all messages and register each image in the
+      // per-conversation registry so the consult_vision tool can look it up.
+      let seqId = 0;
+      const result: ChatMessage[] = [];
+      for (const msg of resolved) {
+        if (!msg.chunks.some((c) => c.type === "image")) {
+          result.push(msg);
+          continue;
+        }
+        const newChunks: Chunk[] = [];
+        for (const chunk of msg.chunks) {
+          if (chunk.type === "image") {
+            seqId++;
+            if (convId !== undefined && vision !== undefined) {
+              let convImages = imageRegistry.get(convId);
+              if (convImages === undefined) {
+                convImages = new Map();
+                imageRegistry.set(convId, convImages);
+              }
+              convImages.set(seqId, {
+                url: chunk.url,
+                ...(chunk.mimeType !== undefined ? { mimeType: chunk.mimeType } : {}),
+              });
+            }
+            newChunks.push({ type: "text", text: placeholderFn(seqId) });
+          } else {
+            newChunks.push(chunk);
+          }
+        }
+        result.push({ role: msg.role, chunks: newChunks });
+      }
+      return result;
+    },
+
+    getRegisteredImage(conversationId: string, imageId: number): RegisteredImage | undefined {
+      return imageRegistry.get(conversationId)?.get(imageId);
+    },
+
+    async consultVision(
+      question: string,
+      opts: {
+        readonly conversationId: string;
+        readonly imageIds?: readonly number[];
+        readonly path?: string;
+        readonly cwd?: string;
+        readonly signal?: AbortSignal;
+        readonly logger?: Logger;
+      },
+    ): Promise<
+      { readonly conversationId: string; readonly response: string } | { readonly error: string }
+    > {
+      const orchestrator = deps.resolveOrchestrator?.();
+      if (orchestrator === undefined) {
+        return {
+          error: "The session orchestrator is not available — cannot start a vision consultation.",
+        };
+      }
+
+      const vision = await resolveVisionModel();
+      if (vision === undefined) {
+        return {
+          error:
+            "No vision-capable model is available in the catalog. Install or configure one (e.g. kimi) to enable image analysis.",
+        };
+      }
+
+      // Collect image data URLs to attach.
+      const images: ImageInput[] = [];
+      if (opts.imageIds !== undefined) {
+        for (const id of opts.imageIds) {
+          const img = service.getRegisteredImage(opts.conversationId, id);
+          if (img === undefined) {
+            return {
+              error: `Image ${id} is not registered. It may have been lost after a server restart — ask the user to re-paste the image.`,
+            };
+          }
+          images.push({
+            url: img.url,
+            ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+          });
+        }
+      }
+      if (opts.path !== undefined) {
+        try {
+          const dataUrl = await deps.readFileAsDataUrl(opts.path, opts.cwd);
+          images.push({ url: dataUrl });
+        } catch (err) {
+          const msg = err instanceof Error ? err.message : String(err);
+          return { error: `Failed to read image file "${opts.path}": ${msg}` };
+        }
+      }
+      if (images.length === 0) {
+        return {
+          error:
+            "No image to consult about. Provide imageIds (for pasted images) or path (for a file).",
+        };
+      }
+
+      // Start a NEW conversation with the vision model.
+      const consultationId = generateId();
+      log?.info("vision-handoff: starting consultation", {
+        consultationId,
+        visionModel: vision.modelName,
+        imageCount: images.length,
+        fromConversation: opts.conversationId,
+      });
+
+      // Label the consultation tab with an "IMAGE - " prefix so it's visually
+      // distinguishable from normal conversation tabs. Set BEFORE the turn
+      // starts so the tab shows the correct title from the first moment (the
+      // store keeps a non-"Untitled" title on first message append).
+      if (deps.setConversationTitle !== undefined) {
+        try {
+          await deps.setConversationTitle(consultationId, formatConsultationTitle(question));
+        } catch (err) {
+          // Best-effort — don't let a title-write failure break the consultation.
+          log?.warn("vision-handoff: failed to set consultation title", {
+            consultationId,
+            error: err instanceof Error ? err.message : String(err),
+          });
+        }
+      }
+
+      let responseText = "";
+      let errorMessage = "";
+      try {
+        await orchestrator.handleMessage({
+          conversationId: consultationId,
+          text: question,
+          images,
+          modelName: vision.modelName,
+          ...(opts.cwd !== undefined ? { cwd: opts.cwd } : {}),
+          systemPrompt:
+            "You are a vision assistant. A developer who cannot see images is asking you specific questions about an image they attached. Answer their question precisely and thoroughly.",
+          onEvent: (event: AgentEvent) => {
+            if (event.type === "text-delta") {
+              responseText += event.delta;
+            } else if (event.type === "error") {
+              errorMessage = event.message;
+            }
+          },
+        });
+      } catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        return { error: `Vision consultation failed: ${msg}` };
+      }
+
+      if (errorMessage.length > 0 && responseText.trim().length === 0) {
+        return { error: `Vision consultation failed: ${errorMessage}` };
+      }
+
+      const response = formatConsultResult(consultationId, responseText);
+      return { conversationId: consultationId, response };
+    },
+  };
+
+  return service;
+}
diff --git a/packages/vision-handoff/src/tool.ts b/packages/vision-handoff/src/tool.ts
new file mode 100644
index 0000000..86be2ed
--- /dev/null
+++ b/packages/vision-handoff/src/tool.ts
@@ -0,0 +1,137 @@
+/**
+ * consult_vision tool — lets any model (vision-capable or not) consult a
+ * vision-capable model about an image by opening a NEW conversation tab.
+ *
+ * The tool attaches image(s) + the model's specific question to a vision-capable
+ * model (resolved from the catalog — e.g. Kimi), waits for the response, and
+ * returns the conversation ID + the vision model's answer. The MODEL directs the
+ * analysis — it asks exactly what it needs to know — instead of receiving a
+ * pre-emptive generic dump.
+ *
+ * For images PASTED into the chat, the model references them by `imageIds` (from
+ * the "[Image N attached]" placeholders the orchestrator injected). For image
+ * FILES on disk, the model passes a `path`.
+ *
+ * Follow-up questions are NOT handled by this tool — the model uses the dispatch
+ * CLI to continue the vision conversation (the returned conversation ID is the
+ * bridge; the model can load the `dispatch-cli` skill for the exact commands).
+ */
+
+import type { ToolContract, ToolExecuteContext, ToolResult } from "@dispatch/kernel";
+import type { VisionHandoffService } from "./service.js";
+
+export function createConsultVisionTool(service: VisionHandoffService): ToolContract {
+  return {
+    name: "consult_vision",
+    description:
+      "Consult a vision-capable model (e.g. Kimi) about an image by opening a new " +
+      "conversation tab. Attaches the image(s) + your specific question, waits for " +
+      "the vision model's response, and returns the conversation ID + the answer. " +
+      "Use this when you cannot view an image (e.g. a pasted screenshot or diagram) " +
+      "and need to know what it shows — ask a SPECIFIC question (e.g. 'What error " +
+      "message is on line 12?' rather than 'describe this image'). The conversation " +
+      "ID is returned so follow-up questions can be asked via the dispatch CLI.",
+    parameters: {
+      type: "object",
+      properties: {
+        question: {
+          type: "string",
+          description:
+            "Your specific question about the image. Be precise — the vision model " +
+            "will answer exactly this. E.g. 'What error message is displayed?' or " +
+            "'Compare the layout of these two screenshots.'",
+        },
+        imageIds: {
+          type: "array",
+          items: { type: "number" },
+          description:
+            "The IDs of pasted images to attach (from the '[Image N attached]' " +
+            "placeholders in the conversation). Pass multiple to attach several " +
+            "images to one consultation (e.g. [1, 2] to compare them).",
+        },
+        path: {
+          type: "string",
+          description:
+            "Path to an image FILE on disk to attach (alternative to imageIds for " +
+            "code-referenced images). Relative paths resolve against the cwd.",
+        },
+      },
+      required: ["question"],
+    },
+    concurrencySafe: true,
+    async execute(args: unknown, ctx: ToolExecuteContext): Promise<ToolResult> {
+      const input = args as {
+        question?: unknown;
+        imageIds?: unknown;
+        path?: unknown;
+      } | null;
+
+      const question = input?.question;
+      if (typeof question !== "string" || question.trim().length === 0) {
+        return {
+          content: "Error: 'question' is required and must be a non-empty string.",
+          isError: true,
+        };
+      }
+
+      const imageIds = input?.imageIds;
+      const path = input?.path;
+
+      // Parse imageIds (must be an array of numbers if present).
+      let parsedImageIds: number[] | undefined;
+      if (imageIds !== undefined) {
+        if (!Array.isArray(imageIds)) {
+          return { content: "Error: 'imageIds' must be an array of numbers.", isError: true };
+        }
+        parsedImageIds = imageIds.filter((n): n is number => typeof n === "number");
+        if (parsedImageIds.length === 0) {
+          return { content: "Error: 'imageIds' must contain at least one number.", isError: true };
+        }
+      }
+
+      // path must be a string if present.
+      let parsedPath: string | undefined;
+      if (path !== undefined) {
+        if (typeof path !== "string" || path.trim().length === 0) {
+          return { content: "Error: 'path' must be a non-empty string.", isError: true };
+        }
+        parsedPath = path;
+      }
+
+      // At least one image source is required.
+      if (parsedImageIds === undefined && parsedPath === undefined) {
+        return {
+          content:
+            "Error: provide 'imageIds' (for pasted images) or 'path' (for a file) " +
+            "to attach an image to the consultation.",
+          isError: true,
+        };
+      }
+
+      const span = ctx.log.span("consult_vision.execute", {
+        imageCount: (parsedImageIds?.length ?? 0) + (parsedPath !== undefined ? 1 : 0),
+      });
+      try {
+        const result = await service.consultVision(question, {
+          conversationId: ctx.conversationId ?? "",
+          ...(parsedImageIds !== undefined ? { imageIds: parsedImageIds } : {}),
+          ...(parsedPath !== undefined ? { path: parsedPath } : {}),
+          ...(ctx.cwd !== undefined ? { cwd: ctx.cwd } : {}),
+          signal: ctx.signal,
+          logger: ctx.log,
+        });
+        span.end({ attrs: { ok: !("error" in result) } });
+        if ("error" in result) {
+          return { content: result.error, isError: true };
+        }
+        return { content: result.response };
+      } catch (err: unknown) {
+        span.end({ err });
+        return {
+          content: `Error during vision consultation: ${err instanceof Error ? err.message : String(err)}`,
+          isError: true,
+        };
+      }
+    },
+  };
+}
diff --git a/packages/vision-handoff/tsconfig.json b/packages/vision-handoff/tsconfig.json
new file mode 100644
index 0000000..b5439aa
--- /dev/null
+++ b/packages/vision-handoff/tsconfig.json
@@ -0,0 +1,12 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": { "rootDir": "src", "outDir": "dist", "composite": true },
+  "include": ["src/**/*.ts"],
+  "references": [
+    { "path": "../kernel" },
+    { "path": "../wire" },
+    { "path": "../conversation-store" },
+    { "path": "../credential-store" },
+    { "path": "../openai-stream" }
+  ]
+}
diff --git a/packages/wire/src/index.test.ts b/packages/wire/src/index.test.ts
index 3f07e00..81d10c1 100644
--- a/packages/wire/src/index.test.ts
+++ b/packages/wire/src/index.test.ts
@@ -8,7 +8,7 @@
  */
 
 import { describe, expect, it } from "vitest";
-import type { Computer, ComputerEntry, Workspace } from "./index.js";
+import type { Chunk, Computer, ComputerEntry, ImageChunk, ImageInput, Workspace } from "./index.js";
 
 describe("@dispatch/wire — Computer / Workspace shapes", () => {
   it("a Computer literal satisfies the Computer type", () => {
@@ -57,3 +57,32 @@ describe("@dispatch/wire — Computer / Workspace shapes", () => {
     expect(local.defaultComputerId).toBeNull();
   });
 });
+
+describe("@dispatch/wire — ImageChunk / ImageInput shapes", () => {
+  it("an ImageChunk carries a data URL and optional mimeType", () => {
+    const c: ImageChunk = {
+      type: "image",
+      url: "data:image/png;base64,iVBORw0KGgo=",
+      mimeType: "image/png",
+    };
+    expect(c.type).toBe("image");
+    expect(c.url).toContain("base64");
+    expect(c.mimeType).toBe("image/png");
+  });
+
+  it("an ImageChunk with only a url is valid (mimeType optional)", () => {
+    const c: ImageChunk = { type: "image", url: "https://example.com/cat.png" };
+    expect(c.mimeType).toBeUndefined();
+  });
+
+  it("ImageInput mirrors ImageChunk's url semantics", () => {
+    const input: ImageInput = { url: "data:image/jpeg;base64,/9j/4AAQ" };
+    expect(input.url).toContain("jpeg");
+  });
+
+  it("ImageChunk is a member of the Chunk union (assignable)", () => {
+    const chunk: Chunk = { type: "image", url: "data:image/png;base64,x" };
+    // Compile-time proof: an ImageChunk satisfies the Chunk union.
+    expect(chunk.type).toBe("image");
+  });
+});
diff --git a/packages/wire/src/index.ts b/packages/wire/src/index.ts
index 6d10e0f..113f684 100644
--- a/packages/wire/src/index.ts
+++ b/packages/wire/src/index.ts
@@ -36,7 +36,8 @@ export type Chunk =
   | ToolCallChunk
   | ToolResultChunk
   | ErrorChunk
-  | SystemChunk;
+  | SystemChunk
+  | ImageChunk;
 
 /** A piece of plain text content from the assistant or user. */
 export interface TextChunk {
@@ -113,6 +114,46 @@ export interface SystemChunk {
 }
 
 /**
+ * An image attached to a message (e.g. a user-pasted screenshot or pasted
+ * photo). Carries a `url` that is EITHER a base64 data URL
+ * (`data:image/png;base64,…`) OR an `http(s)://` URL. Vision-capable models
+ * receive it natively (the provider serializes it to its image-content
+ * format); non-vision models never see it directly — the orchestrator's
+ * **vision handoff** transcribes it to a text description (via a
+ * vision-capable model) and feeds that text instead, so a text-only model can
+ * still reason about the image's contents.
+ *
+ * When a transcription was performed, it is persisted as a separate `text`
+ * chunk alongside the `image` chunk in the SAME user message, so the
+ * description is reused on every later turn (no re-transcription) and a
+ * client renders both the original image and its textual analysis.
+ */
+export interface ImageChunk {
+  readonly type: "image";
+  /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+  readonly url: string;
+  /**
+   * Optional MIME type of the image (e.g. `"image/png"`). Inferred from the
+   * data URL when absent; present so a client can render an icon/label without
+   * parsing the URL. Optional — callers that only have a URL omit it.
+   */
+  readonly mimeType?: string;
+}
+
+/**
+ * An image a client attaches to a chat message (`ChatRequest.images`). The
+ * transport-facing input shape; the orchestrator converts each `ImageInput`
+ * into an `ImageChunk` on the persisted user message. Carries the same `url`
+ * semantics as `ImageChunk.url`.
+ */
+export interface ImageInput {
+  /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+  readonly url: string;
+  /** Optional MIME type (e.g. `"image/png"`). Optional — inferred from the data URL when absent. */
+  readonly mimeType?: string;
+}
+
+/**
  * A chat message: a role plus an ordered sequence of chunks. Messages are the
  * unit passed to and from the provider; chunks are the unit persisted and
  * rendered.
diff --git a/tsconfig.json b/tsconfig.json
index d31b44a..f97edde 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -41,6 +41,9 @@
       "path": "./packages/credential-store"
     },
     {
+      "path": "./packages/vision-handoff"
+    },
+    {
       "path": "./packages/exec-backend"
     },
     {
author	Adam Malczewski <[email protected]>	2026-06-27 20:48:24 +0900
committer	Adam Malczewski <[email protected]>	2026-06-27 20:48:24 +0900
commit	04356c8678ae8dd1d7ddca2d0460b514116adc2e (patch)
tree	6c81894ef02d062570b12f4d3a871e58600dcb9c
parent	3184b10e614ce6249c83aa111368e98f6689f456 (diff)
parent	b24ed99e89bc657e8c98c7cef8608e0c0b7594da (diff)
download	dispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.tar.gz dispatch-04356c8678ae8dd1d7ddca2d0460b514116adc2e.zip