feat(vision-handoff): implement vision for capable models and universal vision handoff

author: Adam Malczewski <[email protected]> 2026-06-27 03:40:38 +0900
committer: Adam Malczewski <[email protected]> 2026-06-27 03:40:38 +0900
commit: d5633cf6e007eaf8255a44529a638d2466a74ba3 (patch)
tree: 14fe72f5b585eb72c763073b4e7022b914bdbafb
parent: ad9d135e583c99a0d93327115defa43187cde1c3 (diff)
download: dispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.tar.gz
dispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.zip
35 files changed, 1727 insertions, 19 deletions
diff --git a/bun.lock b/bun.lock
index 2261ba8..8a913d0 100644
--- a/bun.lock
+++ b/bun.lock
@@ -103,6 +103,7 @@
         "@dispatch/tool-youtube-transcript": "workspace:*",
         "@dispatch/transport-http": "workspace:*",
         "@dispatch/transport-ws": "workspace:*",
+        "@dispatch/vision-handoff": "workspace:*",
       },
     },
     "packages/journal-sink": {
@@ -361,6 +362,15 @@
       "name": "@dispatch/ui-contract",
       "version": "0.2.0",
     },
+    "packages/vision-handoff": {
+      "name": "@dispatch/vision-handoff",
+      "version": "0.0.0",
+      "dependencies": {
+        "@dispatch/credential-store": "workspace:*",
+        "@dispatch/kernel": "workspace:*",
+        "@dispatch/openai-stream": "workspace:*",
+      },
+    },
     "packages/wire": {
       "name": "@dispatch/wire",
       "version": "0.12.0",
@@ -461,6 +471,8 @@
 
     "@dispatch/ui-contract": ["@dispatch/ui-contract@workspace:packages/ui-contract"],
 
+    "@dispatch/vision-handoff": ["@dispatch/vision-handoff@workspace:packages/vision-handoff"],
+
     "@dispatch/wire": ["@dispatch/wire@workspace:packages/wire"],
 
     "@esbuild/aix-ppc64": ["@esbuild/[email protected]", "", { "os": "aix", "cpu": "ppc64" }, "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg=="],
diff --git a/packages/host-bin/package.json b/packages/host-bin/package.json
index 65ea305..b5ab954 100644
--- a/packages/host-bin/package.json
+++ b/packages/host-bin/package.json
@@ -33,6 +33,7 @@
     "@dispatch/surface-loaded-extensions": "workspace:*",
     "@dispatch/surface-registry": "workspace:*",
     "@dispatch/transport-ws": "workspace:*",
-    "@dispatch/system-prompt": "workspace:*"
+    "@dispatch/system-prompt": "workspace:*",
+    "@dispatch/vision-handoff": "workspace:*"
   }
 }
diff --git a/packages/host-bin/src/main.ts b/packages/host-bin/src/main.ts
index 8633052..a5dabab 100644
--- a/packages/host-bin/src/main.ts
+++ b/packages/host-bin/src/main.ts
@@ -43,6 +43,7 @@ import { extension as toolWriteFileExt } from "@dispatch/tool-write-file";
 import { extension as toolYoutubeTranscriptExt } from "@dispatch/tool-youtube-transcript";
 import { createTransportHttpExtension } from "@dispatch/transport-http";
 import { createTransportWsExtension } from "@dispatch/transport-ws";
+import { extension as visionHandoffExt } from "@dispatch/vision-handoff";
 import type { ChildHandle } from "./collector-supervisor.js";
 import { createCollectorSupervisor } from "./collector-supervisor.js";
 import { configMapToAccess, envToConfigMap } from "./config.js";
@@ -204,6 +205,13 @@ async function boot(): Promise<void> {
   const extensions: Extension[] = [
     ...CORE_EXTENSIONS,
     createCredentialStoreExtension({ credentials }),
+    // vision-handoff activates AFTER credential-store (it resolves the
+    // credential-store service at activate time to find vision-capable models).
+    // Placed here, not in CORE_EXTENSIONS, so the service is available when it
+    // activates. The session-orchestrator resolves its service LAZILY
+    // (per-turn), so activation order between it and session-orchestrator
+    // doesn't matter.
+    visionHandoffExt,
     ...externalExtensions,
   ];
 
diff --git a/packages/host-bin/tsconfig.json b/packages/host-bin/tsconfig.json
index 2b1edf5..305274c 100644
--- a/packages/host-bin/tsconfig.json
+++ b/packages/host-bin/tsconfig.json
@@ -60,6 +60,9 @@
     },
     {
       "path": "../transport-ws"
+    },
+    {
+      "path": "../vision-handoff"
     }
   ]
 }
diff --git a/packages/kernel/src/contracts/conversation.ts b/packages/kernel/src/contracts/conversation.ts
index f074c52..80da86e 100644
--- a/packages/kernel/src/contracts/conversation.ts
+++ b/packages/kernel/src/contracts/conversation.ts
@@ -12,6 +12,8 @@ export type {
   ConversationMeta,
   ConversationStatus,
   ErrorChunk,
+  ImageChunk,
+  ImageInput,
   Role,
   StepId,
   StepMetrics,
diff --git a/packages/kernel/src/contracts/index.ts b/packages/kernel/src/contracts/index.ts
index 09e0a56..28e0a0b 100644
--- a/packages/kernel/src/contracts/index.ts
+++ b/packages/kernel/src/contracts/index.ts
@@ -19,6 +19,8 @@ export type {
   ConversationMeta,
   ConversationStatus,
   ErrorChunk,
+  ImageChunk,
+  ImageInput,
   Role,
   StepId,
   StepMetrics,
diff --git a/packages/kernel/src/contracts/provider.ts b/packages/kernel/src/contracts/provider.ts
index b6dc8ca..3137073 100644
--- a/packages/kernel/src/contracts/provider.ts
+++ b/packages/kernel/src/contracts/provider.ts
@@ -114,6 +114,16 @@ export interface ModelInfo {
   readonly displayName?: string;
   /** The model's max context window in tokens (e.g. 200000). Optional — providers that don't report it leave it undefined. */
   readonly contextWindow?: number;
+  /**
+   * Whether this model can natively accept image input (vision/multimodal).
+   * When `true`, image chunks in a user message are passed through to the
+   * provider serialized to its image-content format. When `false`/absent, the
+   * orchestrator's vision handoff transcribes images to text (via a
+   * vision-capable model) before the model sees them. Optional — providers
+   * that cannot detect it leave it undefined (treated as non-vision); a
+   * provider that knows a model is vision-capable sets it `true`.
+   */
+  readonly vision?: boolean;
 }
 
 /**
diff --git a/packages/openai-stream/src/convert-messages.test.ts b/packages/openai-stream/src/convert-messages.test.ts
index 3520eb5..57c7d81 100644
--- a/packages/openai-stream/src/convert-messages.test.ts
+++ b/packages/openai-stream/src/convert-messages.test.ts
@@ -35,6 +35,100 @@ describe("convertMessages", () => {
     expect(result).toEqual([{ role: "user", content: "Hello, world!" }]);
   });
 
+  it("converts a user message with a text + image chunk to a multimodal content array", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "What is in this image?" },
+          { type: "image", url: "data:image/png;base64,iVBORw0KGgo=" },
+        ],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    expect(result).toEqual([
+      {
+        role: "user",
+        content: [
+          { type: "text", text: "What is in this image?" },
+          { type: "image_url", image_url: { url: "data:image/png;base64,iVBORw0KGgo=" } },
+        ],
+      },
+    ]);
+  });
+
+  it("converts an image-only user message (no text) to a content array with just the image", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [{ type: "image", url: "https://example.com/cat.png" }],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    expect(result).toEqual([
+      {
+        role: "user",
+        content: [{ type: "image_url", image_url: { url: "https://example.com/cat.png" } }],
+      },
+    ]);
+  });
+
+  it("converts a user message with multiple images interspersed with text", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "Compare these:" },
+          { type: "image", url: "data:image/png;base64,aaa" },
+          { type: "text", text: "and" },
+          { type: "image", url: "data:image/jpeg;base64,bbb" },
+        ],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    expect(result).toHaveLength(1);
+    const content = result[0]?.content;
+    expect(Array.isArray(content)).toBe(true);
+    if (Array.isArray(content)) {
+      expect(content).toHaveLength(4);
+      expect(content[0]).toEqual({ type: "text", text: "Compare these:" });
+      expect(content[1]).toEqual({
+        type: "image_url",
+        image_url: { url: "data:image/png;base64,aaa" },
+      });
+      expect(content[2]).toEqual({ type: "text", text: "and" });
+      expect(content[3]).toEqual({
+        type: "image_url",
+        image_url: { url: "data:image/jpeg;base64,bbb" },
+      });
+    }
+  });
+
+  it("skips empty text parts in a multimodal message but keeps images", () => {
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "" },
+          { type: "image", url: "data:image/png;base64,x" },
+        ],
+      },
+    ];
+
+    const result = convertMessages(messages);
+    const content = result[0]?.content;
+    expect(Array.isArray(content)).toBe(true);
+    if (Array.isArray(content)) {
+      // Empty text part is dropped; only the image remains.
+      expect(content).toEqual([
+        { type: "image_url", image_url: { url: "data:image/png;base64,x" } },
+      ]);
+    }
+  });
+
   it("converts an assistant message with text only", () => {
     const messages: ChatMessage[] = [
       {
diff --git a/packages/openai-stream/src/convert-messages.ts b/packages/openai-stream/src/convert-messages.ts
index e830243..eba3575 100644
--- a/packages/openai-stream/src/convert-messages.ts
+++ b/packages/openai-stream/src/convert-messages.ts
@@ -1,8 +1,28 @@
 import type { ChatMessage, Chunk } from "@dispatch/kernel";
 
+/** A text part within a multimodal OpenAI content array. */
+export interface OpenAITextPart {
+  readonly type: "text";
+  readonly text: string;
+}
+
+/** An image part within a multimodal OpenAI content array (OpenAI vision format). */
+export interface OpenAIImagePart {
+  readonly type: "image_url";
+  readonly image_url: { readonly url: string };
+}
+
+/**
+ * A part of a multimodal message content array. When a message has mixed text
+ * and image chunks, the content is serialized as an array of these parts
+ * (OpenAI's vision format). Plain-text messages keep a string `content` for
+ * byte-stability with providers that only accept strings.
+ */
+export type OpenAIContentPart = OpenAITextPart | OpenAIImagePart;
+
 export interface OpenAIMessage {
   readonly role: "system" | "user" | "assistant" | "tool";
-  readonly content: string | null;
+  readonly content: string | null | readonly OpenAIContentPart[];
   readonly tool_calls?: readonly OpenAIToolCall[];
   readonly tool_call_id?: string;
 }
@@ -49,6 +69,29 @@ function convertSystemMessage(msg: ChatMessage): OpenAIMessage {
 }
 
 function convertUserMessage(msg: ChatMessage): OpenAIMessage {
+  // If the message has image chunks, serialize as a multimodal content array
+  // (OpenAI vision format): text parts + image_url parts in chunk order.
+  // Plain text-only messages keep a string `content` for byte-stability with
+  // providers that only accept a string (and to keep prompt-cache prefixes
+  // unchanged for the common no-image case).
+  const hasImage = msg.chunks.some((c) => c.type === "image");
+  if (hasImage) {
+    const parts: OpenAIContentPart[] = [];
+    for (const chunk of msg.chunks) {
+      if (chunk.type === "text") {
+        if (chunk.text.length > 0) {
+          parts.push({ type: "text", text: chunk.text });
+        }
+      } else if (chunk.type === "image") {
+        parts.push({ type: "image_url", image_url: { url: chunk.url } });
+      }
+      // Non-text/non-image chunks (tool-call, thinking, etc.) are not part of a
+      // user message's provider content and are skipped here.
+    }
+    // An image-only message (no text) still needs at least the image part.
+    return { role: "user", content: parts.length > 0 ? parts : "" };
+  }
+
   const text = msg.chunks
     .filter((c): c is Extract<Chunk, { type: "text" }> => c.type === "text")
     .map((c) => c.text)
diff --git a/packages/openai-stream/src/index.ts b/packages/openai-stream/src/index.ts
index bd2f673..3f76b99 100644
--- a/packages/openai-stream/src/index.ts
+++ b/packages/openai-stream/src/index.ts
@@ -1,8 +1,14 @@
-export type { OpenAIMessage, OpenAIToolCall } from "./convert-messages.js";
+export type {
+  OpenAIContentPart,
+  OpenAIImagePart,
+  OpenAIMessage,
+  OpenAITextPart,
+  OpenAIToolCall,
+} from "./convert-messages.js";
 export { convertMessages } from "./convert-messages.js";
 export type { OpenAITool } from "./convert-tools.js";
 export { convertTools } from "./convert-tools.js";
-export { parseModelList } from "./listModels.js";
+export { isVisionModelId, parseModelList } from "./listModels.js";
 export { parseSSELines } from "./parse-sse.js";
 export type { CreateOpenAICompatProviderOpts } from "./provider.js";
 export { createOpenAICompatProvider } from "./provider.js";
diff --git a/packages/openai-stream/src/listModels.test.ts b/packages/openai-stream/src/listModels.test.ts
index c2438bc..3acf46e 100644
--- a/packages/openai-stream/src/listModels.test.ts
+++ b/packages/openai-stream/src/listModels.test.ts
@@ -1,7 +1,7 @@
 import type { ApiKeyCredentials, ModelInfo, ProviderContract } from "@dispatch/kernel";
 import type { FetchLike } from "@dispatch/trace-replay";
 import { describe, expect, it, vi } from "vitest";
-import { parseModelList } from "./listModels.js";
+import { isVisionModelId, parseModelList } from "./listModels.js";
 import { createOpenAICompatProvider } from "./provider.js";
 
 function makeProvider(fetchFn: FetchLike, apiKey = "sk-test-1234567890abcdef"): ProviderContract {
@@ -35,6 +35,48 @@ describe("listModels — pure mapping (parseModelList)", () => {
     const result = parseModelList([]);
     expect(result).toEqual([]);
   });
+
+  it("extracts contextWindow from common field names", () => {
+    const result = parseModelList([
+      { id: "m1", context_length: 128000 },
+      { id: "m2", context_window: 200000 },
+      { id: "m3", max_context_length: 64000 },
+      { id: "m4", max_tokens: 8000 },
+    ]);
+    expect(result).toEqual([
+      { id: "m1", contextWindow: 128000 },
+      { id: "m2", contextWindow: 200000 },
+      { id: "m3", contextWindow: 64000 },
+      { id: "m4", contextWindow: 8000 },
+    ]);
+  });
+});
+
+describe("listModels — vision capability detection", () => {
+  it("isVisionModelId returns true for kimi-family model ids", () => {
+    expect(isVisionModelId("kimi-k2.7")).toBe(true);
+    expect(isVisionModelId("Kimi-K2.7")).toBe(true); // case-insensitive
+    expect(isVisionModelId("moonshot/kimi-k2-thinking")).toBe(true);
+  });
+
+  it("isVisionModelId returns false for non-kimi model ids", () => {
+    expect(isVisionModelId("glm-5.2")).toBe(false);
+    expect(isVisionModelId("deepseek-v4-flash")).toBe(false);
+    expect(isVisionModelId("umans-coder")).toBe(false);
+  });
+
+  it("parseModelList sets vision: true on kimi-family models", () => {
+    const result = parseModelList([
+      { id: "kimi-k2.7", context_length: 200000 },
+      { id: "glm-5.2", context_length: 128000 },
+      { id: "deepseek-v4-flash" },
+    ]);
+    expect(result).toEqual([
+      { id: "kimi-k2.7", contextWindow: 200000, vision: true },
+      { id: "glm-5.2", contextWindow: 128000 },
+      { id: "deepseek-v4-flash" },
+    ]);
+  });
 });
 
 describe("listModels — provider contract", () => {
diff --git a/packages/openai-stream/src/listModels.ts b/packages/openai-stream/src/listModels.ts
index 0e94c43..273fee3 100644
--- a/packages/openai-stream/src/listModels.ts
+++ b/packages/openai-stream/src/listModels.ts
@@ -24,17 +24,39 @@ interface OpenAIModelListResponse {
 }
 
 /**
+ * Whether a model id is vision-capable (can natively accept image input).
+ *
+ * The OpenAI-compatible `/models` endpoint does not reliably report image
+ * capabilities, so this is a hardcoded heuristic by model id: a model whose id
+ * contains "kimi" (e.g. `kimi-k2.7`, `moonshot/kimi-k2.7`) is vision-capable;
+ * all others are treated as non-vision. This is the single source of truth —
+ * the orchestrator's vision handoff and the `read_image` tool both consult the
+ * `ModelInfo.vision` flag this sets, so adding a model here enables vision
+ * everywhere. Pure: id → boolean, no I/O.
+ *
+ * (When an endpoint gains reliable vision reporting, this can be replaced with
+ * a real capability check without changing callers.)
+ */
+export function isVisionModelId(id: string): boolean {
+  const lower = id.toLowerCase();
+  return lower.includes("kimi");
+}
+
+/**
  * Pure mapping: raw OpenAI-compatible model list → ModelInfo[].
- * Extracts `contextWindow` from common field names (providers vary).
- * Extracted for direct unit testing with no I/O.
+ * Extracts `contextWindow` from common field names (providers vary) and
+ * detects vision capability via {@link isVisionModelId}. Extracted for direct
+ * unit testing with no I/O.
  */
 export function parseModelList(data: readonly OpenAIModelEntry[]): readonly ModelInfo[] {
   return data.map((entry) => {
     const contextWindow =
       entry.context_length ?? entry.context_window ?? entry.max_context_length ?? entry.max_tokens;
+    const vision = isVisionModelId(entry.id);
     return {
       id: entry.id,
       ...(contextWindow !== undefined ? { contextWindow } : {}),
+      ...(vision ? { vision } : {}),
     };
   });
 }
diff --git a/packages/session-orchestrator/src/extension.ts b/packages/session-orchestrator/src/extension.ts
index 5afffd8..d080e90 100644
--- a/packages/session-orchestrator/src/extension.ts
+++ b/packages/session-orchestrator/src/extension.ts
@@ -11,6 +11,7 @@ import {
   createSessionOrchestrator,
   createWarmService,
   sessionOrchestratorHandle,
+  visionHandoffLocalHandle,
 } from "./orchestrator.js";
 import { selectFirstProvider } from "./pure.js";
 import { filterRemoteIncompatibleTools, toolsFilter } from "./tools-filter.js";
@@ -93,6 +94,20 @@ export function activate(host: HostAPI): void {
         return undefined;
       }
     },
+    resolveVisionHandoff: () => {
+      // Lazily resolve the vision-handoff service. Returns undefined when the
+      // vision-handoff extension isn't loaded (images pass through unchanged —
+      // correct for vision-capable models; the feature degrades off cleanly for
+      // text-only turns). Lazy so activation order doesn't matter; the
+      // activated-manifests guard avoids a getService throw when absent.
+      const loaded = host.getExtensions().some((m) => m.id === "vision-handoff");
+      if (!loaded) return undefined;
+      try {
+        return host.getService(visionHandoffLocalHandle);
+      } catch {
+        return undefined;
+      }
+    },
   });
 
   host.provideService(sessionOrchestratorHandle, orchestrator);
diff --git a/packages/session-orchestrator/src/orchestrator.ts b/packages/session-orchestrator/src/orchestrator.ts
index 96cd3a3..ac1eaf4 100644
--- a/packages/session-orchestrator/src/orchestrator.ts
+++ b/packages/session-orchestrator/src/orchestrator.ts
@@ -5,6 +5,7 @@ import type {
   CompactionResult,
   ConversationStatus,
   EventHookDescriptor,
+  ImageInput,
   Logger,
   ModelInfo,
   ProviderContract,
@@ -32,11 +33,52 @@ import {
 } from "./pure.js";
 import type { ToolAssembly } from "./tools-filter.js";
 
+// --- Vision handoff (lazy, optional) ---
+
+/**
+ * Minimal contract the vision-handoff service satisfies. Defined here (not
+ * imported from the vision-handoff package) so the orchestrator has NO
+ * compile-time dependency on it — the service is resolved lazily at runtime
+ * (like the message-queue / system-prompt services), and the feature degrades
+ * off cleanly when the extension isn't loaded (images pass through unchanged,
+ * which is correct for vision-capable models and a no-op for text-only turns).
+ *
+ * `transcribeForProvider` transforms a message list for the provider: if the
+ * active model is vision-capable, messages pass through unchanged; otherwise
+ * image chunks are replaced with text descriptions (transcribed via a
+ * vision-capable model). Never throws — degrades to placeholders.
+ */
+export interface VisionHandoffService {
+  readonly transcribeForProvider: (
+    messages: readonly ChatMessage[],
+    currentModelName: string | undefined,
+    opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+  ) => Promise<readonly ChatMessage[]>;
+}
+
+/**
+ * Local handle for the vision-handoff service, keyed by the same ID the
+ * vision-handoff extension registers under (`"vision-handoff/service"`). Defined
+ * locally (not imported) so the orchestrator has no compile-time dependency on
+ * the vision-handoff package — the service is resolved lazily at runtime, and
+ * the feature degrades off cleanly when the extension isn't loaded.
+ */
+export const visionHandoffLocalHandle: ServiceHandle<VisionHandoffService> =
+  defineService<VisionHandoffService>("vision-handoff/service");
+
 // --- Broadcast hub types ---
 
 export interface StartTurnInput {
   readonly conversationId: string;
   readonly text: string;
+  /**
+   * Images attached to this turn (e.g. user-pasted screenshots). Each is
+   * appended as an `image` chunk on the persisted user message. For a
+   * vision-capable model the images pass through to the provider natively; for
+   * a non-vision model the vision handoff transcribes them to text first.
+   * Optional — omit for a text-only turn.
+   */
+  readonly images?: readonly ImageInput[];
   readonly modelName?: string;
   readonly cwd?: string;
   /**
@@ -75,6 +117,12 @@ export type StartTurnResult =
 export interface EnqueueInput {
   readonly conversationId: string;
   readonly text: string;
+  /**
+   * Images attached (the steering / opening message analog of
+   * `StartTurnInput.images`). Threaded to `startTurn` when the conversation is
+   * idle (the message starts a turn). Additive optional.
+   */
+  readonly images?: readonly ImageInput[];
   /** Workspace to stamp on a new conversation. Defaults to `"default"`. */
   readonly workspaceId?: string;
   /**
@@ -289,6 +337,8 @@ export interface SessionOrchestrator {
     workspaceId?: string;
     /** Explicit system-prompt override — see {@link StartTurnInput.systemPrompt}. */
     systemPrompt?: string;
+    /** Images attached to this turn — see {@link StartTurnInput.images}. */
+    images?: readonly ImageInput[];
   }): Promise<void>;
 }
 
@@ -335,6 +385,17 @@ export interface SessionOrchestratorDeps {
    * order doesn't matter.
    */
   readonly resolveSystemPrompt?: () => SystemPromptService | undefined;
+  /**
+   * Lazily resolves the vision-handoff service, or `undefined` when the
+   * vision-handoff extension isn't loaded. Used to transcribe image chunks to
+   * text for non-vision models before they reach the provider (so a text-only
+   * model can still reason about pasted/code images). When `undefined`, images
+   * pass through unchanged (correct for vision-capable models; a text-only model
+   * would then receive image content its API may reject — the feature degrades
+   * off cleanly for text-only turns since there are no images). Lazy so
+   * activation order doesn't matter; called per-turn.
+   */
+  readonly resolveVisionHandoff?: () => VisionHandoffService | undefined;
   /** Apply the per-turn tools filter chain. Injected for testability. */
   readonly applyToolsFilter: (assembly: ToolAssembly) => Promise<ToolAssembly>;
   /** Base logger (auto-scoped to this extension); childed per turn for span capture. */
@@ -437,6 +498,7 @@ export function createSessionOrchestrator(
     reasoningEffortOverride: ReasoningEffort | undefined,
     workspaceId: string,
     systemPromptOverride: string | undefined,
+    images: readonly ImageInput[] | undefined,
   ): void {
     const turnId = generateTurnId();
     const controller = new AbortController();
@@ -558,7 +620,7 @@ export function createSessionOrchestrator(
         const effectiveModelName = resolveModelName(modelName, storedModel);
 
         const history = await deps.conversationStore.load(conversationId);
-        const userMsg = buildUserMessage(text);
+        const userMsg = buildUserMessage(text, images);
 
         // Workspace assignment for new conversations happens BEFORE
         // effective-cwd resolution (see workspaceSetupPromise above) so
@@ -697,9 +759,32 @@ export function createSessionOrchestrator(
                 return [{ role: "user", chunks: [{ type: "text", text: steerText }] }];
               };
 
+        // Vision handoff: transform the message list for the provider. When the
+        // active model is vision-capable, images pass through natively (no-op).
+        // When it is NOT vision-capable, image chunks are transcribed to text
+        // descriptions via a vision-capable model — so a text-only model can
+        // still reason about images. The PERSISTED user message keeps the
+        // original image chunks (appended below); only the provider's view is
+        // transcribed. When the vision-handoff service isn't loaded, images pass
+        // through unchanged (correct for vision models; text-only models would
+        // then receive image content their API may reject — degrades off cleanly
+        // for text-only turns with no images).
+        const visionHandoff = deps.resolveVisionHandoff?.();
+        let providerMessages: readonly ChatMessage[] = [...history, userMsg];
+        if (visionHandoff !== undefined) {
+          providerMessages = await visionHandoff.transcribeForProvider(
+            providerMessages,
+            effectiveModelName,
+            {
+              signal: controller.signal,
+              ...(turnLogger !== undefined ? { logger: turnLogger } : {}),
+            },
+          );
+        }
+
         const opts: RunTurnInput = {
           provider,
-          messages: [...history, userMsg],
+          messages: providerMessages,
           tools: assembled.tools,
           dispatch,
           emit: emitAndAccumulate,
@@ -805,6 +890,7 @@ export function createSessionOrchestrator(
       reasoningEffort,
       workspaceId,
       systemPrompt,
+      images,
     }) {
       if (activeTurns.has(conversationId)) {
         return { started: false, reason: "already-active" };
@@ -818,18 +904,20 @@ export function createSessionOrchestrator(
         reasoningEffort,
         workspaceId ?? "default",
         systemPrompt,
+        images,
       );
       const turn = activeTurns.get(conversationId);
       const turnId = turn !== undefined ? turn.turnId : "";
       return { started: true, turnId };
     },
 
-    enqueue({ conversationId, text, workspaceId, computerId }) {
+    enqueue({ conversationId, text, workspaceId, computerId, images }) {
       const result = orchestrator.startTurn({
         conversationId,
         text,
         ...(workspaceId !== undefined ? { workspaceId } : {}),
         ...(computerId !== undefined ? { computerId } : {}),
+        ...(images !== undefined ? { images } : {}),
       });
       if (result.started) {
         return { startedTurn: true, queue: [] };
@@ -914,6 +1002,7 @@ export function createSessionOrchestrator(
       reasoningEffort,
       workspaceId,
       systemPrompt,
+      images,
     }) {
       const turnInput: StartTurnInput = {
         conversationId,
@@ -924,6 +1013,7 @@ export function createSessionOrchestrator(
         ...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
         ...(workspaceId !== undefined ? { workspaceId } : {}),
         ...(systemPrompt !== undefined ? { systemPrompt } : {}),
+        ...(images !== undefined ? { images } : {}),
       };
       const result = orchestrator.startTurn(turnInput);
       if (!result.started) {
diff --git a/packages/session-orchestrator/src/pure.test.ts b/packages/session-orchestrator/src/pure.test.ts
index c75cb82..7a574f1 100644
--- a/packages/session-orchestrator/src/pure.test.ts
+++ b/packages/session-orchestrator/src/pure.test.ts
@@ -26,6 +26,39 @@ describe("buildUserMessage", () => {
     expect(msg.role).toBe("user");
     expect(msg.chunks[0]).toEqual({ type: "text", text: "" });
   });
+
+  it("appends image chunks after the text chunk when images are given", () => {
+    const msg = buildUserMessage("look at this", [
+      { url: "data:image/png;base64,aaa" },
+      { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+    ]);
+    expect(msg.chunks).toHaveLength(3);
+    expect(msg.chunks[0]).toEqual({ type: "text", text: "look at this" });
+    expect(msg.chunks[1]).toEqual({ type: "image", url: "data:image/png;base64,aaa" });
+    expect(msg.chunks[2]).toEqual({
+      type: "image",
+      url: "data:image/jpeg;base64,bbb",
+      mimeType: "image/jpeg",
+    });
+  });
+
+  it("builds an image-only message when text is empty", () => {
+    const msg = buildUserMessage("", [{ url: "data:image/png;base64,zzz" }]);
+    expect(msg.chunks).toHaveLength(1);
+    expect(msg.chunks[0]).toEqual({ type: "image", url: "data:image/png;base64,zzz" });
+  });
+
+  it("includes mimeType when provided", () => {
+    const msg = buildUserMessage("hi", [
+      { url: "data:image/webp;base64,x", mimeType: "image/webp" },
+    ]);
+    expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBe("image/webp");
+  });
+
+  it("omits mimeType when not provided", () => {
+    const msg = buildUserMessage("hi", [{ url: "https://example.com/x.png" }]);
+    expect((msg.chunks[1] as { mimeType?: string }).mimeType).toBeUndefined();
+  });
 });
 
 describe("selectFirstProvider", () => {
diff --git a/packages/session-orchestrator/src/pure.ts b/packages/session-orchestrator/src/pure.ts
index 2208e8f..0d2068f 100644
--- a/packages/session-orchestrator/src/pure.ts
+++ b/packages/session-orchestrator/src/pure.ts
@@ -1,12 +1,40 @@
 import type {
   ChatMessage,
+  Chunk,
+  ImageInput,
   ProviderContract,
   ReasoningEffort,
   ToolDispatchPolicy,
 } from "@dispatch/kernel";
 
-export function buildUserMessage(text: string): ChatMessage {
-  return { role: "user", chunks: [{ type: "text", text }] };
+/**
+ * Build the persisted user message for a turn. When `images` are provided, each
+ * is appended as an `image` chunk AFTER the text chunk, so the persisted message
+ * carries both the prompt text and the attached images (the frontend renders
+ * the images; vision-capable providers receive them natively; non-vision
+ * providers have them transcribed by the vision handoff before streaming).
+ *
+ * Pure: inputs → a ChatMessage, no I/O.
+ */
+export function buildUserMessage(text: string, images?: readonly ImageInput[]): ChatMessage {
+  const chunks: Chunk[] = [];
+  if (text.length > 0) {
+    chunks.push({ type: "text", text });
+  }
+  if (images !== undefined) {
+    for (const img of images) {
+      chunks.push({
+        type: "image",
+        url: img.url,
+        ...(img.mimeType !== undefined ? { mimeType: img.mimeType } : {}),
+      });
+    }
+  }
+  // An image-only message (empty text) is valid.
+  if (chunks.length === 0) {
+    chunks.push({ type: "text", text: "" });
+  }
+  return { role: "user", chunks };
 }
 
 // ── Provider-error retry backoff schedule ───────────────────────────────────
diff --git a/packages/transport-contract/src/contract.types.test.ts b/packages/transport-contract/src/contract.types.test.ts
index 9d3d904..34ff544 100644
--- a/packages/transport-contract/src/contract.types.test.ts
+++ b/packages/transport-contract/src/contract.types.test.ts
@@ -20,6 +20,7 @@ import type {
   LspServerState,
   LspStatusResponse,
   McpStatusResponse,
+  ModelsResponse,
   SetConversationComputerRequest,
   SetCwdRequest,
   SetWorkspaceDefaultComputerRequest,
@@ -55,6 +56,18 @@ const _chatWithoutComputer: ChatRequest = {
   message: "hello",
 };
 
+// ─── ChatRequest.images (additive optional) ──────────────────────────────────
+
+const _chatWithImages: ChatRequest = {
+  message: "What's in this screenshot?",
+  images: [{ url: "data:image/png;base64,iVBORw0KGgo=", mimeType: "image/png" }],
+};
+
+const _chatWithHttpImage: ChatRequest = {
+  message: "analyze this",
+  images: [{ url: "https://example.com/diagram.png" }],
+};
+
 // ─── Computer list / single response ─────────────────────────────────────────
 
 const _computer: Computer = {
@@ -255,6 +268,35 @@ describe("transport-contract types compile and are exported", () => {
     expect(_chatWithComputer.computerId).toBe("prod-box");
   });
 
+  // ─── ChatRequest.images (additive optional) ──────────────────────────────
+
+  it("ChatRequest: images is additive optional (omittable)", () => {
+    expect(_chatWithoutComputer.images).toBeUndefined();
+  });
+
+  it("ChatRequest: carries images (data URL) when set", () => {
+    expect(_chatWithImages.images).toHaveLength(1);
+    expect(_chatWithImages.images?.[0]?.url).toContain("base64");
+    expect(_chatWithImages.images?.[0]?.mimeType).toBe("image/png");
+  });
+
+  it("ChatRequest: carries images (http URL, mimeType optional)", () => {
+    expect(_chatWithHttpImage.images?.[0]?.url).toBe("https://example.com/diagram.png");
+    expect(_chatWithHttpImage.images?.[0]?.mimeType).toBeUndefined();
+  });
+
+  it("ModelsResponse: ModelMetadata carries optional vision flag", () => {
+    const resp: ModelsResponse = {
+      models: ["umans/kimi-k2.7", "umans/glm-5.2"],
+      modelInfo: {
+        "umans/kimi-k2.7": { contextWindow: 200000, vision: true },
+        "umans/glm-5.2": { contextWindow: 128000 },
+      },
+    };
+    expect(resp.modelInfo?.["umans/kimi-k2.7"]?.vision).toBe(true);
+    expect(resp.modelInfo?.["umans/glm-5.2"]?.vision).toBeUndefined();
+  });
+
   // ─── Computers ───────────────────────────────────────────────────────────
 
   it("ComputerListResponse: carries entries with usage counts", () => {
diff --git a/packages/transport-contract/src/index.ts b/packages/transport-contract/src/index.ts
index 6a9a29f..0444f29 100644
--- a/packages/transport-contract/src/index.ts
+++ b/packages/transport-contract/src/index.ts
@@ -26,6 +26,7 @@ import type {
   ComputerEntry,
   ConversationMeta,
   ConversationStatus,
+  ImageInput,
   QueuedMessage,
   ReasoningEffort,
   StoredChunk,
@@ -41,6 +42,8 @@ export type {
   ComputerEntry,
   ConversationMeta,
   ConversationStatus,
+  ImageChunk,
+  ImageInput,
   QueuedMessage,
   ReasoningEffort,
   StepMetrics,
@@ -68,6 +71,19 @@ export interface ChatRequest {
   readonly message: string;
 
   /**
+   * Images attached to this turn (e.g. a user-pasted screenshot). Each entry's
+   * `url` is a base64 data URL (`data:image/…;base64,…`) or an `http(s)://`
+   * URL. The server converts these to `image` chunks on the persisted user
+   * message. For a VISION-capable model (e.g. kimi), the images are passed
+   * through to the provider natively. For a NON-vision model (e.g. glm-5.2),
+   * the server's vision handoff transcribes each image to a text description
+   * (via a vision-capable model) and feeds that text instead — so a text-only
+   * model can still reason about the image's contents. Optional — omit for a
+   * text-only turn (backward compatible).
+   */
+  readonly images?: readonly ImageInput[];
+
+  /**
    * The model to use, as a model name in `<credentialName>/<model>` form — one
    * of the exact strings returned by `GET /models`. Omit to use the server's
    * default credential + model.
@@ -124,6 +140,14 @@ export interface ModelsResponse {
 /** Per-model metadata returned alongside the model catalog. */
 export interface ModelMetadata {
   readonly contextWindow?: number;
+  /**
+   * Whether this model can natively accept image input (vision/multimodal).
+   * When `true`, image chunks in a user message are passed through to the
+   * provider. When `false`/absent, the server's vision handoff transcribes
+   * images to text before the model sees them. A client may use this to show a
+   * vision badge in the model picker. Optional — absent when unknown.
+   */
+  readonly vision?: boolean;
 }
 
 /**
diff --git a/packages/transport-http/src/app.ts b/packages/transport-http/src/app.ts
index 4fb295e..a9a23da 100644
--- a/packages/transport-http/src/app.ts
+++ b/packages/transport-http/src/app.ts
@@ -294,11 +294,14 @@ export function createApp(opts: CreateServerOptions): Hono {
   app.get("/models", async (c) => {
     try {
       const models = await opts.credentialStore.listCatalog();
-      const modelInfo: Record<string, { contextWindow?: number }> = {};
+      const modelInfo: Record<string, { contextWindow?: number; vision?: boolean }> = {};
       for (const modelName of models) {
         const info = await opts.credentialStore.getModelInfo(modelName);
-        if (info?.contextWindow !== undefined) {
-          modelInfo[modelName] = { contextWindow: info.contextWindow };
+        if (info?.contextWindow !== undefined || info?.vision === true) {
+          const entry: { contextWindow?: number; vision?: boolean } = {};
+          if (info?.contextWindow !== undefined) entry.contextWindow = info.contextWindow;
+          if (info?.vision === true) entry.vision = true;
+          modelInfo[modelName] = entry;
         }
       }
       const body: ModelsResponse = {
@@ -398,8 +401,16 @@ export function createApp(opts: CreateServerOptions): Hono {
       return c.json({ error: result.error }, 400);
     }
 
-    const { conversationId, message, model, cwd, computerId, reasoningEffort, workspaceId } =
-      result;
+    const {
+      conversationId,
+      message,
+      model,
+      cwd,
+      computerId,
+      reasoningEffort,
+      workspaceId,
+      images,
+    } = result;
     log.info("chat: request accepted", {
       conversationId,
       hasModel: model !== undefined,
@@ -407,6 +418,7 @@ export function createApp(opts: CreateServerOptions): Hono {
       hasComputerId: computerId !== undefined,
       hasReasoningEffort: reasoningEffort !== undefined,
       hasWorkspaceId: workspaceId !== undefined,
+      imageCount: images?.length ?? 0,
     });
 
     const events: AgentEvent[] = [];
@@ -457,6 +469,7 @@ export function createApp(opts: CreateServerOptions): Hono {
       ...(computerId !== undefined ? { computerId } : {}),
       ...(reasoningEffort !== undefined ? { reasoningEffort } : {}),
       ...(workspaceId !== undefined ? { workspaceId } : {}),
+      ...(images !== undefined ? { images } : {}),
     };
 
     opts.orchestrator
diff --git a/packages/transport-http/src/logic.test.ts b/packages/transport-http/src/logic.test.ts
index fc8302e..67632f3 100644
--- a/packages/transport-http/src/logic.test.ts
+++ b/packages/transport-http/src/logic.test.ts
@@ -182,6 +182,69 @@ describe("parseChatBody", () => {
       expect(result.reasoningEffort).toBeUndefined();
     }
   });
+
+  // ── images ──────────────────────────────────────────────────────────────
+
+  it("parses images array with data URLs", () => {
+    const result = parseChatBody(
+      {
+        message: "what is this?",
+        images: [
+          { url: "data:image/png;base64,aaa" },
+          { url: "data:image/jpeg;base64,bbb", mimeType: "image/jpeg" },
+        ],
+      },
+      fakeId,
+    );
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images).toHaveLength(2);
+      expect(result.images?.[0]?.url).toBe("data:image/png;base64,aaa");
+      expect(result.images?.[1]?.mimeType).toBe("image/jpeg");
+    }
+  });
+
+  it("parses images with http URLs", () => {
+    const result = parseChatBody(
+      { message: "hi", images: [{ url: "https://example.com/x.png" }] },
+      fakeId,
+    );
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images?.[0]?.url).toBe("https://example.com/x.png");
+    }
+  });
+
+  it("returns error when images is not an array", () => {
+    const result = parseChatBody({ message: "hi", images: "not-an-array" }, fakeId);
+    expect(isParseError(result)).toBe(true);
+  });
+
+  it("returns error when an image lacks a url", () => {
+    const result = parseChatBody({ message: "hi", images: [{ mimeType: "image/png" }] }, fakeId);
+    expect(isParseError(result)).toBe(true);
+  });
+
+  it("returns error when an image url is empty", () => {
+    const result = parseChatBody({ message: "hi", images: [{ url: "" }] }, fakeId);
+    expect(isParseError(result)).toBe(true);
+  });
+
+  it("omits images when absent (backward compatible)", () => {
+    const result = parseChatBody({ message: "hi" }, fakeId);
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images).toBeUndefined();
+    }
+  });
+
+  it("omits images when the array is empty", () => {
+    const result = parseChatBody({ message: "hi", images: [] }, fakeId);
+    expect(isParseError(result)).toBe(false);
+    if (!isParseError(result)) {
+      expect(result.images).toBeUndefined();
+    }
+  });
 });
 
 describe("parseSinceSeq", () => {
diff --git a/packages/transport-http/src/logic.ts b/packages/transport-http/src/logic.ts
index 97ad426..a928147 100644
--- a/packages/transport-http/src/logic.ts
+++ b/packages/transport-http/src/logic.ts
@@ -55,6 +55,13 @@ export interface ChatCommand {
   readonly computerId?: string;
   readonly reasoningEffort?: ReasoningEffort;
   readonly workspaceId?: string;
+  /**
+   * Images attached to this turn (data URLs or http URLs). Parsed from the
+   * `ChatRequest.images` field; forwarded to the orchestrator which converts
+   * them to `image` chunks on the user message. Each entry must have a non-empty
+   * string `url`; `mimeType` is optional.
+   */
+  readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
 }
 
 export interface ParseError {
@@ -121,6 +128,33 @@ export function parseChatBody(body: unknown, generateId: () => string): ParseRes
     (result as { workspaceId?: string }).workspaceId = obj.workspaceId;
   }
 
+  if (obj.images !== undefined) {
+    if (!Array.isArray(obj.images)) {
+      return { error: "Field 'images' must be an array" };
+    }
+    const images: { url: string; mimeType?: string }[] = [];
+    for (const entry of obj.images) {
+      if (entry === null || typeof entry !== "object") {
+        return { error: "Each image must be an object with a 'url' string" };
+      }
+      const img = entry as { url?: unknown; mimeType?: unknown };
+      if (typeof img.url !== "string" || img.url.length === 0) {
+        return { error: "Each image must have a non-empty string 'url'" };
+      }
+      const parsed: { url: string; mimeType?: string } = { url: img.url };
+      if (img.mimeType !== undefined) {
+        if (typeof img.mimeType !== "string") {
+          return { error: "Field 'mimeType' on an image must be a string" };
+        }
+        parsed.mimeType = img.mimeType;
+      }
+      images.push(parsed);
+    }
+    if (images.length > 0) {
+      (result as { images?: readonly { url: string; mimeType?: string }[] }).images = images;
+    }
+  }
+
   return result;
 }
 
diff --git a/packages/transport-ws/src/extension.ts b/packages/transport-ws/src/extension.ts
index 3811ed7..d26712b 100644
--- a/packages/transport-ws/src/extension.ts
+++ b/packages/transport-ws/src/extension.ts
@@ -291,6 +291,7 @@ export function createTransportWsExtension(): Extension {
                     : {}),
                   ...(result.workspaceId !== undefined ? { workspaceId: result.workspaceId } : {}),
                   ...(result.computerId !== undefined ? { computerId: result.computerId } : {}),
+                  ...(result.images !== undefined ? { images: result.images } : {}),
                 });
                 if (!startResult.started) {
                   send(ws, {
diff --git a/packages/transport-ws/src/router.ts b/packages/transport-ws/src/router.ts
index a33aa5a..0caf305 100644
--- a/packages/transport-ws/src/router.ts
+++ b/packages/transport-ws/src/router.ts
@@ -58,6 +58,12 @@ export interface ChatRouteResult {
    * conversation → workspace → local chain).
    */
   readonly computerId?: string;
+  /**
+   * Images attached to this turn (data URLs or http URLs), forwarded verbatim to
+   * the orchestrator. Absent when the client omits it. Each entry must have a
+   * non-empty string `url`; `mimeType` is optional.
+   */
+  readonly images?: readonly { readonly url: string; readonly mimeType?: string }[];
 }
 
 /** A malformed chat.send that should yield a chat.error reply. */
@@ -174,6 +180,36 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
       errorMessage: `chat.send: invalid reasoningEffort "${msg.reasoningEffort}" — must be one of: low, medium, high, xhigh, max`,
     };
   }
+  // Validate images (if present): each must be an object with a non-empty url.
+  let images: readonly { url: string; mimeType?: string }[] | undefined;
+  if (msg.images !== undefined) {
+    if (!Array.isArray(msg.images)) {
+      return {
+        kind: "chat-error",
+        conversationId: msg.conversationId,
+        errorMessage: "chat.send: 'images' must be an array",
+      };
+    }
+    const parsed: { url: string; mimeType?: string }[] = [];
+    for (const entry of msg.images) {
+      if (
+        entry === null ||
+        typeof entry !== "object" ||
+        typeof entry.url !== "string" ||
+        entry.url.length === 0
+      ) {
+        return {
+          kind: "chat-error",
+          conversationId: msg.conversationId,
+          errorMessage: "chat.send: each image must have a non-empty string 'url'",
+        };
+      }
+      const p: { url: string; mimeType?: string } = { url: entry.url };
+      if (entry.mimeType !== undefined) p.mimeType = entry.mimeType;
+      parsed.push(p);
+    }
+    if (parsed.length > 0) images = parsed;
+  }
   return {
     kind: "chat",
     conversationId: msg.conversationId,
@@ -183,6 +219,7 @@ function handleChatSend(msg: ChatSendMessage): ChatRouteResult | ChatRouteError
     ...(msg.reasoningEffort !== undefined ? { reasoningEffort: msg.reasoningEffort } : {}),
     ...(msg.workspaceId !== undefined ? { workspaceId: msg.workspaceId } : {}),
     ...(msg.computerId !== undefined ? { computerId: msg.computerId } : {}),
+    ...(images !== undefined ? { images } : {}),
   };
 }
 
diff --git a/packages/vision-handoff/package.json b/packages/vision-handoff/package.json
new file mode 100644
index 0000000..a88ab49
--- /dev/null
+++ b/packages/vision-handoff/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "@dispatch/vision-handoff",
+  "version": "0.0.0",
+  "type": "module",
+  "private": true,
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "dependencies": {
+    "@dispatch/credential-store": "workspace:*",
+    "@dispatch/kernel": "workspace:*",
+    "@dispatch/openai-stream": "workspace:*"
+  }
+}
diff --git a/packages/vision-handoff/src/extension.ts b/packages/vision-handoff/src/extension.ts
new file mode 100644
index 0000000..aa745b7
--- /dev/null
+++ b/packages/vision-handoff/src/extension.ts
@@ -0,0 +1,106 @@
+/**
+ * vision-handoff extension — registers the universal vision handoff service +
+ * the `read_image` tool.
+ *
+ * The service performs provider-agnostic vision handoff: it resolves a
+ * vision-capable model from the catalog (any provider), streams an image to it
+ * via the standard `ProviderContract.stream` interface, and folds the textual
+ * description back — so a non-vision model (e.g. glm-5.2) can still reason about
+ * images, and any model can analyze image FILES referenced in code.
+ *
+ * Effects (filesystem, fetch) live here in the shell, injected into the service.
+ * The pure decisions live in `pure.ts`. No `console.*`; logging via `host.logger`.
+ */
+
+import { readFile } from "node:fs/promises";
+import { extname, isAbsolute, resolve as pathResolve } from "node:path";
+import type { CredentialStore } from "@dispatch/credential-store";
+import { credentialStoreHandle } from "@dispatch/credential-store";
+import type { Extension, HostAPI, Manifest } from "@dispatch/kernel";
+import { createVisionHandoffService, visionHandoffHandle } from "./service.js";
+import { createReadImageTool } from "./tool.js";
+
+export const manifest: Manifest = {
+  id: "vision-handoff",
+  name: "Vision Handoff",
+  version: "0.0.0",
+  apiVersion: "^0.1.0",
+  trust: "bundled",
+  activation: "eager",
+  capabilities: { network: true },
+  contributes: { services: ["vision-handoff/service"], tools: ["read_image"] },
+};
+
+/** MIME types for recognized image extensions. */
+const MIME_BY_EXT: Readonly<Record<string, string>> = {
+  ".png": "image/png",
+  ".jpg": "image/jpeg",
+  ".jpeg": "image/jpeg",
+  ".webp": "image/webp",
+  ".gif": "image/gif",
+  ".bmp": "image/bmp",
+};
+
+/**
+ * Read an image file from disk as a base64 data URL. Resolves relative paths
+ * against the cwd (the conversation's working directory). Throws on missing
+ * file / read error (the caller surfaces it). The shell edge — real `node:fs`.
+ */
+async function readFileAsDataUrl(path: string, cwd?: string): Promise<string> {
+  const abs = cwd !== undefined && !isAbsolute(path) ? pathResolve(cwd, path) : pathResolve(path);
+  const buf = await readFile(abs);
+  const ext = extname(abs).toLowerCase();
+  const mime = MIME_BY_EXT[ext] ?? "image/png";
+  return `data:${mime};base64,${buf.toString("base64")}`;
+}
+
+/**
+ * Fetch an HTTP(S) image URL and convert it to a base64 data URL (so it can be
+ * sent to the vision model inline, regardless of whether the provider can fetch
+ * remote URLs). The shell edge — real `globalThis.fetch`.
+ */
+async function fetchUrlAsDataUrl(url: string): Promise<string> {
+  const res = await fetch(url);
+  if (!res.ok) {
+    throw new Error(`Failed to fetch image: HTTP ${res.status}`);
+  }
+  const buf = new Uint8Array(await res.arrayBuffer());
+  const mime = res.headers.get("content-type") ?? "image/png";
+  // Buffer/base64 in Bun + Node. Convert byte-by-byte without non-null asserts.
+  let binary = "";
+  for (const byte of buf) binary += String.fromCharCode(byte);
+  const base64 = btoa(binary);
+  return `data:${mime};base64,${base64}`;
+}
+
+export async function activate(host: HostAPI): Promise<void> {
+  const credentialStore = host.getService(credentialStoreHandle) as CredentialStore | undefined;
+  if (credentialStore === undefined) {
+    host.logger.warn(
+      "vision-handoff: credential-store service not available. The read_image tool and image transcription are disabled.",
+    );
+    return;
+  }
+
+  const resolveModel = (modelName: string) => {
+    const resolved = credentialStore.resolve(modelName);
+    if (resolved === undefined) return undefined;
+    const provider = host.getProviders().get(resolved.providerId);
+    if (provider === undefined) return undefined;
+    return { provider, model: resolved.model };
+  };
+
+  const service = createVisionHandoffService({
+    credentialStore,
+    resolveModel,
+    readFileAsDataUrl,
+    fetchUrlAsDataUrl,
+    logger: host.logger.child({ extensionId: "vision-handoff" }),
+  });
+
+  host.provideService(visionHandoffHandle, service);
+  host.defineTool(createReadImageTool(service));
+  host.logger.info("vision-handoff: registered (read_image tool + transcription service)");
+}
+
+export const extension: Extension = { manifest, activate };
diff --git a/packages/vision-handoff/src/index.ts b/packages/vision-handoff/src/index.ts
new file mode 100644
index 0000000..4a13e65
--- /dev/null
+++ b/packages/vision-handoff/src/index.ts
@@ -0,0 +1,19 @@
+export { extension, manifest } from "./extension.js";
+export {
+  buildTranscriptionPrompt,
+  collectTextFromStream,
+  findVisionModelName,
+  formatNoVisionPlaceholder,
+  formatTranscriptionText,
+  isVisionCapable,
+} from "./pure.js";
+export type {
+  ResolvedVisionModel,
+  VisionHandoffDeps,
+  VisionHandoffService,
+} from "./service.js";
+export {
+  createVisionHandoffService,
+  visionHandoffHandle,
+} from "./service.js";
+export { createReadImageTool } from "./tool.js";
diff --git a/packages/vision-handoff/src/pure.test.ts b/packages/vision-handoff/src/pure.test.ts
new file mode 100644
index 0000000..89dac72
--- /dev/null
+++ b/packages/vision-handoff/src/pure.test.ts
@@ -0,0 +1,141 @@
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { describe, expect, it } from "vitest";
+import {
+  buildTranscriptionPrompt,
+  collectTextFromStream,
+  findVisionModelName,
+  formatNoVisionPlaceholder,
+  formatTranscriptionText,
+  isVisionCapable,
+} from "./pure.js";
+
+describe("isVisionCapable", () => {
+  it("returns true when ModelInfo.vision is true", () => {
+    expect(isVisionCapable("umans/kimi-k2.7", { id: "kimi-k2.7", vision: true })).toBe(true);
+  });
+
+  it("returns false when ModelInfo.vision is false (overrides name heuristic)", () => {
+    expect(isVisionCapable("umans/kimi-k2.7", { id: "kimi-k2.7", vision: false })).toBe(false);
+  });
+
+  it("falls back to name heuristic when vision is absent (kimi)", () => {
+    expect(isVisionCapable("umans/kimi-k2.7", undefined)).toBe(true);
+    expect(isVisionCapable("umans/Kimi-K2.7", undefined)).toBe(true); // case-insensitive
+  });
+
+  it("falls back to name heuristic when vision is absent (non-kimi)", () => {
+    expect(isVisionCapable("umans/glm-5.2", undefined)).toBe(false);
+    expect(isVisionCapable("umans/deepseek-v4-flash", { id: "deepseek-v4-flash" })).toBe(false);
+  });
+
+  it("returns false for undefined model name", () => {
+    expect(isVisionCapable(undefined, undefined)).toBe(false);
+  });
+});
+
+describe("findVisionModelName", () => {
+  const getInfo = async (name: string): Promise<ModelInfo | undefined> => {
+    const map: Record<string, ModelInfo> = {
+      "umans/kimi-k2.7": { id: "kimi-k2.7", vision: true },
+      "umans/glm-5.2": { id: "glm-5.2" },
+      "umans/llama-vision": { id: "llama-vision", vision: true },
+    };
+    return map[name];
+  };
+
+  it("finds the first kimi-family model via name heuristic (no async lookup needed)", async () => {
+    const name = await findVisionModelName(
+      ["umans/glm-5.2", "umans/kimi-k2.7", "umans/llama-vision"],
+      getInfo,
+    );
+    expect(name).toBe("umans/kimi-k2.7");
+  });
+
+  it("finds a vision model via ModelInfo.vision when name heuristic misses", async () => {
+    const name = await findVisionModelName(["umans/glm-5.2", "umans/llama-vision"], getInfo);
+    expect(name).toBe("umans/llama-vision");
+  });
+
+  it("skips the excluded model", async () => {
+    const name = await findVisionModelName(
+      ["umans/kimi-k2.7", "umans/llama-vision"],
+      getInfo,
+      "umans/kimi-k2.7",
+    );
+    expect(name).toBe("umans/llama-vision");
+  });
+
+  it("returns undefined when no vision model is available", async () => {
+    const name = await findVisionModelName(["umans/glm-5.2"], getInfo);
+    expect(name).toBeUndefined();
+  });
+
+  it("returns undefined for empty catalog", async () => {
+    const name = await findVisionModelName([], getInfo);
+    expect(name).toBeUndefined();
+  });
+});
+
+describe("collectTextFromStream", () => {
+  async function* stream(events: ProviderEvent[]): AsyncIterable<ProviderEvent> {
+    for (const e of events) yield e;
+  }
+
+  it("collects text-delta events into a single string", async () => {
+    const events: ProviderEvent[] = [
+      { type: "text-delta", delta: "Hello " },
+      { type: "text-delta", delta: "world!" },
+    ];
+    const text = await collectTextFromStream(stream(events));
+    expect(text).toBe("Hello world!");
+  });
+
+  it("ignores non-text events (reasoning, usage, tool-call, finish)", async () => {
+    const events: ProviderEvent[] = [
+      { type: "reasoning-delta", delta: "thinking..." },
+      { type: "text-delta", delta: "answer" },
+      { type: "usage", usage: { inputTokens: 5, outputTokens: 1 } },
+      { type: "finish", reason: "stop" },
+    ];
+    const text = await collectTextFromStream(stream(events));
+    expect(text).toBe("answer");
+  });
+
+  it("throws on an error event", async () => {
+    const events: ProviderEvent[] = [
+      { type: "text-delta", delta: "partial" },
+      { type: "error", message: "boom" },
+    ];
+    await expect(collectTextFromStream(stream(events))).rejects.toThrow("boom");
+  });
+
+  it("returns empty string for an empty stream", async () => {
+    const text = await collectTextFromStream(stream([]));
+    expect(text).toBe("");
+  });
+});
+
+describe("prompt + formatting helpers", () => {
+  it("buildTranscriptionPrompt includes focus when a question is given", () => {
+    const prompt = buildTranscriptionPrompt("What error is shown?");
+    expect(prompt).toContain("Describe this image in detail");
+    expect(prompt).toContain('The user asked: "What error is shown?"');
+  });
+
+  it("buildTranscriptionPrompt omits focus when no question", () => {
+    const prompt = buildTranscriptionPrompt(undefined);
+    expect(prompt).toContain("Describe this image in detail");
+    expect(prompt).not.toContain("The user asked");
+  });
+
+  it("formatTranscriptionText names the vision model", () => {
+    expect(formatTranscriptionText("a red car", "umans/kimi-k2.7")).toBe(
+      "[Image analysis (via umans/kimi-k2.7)]: a red car",
+    );
+  });
+
+  it("formatNoVisionPlaceholder explains the limitation", () => {
+    const text = formatNoVisionPlaceholder();
+    expect(text).toContain("no vision-capable model");
+  });
+});
diff --git a/packages/vision-handoff/src/pure.ts b/packages/vision-handoff/src/pure.ts
new file mode 100644
index 0000000..11eeefc
--- /dev/null
+++ b/packages/vision-handoff/src/pure.ts
@@ -0,0 +1,129 @@
+/**
+ * Pure decision helpers for the vision handoff.
+ *
+ * No I/O, no ambient state. The shell (the extension + the service) injects the
+ * effects (credential store lookups, provider streaming). This module owns only
+ * the policy: which model is vision-capable, how to build a transcription
+ * request, and how to fold a provider's streamed text into a description.
+ */
+
+import type { ModelInfo, ProviderEvent } from "@dispatch/kernel";
+import { isVisionModelId } from "@dispatch/openai-stream";
+
+/**
+ * Whether a model is vision-capable, given its catalog name and (optional)
+ * resolved `ModelInfo`. When `ModelInfo.vision` is present it is authoritative;
+ * otherwise fall back to the hardcoded name heuristic ({@link isVisionModelId}).
+ *
+ * The `modelName` is the `<credentialName>/<model>` catalog form; the heuristic
+ * inspects the model SEGMENT (after the first `/`) so `umans/kimi-k2.7` → the
+ * `kimi-k2.7` segment is checked. Pure.
+ */
+export function isVisionCapable(
+  modelName: string | undefined,
+  info: ModelInfo | undefined,
+): boolean {
+  // When ModelInfo explicitly reports vision (true OR false), it is authoritative
+  // — an explicit false overrides the name heuristic (a provider that KNOWS a
+  // model is non-vision wins over the name guess).
+  if (info?.vision !== undefined) return info.vision;
+  if (modelName === undefined) return false;
+  const slash = modelName.indexOf("/");
+  const modelId = slash >= 0 ? modelName.slice(slash + 1) : modelName;
+  return isVisionModelId(modelId);
+}
+
+/**
+ * Find the first vision-capable model name in a catalog, given a lookup that
+ * resolves a `<credentialName>/<model>` → `ModelInfo`. Returns `undefined` when
+ * no vision-capable model is available (the handoff degrades: images are
+ * replaced with a placeholder note). Pure given the (async) lookup — no
+ * ambient state, no side effects.
+ *
+ * @param catalog  The full list of model names (`<credentialName>/<model>`).
+ * @param getInfo  Async lookup of a model name → ModelInfo (from the credential store).
+ * @param exclude  Optional model name to skip (e.g. the current non-vision model).
+ */
+export async function findVisionModelName(
+  catalog: readonly string[],
+  getInfo: (modelName: string) => Promise<ModelInfo | undefined>,
+  exclude?: string,
+): Promise<string | undefined> {
+  for (const name of catalog) {
+    if (exclude !== undefined && name === exclude) continue;
+    // Fast path: the name heuristic lets us short-circuit without an async
+    // lookup for known vision families (kimi). This avoids a round-trip to
+    // listModels for the common case.
+    const slash = name.indexOf("/");
+    const modelId = slash >= 0 ? name.slice(slash + 1) : name;
+    if (isVisionModelId(modelId)) return name;
+    const info = await getInfo(name);
+    if (info?.vision === true) return name;
+  }
+  return undefined;
+}
+
+/**
+ * Fold a provider's streamed events into a single text string (the
+ * transcription). Pure given the async iterable — collects `text-delta` events,
+ * ignores everything else (reasoning, usage, tool-calls, errors). If the stream
+ * yields an error event, it is surfaced as a thrown Error so the caller can
+ * decide how to degrade (placeholder vs. fail). Pure: input → output, no I/O.
+ */
+export async function collectTextFromStream(stream: AsyncIterable<ProviderEvent>): Promise<string> {
+  let text = "";
+  for await (const event of stream) {
+    if (event.type === "text-delta") {
+      text += event.delta;
+    } else if (event.type === "error") {
+      throw new Error(event.message);
+    }
+  }
+  return text;
+}
+
+/**
+ * Build the prompt sent to the vision model to transcribe an image. Kept here
+ * (pure) so the prompt is testable and stable. The prompt asks for a thorough
+ * description so the text-only model has enough detail to reason about the
+ * image's contents. Pure.
+ *
+ * @param userQuestion  The user's own message text (may be empty) — passed so
+ *   the vision model can tailor its description to what the user actually asked.
+ */
+export function buildTranscriptionPrompt(userQuestion: string | undefined): string {
+  const focus =
+    userQuestion && userQuestion.trim().length > 0
+      ? `\n\nThe user asked: "${userQuestion.trim()}". Focus your description on what is relevant to that question, but still describe the whole image.`
+      : "";
+  return (
+    "Describe this image in detail. Include: the overall scene/subject, " +
+    "visible text (transcribe verbatim), key objects, layout, colors, and any " +
+    "notable details a developer or user would need to understand the image." +
+    focus
+  );
+}
+
+/**
+ * Format a single image's transcription as a text chunk string for the
+ * persisted user message. The note names the vision model so the consumer knows
+ * the description's provenance. Pure.
+ */
+export function formatTranscriptionText(
+  description: string,
+  visionModelName: string | undefined,
+): string {
+  const source = visionModelName ?? "vision model";
+  return `[Image analysis (via ${source})]: ${description}`;
+}
+
+/**
+ * Placeholder text used when NO vision-capable model is available (the
+ * degraded path). Pure.
+ */
+export function formatNoVisionPlaceholder(): string {
+  return (
+    "[Image attached — no vision-capable model is available to analyze it. " +
+    "Install or configure a vision-capable model (e.g. kimi) to enable image analysis.]"
+  );
+}
diff --git a/packages/vision-handoff/src/service.test.ts b/packages/vision-handoff/src/service.test.ts
new file mode 100644
index 0000000..fe99d17
--- /dev/null
+++ b/packages/vision-handoff/src/service.test.ts
@@ -0,0 +1,242 @@
+import type {
+  ChatMessage,
+  ModelInfo,
+  ProviderContract,
+  ProviderEvent,
+  ProviderStreamOptions,
+  ToolContract,
+} from "@dispatch/kernel";
+import { describe, expect, it, vi } from "vitest";
+import { createVisionHandoffService, type VisionHandoffDeps } from "./service.js";
+
+// ── Test doubles (outermost-edge fakes — NOT @dispatch/* mocks) ──────────────
+
+function makeVisionProvider(
+  describe: (imageUrl: string) => string,
+  id = "umans",
+): ProviderContract {
+  return {
+    id,
+    stream: vi.fn(
+      (
+        messages: readonly ChatMessage[],
+        _tools: readonly ToolContract[],
+        _opts?: ProviderStreamOptions,
+      ): AsyncIterable<ProviderEvent> => {
+        const img = messages.flatMap((m) => m.chunks).find((c) => c.type === "image");
+        const url = img && img.type === "image" ? img.url : "";
+        const text = describe(url);
+        async function* gen(): AsyncIterable<ProviderEvent> {
+          yield { type: "text-delta", delta: text };
+          yield { type: "finish", reason: "stop" };
+        }
+        return gen();
+      },
+    ),
+  };
+}
+
+function makeDeps(overrides: Partial<VisionHandoffDeps> = {}): VisionHandoffDeps {
+  const visionProvider = makeVisionProvider((url) => `DESCRIPTION of ${url}`);
+  const catalog = ["umans/kimi-k2.7", "umans/glm-5.2"];
+  const infoMap: Record<string, ModelInfo> = {
+    "umans/kimi-k2.7": { id: "kimi-k2.7", vision: true },
+    "umans/glm-5.2": { id: "glm-5.2" },
+  };
+  return {
+    credentialStore: {
+      listCatalog: vi.fn(async () => catalog),
+      getModelInfo: vi.fn(async (name: string) => infoMap[name]),
+      resolve: vi.fn((name: string) => {
+        if (name === "umans/kimi-k2.7") return { providerId: "umans", model: "kimi-k2.7" };
+        if (name === "umans/glm-5.2") return { providerId: "umans", model: "glm-5.2" };
+        return undefined;
+      }),
+    },
+    resolveModel: vi.fn((name: string) =>
+      name === "umans/kimi-k2.7" || name === "umans/glm-5.2"
+        ? { provider: visionProvider, model: name.split("/")[1] }
+        : undefined,
+    ),
+    readFileAsDataUrl: vi.fn(async (path: string) => `data:image/png;base64,FILE(${path})`),
+    ...overrides,
+  };
+}
+
+describe("VisionHandoffService.isVisionCapable", () => {
+  it("returns true for kimi (via ModelInfo)", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    expect(await svc.isVisionCapable("umans/kimi-k2.7")).toBe(true);
+  });
+
+  it("returns false for glm-5.2", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    expect(await svc.isVisionCapable("umans/glm-5.2")).toBe(false);
+  });
+
+  it("returns false for undefined model name", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    expect(await svc.isVisionCapable(undefined)).toBe(false);
+  });
+});
+
+describe("VisionHandoffService.resolveVisionModel", () => {
+  it("resolves the kimi model from the catalog", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    const vision = await svc.resolveVisionModel();
+    expect(vision?.modelName).toBe("umans/kimi-k2.7");
+    expect(vision?.model).toBe("kimi-k2.7");
+  });
+
+  it("excludes the given model", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    const vision = await svc.resolveVisionModel("umans/kimi-k2.7");
+    // kimi is the only vision model; excluding it → undefined.
+    expect(vision).toBeUndefined();
+  });
+});
+
+describe("VisionHandoffService.transcribeImage", () => {
+  it("returns a formatted description from the vision model", async () => {
+    const svc = createVisionHandoffService(makeDeps());
+    const result = await svc.transcribeImage("data:image/png;base64,xxx", "what is this?");
+    expect(result).toBe(
+      "[Image analysis (via umans/kimi-k2.7)]: DESCRIPTION of data:image/png;base64,xxx",
+    );
+  });
+
+  it("returns a placeholder when no vision model is available", async () => {
+    const deps = makeDeps();
+    // Empty catalog → no vision model.
+    (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+    const svc = createVisionHandoffService(deps);
+    const result = await svc.transcribeImage("data:image/png;base64,xxx", undefined);
+    expect(result).toContain("no vision-capable model");
+  });
+
+  it("returns an error note when the vision stream errors", async () => {
+    const errorProvider: ProviderContract = {
+      id: "umans",
+      stream: vi.fn(async function* (): AsyncIterable<ProviderEvent> {
+        yield { type: "error", message: "vision API down" };
+      }),
+    };
+    const deps = makeDeps({
+      resolveModel: vi.fn(() => ({ provider: errorProvider, model: "kimi-k2.7" })),
+    });
+    const svc = createVisionHandoffService(deps);
+    const result = await svc.transcribeImage("data:image/png;base64,xxx", undefined);
+    expect(result).toContain("Image analysis failed: vision API down");
+  });
+});
+
+describe("VisionHandoffService.transcribeForProvider", () => {
+  it("passes messages through unchanged when the model is vision-capable", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "What's this?" },
+          { type: "image", url: "data:image/png;base64,abc" },
+        ],
+      },
+    ];
+    const result = await svc.transcribeForProvider(messages, "umans/kimi-k2.7");
+    expect(result).toBe(messages); // same reference — no copy, no transcription
+  });
+
+  it("passes messages through unchanged when there are no images", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [{ role: "user", chunks: [{ type: "text", text: "hi" }] }];
+    const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+    expect(result).toBe(messages);
+  });
+
+  it("transcribes image chunks to text for a non-vision model", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "text", text: "Describe this" },
+          { type: "image", url: "data:image/png;base64,img1" },
+        ],
+      },
+    ];
+    const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+    expect(result).toHaveLength(1);
+    const chunks = result[0]?.chunks;
+    expect(chunks).toHaveLength(2);
+    expect(chunks?.[0]).toEqual({ type: "text", text: "Describe this" });
+    // The image chunk was replaced with a transcribed text chunk.
+    expect(chunks?.[1]?.type).toBe("text");
+    expect((chunks?.[1] as { text: string }).text).toContain("Image analysis");
+    expect((chunks?.[1] as { text: string }).text).toContain("img1");
+  });
+
+  it("caches transcription per unique image URL within a call", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      {
+        role: "user",
+        chunks: [
+          { type: "image", url: "data:image/png;base64,same" },
+          { type: "image", url: "data:image/png;base64,same" },
+        ],
+      },
+    ];
+    const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+    const chunks = result[0]?.chunks;
+    // Both image chunks → text, same description (cached).
+    expect(chunks).toHaveLength(2);
+    expect((chunks?.[0] as { text: string }).text).toBe((chunks?.[1] as { text: string }).text);
+    // The vision provider was called only once (cache hit on the second).
+    const provider = deps.resolveModel("umans/kimi-k2.7")?.provider;
+    expect((provider?.stream as ReturnType<typeof vi.fn>).mock.calls).toHaveLength(1);
+  });
+
+  it("transcribes images in history messages too (non-vision model)", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,hist" }] },
+      { role: "assistant", chunks: [{ type: "text", text: "got it" }] },
+      { role: "user", chunks: [{ type: "text", text: "and now?" }] },
+    ];
+    const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+    // First message's image chunk is now text.
+    expect(result[0]?.chunks[0]?.type).toBe("text");
+    expect((result[0]?.chunks[0] as { text: string }).text).toContain("Image analysis");
+    // Assistant message unchanged.
+    expect(result[1]?.chunks[0]?.type).toBe("text");
+    // Last user message unchanged.
+    expect(result[2]?.chunks[0]).toEqual({ type: "text", text: "and now?" });
+  });
+
+  it("uses a placeholder when no vision model is available (non-vision model)", async () => {
+    const deps = makeDeps();
+    (deps.credentialStore.listCatalog as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+    const svc = createVisionHandoffService(deps);
+    const messages: ChatMessage[] = [
+      { role: "user", chunks: [{ type: "image", url: "data:image/png;base64,abc" }] },
+    ];
+    const result = await svc.transcribeForProvider(messages, "umans/glm-5.2");
+    expect((result[0]?.chunks[0] as { text: string }).text).toContain("no vision-capable model");
+  });
+});
+
+describe("VisionHandoffService.readImageFile", () => {
+  it("reads the file and transcribes it", async () => {
+    const deps = makeDeps();
+    const svc = createVisionHandoffService(deps);
+    const result = await svc.readImageFile("screenshot.png", "/work");
+    expect(deps.readFileAsDataUrl).toHaveBeenCalledWith("screenshot.png", "/work");
+    expect(result).toContain("Image analysis");
+    expect(result).toContain("FILE(screenshot.png)");
+  });
+});
diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts
new file mode 100644
index 0000000..5e6ad70
--- /dev/null
+++ b/packages/vision-handoff/src/service.ts
@@ -0,0 +1,281 @@
+/**
+ * Vision handoff service — the imperative shell that performs the universal,
+ * provider-agnostic vision handoff.
+ *
+ * Two capabilities:
+ * 1. **Transcription for non-vision models** (`transcribeForProvider`): when a
+ *    user message carries images but the active model cannot see them, this
+ *    calls a vision-capable model (resolved from the catalog — any provider) to
+ *    describe each image, then replaces the image chunks with text. Universal:
+ *    it uses the standard `ProviderContract.stream` interface, never a
+ *    provider-specific vision endpoint.
+ * 2. **`read_image` tool** (`readImageFile`): reads an image FILE from disk and
+ *    transcribes it via a vision-capable model, returning the text description
+ *    — so any model (vision or not) can analyze an image referenced in code.
+ *
+ * Effects (credential store, provider streaming, filesystem, fetch) are
+ * injected. The pure decisions live in `pure.ts`. This shell wires them.
+ */
+
+import type { CredentialStore } from "@dispatch/credential-store";
+import type {
+  ChatMessage,
+  Chunk,
+  Logger,
+  ModelInfo,
+  ProviderContract,
+  ProviderStreamOptions,
+} from "@dispatch/kernel";
+import { defineService, type ServiceHandle } from "@dispatch/kernel";
+import {
+  buildTranscriptionPrompt,
+  collectTextFromStream,
+  findVisionModelName,
+  formatNoVisionPlaceholder,
+  formatTranscriptionText,
+  isVisionCapable,
+} from "./pure.js";
+
+/**
+ * Resolved vision model — a provider + its model id, ready to stream from.
+ */
+export interface ResolvedVisionModel {
+  readonly provider: ProviderContract;
+  readonly model: string;
+  readonly modelName: string;
+}
+
+/**
+ * Dependencies the service needs — all injected (no ambient state).
+ */
+export interface VisionHandoffDeps {
+  readonly credentialStore: CredentialStore;
+  /** Resolve a `<credentialName>/<model>` → its provider + model id. */
+  readonly resolveModel: (
+    modelName: string,
+  ) => { provider: ProviderContract; model: string } | undefined;
+  /**
+   * Read a file from disk as a base64 data URL. Injected so the shell controls
+   * the filesystem edge (and tests inject a fake). Returns the data URL, or
+   * throws on error (the caller surfaces it as a tool error).
+   */
+  readonly readFileAsDataUrl: (path: string, cwd?: string) => Promise<string>;
+  /**
+   * Fetch an HTTP(S) URL to a data URL (for http image sources). Injected so
+   * tests inject a fake. Optional — when absent, HTTP image URLs are passed to
+   * the vision provider as-is (it fetches them).
+   */
+  readonly fetchUrlAsDataUrl?: (url: string) => Promise<string>;
+  readonly logger?: Logger;
+}
+
+export interface VisionHandoffService {
+  /**
+   * Whether a given model (by catalog name) is vision-capable. Uses the
+   * credential store's ModelInfo + the name heuristic. Async because ModelInfo
+   * may require a listModels round-trip (cached by the credential store).
+   */
+  readonly isVisionCapable: (modelName: string | undefined) => Promise<boolean>;
+
+  /**
+   * Resolve a vision-capable model from the catalog (any provider). Returns
+   * `undefined` when none is available.
+   */
+  readonly resolveVisionModel: (excludeName?: string) => Promise<ResolvedVisionModel | undefined>;
+
+  /**
+   * Transcribe a single image URL to a text description via a vision-capable
+   * model. Returns the description, or a placeholder string when no vision
+   * model is available (does NOT throw — callers want graceful degradation).
+   */
+  readonly transcribeImage: (
+    imageUrl: string,
+    userQuestion: string | undefined,
+    opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+  ) => Promise<string>;
+
+  /**
+   * Transform a message list for the provider: if the active model is
+   * vision-capable, return messages unchanged (images pass through natively).
+   * If NOT vision-capable, replace every `image` chunk with a text
+   * description (transcribed via a vision model — once per unique image URL,
+   * cached within the call) so a text-only model can still reason about the
+   * images. Never throws — on failure an image becomes a placeholder note.
+   *
+   * The PERSISTED history is NOT modified by this (the caller persists the
+   * original messages with images); this only transforms what the provider sees.
+   */
+  readonly transcribeForProvider: (
+    messages: readonly ChatMessage[],
+    currentModelName: string | undefined,
+    opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+  ) => Promise<readonly ChatMessage[]>;
+
+  /**
+   * Read an image FILE from disk and transcribe it (the `read_image` tool's
+   * core). Returns the description text. Throws on filesystem error (the tool
+   * surfaces it as a tool-error result).
+   */
+  readonly readImageFile: (
+    path: string,
+    cwd: string | undefined,
+    opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+  ) => Promise<string>;
+}
+
+export const visionHandoffHandle: ServiceHandle<VisionHandoffService> =
+  defineService<VisionHandoffService>("vision-handoff/service");
+
+/** Whether a message list contains any image chunks. Pure. */
+function hasImageChunks(messages: readonly ChatMessage[]): boolean {
+  return messages.some((m) => m.chunks.some((c) => c.type === "image"));
+}
+
+export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHandoffService {
+  const log = deps.logger;
+
+  async function getInfo(modelName: string): Promise<ModelInfo | undefined> {
+    return deps.credentialStore.getModelInfo(modelName);
+  }
+
+  async function resolveVisionModel(
+    excludeName?: string,
+  ): Promise<ResolvedVisionModel | undefined> {
+    const catalog = await deps.credentialStore.listCatalog();
+    const name = await findVisionModelName(catalog, getInfo, excludeName);
+    if (name === undefined) return undefined;
+    const resolved = deps.resolveModel(name);
+    if (resolved === undefined) return undefined;
+    return { provider: resolved.provider, model: resolved.model, modelName: name };
+  }
+
+  async function streamVisionText(
+    vision: ResolvedVisionModel,
+    imageUrl: string,
+    prompt: string,
+    opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+  ): Promise<string> {
+    // Build a single-turn user message: [text prompt, image]. The vision model
+    // receives the image natively via the OpenAI-compatible content array
+    // (convertMessages serializes the image chunk to image_url).
+    const userMessage: ChatMessage = {
+      role: "user",
+      chunks: [
+        { type: "text", text: prompt },
+        { type: "image", url: imageUrl },
+      ],
+    };
+    const providerOpts: ProviderStreamOptions = {
+      model: vision.model,
+      // Low temperature for faithful transcription.
+      temperature: 0,
+      // A short system prompt keeps the vision model focused on describing.
+      systemPrompt:
+        "You are a vision assistant. Describe images faithfully and thoroughly for a developer who cannot see them.",
+    };
+    const streamOpts: Parameters<ProviderContract["stream"]>[2] = {
+      ...providerOpts,
+      ...(opts?.logger !== undefined ? { logger: opts.logger } : {}),
+    };
+    const stream = vision.provider.stream([userMessage], [], streamOpts);
+    return collectTextFromStream(stream);
+  }
+
+  const service: VisionHandoffService = {
+    async isVisionCapable(modelName: string | undefined): Promise<boolean> {
+      if (modelName === undefined) return false;
+      const info = await getInfo(modelName);
+      return isVisionCapable(modelName, info);
+    },
+
+    resolveVisionModel,
+
+    async transcribeImage(
+      imageUrl: string,
+      userQuestion: string | undefined,
+      opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+    ): Promise<string> {
+      const vision = await resolveVisionModel();
+      if (vision === undefined) {
+        log?.warn("vision-handoff: no vision-capable model available for transcription");
+        return formatNoVisionPlaceholder();
+      }
+      const prompt = buildTranscriptionPrompt(userQuestion);
+      try {
+        const description = await streamVisionText(vision, imageUrl, prompt, opts);
+        const trimmed = description.trim();
+        if (trimmed.length === 0) {
+          return "[Image analysis produced no output.]";
+        }
+        return formatTranscriptionText(trimmed, vision.modelName);
+      } catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        log?.warn("vision-handoff: transcription failed", { error: msg });
+        return `[Image analysis failed: ${msg}]`;
+      }
+    },
+
+    async transcribeForProvider(
+      messages: readonly ChatMessage[],
+      currentModelName: string | undefined,
+      opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+    ): Promise<readonly ChatMessage[]> {
+      // Fast path: no images anywhere → nothing to do.
+      if (!hasImageChunks(messages)) return messages;
+
+      // If the active model IS vision-capable, pass images through natively.
+      if (currentModelName !== undefined) {
+        const capable = await isVisionCapable(currentModelName, await getInfo(currentModelName));
+        if (capable) return messages;
+      }
+
+      // Non-vision model: transcribe each unique image URL once (cached).
+      const cache = new Map<string, string>();
+      const userText = messages
+        .filter((m) => m.role === "user")
+        .flatMap((m) => m.chunks)
+        .filter((c): c is { type: "text"; text: string } => c.type === "text")
+        .map((c) => c.text)
+        .join(" ");
+
+      async function transcribeCached(url: string): Promise<string> {
+        const cached = cache.get(url);
+        if (cached !== undefined) return cached;
+        const description = await service.transcribeImage(url, userText, opts);
+        cache.set(url, description);
+        return description;
+      }
+
+      const result: ChatMessage[] = [];
+      for (const msg of messages) {
+        if (!msg.chunks.some((c) => c.type === "image")) {
+          result.push(msg);
+          continue;
+        }
+        // Replace image chunks with transcribed text chunks; keep all else.
+        const newChunks: Chunk[] = [];
+        for (const chunk of msg.chunks) {
+          if (chunk.type === "image") {
+            const description = await transcribeCached(chunk.url);
+            newChunks.push({ type: "text", text: description });
+          } else {
+            newChunks.push(chunk);
+          }
+        }
+        result.push({ role: msg.role, chunks: newChunks });
+      }
+      return result;
+    },
+
+    async readImageFile(
+      path: string,
+      cwd: string | undefined,
+      opts?: { readonly signal?: AbortSignal; readonly logger?: Logger },
+    ): Promise<string> {
+      const dataUrl = await deps.readFileAsDataUrl(path, cwd);
+      return service.transcribeImage(dataUrl, undefined, opts);
+    },
+  };
+
+  return service;
+}
diff --git a/packages/vision-handoff/src/tool.ts b/packages/vision-handoff/src/tool.ts
new file mode 100644
index 0000000..3995598
--- /dev/null
+++ b/packages/vision-handoff/src/tool.ts
@@ -0,0 +1,68 @@
+/**
+ * read_image tool — lets any model (vision-capable or not) analyze an image
+ * FILE on disk by handing it off to a vision-capable model.
+ *
+ * The tool reads the image file into a base64 data URL, then asks the vision
+ * handoff service to transcribe it (via a vision-capable model resolved from
+ * the catalog) and returns the textual description as the tool result. This is
+ * the universal mechanism: it works regardless of whether the active model has
+ * vision, because the result is plain text the model reasons about.
+ *
+ * For images PASTED into the chat, the orchestrator's auto-transcription handles
+ * them (no tool call needed). This tool is for images REFERENCED IN CODE by path
+ * (e.g. a screenshot, diagram, or mockup the model discovered while reading files).
+ */
+
+import type { ToolContract, ToolExecuteContext, ToolResult } from "@dispatch/kernel";
+import type { VisionHandoffService } from "./service.js";
+
+export function createReadImageTool(service: VisionHandoffService): ToolContract {
+  return {
+    name: "read_image",
+    description:
+      "Read and analyze an image file on disk (PNG, JPEG, WebP, GIF). Returns a " +
+      "detailed textual description of the image's contents — useful when you " +
+      "encounter a screenshot, diagram, UI mockup, or chart referenced in the " +
+      "codebase and need to understand what it shows. The analysis is performed " +
+      "by a vision-capable model, so you can use this even if you cannot " +
+      "directly view images. Pass a file path (relative to the cwd or absolute).",
+    parameters: {
+      type: "object",
+      properties: {
+        path: {
+          type: "string",
+          description:
+            "Path to the image file to analyze. Relative paths resolve against " +
+            "the conversation's working directory; absolute paths are used as-is.",
+        },
+      },
+      required: ["path"],
+    },
+    concurrencySafe: true,
+    async execute(args: unknown, ctx: ToolExecuteContext): Promise<ToolResult> {
+      const input = args as { path?: unknown } | null;
+      const path = input?.path;
+      if (typeof path !== "string" || path.trim().length === 0) {
+        return {
+          content: "Error: 'path' is required and must be a non-empty string.",
+          isError: true,
+        };
+      }
+      const span = ctx.log.span("read_image.execute", { path });
+      try {
+        const description = await service.readImageFile(path, ctx.cwd, {
+          signal: ctx.signal,
+          logger: ctx.log,
+        });
+        span.end({ attrs: { descriptionLength: description.length } });
+        return { content: description };
+      } catch (err: unknown) {
+        span.end({ err });
+        return {
+          content: `Error reading image: ${err instanceof Error ? err.message : String(err)}`,
+          isError: true,
+        };
+      }
+    },
+  };
+}
diff --git a/packages/vision-handoff/tsconfig.json b/packages/vision-handoff/tsconfig.json
new file mode 100644
index 0000000..ec597fc
--- /dev/null
+++ b/packages/vision-handoff/tsconfig.json
@@ -0,0 +1,11 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": { "rootDir": "src", "outDir": "dist", "composite": true },
+  "include": ["src/**/*.ts"],
+  "references": [
+    { "path": "../kernel" },
+    { "path": "../wire" },
+    { "path": "../credential-store" },
+    { "path": "../openai-stream" }
+  ]
+}
diff --git a/packages/wire/src/index.test.ts b/packages/wire/src/index.test.ts
index 3f07e00..81d10c1 100644
--- a/packages/wire/src/index.test.ts
+++ b/packages/wire/src/index.test.ts
@@ -8,7 +8,7 @@
  */
 
 import { describe, expect, it } from "vitest";
-import type { Computer, ComputerEntry, Workspace } from "./index.js";
+import type { Chunk, Computer, ComputerEntry, ImageChunk, ImageInput, Workspace } from "./index.js";
 
 describe("@dispatch/wire — Computer / Workspace shapes", () => {
   it("a Computer literal satisfies the Computer type", () => {
@@ -57,3 +57,32 @@ describe("@dispatch/wire — Computer / Workspace shapes", () => {
     expect(local.defaultComputerId).toBeNull();
   });
 });
+
+describe("@dispatch/wire — ImageChunk / ImageInput shapes", () => {
+  it("an ImageChunk carries a data URL and optional mimeType", () => {
+    const c: ImageChunk = {
+      type: "image",
+      url: "data:image/png;base64,iVBORw0KGgo=",
+      mimeType: "image/png",
+    };
+    expect(c.type).toBe("image");
+    expect(c.url).toContain("base64");
+    expect(c.mimeType).toBe("image/png");
+  });
+
+  it("an ImageChunk with only a url is valid (mimeType optional)", () => {
+    const c: ImageChunk = { type: "image", url: "https://example.com/cat.png" };
+    expect(c.mimeType).toBeUndefined();
+  });
+
+  it("ImageInput mirrors ImageChunk's url semantics", () => {
+    const input: ImageInput = { url: "data:image/jpeg;base64,/9j/4AAQ" };
+    expect(input.url).toContain("jpeg");
+  });
+
+  it("ImageChunk is a member of the Chunk union (assignable)", () => {
+    const chunk: Chunk = { type: "image", url: "data:image/png;base64,x" };
+    // Compile-time proof: an ImageChunk satisfies the Chunk union.
+    expect(chunk.type).toBe("image");
+  });
+});
diff --git a/packages/wire/src/index.ts b/packages/wire/src/index.ts
index 16b7023..d6ea1c1 100644
--- a/packages/wire/src/index.ts
+++ b/packages/wire/src/index.ts
@@ -36,7 +36,8 @@ export type Chunk =
   | ToolCallChunk
   | ToolResultChunk
   | ErrorChunk
-  | SystemChunk;
+  | SystemChunk
+  | ImageChunk;
 
 /** A piece of plain text content from the assistant or user. */
 export interface TextChunk {
@@ -113,6 +114,46 @@ export interface SystemChunk {
 }
 
 /**
+ * An image attached to a message (e.g. a user-pasted screenshot or pasted
+ * photo). Carries a `url` that is EITHER a base64 data URL
+ * (`data:image/png;base64,…`) OR an `http(s)://` URL. Vision-capable models
+ * receive it natively (the provider serializes it to its image-content
+ * format); non-vision models never see it directly — the orchestrator's
+ * **vision handoff** transcribes it to a text description (via a
+ * vision-capable model) and feeds that text instead, so a text-only model can
+ * still reason about the image's contents.
+ *
+ * When a transcription was performed, it is persisted as a separate `text`
+ * chunk alongside the `image` chunk in the SAME user message, so the
+ * description is reused on every later turn (no re-transcription) and a
+ * client renders both the original image and its textual analysis.
+ */
+export interface ImageChunk {
+  readonly type: "image";
+  /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+  readonly url: string;
+  /**
+   * Optional MIME type of the image (e.g. `"image/png"`). Inferred from the
+   * data URL when absent; present so a client can render an icon/label without
+   * parsing the URL. Optional — callers that only have a URL omit it.
+   */
+  readonly mimeType?: string;
+}
+
+/**
+ * An image a client attaches to a chat message (`ChatRequest.images`). The
+ * transport-facing input shape; the orchestrator converts each `ImageInput`
+ * into an `ImageChunk` on the persisted user message. Carries the same `url`
+ * semantics as `ImageChunk.url`.
+ */
+export interface ImageInput {
+  /** Image source: a base64 data URL (`data:image/…;base64,…`) or an `http(s)://` URL. */
+  readonly url: string;
+  /** Optional MIME type (e.g. `"image/png"`). Optional — inferred from the data URL when absent. */
+  readonly mimeType?: string;
+}
+
+/**
  * A chat message: a role plus an ordered sequence of chunks. Messages are the
  * unit passed to and from the provider; chunks are the unit persisted and
  * rendered.
diff --git a/tsconfig.json b/tsconfig.json
index e4e833d..fe5ea92 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -41,6 +41,9 @@
       "path": "./packages/credential-store"
     },
     {
+      "path": "./packages/vision-handoff"
+    },
+    {
       "path": "./packages/exec-backend"
     },
     {
author	Adam Malczewski <[email protected]>	2026-06-27 03:40:38 +0900
committer	Adam Malczewski <[email protected]>	2026-06-27 03:40:38 +0900
commit	d5633cf6e007eaf8255a44529a638d2466a74ba3 (patch)
tree	14fe72f5b585eb72c763073b4e7022b914bdbafb
parent	ad9d135e583c99a0d93327115defa43187cde1c3 (diff)
download	dispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.tar.gz dispatch-d5633cf6e007eaf8255a44529a638d2466a74ba3.zip