From 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Tue, 2 Jun 2026 22:50:11 +0900 Subject: feat(chat): paste-to-attach images/PDFs with model capability check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests. --- packages/api/src/agent-manager.ts | 11 +++++++ packages/api/src/app.ts | 63 +++++++++++++++++++++++++++++++++++++++ packages/api/src/routes/models.ts | 18 +++++++++++ packages/api/tests/routes.test.ts | 63 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+) (limited to 'packages/api') diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts index 2532efa..3b12a80 100644 --- a/packages/api/src/agent-manager.ts +++ b/packages/api/src/agent-manager.ts @@ -65,6 +65,7 @@ import { toAvailableUserAgents, type UsageData, type UsageStats, + type UserContentPart, validateConfig, } from "@dispatch/core"; import type { PermissionManager } from "./permission-manager.js"; @@ -1535,6 +1536,13 @@ export class AgentManager { reasoningEffort?: ReasoningEffort; workingDirectory?: string; queueId?: string; + /** + * Ephemeral ordered multimodal content (image/pdf attachments) for a + * FRESH human turn. Forwarded to `processMessage` → `agent.run` only + * when the tab is idle (a started turn); never carried into the queue + * path (attachments require a fresh turn — the caller guards that). + */ + content?: UserContentPart[]; /** * Who is sending this message. `"human"` (default) is unrestricted * and REFILLS the target's agent-to-agent auto-wake budget. `"agent"` @@ -1606,6 +1614,7 @@ export class AgentManager { opts.reasoningEffort, opts.workingDirectory, agentModels, + opts.content, ).catch((err) => { console.error(`[dispatch] deliverMessage processMessage error for tab ${tabId}:`, err); }); @@ -1620,6 +1629,7 @@ export class AgentManager { reasoningEffort?: ReasoningEffort, workingDirectory?: string, agentModels?: AgentModelEntry[], + content?: UserContentPart[], ): Promise { const tabAgent = this._getOrCreateTabAgent(tabId); @@ -1731,6 +1741,7 @@ export class AgentManager { for await (const event of agent.run(message, { ...(effortForEntry ? { reasoningEffort: effortForEntry } : {}), abortSignal: tabAgent.abortController?.signal, + ...(content ? { content } : {}), })) { // Stop processing if the tab was aborted (closed/stopped). // stopTab() already injected a `cancelled` system chunk into diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts index 84afd2a..2f4e538 100644 --- a/packages/api/src/app.ts +++ b/packages/api/src/app.ts @@ -3,6 +3,8 @@ import { getTab, isReasoningEffort, NotificationDispatcher, + type UserContentPart, + validateUserContent, } from "@dispatch/core"; import { Hono } from "hono"; import { cors } from "hono/cors"; @@ -37,6 +39,41 @@ function sanitizeAgentModels(raw: unknown): AgentModelEntry[] | undefined { return out; } +/** + * Validate and normalise the optional multimodal `content` array from the + * `/chat` body. Each entry is either a `{ type: "text", text }` part or a + * `{ type: "attachment", mediaType, data, name? }` part (base64 payload). + * Returns `undefined` when the input isn't a non-empty array or contains no + * attachment (so the plain-string path is taken — byte-identical to before). + * Shape only: SIZE/TYPE limits are enforced separately by `validateUserContent`. + */ +function sanitizeUserContent(raw: unknown): UserContentPart[] | undefined { + if (!Array.isArray(raw) || raw.length === 0) return undefined; + const out: UserContentPart[] = []; + let hasAttachment = false; + for (const p of raw) { + if (!p || typeof p !== "object") continue; + const part = p as Record; + if (part.type === "text") { + if (typeof part.text === "string") out.push({ type: "text", text: part.text }); + continue; + } + if (part.type === "attachment") { + if (typeof part.mediaType !== "string" || typeof part.data !== "string") continue; + hasAttachment = true; + out.push({ + type: "attachment", + mediaType: part.mediaType, + data: part.data, + ...(typeof part.name === "string" ? { name: part.name } : {}), + }); + } + } + // No attachment → let the plain-text path handle it (avoids needlessly + // switching the model message to array content for a text-only turn). + return hasAttachment ? out : undefined; +} + export const permissionManager = new PermissionManager(); export const agentManager = new AgentManager(permissionManager); @@ -94,6 +131,7 @@ app.post("/chat", async (c) => { const body = await c.req.json<{ tabId?: unknown; message?: unknown; + content?: unknown; keyId?: unknown; modelId?: unknown; agentModels?: unknown; @@ -121,6 +159,30 @@ app.post("/chat", async (c) => { ? body.reasoningEffort : undefined; + // Optional multimodal content (image/pdf attachments). When present, the + // attachments are EPHEMERAL — forwarded to the model for this turn only and + // never persisted (the chunk log keeps just `message`, which the frontend + // has already projected to text with `[image]`/`[pdf]` markers). + const content = sanitizeUserContent(body.content); + if (content) { + // Enforce size/type/count ceilings server-side (defence in depth; the + // frontend also enforces them at paste time). Reject the whole request + // so no tokens are spent on an over-limit payload. + const validation = validateUserContent(content); + if (!validation.ok) { + return c.json({ error: "invalid attachments", details: validation.errors }, 400); + } + // Attachments only attach to a FRESH turn. If the tab is mid-turn the + // message would queue (text-only machinery), silently dropping the + // images. Reject clearly instead so the user can retry once idle. + if (agentManager.getTabStatus(tabId) === "running") { + return c.json( + { error: "cannot attach images while the agent is generating; wait for it to finish" }, + 409, + ); + } + } + // Single routing decision (queue if busy, new turn if idle) shared with the // `send_to_tab` tool via `AgentManager.deliverMessage`. Non-blocking — a // started turn runs in the background. @@ -131,6 +193,7 @@ app.post("/chat", async (c) => { ...(reasoningEffort ? { reasoningEffort } : {}), ...(workingDirectory !== undefined ? { workingDirectory } : {}), ...(queueId ? { queueId } : {}), + ...(content ? { content } : {}), }); if (outcome.status === "queued") { diff --git a/packages/api/src/routes/models.ts b/packages/api/src/routes/models.ts index eeb6029..a1700b1 100644 --- a/packages/api/src/routes/models.ts +++ b/packages/api/src/routes/models.ts @@ -20,6 +20,7 @@ import { refreshAccountCredentialsAsync, resolveApiKey, resolveContextLimit, + resolveModelCapabilities, selectHaikuModel, setApiKey, validateAccountCredentials, @@ -180,6 +181,23 @@ modelsRoutes.get("/context-limit", async (c) => { return c.json({ contextLimit }); }); +// Resolve a model's image / PDF INPUT capabilities from the models.dev catalog. +// Returns `{ capabilities: { image, pdf } | null }`. `null` means UNKNOWN — the +// provider is unmapped, the model is absent, the catalog predates the +// `modalities` field, or the catalog is offline. The frontend treats `null` as +// "can't verify" (optimistic allow) and a definitive `{ image: false }` as a +// hard block (no tokens spent). +modelsRoutes.get("/capabilities", async (c) => { + const provider = c.req.query("provider"); + const modelId = c.req.query("modelId"); + if (!provider || !modelId) { + return c.json({ error: "provider and modelId query parameters are required" }, 400); + } + + const capabilities = await resolveModelCapabilities(provider, modelId); + return c.json({ capabilities }); +}); + // List available Claude accounts with validated credentials modelsRoutes.get("/claude-accounts", async (c) => { const candidates = resolveClaudeAccounts(); diff --git a/packages/api/tests/routes.test.ts b/packages/api/tests/routes.test.ts index 37c19ca..7cfd8a7 100644 --- a/packages/api/tests/routes.test.ts +++ b/packages/api/tests/routes.test.ts @@ -219,6 +219,16 @@ vi.mock("@dispatch/core", () => ({ typeof value === "string" && ["none", "low", "medium", "high", "xhigh", "max"].includes(value) ); }, + // Lightweight stand-in for the real validator: accept the supported media + // types, reject everything else. Enough to exercise the /chat attachment + // validation branch (the real validator is unit-tested in core). + validateUserContent(content: Array<{ type: string; mediaType?: string }>) { + const accepted = ["image/png", "image/jpeg", "image/webp", "image/gif", "application/pdf"]; + const errors = content + .filter((p) => p.type === "attachment" && !accepted.includes(p.mediaType ?? "")) + .map((p) => ({ code: "unsupported-type", mediaType: p.mediaType })); + return { ok: errors.length === 0, errors }; + }, listOpenTabs() { return [...fakeOpenTabs]; }, @@ -449,6 +459,59 @@ describe("POST /chat", () => { expect(await res.json()).toEqual({ status: "ok" }); }); + it("accepts a valid image attachment and starts a turn", async () => { + const res = await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + tabId: "tab-img-ok", + message: "look: [image]", + content: [ + { type: "text", text: "look: " }, + { type: "attachment", mediaType: "image/png", data: "QQ==" }, + ], + }), + }); + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ status: "ok" }); + }); + + it("returns 400 for an unsupported attachment media type", async () => { + const res = await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + tabId: "tab-img-bad", + message: "look: [image]", + content: [{ type: "attachment", mediaType: "image/svg+xml", data: "QQ==" }], + }), + }); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe("invalid attachments"); + }); + + it("returns 409 when attaching while the agent is generating", async () => { + // Kick off a turn so the tab is running. + await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ tabId: "tab-img-busy", message: "first" }), + }); + await new Promise((r) => setTimeout(r, 20)); + + const res = await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + tabId: "tab-img-busy", + message: "second [image]", + content: [{ type: "attachment", mediaType: "image/png", data: "QQ==" }], + }), + }); + expect(res.status).toBe(409); + }); + it("returns 400 with empty message", async () => { const res = await app.request("/chat", { method: "POST", -- cgit v1.2.3