From 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Tue, 2 Jun 2026 22:50:11 +0900 Subject: feat(chat): paste-to-attach images/PDFs with model capability check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests. --- packages/api/src/app.ts | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'packages/api/src/app.ts') diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts index 84afd2a..2f4e538 100644 --- a/packages/api/src/app.ts +++ b/packages/api/src/app.ts @@ -3,6 +3,8 @@ import { getTab, isReasoningEffort, NotificationDispatcher, + type UserContentPart, + validateUserContent, } from "@dispatch/core"; import { Hono } from "hono"; import { cors } from "hono/cors"; @@ -37,6 +39,41 @@ function sanitizeAgentModels(raw: unknown): AgentModelEntry[] | undefined { return out; } +/** + * Validate and normalise the optional multimodal `content` array from the + * `/chat` body. Each entry is either a `{ type: "text", text }` part or a + * `{ type: "attachment", mediaType, data, name? }` part (base64 payload). + * Returns `undefined` when the input isn't a non-empty array or contains no + * attachment (so the plain-string path is taken — byte-identical to before). + * Shape only: SIZE/TYPE limits are enforced separately by `validateUserContent`. + */ +function sanitizeUserContent(raw: unknown): UserContentPart[] | undefined { + if (!Array.isArray(raw) || raw.length === 0) return undefined; + const out: UserContentPart[] = []; + let hasAttachment = false; + for (const p of raw) { + if (!p || typeof p !== "object") continue; + const part = p as Record; + if (part.type === "text") { + if (typeof part.text === "string") out.push({ type: "text", text: part.text }); + continue; + } + if (part.type === "attachment") { + if (typeof part.mediaType !== "string" || typeof part.data !== "string") continue; + hasAttachment = true; + out.push({ + type: "attachment", + mediaType: part.mediaType, + data: part.data, + ...(typeof part.name === "string" ? { name: part.name } : {}), + }); + } + } + // No attachment → let the plain-text path handle it (avoids needlessly + // switching the model message to array content for a text-only turn). + return hasAttachment ? out : undefined; +} + export const permissionManager = new PermissionManager(); export const agentManager = new AgentManager(permissionManager); @@ -94,6 +131,7 @@ app.post("/chat", async (c) => { const body = await c.req.json<{ tabId?: unknown; message?: unknown; + content?: unknown; keyId?: unknown; modelId?: unknown; agentModels?: unknown; @@ -121,6 +159,30 @@ app.post("/chat", async (c) => { ? body.reasoningEffort : undefined; + // Optional multimodal content (image/pdf attachments). When present, the + // attachments are EPHEMERAL — forwarded to the model for this turn only and + // never persisted (the chunk log keeps just `message`, which the frontend + // has already projected to text with `[image]`/`[pdf]` markers). + const content = sanitizeUserContent(body.content); + if (content) { + // Enforce size/type/count ceilings server-side (defence in depth; the + // frontend also enforces them at paste time). Reject the whole request + // so no tokens are spent on an over-limit payload. + const validation = validateUserContent(content); + if (!validation.ok) { + return c.json({ error: "invalid attachments", details: validation.errors }, 400); + } + // Attachments only attach to a FRESH turn. If the tab is mid-turn the + // message would queue (text-only machinery), silently dropping the + // images. Reject clearly instead so the user can retry once idle. + if (agentManager.getTabStatus(tabId) === "running") { + return c.json( + { error: "cannot attach images while the agent is generating; wait for it to finish" }, + 409, + ); + } + } + // Single routing decision (queue if busy, new turn if idle) shared with the // `send_to_tab` tool via `AgentManager.deliverMessage`. Non-blocking — a // started turn runs in the background. @@ -131,6 +193,7 @@ app.post("/chat", async (c) => { ...(reasoningEffort ? { reasoningEffort } : {}), ...(workingDirectory !== undefined ? { workingDirectory } : {}), ...(queueId ? { queueId } : {}), + ...(content ? { content } : {}), }); if (outcome.status === "queued") { -- cgit v1.2.3