From 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Tue, 2 Jun 2026 22:50:11 +0900 Subject: feat(chat): paste-to-attach images/PDFs with model capability check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests. --- packages/api/src/agent-manager.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'packages/api/src/agent-manager.ts') diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts index 2532efa..3b12a80 100644 --- a/packages/api/src/agent-manager.ts +++ b/packages/api/src/agent-manager.ts @@ -65,6 +65,7 @@ import { toAvailableUserAgents, type UsageData, type UsageStats, + type UserContentPart, validateConfig, } from "@dispatch/core"; import type { PermissionManager } from "./permission-manager.js"; @@ -1535,6 +1536,13 @@ export class AgentManager { reasoningEffort?: ReasoningEffort; workingDirectory?: string; queueId?: string; + /** + * Ephemeral ordered multimodal content (image/pdf attachments) for a + * FRESH human turn. Forwarded to `processMessage` → `agent.run` only + * when the tab is idle (a started turn); never carried into the queue + * path (attachments require a fresh turn — the caller guards that). + */ + content?: UserContentPart[]; /** * Who is sending this message. `"human"` (default) is unrestricted * and REFILLS the target's agent-to-agent auto-wake budget. `"agent"` @@ -1606,6 +1614,7 @@ export class AgentManager { opts.reasoningEffort, opts.workingDirectory, agentModels, + opts.content, ).catch((err) => { console.error(`[dispatch] deliverMessage processMessage error for tab ${tabId}:`, err); }); @@ -1620,6 +1629,7 @@ export class AgentManager { reasoningEffort?: ReasoningEffort, workingDirectory?: string, agentModels?: AgentModelEntry[], + content?: UserContentPart[], ): Promise { const tabAgent = this._getOrCreateTabAgent(tabId); @@ -1731,6 +1741,7 @@ export class AgentManager { for await (const event of agent.run(message, { ...(effortForEntry ? { reasoningEffort: effortForEntry } : {}), abortSignal: tabAgent.abortController?.signal, + ...(content ? { content } : {}), })) { // Stop processing if the tab was aborted (closed/stopped). // stopTab() already injected a `cancelled` system chunk into -- cgit v1.2.3