summaryrefslogtreecommitdiffhomepage
path: root/packages/api
diff options
context:
space:
mode:
authorAdam Malczewski <[email protected]>2026-06-02 22:50:11 +0900
committerAdam Malczewski <[email protected]>2026-06-02 22:50:11 +0900
commit66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae (patch)
treec3e039e09c89231f84dfd16f7bbbf8aedcc2dc7d /packages/api
parent4b45d33c256cf580a53054078be6fd7148fa6302 (diff)
downloaddispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.tar.gz
dispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.zip
feat(chat): paste-to-attach images/PDFs with model capability check
Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests.
Diffstat (limited to 'packages/api')
-rw-r--r--packages/api/src/agent-manager.ts11
-rw-r--r--packages/api/src/app.ts63
-rw-r--r--packages/api/src/routes/models.ts18
-rw-r--r--packages/api/tests/routes.test.ts63
4 files changed, 155 insertions, 0 deletions
diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts
index 2532efa..3b12a80 100644
--- a/packages/api/src/agent-manager.ts
+++ b/packages/api/src/agent-manager.ts
@@ -65,6 +65,7 @@ import {
toAvailableUserAgents,
type UsageData,
type UsageStats,
+ type UserContentPart,
validateConfig,
} from "@dispatch/core";
import type { PermissionManager } from "./permission-manager.js";
@@ -1536,6 +1537,13 @@ export class AgentManager {
workingDirectory?: string;
queueId?: string;
/**
+ * Ephemeral ordered multimodal content (image/pdf attachments) for a
+ * FRESH human turn. Forwarded to `processMessage` → `agent.run` only
+ * when the tab is idle (a started turn); never carried into the queue
+ * path (attachments require a fresh turn — the caller guards that).
+ */
+ content?: UserContentPart[];
+ /**
* Who is sending this message. `"human"` (default) is unrestricted
* and REFILLS the target's agent-to-agent auto-wake budget. `"agent"`
* (from the `send_to_tab` tool) is governed by that budget: an
@@ -1606,6 +1614,7 @@ export class AgentManager {
opts.reasoningEffort,
opts.workingDirectory,
agentModels,
+ opts.content,
).catch((err) => {
console.error(`[dispatch] deliverMessage processMessage error for tab ${tabId}:`, err);
});
@@ -1620,6 +1629,7 @@ export class AgentManager {
reasoningEffort?: ReasoningEffort,
workingDirectory?: string,
agentModels?: AgentModelEntry[],
+ content?: UserContentPart[],
): Promise<void> {
const tabAgent = this._getOrCreateTabAgent(tabId);
@@ -1731,6 +1741,7 @@ export class AgentManager {
for await (const event of agent.run(message, {
...(effortForEntry ? { reasoningEffort: effortForEntry } : {}),
abortSignal: tabAgent.abortController?.signal,
+ ...(content ? { content } : {}),
})) {
// Stop processing if the tab was aborted (closed/stopped).
// stopTab() already injected a `cancelled` system chunk into
diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts
index 84afd2a..2f4e538 100644
--- a/packages/api/src/app.ts
+++ b/packages/api/src/app.ts
@@ -3,6 +3,8 @@ import {
getTab,
isReasoningEffort,
NotificationDispatcher,
+ type UserContentPart,
+ validateUserContent,
} from "@dispatch/core";
import { Hono } from "hono";
import { cors } from "hono/cors";
@@ -37,6 +39,41 @@ function sanitizeAgentModels(raw: unknown): AgentModelEntry[] | undefined {
return out;
}
+/**
+ * Validate and normalise the optional multimodal `content` array from the
+ * `/chat` body. Each entry is either a `{ type: "text", text }` part or a
+ * `{ type: "attachment", mediaType, data, name? }` part (base64 payload).
+ * Returns `undefined` when the input isn't a non-empty array or contains no
+ * attachment (so the plain-string path is taken — byte-identical to before).
+ * Shape only: SIZE/TYPE limits are enforced separately by `validateUserContent`.
+ */
+function sanitizeUserContent(raw: unknown): UserContentPart[] | undefined {
+ if (!Array.isArray(raw) || raw.length === 0) return undefined;
+ const out: UserContentPart[] = [];
+ let hasAttachment = false;
+ for (const p of raw) {
+ if (!p || typeof p !== "object") continue;
+ const part = p as Record<string, unknown>;
+ if (part.type === "text") {
+ if (typeof part.text === "string") out.push({ type: "text", text: part.text });
+ continue;
+ }
+ if (part.type === "attachment") {
+ if (typeof part.mediaType !== "string" || typeof part.data !== "string") continue;
+ hasAttachment = true;
+ out.push({
+ type: "attachment",
+ mediaType: part.mediaType,
+ data: part.data,
+ ...(typeof part.name === "string" ? { name: part.name } : {}),
+ });
+ }
+ }
+ // No attachment → let the plain-text path handle it (avoids needlessly
+ // switching the model message to array content for a text-only turn).
+ return hasAttachment ? out : undefined;
+}
+
export const permissionManager = new PermissionManager();
export const agentManager = new AgentManager(permissionManager);
@@ -94,6 +131,7 @@ app.post("/chat", async (c) => {
const body = await c.req.json<{
tabId?: unknown;
message?: unknown;
+ content?: unknown;
keyId?: unknown;
modelId?: unknown;
agentModels?: unknown;
@@ -121,6 +159,30 @@ app.post("/chat", async (c) => {
? body.reasoningEffort
: undefined;
+ // Optional multimodal content (image/pdf attachments). When present, the
+ // attachments are EPHEMERAL — forwarded to the model for this turn only and
+ // never persisted (the chunk log keeps just `message`, which the frontend
+ // has already projected to text with `[image]`/`[pdf]` markers).
+ const content = sanitizeUserContent(body.content);
+ if (content) {
+ // Enforce size/type/count ceilings server-side (defence in depth; the
+ // frontend also enforces them at paste time). Reject the whole request
+ // so no tokens are spent on an over-limit payload.
+ const validation = validateUserContent(content);
+ if (!validation.ok) {
+ return c.json({ error: "invalid attachments", details: validation.errors }, 400);
+ }
+ // Attachments only attach to a FRESH turn. If the tab is mid-turn the
+ // message would queue (text-only machinery), silently dropping the
+ // images. Reject clearly instead so the user can retry once idle.
+ if (agentManager.getTabStatus(tabId) === "running") {
+ return c.json(
+ { error: "cannot attach images while the agent is generating; wait for it to finish" },
+ 409,
+ );
+ }
+ }
+
// Single routing decision (queue if busy, new turn if idle) shared with the
// `send_to_tab` tool via `AgentManager.deliverMessage`. Non-blocking — a
// started turn runs in the background.
@@ -131,6 +193,7 @@ app.post("/chat", async (c) => {
...(reasoningEffort ? { reasoningEffort } : {}),
...(workingDirectory !== undefined ? { workingDirectory } : {}),
...(queueId ? { queueId } : {}),
+ ...(content ? { content } : {}),
});
if (outcome.status === "queued") {
diff --git a/packages/api/src/routes/models.ts b/packages/api/src/routes/models.ts
index eeb6029..a1700b1 100644
--- a/packages/api/src/routes/models.ts
+++ b/packages/api/src/routes/models.ts
@@ -20,6 +20,7 @@ import {
refreshAccountCredentialsAsync,
resolveApiKey,
resolveContextLimit,
+ resolveModelCapabilities,
selectHaikuModel,
setApiKey,
validateAccountCredentials,
@@ -180,6 +181,23 @@ modelsRoutes.get("/context-limit", async (c) => {
return c.json({ contextLimit });
});
+// Resolve a model's image / PDF INPUT capabilities from the models.dev catalog.
+// Returns `{ capabilities: { image, pdf } | null }`. `null` means UNKNOWN — the
+// provider is unmapped, the model is absent, the catalog predates the
+// `modalities` field, or the catalog is offline. The frontend treats `null` as
+// "can't verify" (optimistic allow) and a definitive `{ image: false }` as a
+// hard block (no tokens spent).
+modelsRoutes.get("/capabilities", async (c) => {
+ const provider = c.req.query("provider");
+ const modelId = c.req.query("modelId");
+ if (!provider || !modelId) {
+ return c.json({ error: "provider and modelId query parameters are required" }, 400);
+ }
+
+ const capabilities = await resolveModelCapabilities(provider, modelId);
+ return c.json({ capabilities });
+});
+
// List available Claude accounts with validated credentials
modelsRoutes.get("/claude-accounts", async (c) => {
const candidates = resolveClaudeAccounts();
diff --git a/packages/api/tests/routes.test.ts b/packages/api/tests/routes.test.ts
index 37c19ca..7cfd8a7 100644
--- a/packages/api/tests/routes.test.ts
+++ b/packages/api/tests/routes.test.ts
@@ -219,6 +219,16 @@ vi.mock("@dispatch/core", () => ({
typeof value === "string" && ["none", "low", "medium", "high", "xhigh", "max"].includes(value)
);
},
+ // Lightweight stand-in for the real validator: accept the supported media
+ // types, reject everything else. Enough to exercise the /chat attachment
+ // validation branch (the real validator is unit-tested in core).
+ validateUserContent(content: Array<{ type: string; mediaType?: string }>) {
+ const accepted = ["image/png", "image/jpeg", "image/webp", "image/gif", "application/pdf"];
+ const errors = content
+ .filter((p) => p.type === "attachment" && !accepted.includes(p.mediaType ?? ""))
+ .map((p) => ({ code: "unsupported-type", mediaType: p.mediaType }));
+ return { ok: errors.length === 0, errors };
+ },
listOpenTabs() {
return [...fakeOpenTabs];
},
@@ -449,6 +459,59 @@ describe("POST /chat", () => {
expect(await res.json()).toEqual({ status: "ok" });
});
+ it("accepts a valid image attachment and starts a turn", async () => {
+ const res = await app.request("/chat", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ tabId: "tab-img-ok",
+ message: "look: [image]",
+ content: [
+ { type: "text", text: "look: " },
+ { type: "attachment", mediaType: "image/png", data: "QQ==" },
+ ],
+ }),
+ });
+ expect(res.status).toBe(200);
+ expect(await res.json()).toEqual({ status: "ok" });
+ });
+
+ it("returns 400 for an unsupported attachment media type", async () => {
+ const res = await app.request("/chat", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ tabId: "tab-img-bad",
+ message: "look: [image]",
+ content: [{ type: "attachment", mediaType: "image/svg+xml", data: "QQ==" }],
+ }),
+ });
+ expect(res.status).toBe(400);
+ const body = await res.json();
+ expect(body.error).toBe("invalid attachments");
+ });
+
+ it("returns 409 when attaching while the agent is generating", async () => {
+ // Kick off a turn so the tab is running.
+ await app.request("/chat", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ tabId: "tab-img-busy", message: "first" }),
+ });
+ await new Promise<void>((r) => setTimeout(r, 20));
+
+ const res = await app.request("/chat", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ tabId: "tab-img-busy",
+ message: "second [image]",
+ content: [{ type: "attachment", mediaType: "image/png", data: "QQ==" }],
+ }),
+ });
+ expect(res.status).toBe(409);
+ });
+
it("returns 400 with empty message", async () => {
const res = await app.request("/chat", {
method: "POST",