summaryrefslogtreecommitdiffhomepage
path: root/packages/api/src/app.ts
diff options
context:
space:
mode:
authorAdam Malczewski <[email protected]>2026-06-02 22:50:11 +0900
committerAdam Malczewski <[email protected]>2026-06-02 22:50:11 +0900
commit66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae (patch)
treec3e039e09c89231f84dfd16f7bbbf8aedcc2dc7d /packages/api/src/app.ts
parent4b45d33c256cf580a53054078be6fd7148fa6302 (diff)
downloaddispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.tar.gz
dispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.zip
feat(chat): paste-to-attach images/PDFs with model capability check
Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests.
Diffstat (limited to 'packages/api/src/app.ts')
-rw-r--r--packages/api/src/app.ts63
1 files changed, 63 insertions, 0 deletions
diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts
index 84afd2a..2f4e538 100644
--- a/packages/api/src/app.ts
+++ b/packages/api/src/app.ts
@@ -3,6 +3,8 @@ import {
getTab,
isReasoningEffort,
NotificationDispatcher,
+ type UserContentPart,
+ validateUserContent,
} from "@dispatch/core";
import { Hono } from "hono";
import { cors } from "hono/cors";
@@ -37,6 +39,41 @@ function sanitizeAgentModels(raw: unknown): AgentModelEntry[] | undefined {
return out;
}
+/**
+ * Validate and normalise the optional multimodal `content` array from the
+ * `/chat` body. Each entry is either a `{ type: "text", text }` part or a
+ * `{ type: "attachment", mediaType, data, name? }` part (base64 payload).
+ * Returns `undefined` when the input isn't a non-empty array or contains no
+ * attachment (so the plain-string path is taken — byte-identical to before).
+ * Shape only: SIZE/TYPE limits are enforced separately by `validateUserContent`.
+ */
+function sanitizeUserContent(raw: unknown): UserContentPart[] | undefined {
+ if (!Array.isArray(raw) || raw.length === 0) return undefined;
+ const out: UserContentPart[] = [];
+ let hasAttachment = false;
+ for (const p of raw) {
+ if (!p || typeof p !== "object") continue;
+ const part = p as Record<string, unknown>;
+ if (part.type === "text") {
+ if (typeof part.text === "string") out.push({ type: "text", text: part.text });
+ continue;
+ }
+ if (part.type === "attachment") {
+ if (typeof part.mediaType !== "string" || typeof part.data !== "string") continue;
+ hasAttachment = true;
+ out.push({
+ type: "attachment",
+ mediaType: part.mediaType,
+ data: part.data,
+ ...(typeof part.name === "string" ? { name: part.name } : {}),
+ });
+ }
+ }
+ // No attachment → let the plain-text path handle it (avoids needlessly
+ // switching the model message to array content for a text-only turn).
+ return hasAttachment ? out : undefined;
+}
+
export const permissionManager = new PermissionManager();
export const agentManager = new AgentManager(permissionManager);
@@ -94,6 +131,7 @@ app.post("/chat", async (c) => {
const body = await c.req.json<{
tabId?: unknown;
message?: unknown;
+ content?: unknown;
keyId?: unknown;
modelId?: unknown;
agentModels?: unknown;
@@ -121,6 +159,30 @@ app.post("/chat", async (c) => {
? body.reasoningEffort
: undefined;
+ // Optional multimodal content (image/pdf attachments). When present, the
+ // attachments are EPHEMERAL — forwarded to the model for this turn only and
+ // never persisted (the chunk log keeps just `message`, which the frontend
+ // has already projected to text with `[image]`/`[pdf]` markers).
+ const content = sanitizeUserContent(body.content);
+ if (content) {
+ // Enforce size/type/count ceilings server-side (defence in depth; the
+ // frontend also enforces them at paste time). Reject the whole request
+ // so no tokens are spent on an over-limit payload.
+ const validation = validateUserContent(content);
+ if (!validation.ok) {
+ return c.json({ error: "invalid attachments", details: validation.errors }, 400);
+ }
+ // Attachments only attach to a FRESH turn. If the tab is mid-turn the
+ // message would queue (text-only machinery), silently dropping the
+ // images. Reject clearly instead so the user can retry once idle.
+ if (agentManager.getTabStatus(tabId) === "running") {
+ return c.json(
+ { error: "cannot attach images while the agent is generating; wait for it to finish" },
+ 409,
+ );
+ }
+ }
+
// Single routing decision (queue if busy, new turn if idle) shared with the
// `send_to_tab` tool via `AgentManager.deliverMessage`. Non-blocking — a
// started turn runs in the background.
@@ -131,6 +193,7 @@ app.post("/chat", async (c) => {
...(reasoningEffort ? { reasoningEffort } : {}),
...(workingDirectory !== undefined ? { workingDirectory } : {}),
...(queueId ? { queueId } : {}),
+ ...(content ? { content } : {}),
});
if (outcome.status === "queued") {