From 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae Mon Sep 17 00:00:00 2001
From: Adam Malczewski <github@tradam.dev>
Date: Tue, 2 Jun 2026 22:50:11 +0900
Subject: feat(chat): paste-to-attach images/PDFs with model capability check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add multimodal image/PDF input to the chat box via clipboard paste, gated by a
graceful per-model capability check.

UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the
draft, so attachments have ORDER relative to typed text and can be referenced
positionally. The token is the only handle — deleting it (atomic Backspace/
Delete, or selection overlap) detaches the file; an input-reconciliation safety
net detaches any attachment whose token is no longer intact. No preview strip.

Capability check: resolveModelCapabilities reads models.dev modalities.input
(new GET /models/capabilities, mirrors /context-limit). The input blocks Send
(no tokens spent) only on a definitive 'no'; unknown capability (catalog offline
/ unmapped provider) stays permissive. Attachments require a fresh turn — Send is
blocked while generating and /chat rejects content mid-turn (409).

Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK
ImagePart/FilePart content, but never persisted (history keeps the text with
[image]/[pdf] markers). Text-only turns serialize byte-identically to before.

Limits (Anthropic-aligned, enforced at paste + re-validated server-side):
PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total.

core: UserContentPart types, models/attachments validator, capability resolver,
agent.run+toModelMessages thread ordered content. api: /chat content validation +
passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating,
per-tab staged attachments, App.svelte capability fetch. +44 tests.
---
 packages/api/src/agent-manager.ts                  |  11 +
 packages/api/src/app.ts                            |  63 ++++++
 packages/api/src/routes/models.ts                  |  18 ++
 packages/api/tests/routes.test.ts                  |  63 ++++++
 packages/core/src/agent/agent.ts                   | Bin 57822 -> 60515 bytes
 packages/core/src/index.ts                         |  17 ++
 packages/core/src/models/attachments.ts            | 151 +++++++++++++
 packages/core/src/models/catalog.ts                |  50 +++++
 packages/core/src/models/index.ts                  |  19 ++
 packages/core/src/types/index.ts                   |  49 +++++
 packages/core/tests/agent/agent.test.ts            |  98 +++++++++
 packages/core/tests/models/attachments.test.ts     | 136 ++++++++++++
 packages/core/tests/models/catalog.test.ts         |  75 ++++++-
 packages/frontend/src/App.svelte                   |  55 ++++-
 packages/frontend/src/lib/attachment-tokens.ts     | 234 +++++++++++++++++++++
 .../frontend/src/lib/components/ChatInput.svelte   | 223 +++++++++++++++++++-
 packages/frontend/src/lib/tabs.svelte.ts           |  66 +++++-
 packages/frontend/tests/attachment-tokens.test.ts  | 130 ++++++++++++
 packages/frontend/tests/chat-store.test.ts         |  75 +++++++
 19 files changed, 1510 insertions(+), 23 deletions(-)
 create mode 100644 packages/core/src/models/attachments.ts
 create mode 100644 packages/core/tests/models/attachments.test.ts
 create mode 100644 packages/frontend/src/lib/attachment-tokens.ts
 create mode 100644 packages/frontend/tests/attachment-tokens.test.ts

(limited to 'packages')

diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts
index 2532efa..3b12a80 100644
--- a/packages/api/src/agent-manager.ts
+++ b/packages/api/src/agent-manager.ts
@@ -65,6 +65,7 @@ import {
 	toAvailableUserAgents,
 	type UsageData,
 	type UsageStats,
+	type UserContentPart,
 	validateConfig,
 } from "@dispatch/core";
 import type { PermissionManager } from "./permission-manager.js";
@@ -1535,6 +1536,13 @@ export class AgentManager {
 			reasoningEffort?: ReasoningEffort;
 			workingDirectory?: string;
 			queueId?: string;
+			/**
+			 * Ephemeral ordered multimodal content (image/pdf attachments) for a
+			 * FRESH human turn. Forwarded to `processMessage` → `agent.run` only
+			 * when the tab is idle (a started turn); never carried into the queue
+			 * path (attachments require a fresh turn — the caller guards that).
+			 */
+			content?: UserContentPart[];
 			/**
 			 * Who is sending this message. `"human"` (default) is unrestricted
 			 * and REFILLS the target's agent-to-agent auto-wake budget. `"agent"`
@@ -1606,6 +1614,7 @@ export class AgentManager {
 			opts.reasoningEffort,
 			opts.workingDirectory,
 			agentModels,
+			opts.content,
 		).catch((err) => {
 			console.error(`[dispatch] deliverMessage processMessage error for tab ${tabId}:`, err);
 		});
@@ -1620,6 +1629,7 @@ export class AgentManager {
 		reasoningEffort?: ReasoningEffort,
 		workingDirectory?: string,
 		agentModels?: AgentModelEntry[],
+		content?: UserContentPart[],
 	): Promise<void> {
 		const tabAgent = this._getOrCreateTabAgent(tabId);
 
@@ -1731,6 +1741,7 @@ export class AgentManager {
 				for await (const event of agent.run(message, {
 					...(effortForEntry ? { reasoningEffort: effortForEntry } : {}),
 					abortSignal: tabAgent.abortController?.signal,
+					...(content ? { content } : {}),
 				})) {
 					// Stop processing if the tab was aborted (closed/stopped).
 					// stopTab() already injected a `cancelled` system chunk into
diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts
index 84afd2a..2f4e538 100644
--- a/packages/api/src/app.ts
+++ b/packages/api/src/app.ts
@@ -3,6 +3,8 @@ import {
 	getTab,
 	isReasoningEffort,
 	NotificationDispatcher,
+	type UserContentPart,
+	validateUserContent,
 } from "@dispatch/core";
 import { Hono } from "hono";
 import { cors } from "hono/cors";
@@ -37,6 +39,41 @@ function sanitizeAgentModels(raw: unknown): AgentModelEntry[] | undefined {
 	return out;
 }
 
+/**
+ * Validate and normalise the optional multimodal `content` array from the
+ * `/chat` body. Each entry is either a `{ type: "text", text }` part or a
+ * `{ type: "attachment", mediaType, data, name? }` part (base64 payload).
+ * Returns `undefined` when the input isn't a non-empty array or contains no
+ * attachment (so the plain-string path is taken — byte-identical to before).
+ * Shape only: SIZE/TYPE limits are enforced separately by `validateUserContent`.
+ */
+function sanitizeUserContent(raw: unknown): UserContentPart[] | undefined {
+	if (!Array.isArray(raw) || raw.length === 0) return undefined;
+	const out: UserContentPart[] = [];
+	let hasAttachment = false;
+	for (const p of raw) {
+		if (!p || typeof p !== "object") continue;
+		const part = p as Record<string, unknown>;
+		if (part.type === "text") {
+			if (typeof part.text === "string") out.push({ type: "text", text: part.text });
+			continue;
+		}
+		if (part.type === "attachment") {
+			if (typeof part.mediaType !== "string" || typeof part.data !== "string") continue;
+			hasAttachment = true;
+			out.push({
+				type: "attachment",
+				mediaType: part.mediaType,
+				data: part.data,
+				...(typeof part.name === "string" ? { name: part.name } : {}),
+			});
+		}
+	}
+	// No attachment → let the plain-text path handle it (avoids needlessly
+	// switching the model message to array content for a text-only turn).
+	return hasAttachment ? out : undefined;
+}
+
 export const permissionManager = new PermissionManager();
 export const agentManager = new AgentManager(permissionManager);
 
@@ -94,6 +131,7 @@ app.post("/chat", async (c) => {
 	const body = await c.req.json<{
 		tabId?: unknown;
 		message?: unknown;
+		content?: unknown;
 		keyId?: unknown;
 		modelId?: unknown;
 		agentModels?: unknown;
@@ -121,6 +159,30 @@ app.post("/chat", async (c) => {
 		? body.reasoningEffort
 		: undefined;
 
+	// Optional multimodal content (image/pdf attachments). When present, the
+	// attachments are EPHEMERAL — forwarded to the model for this turn only and
+	// never persisted (the chunk log keeps just `message`, which the frontend
+	// has already projected to text with `[image]`/`[pdf]` markers).
+	const content = sanitizeUserContent(body.content);
+	if (content) {
+		// Enforce size/type/count ceilings server-side (defence in depth; the
+		// frontend also enforces them at paste time). Reject the whole request
+		// so no tokens are spent on an over-limit payload.
+		const validation = validateUserContent(content);
+		if (!validation.ok) {
+			return c.json({ error: "invalid attachments", details: validation.errors }, 400);
+		}
+		// Attachments only attach to a FRESH turn. If the tab is mid-turn the
+		// message would queue (text-only machinery), silently dropping the
+		// images. Reject clearly instead so the user can retry once idle.
+		if (agentManager.getTabStatus(tabId) === "running") {
+			return c.json(
+				{ error: "cannot attach images while the agent is generating; wait for it to finish" },
+				409,
+			);
+		}
+	}
+
 	// Single routing decision (queue if busy, new turn if idle) shared with the
 	// `send_to_tab` tool via `AgentManager.deliverMessage`. Non-blocking — a
 	// started turn runs in the background.
@@ -131,6 +193,7 @@ app.post("/chat", async (c) => {
 		...(reasoningEffort ? { reasoningEffort } : {}),
 		...(workingDirectory !== undefined ? { workingDirectory } : {}),
 		...(queueId ? { queueId } : {}),
+		...(content ? { content } : {}),
 	});
 
 	if (outcome.status === "queued") {
diff --git a/packages/api/src/routes/models.ts b/packages/api/src/routes/models.ts
index eeb6029..a1700b1 100644
--- a/packages/api/src/routes/models.ts
+++ b/packages/api/src/routes/models.ts
@@ -20,6 +20,7 @@ import {
 	refreshAccountCredentialsAsync,
 	resolveApiKey,
 	resolveContextLimit,
+	resolveModelCapabilities,
 	selectHaikuModel,
 	setApiKey,
 	validateAccountCredentials,
@@ -180,6 +181,23 @@ modelsRoutes.get("/context-limit", async (c) => {
 	return c.json({ contextLimit });
 });
 
+// Resolve a model's image / PDF INPUT capabilities from the models.dev catalog.
+// Returns `{ capabilities: { image, pdf } | null }`. `null` means UNKNOWN — the
+// provider is unmapped, the model is absent, the catalog predates the
+// `modalities` field, or the catalog is offline. The frontend treats `null` as
+// "can't verify" (optimistic allow) and a definitive `{ image: false }` as a
+// hard block (no tokens spent).
+modelsRoutes.get("/capabilities", async (c) => {
+	const provider = c.req.query("provider");
+	const modelId = c.req.query("modelId");
+	if (!provider || !modelId) {
+		return c.json({ error: "provider and modelId query parameters are required" }, 400);
+	}
+
+	const capabilities = await resolveModelCapabilities(provider, modelId);
+	return c.json({ capabilities });
+});
+
 // List available Claude accounts with validated credentials
 modelsRoutes.get("/claude-accounts", async (c) => {
 	const candidates = resolveClaudeAccounts();
diff --git a/packages/api/tests/routes.test.ts b/packages/api/tests/routes.test.ts
index 37c19ca..7cfd8a7 100644
--- a/packages/api/tests/routes.test.ts
+++ b/packages/api/tests/routes.test.ts
@@ -219,6 +219,16 @@ vi.mock("@dispatch/core", () => ({
 			typeof value === "string" && ["none", "low", "medium", "high", "xhigh", "max"].includes(value)
 		);
 	},
+	// Lightweight stand-in for the real validator: accept the supported media
+	// types, reject everything else. Enough to exercise the /chat attachment
+	// validation branch (the real validator is unit-tested in core).
+	validateUserContent(content: Array<{ type: string; mediaType?: string }>) {
+		const accepted = ["image/png", "image/jpeg", "image/webp", "image/gif", "application/pdf"];
+		const errors = content
+			.filter((p) => p.type === "attachment" && !accepted.includes(p.mediaType ?? ""))
+			.map((p) => ({ code: "unsupported-type", mediaType: p.mediaType }));
+		return { ok: errors.length === 0, errors };
+	},
 	listOpenTabs() {
 		return [...fakeOpenTabs];
 	},
@@ -449,6 +459,59 @@ describe("POST /chat", () => {
 		expect(await res.json()).toEqual({ status: "ok" });
 	});
 
+	it("accepts a valid image attachment and starts a turn", async () => {
+		const res = await app.request("/chat", {
+			method: "POST",
+			headers: { "Content-Type": "application/json" },
+			body: JSON.stringify({
+				tabId: "tab-img-ok",
+				message: "look: [image]",
+				content: [
+					{ type: "text", text: "look: " },
+					{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+				],
+			}),
+		});
+		expect(res.status).toBe(200);
+		expect(await res.json()).toEqual({ status: "ok" });
+	});
+
+	it("returns 400 for an unsupported attachment media type", async () => {
+		const res = await app.request("/chat", {
+			method: "POST",
+			headers: { "Content-Type": "application/json" },
+			body: JSON.stringify({
+				tabId: "tab-img-bad",
+				message: "look: [image]",
+				content: [{ type: "attachment", mediaType: "image/svg+xml", data: "QQ==" }],
+			}),
+		});
+		expect(res.status).toBe(400);
+		const body = await res.json();
+		expect(body.error).toBe("invalid attachments");
+	});
+
+	it("returns 409 when attaching while the agent is generating", async () => {
+		// Kick off a turn so the tab is running.
+		await app.request("/chat", {
+			method: "POST",
+			headers: { "Content-Type": "application/json" },
+			body: JSON.stringify({ tabId: "tab-img-busy", message: "first" }),
+		});
+		await new Promise<void>((r) => setTimeout(r, 20));
+
+		const res = await app.request("/chat", {
+			method: "POST",
+			headers: { "Content-Type": "application/json" },
+			body: JSON.stringify({
+				tabId: "tab-img-busy",
+				message: "second [image]",
+				content: [{ type: "attachment", mediaType: "image/png", data: "QQ==" }],
+			}),
+		});
+		expect(res.status).toBe(409);
+	});
+
 	it("returns 400 with empty message", async () => {
 		const res = await app.request("/chat", {
 			method: "POST",
diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
index 4bfa7eb..08b317a 100644
Binary files a/packages/core/src/agent/agent.ts and b/packages/core/src/agent/agent.ts differ
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 08b426f..50012f1 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -82,9 +82,26 @@ export {
 } from "./lsp/index.js";
 // Models
 export {
+	ACCEPTED_ATTACHMENT_MEDIA_TYPES,
+	ACCEPTED_IMAGE_MEDIA_TYPES,
+	ACCEPTED_PDF_MEDIA_TYPE,
+	type AttachmentValidationError,
+	type AttachmentValidationResult,
+	base64ByteLength,
 	getModelsCatalog,
+	hasAttachments,
+	isAcceptedAttachmentMediaType,
+	isImageMediaType,
+	isPdfMediaType,
+	MAX_ATTACHMENTS,
+	MAX_IMAGE_BYTES,
+	MAX_PDF_BYTES,
+	MAX_TOTAL_ATTACHMENT_BYTES,
+	type ModelInputCapabilities,
 	ModelRegistry,
 	resolveContextLimit,
+	resolveModelCapabilities,
+	validateUserContent,
 } from "./models/index.js";
 // Notifications (ntfy.sh)
 export * from "./notifications/index.js";
diff --git a/packages/core/src/models/attachments.ts b/packages/core/src/models/attachments.ts
new file mode 100644
index 0000000..5c98db4
--- /dev/null
+++ b/packages/core/src/models/attachments.ts
@@ -0,0 +1,151 @@
+// Validation + limits for multimodal user attachments (images / PDFs).
+//
+// Kept dependency-free (no DB / `bun:sqlite` import) so both the API layer
+// (`/chat` request validation) and any future caller can share the exact same
+// allowlist and size/count ceilings. The limits mirror Anthropic's documented
+// vision/PDF API constraints (the only image-capable providers Dispatch maps),
+// so a request that passes here won't be rejected by the provider for size.
+
+import type { UserAttachmentPart, UserContentPart } from "../types/index.js";
+
+/** Accepted image media types. */
+export const ACCEPTED_IMAGE_MEDIA_TYPES = [
+	"image/png",
+	"image/jpeg",
+	"image/webp",
+	"image/gif",
+] as const;
+
+/** Accepted document media types. */
+export const ACCEPTED_PDF_MEDIA_TYPE = "application/pdf";
+
+/** Every media type we accept as an attachment. */
+export const ACCEPTED_ATTACHMENT_MEDIA_TYPES = [
+	...ACCEPTED_IMAGE_MEDIA_TYPES,
+	ACCEPTED_PDF_MEDIA_TYPE,
+] as const;
+
+/** Per-image byte ceiling (Anthropic: 5 MB/image). */
+export const MAX_IMAGE_BYTES = 5 * 1024 * 1024;
+
+/** Per-PDF byte ceiling (Anthropic: 32 MB/PDF). */
+export const MAX_PDF_BYTES = 32 * 1024 * 1024;
+
+/** Max attachments per message (Anthropic: 20 images/request). */
+export const MAX_ATTACHMENTS = 20;
+
+/**
+ * Total attachment payload ceiling for a single request (decoded bytes). Bounds
+ * the overall request size even when each individual file is within its limit.
+ */
+export const MAX_TOTAL_ATTACHMENT_BYTES = 32 * 1024 * 1024;
+
+/** Whether a media type is an accepted image type. */
+export function isImageMediaType(mediaType: string): boolean {
+	return (ACCEPTED_IMAGE_MEDIA_TYPES as readonly string[]).includes(mediaType);
+}
+
+/** Whether a media type is the accepted PDF type. */
+export function isPdfMediaType(mediaType: string): boolean {
+	return mediaType === ACCEPTED_PDF_MEDIA_TYPE;
+}
+
+/** Whether a media type is an accepted attachment type at all. */
+export function isAcceptedAttachmentMediaType(mediaType: string): boolean {
+	return (ACCEPTED_ATTACHMENT_MEDIA_TYPES as readonly string[]).includes(mediaType);
+}
+
+/**
+ * Decoded byte length of a base64 string, computed WITHOUT allocating the
+ * decoded buffer. Tolerates an optional `data:<mediaType>;base64,` prefix and
+ * any embedded whitespace/newlines. Returns 0 for an empty/whitespace string.
+ */
+export function base64ByteLength(b64: string): number {
+	// Strip a data-URI prefix if present.
+	const comma = b64.indexOf(",");
+	const body = b64.startsWith("data:") && comma !== -1 ? b64.slice(comma + 1) : b64;
+	let len = 0;
+	let pad = 0;
+	for (let i = 0; i < body.length; i++) {
+		const ch = body.charCodeAt(i);
+		// Skip whitespace (space, \t, \n, \r).
+		if (ch === 32 || ch === 9 || ch === 10 || ch === 13) continue;
+		len++;
+		if (body[i] === "=") pad++;
+	}
+	if (len === 0) return 0;
+	// 4 base64 chars → 3 bytes, minus padding.
+	return Math.floor((len * 3) / 4) - pad;
+}
+
+export type AttachmentValidationError =
+	| { code: "unsupported-type"; mediaType: string }
+	| { code: "image-too-large"; mediaType: string; bytes: number; limit: number }
+	| { code: "pdf-too-large"; bytes: number; limit: number }
+	| { code: "too-many"; count: number; limit: number }
+	| { code: "total-too-large"; bytes: number; limit: number }
+	| { code: "empty"; mediaType: string };
+
+export interface AttachmentValidationResult {
+	ok: boolean;
+	errors: AttachmentValidationError[];
+}
+
+/** Extract just the attachment parts from a mixed content list. */
+function attachmentsOf(content: UserContentPart[]): UserAttachmentPart[] {
+	return content.filter((p): p is UserAttachmentPart => p.type === "attachment");
+}
+
+/**
+ * Validate the attachments in a multimodal user content list against the
+ * media-type allowlist and the size/count ceilings. Pure: never throws,
+ * collects every violation so the caller can report them all at once.
+ *
+ * Text parts are ignored (always valid). An empty content list is valid (it's
+ * just a text-only message expressed as parts).
+ */
+export function validateUserContent(content: UserContentPart[]): AttachmentValidationResult {
+	const errors: AttachmentValidationError[] = [];
+	const attachments = attachmentsOf(content);
+
+	if (attachments.length > MAX_ATTACHMENTS) {
+		errors.push({ code: "too-many", count: attachments.length, limit: MAX_ATTACHMENTS });
+	}
+
+	let total = 0;
+	for (const att of attachments) {
+		if (!isAcceptedAttachmentMediaType(att.mediaType)) {
+			errors.push({ code: "unsupported-type", mediaType: att.mediaType });
+			continue;
+		}
+		const bytes = base64ByteLength(att.data);
+		total += bytes;
+		if (bytes === 0) {
+			errors.push({ code: "empty", mediaType: att.mediaType });
+			continue;
+		}
+		if (isPdfMediaType(att.mediaType)) {
+			if (bytes > MAX_PDF_BYTES) {
+				errors.push({ code: "pdf-too-large", bytes, limit: MAX_PDF_BYTES });
+			}
+		} else if (bytes > MAX_IMAGE_BYTES) {
+			errors.push({
+				code: "image-too-large",
+				mediaType: att.mediaType,
+				bytes,
+				limit: MAX_IMAGE_BYTES,
+			});
+		}
+	}
+
+	if (total > MAX_TOTAL_ATTACHMENT_BYTES) {
+		errors.push({ code: "total-too-large", bytes: total, limit: MAX_TOTAL_ATTACHMENT_BYTES });
+	}
+
+	return { ok: errors.length === 0, errors };
+}
+
+/** Convenience: does the content list contain at least one attachment? */
+export function hasAttachments(content: UserContentPart[] | undefined | null): boolean {
+	return !!content && content.some((p) => p.type === "attachment");
+}
diff --git a/packages/core/src/models/catalog.ts b/packages/core/src/models/catalog.ts
index dea4647..ac310b1 100644
--- a/packages/core/src/models/catalog.ts
+++ b/packages/core/src/models/catalog.ts
@@ -18,6 +18,15 @@ interface ModelsDevModel {
 		context?: number;
 		output?: number;
 	};
+	/**
+	 * Input/output modalities the model accepts. We read `input` to decide
+	 * whether the model can take image / pdf attachments. Absent on older
+	 * catalog entries — treated as "unknown" (capability resolves to `null`).
+	 */
+	modalities?: {
+		input?: string[];
+		output?: string[];
+	};
 }
 
 interface ModelsDevProvider {
@@ -172,6 +181,47 @@ export async function resolveContextLimit(
 	return null;
 }
 
+/**
+ * Image / PDF input capabilities for a model, resolved from the models.dev
+ * catalog's `modalities.input` list.
+ */
+export interface ModelInputCapabilities {
+	/** Model accepts image input (vision). */
+	image: boolean;
+	/** Model accepts PDF/document input. */
+	pdf: boolean;
+}
+
+/**
+ * Resolve whether a model accepts image / pdf input for the given Dispatch
+ * provider + model id. Returns `null` when the capability is UNKNOWN — i.e. the
+ * provider is unsupported/unmapped, the model is absent from the catalog, the
+ * entry predates the `modalities` field, or the catalog is unavailable. Callers
+ * should treat `null` as "can't verify" (optimistic allow) rather than a
+ * definitive "no", so a temporary catalog outage never disables a known-good
+ * vision model.
+ *
+ * A non-null result means the catalog DID describe the model's input modalities
+ * — `{ image, pdf }` then reflects exactly what it advertises (a definitive
+ * yes/no for each).
+ */
+export async function resolveModelCapabilities(
+	provider: string,
+	modelId: string,
+): Promise<ModelInputCapabilities | null> {
+	const candidates = PROVIDER_MAP[provider];
+	if (!candidates || !modelId) return null;
+
+	const catalog = await getModelsCatalog();
+	for (const providerId of candidates) {
+		const input = catalog[providerId]?.models?.[modelId]?.modalities?.input;
+		if (Array.isArray(input)) {
+			return { image: input.includes("image"), pdf: input.includes("pdf") };
+		}
+	}
+	return null;
+}
+
 /** Test-only: reset the in-process memo so a test can re-exercise loading. */
 export function __resetCatalogCacheForTests(): void {
 	cached = null;
diff --git a/packages/core/src/models/index.ts b/packages/core/src/models/index.ts
index 2fcd657..15d1ee2 100644
--- a/packages/core/src/models/index.ts
+++ b/packages/core/src/models/index.ts
@@ -1,5 +1,24 @@
+export {
+	ACCEPTED_ATTACHMENT_MEDIA_TYPES,
+	ACCEPTED_IMAGE_MEDIA_TYPES,
+	ACCEPTED_PDF_MEDIA_TYPE,
+	type AttachmentValidationError,
+	type AttachmentValidationResult,
+	base64ByteLength,
+	hasAttachments,
+	isAcceptedAttachmentMediaType,
+	isImageMediaType,
+	isPdfMediaType,
+	MAX_ATTACHMENTS,
+	MAX_IMAGE_BYTES,
+	MAX_PDF_BYTES,
+	MAX_TOTAL_ATTACHMENT_BYTES,
+	validateUserContent,
+} from "./attachments.js";
 export {
 	getModelsCatalog,
+	type ModelInputCapabilities,
 	resolveContextLimit,
+	resolveModelCapabilities,
 } from "./catalog.js";
 export { ModelRegistry } from "./registry.js";
diff --git a/packages/core/src/types/index.ts b/packages/core/src/types/index.ts
index 607b27d..273e074 100644
--- a/packages/core/src/types/index.ts
+++ b/packages/core/src/types/index.ts
@@ -76,8 +76,57 @@ export interface SystemChunk {
 export interface ChatMessage {
 	role: MessageRole;
 	chunks: Chunk[];
+	/**
+	 * Ephemeral ORDERED multimodal content for a user turn (interleaved text +
+	 * image/pdf attachments). Set ONLY transiently on the in-flight user message
+	 * so `toModelMessages` can emit multimodal `ImagePart`/`FilePart` content to
+	 * the provider. Never persisted (the chunk log stores only the text, with
+	 * `[image]`/`[pdf]` markers), so it's absent on history-rebuilt messages.
+	 * When absent, the message is plain text built from its `chunks`.
+	 */
+	content?: UserContentPart[];
 }
 
+// ─── Multimodal user content (image / PDF attachments) ───────────
+//
+// When a user pastes one or more images/PDFs into the chat input, the turn's
+// user message carries an ORDERED list of content parts instead of a plain
+// string. The ordering is meaningful — the user can interleave text and
+// attachments ("here is image A: <A>, here is image B: <B>") and the model
+// sees them in exactly that sequence.
+//
+// These parts are EPHEMERAL: they are forwarded to the model for the turn that
+// produced them but are NOT persisted as raw bytes in the chunk log. History
+// stores only the user's text (with `[image]` / `[pdf]` markers in place of
+// each attachment), so a later reload re-renders the text but never re-sends
+// the binary payload. This keeps the persisted log small and avoids re-billing
+// image tokens on every subsequent turn.
+
+/** A plain-text segment of a multimodal user message. */
+export interface UserTextPart {
+	type: "text";
+	text: string;
+}
+
+/**
+ * A binary attachment (image or PDF) in a multimodal user message. `data` is a
+ * base64-encoded payload (no `data:` URI prefix); `mediaType` is the IANA media
+ * type (e.g. `image/png`, `application/pdf`). `name` is an optional original
+ * filename, used only for PDF `filename` passthrough and diagnostics.
+ */
+export interface UserAttachmentPart {
+	type: "attachment";
+	/** IANA media type, e.g. `image/png`, `image/jpeg`, `application/pdf`. */
+	mediaType: string;
+	/** Base64-encoded bytes WITHOUT a `data:` URI prefix. */
+	data: string;
+	/** Optional original filename (mainly for PDFs). */
+	name?: string;
+}
+
+/** One ordered part of a multimodal user message. */
+export type UserContentPart = UserTextPart | UserAttachmentPart;
+
 // ─── Append-only chunk log (persisted model) ─────────────────────
 //
 // The DB stores a conversation as a flat stream of `ChunkRow`s (see
diff --git a/packages/core/tests/agent/agent.test.ts b/packages/core/tests/agent/agent.test.ts
index d8edec7..f4b33cc 100644
--- a/packages/core/tests/agent/agent.test.ts
+++ b/packages/core/tests/agent/agent.test.ts
@@ -1544,4 +1544,102 @@ describe("anthropicThinkingProviderOptions — adaptive-thinking model detection
 			effort: "xhigh",
 		});
 	});
+
+	describe("multimodal user content", () => {
+		it("emits ordered text + image parts to the model when content is provided", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("here is image A: [image]", {
+				content: [
+					{ type: "text", text: "here is image A: " },
+					{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+				],
+			})) {
+				// consume
+			}
+
+			const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+			const messages = callArgs?.messages as Array<{ role: string; content: unknown }>;
+			const userMsg = messages.find((m) => m.role === "user");
+			expect(userMsg).toBeDefined();
+			// Multimodal turn → content is an ordered parts array, not a string.
+			expect(Array.isArray(userMsg?.content)).toBe(true);
+			const parts = userMsg?.content as Array<Record<string, unknown>>;
+			expect(parts[0]).toMatchObject({ type: "text", text: "here is image A: " });
+			expect(parts[1]).toMatchObject({ type: "image", mediaType: "image/png" });
+			expect(String(parts[1]?.image)).toBe("data:image/png;base64,QQ==");
+		});
+
+		it("emits a FilePart for a PDF attachment with its filename", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("see [pdf]", {
+				content: [
+					{ type: "text", text: "see " },
+					{ type: "attachment", mediaType: "application/pdf", data: "QQ==", name: "doc.pdf" },
+				],
+			})) {
+				// consume
+			}
+
+			const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+			const messages = callArgs?.messages as Array<{ role: string; content: unknown }>;
+			const userMsg = messages.find((m) => m.role === "user");
+			const parts = userMsg?.content as Array<Record<string, unknown>>;
+			const filePart = parts.find((p) => p.type === "file");
+			expect(filePart).toMatchObject({
+				type: "file",
+				mediaType: "application/pdf",
+				filename: "doc.pdf",
+			});
+			expect(String(filePart?.data)).toBe("data:application/pdf;base64,QQ==");
+		});
+
+		it("persists the user turn as text only (no content) for history", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("look: [image]", {
+				content: [
+					{ type: "text", text: "look: " },
+					{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+				],
+			})) {
+				// consume
+			}
+
+			// The in-memory user message keeps the text chunk for the render/persist
+			// path; the ephemeral `content` rides alongside it but isn't a chunk.
+			const userMsg = agent.messages.find((m) => m.role === "user");
+			expect(userMsg?.chunks).toEqual([{ type: "text", text: "look: [image]" }]);
+		});
+
+		it("falls back to a plain string when content has no attachment", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("plain text", {
+				content: [{ type: "text", text: "plain text" }],
+			})) {
+				// consume
+			}
+
+			const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+			const messages = callArgs?.messages as Array<{ role: string; content: unknown }>;
+			const userMsg = messages.find((m) => m.role === "user");
+			// No attachment → plain string content (byte-identical to text-only path).
+			expect(typeof userMsg?.content).toBe("string");
+			expect(userMsg?.content).toBe("plain text");
+		});
+	});
 });
diff --git a/packages/core/tests/models/attachments.test.ts b/packages/core/tests/models/attachments.test.ts
new file mode 100644
index 0000000..11a9f82
--- /dev/null
+++ b/packages/core/tests/models/attachments.test.ts
@@ -0,0 +1,136 @@
+import { describe, expect, it } from "vitest";
+import {
+	base64ByteLength,
+	isAcceptedAttachmentMediaType,
+	isImageMediaType,
+	isPdfMediaType,
+	MAX_ATTACHMENTS,
+	MAX_IMAGE_BYTES,
+	MAX_PDF_BYTES,
+	MAX_TOTAL_ATTACHMENT_BYTES,
+	validateUserContent,
+} from "../../src/models/attachments.js";
+import type { UserContentPart } from "../../src/types/index.js";
+
+/** A base64 string that decodes to exactly `bytes` bytes (no padding chars). */
+function base64OfBytes(bytes: number): string {
+	// 4 base64 chars → 3 bytes. Use a multiple of 3 for clean (unpadded) output.
+	const groups = Math.ceil(bytes / 3);
+	return "A".repeat(groups * 4);
+}
+
+function imagePart(data: string, mediaType = "image/png"): UserContentPart {
+	return { type: "attachment", mediaType, data };
+}
+
+describe("media-type predicates", () => {
+	it("classifies image types", () => {
+		expect(isImageMediaType("image/png")).toBe(true);
+		expect(isImageMediaType("image/jpeg")).toBe(true);
+		expect(isImageMediaType("image/webp")).toBe(true);
+		expect(isImageMediaType("image/gif")).toBe(true);
+		expect(isImageMediaType("application/pdf")).toBe(false);
+		expect(isImageMediaType("image/svg+xml")).toBe(false);
+	});
+
+	it("classifies pdf + accepted types", () => {
+		expect(isPdfMediaType("application/pdf")).toBe(true);
+		expect(isPdfMediaType("image/png")).toBe(false);
+		expect(isAcceptedAttachmentMediaType("image/gif")).toBe(true);
+		expect(isAcceptedAttachmentMediaType("application/pdf")).toBe(true);
+		expect(isAcceptedAttachmentMediaType("text/plain")).toBe(false);
+	});
+});
+
+describe("base64ByteLength", () => {
+	it("computes decoded length without padding", () => {
+		// "AAAA" → 3 bytes.
+		expect(base64ByteLength("AAAA")).toBe(3);
+	});
+
+	it("accounts for padding", () => {
+		// "QQ==" → 1 byte ("A").
+		expect(base64ByteLength("QQ==")).toBe(1);
+		// "QUI=" → 2 bytes ("AB").
+		expect(base64ByteLength("QUI=")).toBe(2);
+	});
+
+	it("tolerates a data: URI prefix and whitespace", () => {
+		expect(base64ByteLength("data:image/png;base64,AAAA")).toBe(3);
+		expect(base64ByteLength("AA\nAA")).toBe(3);
+	});
+
+	it("returns 0 for empty input", () => {
+		expect(base64ByteLength("")).toBe(0);
+		expect(base64ByteLength("   ")).toBe(0);
+	});
+});
+
+describe("validateUserContent", () => {
+	it("accepts a small image and ignores text parts", () => {
+		const content: UserContentPart[] = [
+			{ type: "text", text: "hi" },
+			imagePart(base64OfBytes(1024)),
+		];
+		expect(validateUserContent(content)).toEqual({ ok: true, errors: [] });
+	});
+
+	it("accepts an empty / text-only content list", () => {
+		expect(validateUserContent([]).ok).toBe(true);
+		expect(validateUserContent([{ type: "text", text: "no files" }]).ok).toBe(true);
+	});
+
+	it("rejects an unsupported media type", () => {
+		const res = validateUserContent([imagePart(base64OfBytes(10), "image/svg+xml")]);
+		expect(res.ok).toBe(false);
+		expect(res.errors[0]).toMatchObject({ code: "unsupported-type", mediaType: "image/svg+xml" });
+	});
+
+	it("rejects an oversized image but allows a PDF of the same size", () => {
+		const big = base64OfBytes(MAX_IMAGE_BYTES + 3);
+		const imgRes = validateUserContent([imagePart(big, "image/png")]);
+		expect(imgRes.ok).toBe(false);
+		expect(imgRes.errors.some((e) => e.code === "image-too-large")).toBe(true);
+
+		// Same byte size as a PDF is fine (PDF limit is much higher).
+		const pdfRes = validateUserContent([imagePart(big, "application/pdf")]);
+		expect(pdfRes.ok).toBe(true);
+	});
+
+	it("rejects an oversized PDF", () => {
+		const res = validateUserContent([
+			imagePart(base64OfBytes(MAX_PDF_BYTES + 3), "application/pdf"),
+		]);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "pdf-too-large")).toBe(true);
+	});
+
+	it("rejects an empty attachment payload", () => {
+		const res = validateUserContent([imagePart("", "image/png")]);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "empty")).toBe(true);
+	});
+
+	it("rejects too many attachments", () => {
+		const content: UserContentPart[] = Array.from({ length: MAX_ATTACHMENTS + 1 }, () =>
+			imagePart(base64OfBytes(8)),
+		);
+		const res = validateUserContent(content);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "too-many")).toBe(true);
+	});
+
+	it("rejects when the total payload exceeds the request ceiling", () => {
+		// Several individually-legal PDFs that together exceed the total cap.
+		const each = Math.floor(MAX_TOTAL_ATTACHMENT_BYTES / 3);
+		const content: UserContentPart[] = [
+			imagePart(base64OfBytes(each), "application/pdf"),
+			imagePart(base64OfBytes(each), "application/pdf"),
+			imagePart(base64OfBytes(each), "application/pdf"),
+			imagePart(base64OfBytes(each), "application/pdf"),
+		];
+		const res = validateUserContent(content);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "total-too-large")).toBe(true);
+	});
+});
diff --git a/packages/core/tests/models/catalog.test.ts b/packages/core/tests/models/catalog.test.ts
index 51043e6..f4bddc2 100644
--- a/packages/core/tests/models/catalog.test.ts
+++ b/packages/core/tests/models/catalog.test.ts
@@ -4,6 +4,7 @@ import {
 	__resetCatalogCacheForTests,
 	getModelsCatalog,
 	resolveContextLimit,
+	resolveModelCapabilities,
 } from "../../src/models/catalog.js";
 
 const CACHE_PATH = "/tmp/dispatch/models-dev.json";
@@ -13,14 +14,30 @@ const CATALOG = {
 	anthropic: {
 		id: "anthropic",
 		models: {
-			"claude-sonnet-4-5": { limit: { context: 200000, output: 64000 } },
-			"claude-sonnet-4-6": { limit: { context: 1000000, output: 64000 } },
+			"claude-sonnet-4-5": {
+				limit: { context: 200000, output: 64000 },
+				modalities: { input: ["text", "image", "pdf"], output: ["text"] },
+			},
+			"claude-sonnet-4-6": {
+				limit: { context: 1000000, output: 64000 },
+				modalities: { input: ["text", "image", "pdf"], output: ["text"] },
+			},
+			// A text-only model: definitively no image/pdf input.
+			"text-only-model": {
+				limit: { context: 100000, output: 8192 },
+				modalities: { input: ["text"], output: ["text"] },
+			},
+			// An entry predating the modalities field → capability unknown.
+			"legacy-model": { limit: { context: 100000, output: 8192 } },
 		},
 	},
 	opencode: {
 		id: "opencode",
 		models: {
-			"glm-4-6": { limit: { context: 131072, output: 8192 } },
+			"glm-4-6": {
+				limit: { context: 131072, output: 8192 },
+				modalities: { input: ["text", "image"], output: ["text"] },
+			},
 		},
 	},
 };
@@ -156,3 +173,55 @@ describe("getModelsCatalog caching", () => {
 		warn.mockRestore();
 	});
 });
+
+describe("resolveModelCapabilities", () => {
+	it("reports image + pdf for a vision model", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toEqual({
+			image: true,
+			pdf: true,
+		});
+	});
+
+	it("reports image-only for a model whose modalities omit pdf", async () => {
+		mockFetchOnce(CATALOG);
+		// glm-4-6 lists image but not pdf (resolved via the opencode fallback).
+		expect(await resolveModelCapabilities("opencode-anthropic", "glm-4-6")).toEqual({
+			image: true,
+			pdf: false,
+		});
+	});
+
+	it("reports a definitive no for a text-only model", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "text-only-model")).toEqual({
+			image: false,
+			pdf: false,
+		});
+	});
+
+	it("returns null (unknown) for an entry without modalities", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "legacy-model")).toBeNull();
+	});
+
+	it("returns null (unknown) for an unknown model id", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "no-such-model")).toBeNull();
+	});
+
+	it("returns null for an unsupported provider without hitting the network", async () => {
+		const fetchFn = mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("google", "gemini-2.5-pro")).toBeNull();
+		expect(await resolveModelCapabilities("anthropic", "")).toBeNull();
+		expect(fetchFn).not.toHaveBeenCalled();
+	});
+
+	it("returns null (unknown) when the catalog is offline with no cache", async () => {
+		const fetchFn = vi.fn(() => Promise.reject(new Error("offline")));
+		vi.stubGlobal("fetch", fetchFn);
+		const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+		expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toBeNull();
+		warn.mockRestore();
+	});
+});
diff --git a/packages/frontend/src/App.svelte b/packages/frontend/src/App.svelte
index a0b25b7..ae0718e 100644
--- a/packages/frontend/src/App.svelte
+++ b/packages/frontend/src/App.svelte
@@ -131,6 +131,59 @@ $effect(() => {
 	})();
 });
 
+// ─── Image / PDF capability lookup ─────────────────────────────
+// Resolve whether the active model accepts image/pdf INPUT from models.dev (via
+// the API), so the chat input can block sending an unsupported attachment
+// (no tokens spent) while staying permissive when the capability is unknown.
+// `null` = unknown (catalog offline / unsupported provider) → optimistic allow.
+let imageSupport = $state<{ image: boolean; pdf: boolean } | null>(null);
+const capabilityCache = new Map<string, { image: boolean; pdf: boolean } | null>();
+
+$effect(() => {
+	const tab = tabStore.activeTab;
+	const keyId = tab?.keyId ?? null;
+	const modelId = tab?.modelId ?? null;
+	const provider = keyId ? (modelsData.keys.find((k) => k.id === keyId)?.provider ?? null) : null;
+
+	if (!provider || !modelId) {
+		imageSupport = null;
+		return;
+	}
+
+	const cacheKey = `${provider}/${modelId}`;
+	if (capabilityCache.has(cacheKey)) {
+		imageSupport = capabilityCache.get(cacheKey) ?? null;
+		return;
+	}
+
+	// Clear immediately so a slow/failed fetch can't leave the PREVIOUS model's
+	// capability on screen (which could wrongly block/allow this model).
+	imageSupport = null;
+
+	void (async () => {
+		try {
+			const res = await fetch(
+				`${config.apiBase}/models/capabilities?provider=${encodeURIComponent(provider)}&modelId=${encodeURIComponent(modelId)}`,
+			);
+			if (!res.ok) return;
+			const data = (await res.json()) as {
+				capabilities?: { image: boolean; pdf: boolean } | null;
+			};
+			const caps = data.capabilities ?? null;
+			capabilityCache.set(cacheKey, caps);
+			const current = tabStore.activeTab;
+			const currentProvider = current?.keyId
+				? (modelsData.keys.find((k) => k.id === current.keyId)?.provider ?? null)
+				: null;
+			if (currentProvider === provider && current?.modelId === modelId) {
+				imageSupport = caps;
+			}
+		} catch {
+			// Leave imageSupport as null (unknown → permissive) on network error.
+		}
+	})();
+});
+
 onMount(() => {
 	// Apply persisted theme (or the shared DEFAULT_THEME if nothing is
 	// stored) so the first paint matches what the Settings panel will
@@ -174,7 +227,7 @@ onMount(() => {
 			<div class="flex-1 overflow-hidden">
 				<ChatPanel />
 			</div>
-			<ChatInput {contextLimit} />
+			<ChatInput {contextLimit} {imageSupport} />
 		</div>
 
 		<!-- Right sidebar: overlay on small screens, inline on large -->
diff --git a/packages/frontend/src/lib/attachment-tokens.ts b/packages/frontend/src/lib/attachment-tokens.ts
new file mode 100644
index 0000000..79d4cbc
--- /dev/null
+++ b/packages/frontend/src/lib/attachment-tokens.ts
@@ -0,0 +1,234 @@
+// Inline attachment tokens for the chat input.
+//
+// A pasted image/PDF is represented in the textarea draft as an inline TOKEN
+// (e.g. `【image:a1b2c3】`). The token is ordinary text living inside the draft,
+// so attachments have ORDER relative to typed text and to each other, and the
+// user can reference them positionally ("here is image A: 【image:…】"). The
+// token is also the ONLY handle on an attachment — deleting it (atomic delete,
+// below) detaches the underlying file. There is no separate preview strip.
+//
+// This module is pure (no DOM, no Svelte) so it can be unit-tested directly.
+
+import type { UserContentPart } from "@dispatch/core/src/types/index.js";
+
+export type AttachmentKind = "image" | "pdf";
+
+/** A staged attachment, keyed by its short token id. */
+export interface StagedAttachment {
+	id: string;
+	kind: AttachmentKind;
+	/** IANA media type, e.g. `image/png`, `application/pdf`. */
+	mediaType: string;
+	/** Base64 payload WITHOUT a `data:` URI prefix. */
+	data: string;
+	/** Optional original filename (used for PDFs). */
+	name?: string;
+}
+
+/**
+ * Token grammar: `【<kind>:<id>】` where kind ∈ {image,pdf} and id is 6
+ * lowercase alphanumerics. The CJK corner brackets (U+3010/U+3011) are used as
+ * delimiters because they're visually distinct and virtually never typed by
+ * hand, so a token won't collide with normal prose.
+ */
+export const ATTACHMENT_TOKEN_RE = /【(image|pdf):([a-z0-9]{6})】/g;
+
+/** Build the inline token string for a staged attachment id + kind. */
+export function makeAttachmentToken(kind: AttachmentKind, id: string): string {
+	return `【${kind}:${id}】`;
+}
+
+/** Generate a short, URL-safe token id (6 lowercase alphanumerics). */
+export function generateTokenId(): string {
+	let out = "";
+	const alphabet = "abcdefghijklmnopqrstuvwxyz0123456789";
+	// crypto.getRandomValues is available in browsers and modern Node/Bun.
+	const cryptoObj = (globalThis as { crypto?: Crypto }).crypto;
+	if (cryptoObj?.getRandomValues) {
+		const buf = new Uint32Array(6);
+		cryptoObj.getRandomValues(buf);
+		for (let i = 0; i < 6; i++) out += alphabet[(buf[i] ?? 0) % alphabet.length];
+		return out;
+	}
+	for (let i = 0; i < 6; i++) out += alphabet[Math.floor(Math.random() * alphabet.length)];
+	return out;
+}
+
+export interface FoundToken {
+	id: string;
+	kind: AttachmentKind;
+	/** Inclusive start index of the token within the text. */
+	start: number;
+	/** Exclusive end index of the token within the text. */
+	end: number;
+}
+
+/** Find all attachment tokens in `text`, in order of appearance. */
+export function findTokens(text: string): FoundToken[] {
+	const out: FoundToken[] = [];
+	// Fresh regex per call so `lastIndex` state never leaks between calls.
+	const re = new RegExp(ATTACHMENT_TOKEN_RE.source, "g");
+	let m: RegExpExecArray | null = re.exec(text);
+	while (m !== null) {
+		out.push({
+			kind: m[1] as AttachmentKind,
+			id: m[2] ?? "",
+			start: m.index,
+			end: m.index + m[0].length,
+		});
+		m = re.exec(text);
+	}
+	return out;
+}
+
+/** The set of attachment ids whose token is still intact in `text`. */
+export function intactTokenIds(text: string): Set<string> {
+	return new Set(findTokens(text).map((t) => t.id));
+}
+
+export interface DeletionResult {
+	/** Text after the deletion. */
+	text: string;
+	/** New caret position (collapsed) after the deletion. */
+	caret: number;
+	/** Ids of attachments whose tokens were removed by this deletion. */
+	removedIds: string[];
+}
+
+/**
+ * Compute the result of a Backspace/Delete keystroke when it interacts with an
+ * attachment token, so a token deletes ATOMICALLY (one keystroke removes the
+ * whole `【…】`, never a single bracket). Returns `null` when the keystroke does
+ * NOT touch a token — the caller should then let the browser's default editing
+ * behaviour run.
+ *
+ * Rules:
+ *  - Range selection (`selStart !== selEnd`): expand the range to fully cover
+ *    any token it overlaps, then delete the expanded range. Only acts when at
+ *    least one token actually overlaps (otherwise returns null).
+ *  - Collapsed + Backspace: if a token ends exactly at the caret, delete it.
+ *  - Collapsed + Delete: if a token starts exactly at the caret, delete it.
+ */
+export function computeTokenDeletion(
+	text: string,
+	selStart: number,
+	selEnd: number,
+	key: "Backspace" | "Delete",
+): DeletionResult | null {
+	const tokens = findTokens(text);
+	if (tokens.length === 0) return null;
+
+	if (selStart !== selEnd) {
+		const lo = Math.min(selStart, selEnd);
+		const hi = Math.max(selStart, selEnd);
+		const overlapping = tokens.filter((t) => t.start < hi && t.end > lo);
+		if (overlapping.length === 0) return null;
+		const delStart = Math.min(lo, ...overlapping.map((t) => t.start));
+		const delEnd = Math.max(hi, ...overlapping.map((t) => t.end));
+		return {
+			text: text.slice(0, delStart) + text.slice(delEnd),
+			caret: delStart,
+			removedIds: overlapping.map((t) => t.id),
+		};
+	}
+
+	// Collapsed caret.
+	if (key === "Backspace") {
+		const tok = tokens.find((t) => t.end === selStart);
+		if (!tok) return null;
+		return {
+			text: text.slice(0, tok.start) + text.slice(tok.end),
+			caret: tok.start,
+			removedIds: [tok.id],
+		};
+	}
+	// Delete (forward).
+	const tok = tokens.find((t) => t.start === selStart);
+	if (!tok) return null;
+	return {
+		text: text.slice(0, tok.start) + text.slice(tok.end),
+		caret: tok.start,
+		removedIds: [tok.id],
+	};
+}
+
+/** Human-readable marker that replaces a token in persisted/display text. */
+export function markerFor(kind: AttachmentKind): string {
+	return kind === "pdf" ? "[pdf]" : "[image]";
+}
+
+export interface ParsedDraft {
+	/**
+	 * Text-only projection of the draft with each attachment token replaced by a
+	 * `[image]` / `[pdf]` marker. This is what gets persisted and rendered in the
+	 * chat history (the raw bytes are never stored).
+	 */
+	displayText: string;
+	/**
+	 * Ordered multimodal content (interleaved text + attachment parts) to send to
+	 * the model, or `null` when the draft has no intact attachment token (the
+	 * caller then sends plain text).
+	 */
+	content: UserContentPart[] | null;
+}
+
+/**
+ * Split a draft (text containing attachment tokens) plus the staged-attachment
+ * map into:
+ *  - `displayText`: tokens swapped for `[image]`/`[pdf]` markers, and
+ *  - `content`: an ordered `UserContentPart[]` interleaving the surrounding text
+ *    with the matching attachment parts.
+ *
+ * A token whose id has no matching staged attachment (e.g. a stray paste of the
+ * token text, or a detached attachment) is treated as plain text in BOTH
+ * outputs — its marker still appears in `displayText`, but it contributes no
+ * attachment part. `content` is `null` when no attachment part is produced.
+ */
+export function parseDraft(draft: string, attachments: Map<string, StagedAttachment>): ParsedDraft {
+	const tokens = findTokens(draft);
+	let displayText = "";
+	const content: UserContentPart[] = [];
+	let textBuf = "";
+	let cursor = 0;
+	let producedAttachment = false;
+
+	const flushText = () => {
+		if (textBuf.length > 0) {
+			content.push({ type: "text", text: textBuf });
+			textBuf = "";
+		}
+	};
+
+	for (const tok of tokens) {
+		const between = draft.slice(cursor, tok.start);
+		textBuf += between;
+		displayText += between;
+		const att = attachments.get(tok.id);
+		if (att) {
+			// displayText (persisted/rendered) gets a `[image]`/`[pdf]` marker;
+			// the multimodal content gets the ACTUAL attachment part instead — no
+			// marker text, since the part itself represents the file to the model.
+			displayText += markerFor(tok.kind);
+			flushText();
+			content.push({
+				type: "attachment",
+				mediaType: att.mediaType,
+				data: att.data,
+				...(att.name ? { name: att.name } : {}),
+			});
+			producedAttachment = true;
+		} else {
+			// Orphan token (no staged attachment) → keep the marker as plain text
+			// in BOTH outputs; it contributes no attachment part.
+			displayText += markerFor(tok.kind);
+			textBuf += markerFor(tok.kind);
+		}
+		cursor = tok.end;
+	}
+	const tail = draft.slice(cursor);
+	textBuf += tail;
+	displayText += tail;
+	flushText();
+
+	return { displayText, content: producedAttachment ? content : null };
+}
diff --git a/packages/frontend/src/lib/components/ChatInput.svelte b/packages/frontend/src/lib/components/ChatInput.svelte
index 079ef4a..4067b78 100644
--- a/packages/frontend/src/lib/components/ChatInput.svelte
+++ b/packages/frontend/src/lib/components/ChatInput.svelte
@@ -1,12 +1,40 @@
 <script lang="ts">
+import {
+	ACCEPTED_PDF_MEDIA_TYPE,
+	isImageMediaType,
+	isPdfMediaType,
+	MAX_ATTACHMENTS,
+	MAX_IMAGE_BYTES,
+	MAX_PDF_BYTES,
+} from "@dispatch/core/src/models/attachments.js";
+import {
+	type AttachmentKind,
+	computeTokenDeletion,
+	generateTokenId,
+	makeAttachmentToken,
+	parseDraft,
+	type StagedAttachment,
+} from "../attachment-tokens.js";
 import { computeContextUsage } from "../context-window.js";
 import { tabStore } from "../tabs.svelte.js";
 
-const { contextLimit = null }: { contextLimit?: number | null } = $props();
+const {
+	contextLimit = null,
+	imageSupport = null,
+}: {
+	contextLimit?: number | null;
+	// Image/PDF INPUT capability for the active model, or `null` when unknown
+	// (catalog offline / unsupported provider) — null means "can't verify"
+	// (optimistic allow), not a hard no.
+	imageSupport?: { image: boolean; pdf: boolean } | null;
+} = $props();
 
 const MAX_LINES = 7;
 
 let inputEl: HTMLTextAreaElement | undefined;
+// Transient error shown when a paste is rejected (bad type / too large / too
+// many). Cleared on the next successful paste or any keystroke.
+let pasteError = $state<string | null>(null);
 
 const agentStatus = $derived(tabStore.activeTab?.agentStatus ?? "idle");
 const tabId = $derived(tabStore.activeTab?.id ?? "");
@@ -14,13 +42,47 @@ const tabId = $derived(tabStore.activeTab?.id ?? "");
 // switching tabs saves the current draft and restores the target tab's text
 // automatically — drafts are never lost or clobbered by tab switching.
 const inputValue = $derived(tabStore.activeTab?.draft ?? "");
+const attachments = $derived(tabStore.activeTab?.attachments ?? []);
 const cacheStats = $derived(tabStore.activeTab?.cacheStats ?? null);
 
 const isRunning = $derived(agentStatus === "running");
 const hasText = $derived(inputValue.trim().length > 0);
+const hasAttachments = $derived(attachments.length > 0);
 // While generating with an empty box, the primary action is "stop". With text
 // in the box, it stays "send" (the message is queued behind the live turn).
-const showStop = $derived(isRunning && !hasText);
+const showStop = $derived(isRunning && !hasText && !hasAttachments);
+
+// ─── Attachment capability gating ──────────────────────────────
+// A definitive "no" from the catalog (imageSupport.image === false with an
+// image staged, or .pdf === false with a pdf staged) blocks the send so no
+// tokens are spent. Unknown capability (imageSupport === null) is permissive.
+const hasImageAttachment = $derived(attachments.some((a) => a.kind === "image"));
+const hasPdfAttachment = $derived(attachments.some((a) => a.kind === "pdf"));
+const imageBlocked = $derived(
+	hasImageAttachment && imageSupport !== null && imageSupport.image === false,
+);
+const pdfBlocked = $derived(
+	hasPdfAttachment && imageSupport !== null && imageSupport.pdf === false,
+);
+// Attachments require a fresh turn — they can't ride the queue path (which is
+// text-only), so block sending an attachment while the agent is generating.
+const attachmentsWhileRunning = $derived(hasAttachments && isRunning);
+
+const attachmentWarning = $derived.by(() => {
+	if (pasteError) return pasteError;
+	if (attachmentsWhileRunning)
+		return "Wait for the current response to finish before sending images.";
+	if (imageBlocked && pdfBlocked)
+		return "The selected model doesn't support image or PDF input. Remove the attachments to send.";
+	if (imageBlocked)
+		return "The selected model doesn't support image input. Remove the image to send.";
+	if (pdfBlocked) return "The selected model doesn't support PDF input. Remove the PDF to send.";
+	return null;
+});
+
+// Send is blocked (but not the box) when an attachment is definitively
+// unsupported or when attachments are staged mid-generation.
+const sendBlocked = $derived(imageBlocked || pdfBlocked || attachmentsWhileRunning);
 
 const usage = $derived(computeContextUsage(cacheStats, contextLimit));
 const hasUsage = $derived((cacheStats?.last ?? null) !== null);
@@ -77,21 +139,153 @@ $effect(() => {
 
 function handleInput(e: Event) {
 	if (!tabId) return;
+	pasteError = null;
+	// setDraft also reconciles staged attachments against the surviving tokens,
+	// so deleting a token (by any means) detaches its attachment.
 	tabStore.setDraft(tabId, (e.currentTarget as HTMLTextAreaElement).value);
 }
 
+function kindForMediaType(mediaType: string): AttachmentKind | null {
+	if (isImageMediaType(mediaType)) return "image";
+	if (isPdfMediaType(mediaType)) return "pdf";
+	return null;
+}
+
+function readAsBase64(file: File): Promise<string> {
+	return new Promise((resolve, reject) => {
+		const reader = new FileReader();
+		reader.onload = () => {
+			const result = reader.result;
+			if (typeof result !== "string") {
+				reject(new Error("unexpected reader result"));
+				return;
+			}
+			// Strip the `data:<mediaType>;base64,` prefix → bare base64.
+			const comma = result.indexOf(",");
+			resolve(comma === -1 ? result : result.slice(comma + 1));
+		};
+		reader.onerror = () => reject(reader.error ?? new Error("read failed"));
+		reader.readAsDataURL(file);
+	});
+}
+
+/** Insert `insert` at the textarea's caret, returning the new caret offset. */
+function insertAtCaret(insert: string): number {
+	const el = inputEl;
+	const text = inputValue;
+	const start = el?.selectionStart ?? text.length;
+	const end = el?.selectionEnd ?? text.length;
+	const next = text.slice(0, start) + insert + text.slice(end);
+	if (tabId) tabStore.setDraft(tabId, next);
+	return start + insert.length;
+}
+
+async function handlePaste(e: ClipboardEvent) {
+	if (!tabId) return;
+	const items = e.clipboardData?.items;
+	if (!items) return;
+	const files: File[] = [];
+	for (const item of items) {
+		if (item.kind === "file") {
+			const file = item.getAsFile();
+			if (file) files.push(file);
+		}
+	}
+	// No files in the clipboard → let the default text paste happen.
+	if (files.length === 0) return;
+	// We're handling at least one file; stop the browser from also pasting a
+	// filename / image fallback into the textarea.
+	e.preventDefault();
+	pasteError = null;
+
+	for (const file of files) {
+		const kind = kindForMediaType(file.type);
+		if (!kind) {
+			pasteError = `Unsupported file type: ${file.type || "unknown"}. Allowed: PNG, JPEG, WebP, GIF, PDF.`;
+			continue;
+		}
+		const current = tabStore.activeTab?.attachments ?? [];
+		if (current.length >= MAX_ATTACHMENTS) {
+			pasteError = `You can attach at most ${MAX_ATTACHMENTS} files per message.`;
+			break;
+		}
+		const limit = kind === "pdf" ? MAX_PDF_BYTES : MAX_IMAGE_BYTES;
+		if (file.size > limit) {
+			const mb = Math.round(limit / (1024 * 1024));
+			pasteError = `${kind === "pdf" ? "PDF" : "Image"} is too large (max ${mb} MB).`;
+			continue;
+		}
+		try {
+			const data = await readAsBase64(file);
+			const id = generateTokenId();
+			const mediaType = kind === "pdf" ? ACCEPTED_PDF_MEDIA_TYPE : file.type;
+			const staged: StagedAttachment = {
+				id,
+				kind,
+				mediaType,
+				data,
+				...(file.name ? { name: file.name } : {}),
+			};
+			// Stage first, then insert the token — `setDraft` reconciles against
+			// staged attachments, so the attachment must exist before its token
+			// appears in the draft.
+			tabStore.addAttachment(tabId, staged);
+			const caret = insertAtCaret(makeAttachmentToken(kind, id));
+			// Restore the caret after the value updates.
+			requestAnimationFrame(() => {
+				const el = inputEl;
+				if (el) {
+					el.focus();
+					el.setSelectionRange(caret, caret);
+				}
+			});
+		} catch {
+			pasteError = "Failed to read the pasted file.";
+		}
+	}
+}
+
 function handleKeydown(e: KeyboardEvent) {
 	if (e.key === "Enter" && !e.shiftKey) {
 		e.preventDefault();
 		submit();
+		return;
+	}
+	if ((e.key === "Backspace" || e.key === "Delete") && inputEl && tabId) {
+		// Atomic token delete: a single Backspace/Delete next to (or a selection
+		// overlapping) a `【…】` token removes the whole token in one stroke.
+		const result = computeTokenDeletion(
+			inputValue,
+			inputEl.selectionStart ?? 0,
+			inputEl.selectionEnd ?? 0,
+			e.key,
+		);
+		if (result) {
+			e.preventDefault();
+			tabStore.setDraft(tabId, result.text);
+			requestAnimationFrame(() => {
+				const el = inputEl;
+				if (el) {
+					el.focus();
+					el.setSelectionRange(result.caret, result.caret);
+				}
+			});
+		}
 	}
 }
 
 function submit() {
-	const text = inputValue.trim();
-	if (!text) return;
-	if (tabId) tabStore.setDraft(tabId, "");
-	tabStore.sendMessage(text);
+	if (!tabId) return;
+	const map = new Map(attachments.map((a) => [a.id, a] as const));
+	const { displayText, content } = parseDraft(inputValue, map);
+	const trimmed = displayText.trim();
+	// Nothing to send (no text and no usable attachment).
+	if (!trimmed && !content) return;
+	// Don't send when a staged attachment is unsupported / mid-generation.
+	if (sendBlocked) return;
+	const text = trimmed || displayText;
+	tabStore.setDraft(tabId, "");
+	void tabStore.sendMessage(text, content ?? undefined);
 }
 
 function primaryAction() {
@@ -104,25 +298,36 @@ function primaryAction() {
 </script>
 
 <div class="flex flex-col">
+	{#if attachmentWarning}
+		<div class="px-3 pt-2 text-xs text-warning flex items-start gap-1">
+			<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="w-3.5 h-3.5 mt-0.5 shrink-0" aria-hidden="true">
+				<path d="M10.29 3.86 1.82 18a2 2 0 0 0 1.71 3h16.94a2 2 0 0 0 1.71-3L13.71 3.86a2 2 0 0 0-3.42 0z"></path>
+				<line x1="12" y1="9" x2="12" y2="13"></line>
+				<line x1="12" y1="17" x2="12.01" y2="17"></line>
+			</svg>
+			<span>{attachmentWarning}</span>
+		</div>
+	{/if}
 	<!-- Top bar: expanding textarea + send/stop action -->
 	<div class="flex items-end gap-2 px-3 pt-3 pb-2">
 		<textarea
 			bind:this={inputEl}
 			value={inputValue}
 			rows="1"
-			placeholder="Type a message..."
+			placeholder="Type a message... (paste an image or PDF to attach)"
 			class="textarea textarea-ghost flex-1 resize-none leading-normal !min-h-0 h-auto"
 			onkeydown={handleKeydown}
 			oninput={handleInput}
+			onpaste={handlePaste}
 		></textarea>
 		<!-- Single fixed-width button across all states so the layout never
 		     shifts when it morphs between Send and Stop. -->
 		<button
 			type="button"
 			class="btn w-20 shrink-0 {showStop ? 'btn-error btn-outline' : 'btn-primary'}"
-			disabled={!showStop && !hasText}
+			disabled={!showStop && !hasText && !hasAttachments || sendBlocked}
 			onclick={primaryAction}
-			title={showStop ? "Stop generation" : "Send message"}
+			title={showStop ? "Stop generation" : sendBlocked ? (attachmentWarning ?? "Cannot send") : "Send message"}
 		>
 			{#if showStop}
 				<span class="loading loading-spinner loading-sm"></span>
diff --git a/packages/frontend/src/lib/tabs.svelte.ts b/packages/frontend/src/lib/tabs.svelte.ts
index 9975d7b..e33a0e9 100644
--- a/packages/frontend/src/lib/tabs.svelte.ts
+++ b/packages/frontend/src/lib/tabs.svelte.ts
@@ -11,13 +11,14 @@ import {
 // DB-free; safe in the browser bundle. The flat chunk log is the frontend's
 // source of truth for HISTORY; `groupRowsToMessages` derives render bubbles.
 import { groupRowsToMessages, type MessageRow } from "@dispatch/core/src/chunks/transform.js";
-import type { ChunkRow } from "@dispatch/core/src/types/index.js";
+import type { ChunkRow, UserContentPart } from "@dispatch/core/src/types/index.js";
 import {
 	type AgentModelEntry,
 	DEFAULT_REASONING_EFFORT,
 	isReasoningEffort,
 	type ReasoningEffort,
 } from "@dispatch/core/src/types/index.js";
+import { intactTokenIds, type StagedAttachment } from "./attachment-tokens.js";
 import { config } from "./config.js";
 import { appSettings } from "./settings.svelte.js";
 import type {
@@ -182,6 +183,13 @@ export interface Tab {
 	 * switching tabs. Cleared on send.
 	 */
 	draft: string;
+	/**
+	 * Staged image/PDF attachments for THIS tab's unsent draft (in-memory only —
+	 * never persisted). Each corresponds to an inline `【image:…】`/`【pdf:…】`
+	 * token in `draft`; removing the token detaches the attachment (reconciled on
+	 * every keystroke). Ephemeral: sent to the model for one turn, then cleared.
+	 */
+	attachments: StagedAttachment[];
 	/**
 	 * True once the user has manually renamed this tab (double-click rename).
 	 * Suppresses the first-message auto-title so a chosen name is never
@@ -312,6 +320,7 @@ export function createTabStore() {
 			queuedMessages: [],
 			chunkLimit: appSettings.chunkLimit,
 			draft: "",
+			attachments: [],
 			manualTitle: false,
 			oldestLoadedSeq: null,
 			totalChunks: 0,
@@ -389,6 +398,7 @@ export function createTabStore() {
 				queuedMessages: [],
 				chunkLimit: appSettings.chunkLimit,
 				draft: "",
+				attachments: [],
 				manualTitle: false,
 				oldestLoadedSeq: win.oldestSeq,
 				totalChunks: win.total,
@@ -493,8 +503,31 @@ export function createTabStore() {
 	 * target tab shows its own text. No-op if the tab is gone.
 	 */
 	function setDraft(id: string, text: string): void {
-		if (!getTabById(id)) return;
-		updateTab(id, { draft: text });
+		const tab = getTabById(id);
+		if (!tab) return;
+		// Detach any staged attachment whose inline token is no longer intact in
+		// the new draft text (covers atomic-delete, manual mid-token edits, cut,
+		// select-all-delete, etc.). The token in the textarea is the ONLY handle
+		// on an attachment, so reconciling here keeps the two in lockstep.
+		const intact = intactTokenIds(text);
+		const keep = tab.attachments.filter((a) => intact.has(a.id));
+		if (keep.length !== tab.attachments.length) {
+			updateTab(id, { draft: text, attachments: keep });
+		} else {
+			updateTab(id, { draft: text });
+		}
+	}
+
+	/**
+	 * Stage a pasted attachment on a tab. The caller is responsible for also
+	 * inserting the matching `【image:…】`/`【pdf:…】` token into the draft (the
+	 * token is what keeps the attachment alive through reconciliation). No-op if
+	 * the tab is gone.
+	 */
+	function addAttachment(id: string, attachment: StagedAttachment): void {
+		const tab = getTabById(id);
+		if (!tab) return;
+		updateTab(id, { attachments: [...tab.attachments, attachment] });
 	}
 
 	/**
@@ -929,6 +962,7 @@ export function createTabStore() {
 				queuedMessages: [],
 				chunkLimit: appSettings.chunkLimit,
 				draft: "",
+				attachments: [],
 				manualTitle: false,
 				oldestLoadedSeq: win.oldestSeq,
 				totalChunks: win.total,
@@ -1284,6 +1318,7 @@ export function createTabStore() {
 						queuedMessages: [],
 						chunkLimit: appSettings.chunkLimit,
 						draft: "",
+						attachments: [],
 						manualTitle: false,
 						oldestLoadedSeq: null,
 						totalChunks: 0,
@@ -1604,7 +1639,7 @@ export function createTabStore() {
 		}
 	}
 
-	async function sendMessage(text: string): Promise<void> {
+	async function sendMessage(text: string, content?: UserContentPart[]): Promise<void> {
 		let tab = getActiveTab();
 		if (!tab) return;
 
@@ -1615,8 +1650,11 @@ export function createTabStore() {
 			if (!tab) return;
 		}
 
-		// Fetch content for checked skills and build the message to send
-		let messageToSend = text;
+		// Fetch content for checked skills and build the message to send.
+		// `skillPrefix` (when non-empty) is prepended to BOTH the text projection
+		// that gets persisted/rendered AND the multimodal content array, so an
+		// image turn still carries the activated skills to the model.
+		let skillPrefix = "";
 		const checkedKeys = Object.entries(appSettings.skillChecks)
 			.filter(([, v]) => v)
 			.map(([k]) => k);
@@ -1627,13 +1665,13 @@ export function createTabStore() {
 				const [scope, ...nameParts] = key.split(":");
 				const name = nameParts.join(":");
 				if (!scope || !name) continue;
-				const content = await fetchSkillContent(scope, name);
-				if (content) {
-					skillSections.push(`<skill name="${name}">\n${content}\n</skill>`);
+				const skillContent = await fetchSkillContent(scope, name);
+				if (skillContent) {
+					skillSections.push(`<skill name="${name}">\n${skillContent}\n</skill>`);
 				}
 			}
 			if (skillSections.length > 0) {
-				messageToSend = `[The following skills have been activated for this message]\n\n${skillSections.join("\n\n")}\n\n---\n\n${text}`;
+				skillPrefix = `[The following skills have been activated for this message]\n\n${skillSections.join("\n\n")}\n\n---\n\n`;
 			}
 
 			// Track injected skills on the tab
@@ -1644,6 +1682,12 @@ export function createTabStore() {
 			appSettings.skillChecks = {};
 		}
 
+		const messageToSend = `${skillPrefix}${text}`;
+		// Prepend the skill prefix to the multimodal content as a leading text
+		// part so the model sees the activated skills before the attachments.
+		const contentToSend =
+			content && skillPrefix ? [{ type: "text" as const, text: skillPrefix }, ...content] : content;
+
 		const userMsg: ChatMessage = {
 			id: generateId(),
 			role: "user",
@@ -1720,6 +1764,7 @@ export function createTabStore() {
 				body: JSON.stringify({
 					tabId: tab.id,
 					message: messageToSend,
+					...(contentToSend ? { content: contentToSend } : {}),
 					...(tab.keyId ? { keyId: tab.keyId } : {}),
 					...(tab.modelId ? { modelId: tab.modelId } : {}),
 					...(tab.agentModels ? { agentModels: tab.agentModels } : {}),
@@ -2118,6 +2163,7 @@ export function createTabStore() {
 		renameTab,
 		reorderTabs,
 		setDraft,
+		addAttachment,
 		sendMessage,
 		cancelQueuedMessage,
 		stopGeneration,
diff --git a/packages/frontend/tests/attachment-tokens.test.ts b/packages/frontend/tests/attachment-tokens.test.ts
new file mode 100644
index 0000000..7208cf3
--- /dev/null
+++ b/packages/frontend/tests/attachment-tokens.test.ts
@@ -0,0 +1,130 @@
+import { describe, expect, it } from "vitest";
+import {
+	computeTokenDeletion,
+	findTokens,
+	generateTokenId,
+	intactTokenIds,
+	makeAttachmentToken,
+	markerFor,
+	parseDraft,
+	type StagedAttachment,
+} from "../src/lib/attachment-tokens.js";
+
+function img(id: string): StagedAttachment {
+	return { id, kind: "image", mediaType: "image/png", data: "QQ==" };
+}
+function pdf(id: string): StagedAttachment {
+	return { id, kind: "pdf", mediaType: "application/pdf", data: "QQ==", name: "doc.pdf" };
+}
+
+describe("token helpers", () => {
+	it("round-trips make/find", () => {
+		const tok = makeAttachmentToken("image", "abc123");
+		expect(tok).toBe("【image:abc123】");
+		const found = findTokens(`x ${tok} y`);
+		expect(found).toHaveLength(1);
+		expect(found[0]).toMatchObject({ id: "abc123", kind: "image", start: 2, end: 2 + tok.length });
+	});
+
+	it("generates 6-char lowercase-alnum ids", () => {
+		for (let i = 0; i < 20; i++) {
+			expect(generateTokenId()).toMatch(/^[a-z0-9]{6}$/);
+		}
+	});
+
+	it("finds multiple tokens in order and reports intact ids", () => {
+		const text = `a ${makeAttachmentToken("image", "aaaaaa")} b ${makeAttachmentToken("pdf", "bbbbbb")}`;
+		const found = findTokens(text);
+		expect(found.map((t) => t.id)).toEqual(["aaaaaa", "bbbbbb"]);
+		expect(intactTokenIds(text)).toEqual(new Set(["aaaaaa", "bbbbbb"]));
+	});
+
+	it("does not treat a partially-broken token as intact", () => {
+		// Missing closing bracket → not a valid token.
+		expect(intactTokenIds("【image:aaaaaa").size).toBe(0);
+	});
+});
+
+describe("computeTokenDeletion", () => {
+	const tok = makeAttachmentToken("image", "abcabc");
+	const text = `hi ${tok}!`; // token spans indices 3..3+len
+	const tokStart = 3;
+	const tokEnd = 3 + tok.length;
+
+	it("returns null when no tokens exist", () => {
+		expect(computeTokenDeletion("plain", 2, 2, "Backspace")).toBeNull();
+	});
+
+	it("Backspace just after a token removes the whole token atomically", () => {
+		const res = computeTokenDeletion(text, tokEnd, tokEnd, "Backspace");
+		expect(res).not.toBeNull();
+		expect(res?.text).toBe("hi !");
+		expect(res?.caret).toBe(tokStart);
+		expect(res?.removedIds).toEqual(["abcabc"]);
+	});
+
+	it("Delete just before a token removes the whole token atomically", () => {
+		const res = computeTokenDeletion(text, tokStart, tokStart, "Delete");
+		expect(res?.text).toBe("hi !");
+		expect(res?.caret).toBe(tokStart);
+		expect(res?.removedIds).toEqual(["abcabc"]);
+	});
+
+	it("Backspace NOT adjacent to a token returns null (default editing)", () => {
+		// Caret at index 2 (after "hi"), token is further along.
+		expect(computeTokenDeletion(text, 2, 2, "Backspace")).toBeNull();
+	});
+
+	it("a selection overlapping a token expands to cover the whole token", () => {
+		// Select from inside "hi " through the middle of the token.
+		const res = computeTokenDeletion(text, 1, tokStart + 3, "Backspace");
+		expect(res).not.toBeNull();
+		// Deletion starts at min(selStart, tokStart)=1 and ends at tokEnd.
+		expect(res?.text).toBe("h!");
+		expect(res?.removedIds).toEqual(["abcabc"]);
+	});
+
+	it("a range selection touching no token returns null", () => {
+		expect(computeTokenDeletion(text, 0, 2, "Backspace")).toBeNull();
+	});
+});
+
+describe("parseDraft", () => {
+	it("returns plain text + null content when there are no attachments", () => {
+		const res = parseDraft("just text", new Map());
+		expect(res.displayText).toBe("just text");
+		expect(res.content).toBeNull();
+	});
+
+	it("interleaves text and attachment parts in order", () => {
+		const a = img("aaaaaa");
+		const b = pdf("bbbbbb");
+		const map = new Map([
+			[a.id, a],
+			[b.id, b],
+		]);
+		const draft = `A: ${makeAttachmentToken("image", a.id)} B: ${makeAttachmentToken("pdf", b.id)} end`;
+		const res = parseDraft(draft, map);
+
+		// displayText swaps tokens for markers.
+		expect(res.displayText).toBe(`A: ${markerFor("image")} B: ${markerFor("pdf")} end`);
+
+		// content interleaves the surrounding text with the attachment parts.
+		expect(res.content).toEqual([
+			{ type: "text", text: "A: " },
+			{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+			{ type: "text", text: " B: " },
+			{ type: "attachment", mediaType: "application/pdf", data: "QQ==", name: "doc.pdf" },
+			{ type: "text", text: " end" },
+		]);
+	});
+
+	it("treats an orphan token (no staged attachment) as plain text", () => {
+		// Token present in text but not in the attachments map.
+		const draft = `x ${makeAttachmentToken("image", "zzzzzz")} y`;
+		const res = parseDraft(draft, new Map());
+		expect(res.displayText).toBe(`x ${markerFor("image")} y`);
+		// No real attachment → null content (plain-text send).
+		expect(res.content).toBeNull();
+	});
+});
diff --git a/packages/frontend/tests/chat-store.test.ts b/packages/frontend/tests/chat-store.test.ts
index a0d4ead..8639bff 100644
--- a/packages/frontend/tests/chat-store.test.ts
+++ b/packages/frontend/tests/chat-store.test.ts
@@ -2126,3 +2126,78 @@ describe("tabStore — per-tab chat input draft", () => {
 		expect(store.tabs.every((t) => t.draft === "")).toBe(true);
 	});
 });
+
+describe("tabStore — image/pdf attachments", () => {
+	function imgAttachment(id: string) {
+		return { id, kind: "image" as const, mediaType: "image/png", data: "QQ==" };
+	}
+
+	it("stages attachments and reconciles them against intact draft tokens", async () => {
+		vi.stubGlobal(
+			"fetch",
+			vi.fn(() => Promise.resolve({ ok: true, json: () => Promise.resolve({}) })),
+		);
+		const store = createTabStore();
+		const a = await store.createNewTab();
+		store.switchTab(a.id);
+
+		store.addAttachment(a.id, imgAttachment("aaaaaa"));
+		// Draft carries the token → attachment survives.
+		store.setDraft(a.id, "look 【image:aaaaaa】");
+		expect(store.activeTab?.attachments.map((x) => x.id)).toEqual(["aaaaaa"]);
+
+		// Remove the token from the draft → attachment is detached.
+		store.setDraft(a.id, "look ");
+		expect(store.activeTab?.attachments).toHaveLength(0);
+	});
+
+	it("sendMessage posts ordered multimodal content and clears the draft", async () => {
+		const fetchMock = vi.fn((url: string) => {
+			if (typeof url === "string" && url.endsWith("/chat")) {
+				return Promise.resolve({ ok: true, json: () => Promise.resolve({ status: "ok" }) });
+			}
+			return Promise.resolve({ ok: true, json: () => Promise.resolve({}) });
+		});
+		vi.stubGlobal("fetch", fetchMock);
+
+		const store = createTabStore();
+		const a = await store.createNewTab();
+		store.switchTab(a.id);
+
+		await store.sendMessage("here is A: [image]", [
+			{ type: "text", text: "here is A: " },
+			{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+		]);
+
+		const chatCall = fetchMock.mock.calls.find(
+			(c) => typeof c[0] === "string" && (c[0] as string).endsWith("/chat"),
+		);
+		expect(chatCall).toBeDefined();
+		const body = JSON.parse((chatCall?.[1] as { body: string }).body);
+		expect(body.message).toBe("here is A: [image]");
+		expect(body.content).toEqual([
+			{ type: "text", text: "here is A: " },
+			{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+		]);
+	});
+
+	it("sendMessage omits content for a plain-text message", async () => {
+		const fetchMock = vi.fn((url: string) => {
+			if (typeof url === "string" && url.endsWith("/chat")) {
+				return Promise.resolve({ ok: true, json: () => Promise.resolve({ status: "ok" }) });
+			}
+			return Promise.resolve({ ok: true, json: () => Promise.resolve({}) });
+		});
+		vi.stubGlobal("fetch", fetchMock);
+
+		const store = createTabStore();
+		await store.createNewTab();
+		await store.sendMessage("just text");
+
+		const chatCall = fetchMock.mock.calls.find(
+			(c) => typeof c[0] === "string" && (c[0] as string).endsWith("/chat"),
+		);
+		const body = JSON.parse((chatCall?.[1] as { body: string }).body);
+		expect(body.content).toBeUndefined();
+	});
+});
-- 
cgit v1.2.3