From 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae Mon Sep 17 00:00:00 2001
From: Adam Malczewski <github@tradam.dev>
Date: Tue, 2 Jun 2026 22:50:11 +0900
Subject: feat(chat): paste-to-attach images/PDFs with model capability check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add multimodal image/PDF input to the chat box via clipboard paste, gated by a
graceful per-model capability check.

UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the
draft, so attachments have ORDER relative to typed text and can be referenced
positionally. The token is the only handle — deleting it (atomic Backspace/
Delete, or selection overlap) detaches the file; an input-reconciliation safety
net detaches any attachment whose token is no longer intact. No preview strip.

Capability check: resolveModelCapabilities reads models.dev modalities.input
(new GET /models/capabilities, mirrors /context-limit). The input blocks Send
(no tokens spent) only on a definitive 'no'; unknown capability (catalog offline
/ unmapped provider) stays permissive. Attachments require a fresh turn — Send is
blocked while generating and /chat rejects content mid-turn (409).

Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK
ImagePart/FilePart content, but never persisted (history keeps the text with
[image]/[pdf] markers). Text-only turns serialize byte-identically to before.

Limits (Anthropic-aligned, enforced at paste + re-validated server-side):
PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total.

core: UserContentPart types, models/attachments validator, capability resolver,
agent.run+toModelMessages thread ordered content. api: /chat content validation +
passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating,
per-tab staged attachments, App.svelte capability fetch. +44 tests.
---
 packages/core/src/agent/agent.ts               | Bin 57822 -> 60515 bytes
 packages/core/src/index.ts                     |  17 +++
 packages/core/src/models/attachments.ts        | 151 +++++++++++++++++++++++++
 packages/core/src/models/catalog.ts            |  50 ++++++++
 packages/core/src/models/index.ts              |  19 ++++
 packages/core/src/types/index.ts               |  49 ++++++++
 packages/core/tests/agent/agent.test.ts        |  98 ++++++++++++++++
 packages/core/tests/models/attachments.test.ts | 136 ++++++++++++++++++++++
 packages/core/tests/models/catalog.test.ts     |  75 +++++++++++-
 9 files changed, 592 insertions(+), 3 deletions(-)
 create mode 100644 packages/core/src/models/attachments.ts
 create mode 100644 packages/core/tests/models/attachments.test.ts

(limited to 'packages/core')

diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
index 4bfa7eb..08b317a 100644
Binary files a/packages/core/src/agent/agent.ts and b/packages/core/src/agent/agent.ts differ
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 08b426f..50012f1 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -82,9 +82,26 @@ export {
 } from "./lsp/index.js";
 // Models
 export {
+	ACCEPTED_ATTACHMENT_MEDIA_TYPES,
+	ACCEPTED_IMAGE_MEDIA_TYPES,
+	ACCEPTED_PDF_MEDIA_TYPE,
+	type AttachmentValidationError,
+	type AttachmentValidationResult,
+	base64ByteLength,
 	getModelsCatalog,
+	hasAttachments,
+	isAcceptedAttachmentMediaType,
+	isImageMediaType,
+	isPdfMediaType,
+	MAX_ATTACHMENTS,
+	MAX_IMAGE_BYTES,
+	MAX_PDF_BYTES,
+	MAX_TOTAL_ATTACHMENT_BYTES,
+	type ModelInputCapabilities,
 	ModelRegistry,
 	resolveContextLimit,
+	resolveModelCapabilities,
+	validateUserContent,
 } from "./models/index.js";
 // Notifications (ntfy.sh)
 export * from "./notifications/index.js";
diff --git a/packages/core/src/models/attachments.ts b/packages/core/src/models/attachments.ts
new file mode 100644
index 0000000..5c98db4
--- /dev/null
+++ b/packages/core/src/models/attachments.ts
@@ -0,0 +1,151 @@
+// Validation + limits for multimodal user attachments (images / PDFs).
+//
+// Kept dependency-free (no DB / `bun:sqlite` import) so both the API layer
+// (`/chat` request validation) and any future caller can share the exact same
+// allowlist and size/count ceilings. The limits mirror Anthropic's documented
+// vision/PDF API constraints (the only image-capable providers Dispatch maps),
+// so a request that passes here won't be rejected by the provider for size.
+
+import type { UserAttachmentPart, UserContentPart } from "../types/index.js";
+
+/** Accepted image media types. */
+export const ACCEPTED_IMAGE_MEDIA_TYPES = [
+	"image/png",
+	"image/jpeg",
+	"image/webp",
+	"image/gif",
+] as const;
+
+/** Accepted document media types. */
+export const ACCEPTED_PDF_MEDIA_TYPE = "application/pdf";
+
+/** Every media type we accept as an attachment. */
+export const ACCEPTED_ATTACHMENT_MEDIA_TYPES = [
+	...ACCEPTED_IMAGE_MEDIA_TYPES,
+	ACCEPTED_PDF_MEDIA_TYPE,
+] as const;
+
+/** Per-image byte ceiling (Anthropic: 5 MB/image). */
+export const MAX_IMAGE_BYTES = 5 * 1024 * 1024;
+
+/** Per-PDF byte ceiling (Anthropic: 32 MB/PDF). */
+export const MAX_PDF_BYTES = 32 * 1024 * 1024;
+
+/** Max attachments per message (Anthropic: 20 images/request). */
+export const MAX_ATTACHMENTS = 20;
+
+/**
+ * Total attachment payload ceiling for a single request (decoded bytes). Bounds
+ * the overall request size even when each individual file is within its limit.
+ */
+export const MAX_TOTAL_ATTACHMENT_BYTES = 32 * 1024 * 1024;
+
+/** Whether a media type is an accepted image type. */
+export function isImageMediaType(mediaType: string): boolean {
+	return (ACCEPTED_IMAGE_MEDIA_TYPES as readonly string[]).includes(mediaType);
+}
+
+/** Whether a media type is the accepted PDF type. */
+export function isPdfMediaType(mediaType: string): boolean {
+	return mediaType === ACCEPTED_PDF_MEDIA_TYPE;
+}
+
+/** Whether a media type is an accepted attachment type at all. */
+export function isAcceptedAttachmentMediaType(mediaType: string): boolean {
+	return (ACCEPTED_ATTACHMENT_MEDIA_TYPES as readonly string[]).includes(mediaType);
+}
+
+/**
+ * Decoded byte length of a base64 string, computed WITHOUT allocating the
+ * decoded buffer. Tolerates an optional `data:<mediaType>;base64,` prefix and
+ * any embedded whitespace/newlines. Returns 0 for an empty/whitespace string.
+ */
+export function base64ByteLength(b64: string): number {
+	// Strip a data-URI prefix if present.
+	const comma = b64.indexOf(",");
+	const body = b64.startsWith("data:") && comma !== -1 ? b64.slice(comma + 1) : b64;
+	let len = 0;
+	let pad = 0;
+	for (let i = 0; i < body.length; i++) {
+		const ch = body.charCodeAt(i);
+		// Skip whitespace (space, \t, \n, \r).
+		if (ch === 32 || ch === 9 || ch === 10 || ch === 13) continue;
+		len++;
+		if (body[i] === "=") pad++;
+	}
+	if (len === 0) return 0;
+	// 4 base64 chars → 3 bytes, minus padding.
+	return Math.floor((len * 3) / 4) - pad;
+}
+
+export type AttachmentValidationError =
+	| { code: "unsupported-type"; mediaType: string }
+	| { code: "image-too-large"; mediaType: string; bytes: number; limit: number }
+	| { code: "pdf-too-large"; bytes: number; limit: number }
+	| { code: "too-many"; count: number; limit: number }
+	| { code: "total-too-large"; bytes: number; limit: number }
+	| { code: "empty"; mediaType: string };
+
+export interface AttachmentValidationResult {
+	ok: boolean;
+	errors: AttachmentValidationError[];
+}
+
+/** Extract just the attachment parts from a mixed content list. */
+function attachmentsOf(content: UserContentPart[]): UserAttachmentPart[] {
+	return content.filter((p): p is UserAttachmentPart => p.type === "attachment");
+}
+
+/**
+ * Validate the attachments in a multimodal user content list against the
+ * media-type allowlist and the size/count ceilings. Pure: never throws,
+ * collects every violation so the caller can report them all at once.
+ *
+ * Text parts are ignored (always valid). An empty content list is valid (it's
+ * just a text-only message expressed as parts).
+ */
+export function validateUserContent(content: UserContentPart[]): AttachmentValidationResult {
+	const errors: AttachmentValidationError[] = [];
+	const attachments = attachmentsOf(content);
+
+	if (attachments.length > MAX_ATTACHMENTS) {
+		errors.push({ code: "too-many", count: attachments.length, limit: MAX_ATTACHMENTS });
+	}
+
+	let total = 0;
+	for (const att of attachments) {
+		if (!isAcceptedAttachmentMediaType(att.mediaType)) {
+			errors.push({ code: "unsupported-type", mediaType: att.mediaType });
+			continue;
+		}
+		const bytes = base64ByteLength(att.data);
+		total += bytes;
+		if (bytes === 0) {
+			errors.push({ code: "empty", mediaType: att.mediaType });
+			continue;
+		}
+		if (isPdfMediaType(att.mediaType)) {
+			if (bytes > MAX_PDF_BYTES) {
+				errors.push({ code: "pdf-too-large", bytes, limit: MAX_PDF_BYTES });
+			}
+		} else if (bytes > MAX_IMAGE_BYTES) {
+			errors.push({
+				code: "image-too-large",
+				mediaType: att.mediaType,
+				bytes,
+				limit: MAX_IMAGE_BYTES,
+			});
+		}
+	}
+
+	if (total > MAX_TOTAL_ATTACHMENT_BYTES) {
+		errors.push({ code: "total-too-large", bytes: total, limit: MAX_TOTAL_ATTACHMENT_BYTES });
+	}
+
+	return { ok: errors.length === 0, errors };
+}
+
+/** Convenience: does the content list contain at least one attachment? */
+export function hasAttachments(content: UserContentPart[] | undefined | null): boolean {
+	return !!content && content.some((p) => p.type === "attachment");
+}
diff --git a/packages/core/src/models/catalog.ts b/packages/core/src/models/catalog.ts
index dea4647..ac310b1 100644
--- a/packages/core/src/models/catalog.ts
+++ b/packages/core/src/models/catalog.ts
@@ -18,6 +18,15 @@ interface ModelsDevModel {
 		context?: number;
 		output?: number;
 	};
+	/**
+	 * Input/output modalities the model accepts. We read `input` to decide
+	 * whether the model can take image / pdf attachments. Absent on older
+	 * catalog entries — treated as "unknown" (capability resolves to `null`).
+	 */
+	modalities?: {
+		input?: string[];
+		output?: string[];
+	};
 }
 
 interface ModelsDevProvider {
@@ -172,6 +181,47 @@ export async function resolveContextLimit(
 	return null;
 }
 
+/**
+ * Image / PDF input capabilities for a model, resolved from the models.dev
+ * catalog's `modalities.input` list.
+ */
+export interface ModelInputCapabilities {
+	/** Model accepts image input (vision). */
+	image: boolean;
+	/** Model accepts PDF/document input. */
+	pdf: boolean;
+}
+
+/**
+ * Resolve whether a model accepts image / pdf input for the given Dispatch
+ * provider + model id. Returns `null` when the capability is UNKNOWN — i.e. the
+ * provider is unsupported/unmapped, the model is absent from the catalog, the
+ * entry predates the `modalities` field, or the catalog is unavailable. Callers
+ * should treat `null` as "can't verify" (optimistic allow) rather than a
+ * definitive "no", so a temporary catalog outage never disables a known-good
+ * vision model.
+ *
+ * A non-null result means the catalog DID describe the model's input modalities
+ * — `{ image, pdf }` then reflects exactly what it advertises (a definitive
+ * yes/no for each).
+ */
+export async function resolveModelCapabilities(
+	provider: string,
+	modelId: string,
+): Promise<ModelInputCapabilities | null> {
+	const candidates = PROVIDER_MAP[provider];
+	if (!candidates || !modelId) return null;
+
+	const catalog = await getModelsCatalog();
+	for (const providerId of candidates) {
+		const input = catalog[providerId]?.models?.[modelId]?.modalities?.input;
+		if (Array.isArray(input)) {
+			return { image: input.includes("image"), pdf: input.includes("pdf") };
+		}
+	}
+	return null;
+}
+
 /** Test-only: reset the in-process memo so a test can re-exercise loading. */
 export function __resetCatalogCacheForTests(): void {
 	cached = null;
diff --git a/packages/core/src/models/index.ts b/packages/core/src/models/index.ts
index 2fcd657..15d1ee2 100644
--- a/packages/core/src/models/index.ts
+++ b/packages/core/src/models/index.ts
@@ -1,5 +1,24 @@
+export {
+	ACCEPTED_ATTACHMENT_MEDIA_TYPES,
+	ACCEPTED_IMAGE_MEDIA_TYPES,
+	ACCEPTED_PDF_MEDIA_TYPE,
+	type AttachmentValidationError,
+	type AttachmentValidationResult,
+	base64ByteLength,
+	hasAttachments,
+	isAcceptedAttachmentMediaType,
+	isImageMediaType,
+	isPdfMediaType,
+	MAX_ATTACHMENTS,
+	MAX_IMAGE_BYTES,
+	MAX_PDF_BYTES,
+	MAX_TOTAL_ATTACHMENT_BYTES,
+	validateUserContent,
+} from "./attachments.js";
 export {
 	getModelsCatalog,
+	type ModelInputCapabilities,
 	resolveContextLimit,
+	resolveModelCapabilities,
 } from "./catalog.js";
 export { ModelRegistry } from "./registry.js";
diff --git a/packages/core/src/types/index.ts b/packages/core/src/types/index.ts
index 607b27d..273e074 100644
--- a/packages/core/src/types/index.ts
+++ b/packages/core/src/types/index.ts
@@ -76,8 +76,57 @@ export interface SystemChunk {
 export interface ChatMessage {
 	role: MessageRole;
 	chunks: Chunk[];
+	/**
+	 * Ephemeral ORDERED multimodal content for a user turn (interleaved text +
+	 * image/pdf attachments). Set ONLY transiently on the in-flight user message
+	 * so `toModelMessages` can emit multimodal `ImagePart`/`FilePart` content to
+	 * the provider. Never persisted (the chunk log stores only the text, with
+	 * `[image]`/`[pdf]` markers), so it's absent on history-rebuilt messages.
+	 * When absent, the message is plain text built from its `chunks`.
+	 */
+	content?: UserContentPart[];
 }
 
+// ─── Multimodal user content (image / PDF attachments) ───────────
+//
+// When a user pastes one or more images/PDFs into the chat input, the turn's
+// user message carries an ORDERED list of content parts instead of a plain
+// string. The ordering is meaningful — the user can interleave text and
+// attachments ("here is image A: <A>, here is image B: <B>") and the model
+// sees them in exactly that sequence.
+//
+// These parts are EPHEMERAL: they are forwarded to the model for the turn that
+// produced them but are NOT persisted as raw bytes in the chunk log. History
+// stores only the user's text (with `[image]` / `[pdf]` markers in place of
+// each attachment), so a later reload re-renders the text but never re-sends
+// the binary payload. This keeps the persisted log small and avoids re-billing
+// image tokens on every subsequent turn.
+
+/** A plain-text segment of a multimodal user message. */
+export interface UserTextPart {
+	type: "text";
+	text: string;
+}
+
+/**
+ * A binary attachment (image or PDF) in a multimodal user message. `data` is a
+ * base64-encoded payload (no `data:` URI prefix); `mediaType` is the IANA media
+ * type (e.g. `image/png`, `application/pdf`). `name` is an optional original
+ * filename, used only for PDF `filename` passthrough and diagnostics.
+ */
+export interface UserAttachmentPart {
+	type: "attachment";
+	/** IANA media type, e.g. `image/png`, `image/jpeg`, `application/pdf`. */
+	mediaType: string;
+	/** Base64-encoded bytes WITHOUT a `data:` URI prefix. */
+	data: string;
+	/** Optional original filename (mainly for PDFs). */
+	name?: string;
+}
+
+/** One ordered part of a multimodal user message. */
+export type UserContentPart = UserTextPart | UserAttachmentPart;
+
 // ─── Append-only chunk log (persisted model) ─────────────────────
 //
 // The DB stores a conversation as a flat stream of `ChunkRow`s (see
diff --git a/packages/core/tests/agent/agent.test.ts b/packages/core/tests/agent/agent.test.ts
index d8edec7..f4b33cc 100644
--- a/packages/core/tests/agent/agent.test.ts
+++ b/packages/core/tests/agent/agent.test.ts
@@ -1544,4 +1544,102 @@ describe("anthropicThinkingProviderOptions — adaptive-thinking model detection
 			effort: "xhigh",
 		});
 	});
+
+	describe("multimodal user content", () => {
+		it("emits ordered text + image parts to the model when content is provided", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("here is image A: [image]", {
+				content: [
+					{ type: "text", text: "here is image A: " },
+					{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+				],
+			})) {
+				// consume
+			}
+
+			const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+			const messages = callArgs?.messages as Array<{ role: string; content: unknown }>;
+			const userMsg = messages.find((m) => m.role === "user");
+			expect(userMsg).toBeDefined();
+			// Multimodal turn → content is an ordered parts array, not a string.
+			expect(Array.isArray(userMsg?.content)).toBe(true);
+			const parts = userMsg?.content as Array<Record<string, unknown>>;
+			expect(parts[0]).toMatchObject({ type: "text", text: "here is image A: " });
+			expect(parts[1]).toMatchObject({ type: "image", mediaType: "image/png" });
+			expect(String(parts[1]?.image)).toBe("data:image/png;base64,QQ==");
+		});
+
+		it("emits a FilePart for a PDF attachment with its filename", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("see [pdf]", {
+				content: [
+					{ type: "text", text: "see " },
+					{ type: "attachment", mediaType: "application/pdf", data: "QQ==", name: "doc.pdf" },
+				],
+			})) {
+				// consume
+			}
+
+			const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+			const messages = callArgs?.messages as Array<{ role: string; content: unknown }>;
+			const userMsg = messages.find((m) => m.role === "user");
+			const parts = userMsg?.content as Array<Record<string, unknown>>;
+			const filePart = parts.find((p) => p.type === "file");
+			expect(filePart).toMatchObject({
+				type: "file",
+				mediaType: "application/pdf",
+				filename: "doc.pdf",
+			});
+			expect(String(filePart?.data)).toBe("data:application/pdf;base64,QQ==");
+		});
+
+		it("persists the user turn as text only (no content) for history", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("look: [image]", {
+				content: [
+					{ type: "text", text: "look: " },
+					{ type: "attachment", mediaType: "image/png", data: "QQ==" },
+				],
+			})) {
+				// consume
+			}
+
+			// The in-memory user message keeps the text chunk for the render/persist
+			// path; the ephemeral `content` rides alongside it but isn't a chunk.
+			const userMsg = agent.messages.find((m) => m.role === "user");
+			expect(userMsg?.chunks).toEqual([{ type: "text", text: "look: [image]" }]);
+		});
+
+		it("falls back to a plain string when content has no attachment", async () => {
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]),
+			);
+
+			const agent = new Agent(makeConfig());
+			for await (const _ of agent.run("plain text", {
+				content: [{ type: "text", text: "plain text" }],
+			})) {
+				// consume
+			}
+
+			const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+			const messages = callArgs?.messages as Array<{ role: string; content: unknown }>;
+			const userMsg = messages.find((m) => m.role === "user");
+			// No attachment → plain string content (byte-identical to text-only path).
+			expect(typeof userMsg?.content).toBe("string");
+			expect(userMsg?.content).toBe("plain text");
+		});
+	});
 });
diff --git a/packages/core/tests/models/attachments.test.ts b/packages/core/tests/models/attachments.test.ts
new file mode 100644
index 0000000..11a9f82
--- /dev/null
+++ b/packages/core/tests/models/attachments.test.ts
@@ -0,0 +1,136 @@
+import { describe, expect, it } from "vitest";
+import {
+	base64ByteLength,
+	isAcceptedAttachmentMediaType,
+	isImageMediaType,
+	isPdfMediaType,
+	MAX_ATTACHMENTS,
+	MAX_IMAGE_BYTES,
+	MAX_PDF_BYTES,
+	MAX_TOTAL_ATTACHMENT_BYTES,
+	validateUserContent,
+} from "../../src/models/attachments.js";
+import type { UserContentPart } from "../../src/types/index.js";
+
+/** A base64 string that decodes to exactly `bytes` bytes (no padding chars). */
+function base64OfBytes(bytes: number): string {
+	// 4 base64 chars → 3 bytes. Use a multiple of 3 for clean (unpadded) output.
+	const groups = Math.ceil(bytes / 3);
+	return "A".repeat(groups * 4);
+}
+
+function imagePart(data: string, mediaType = "image/png"): UserContentPart {
+	return { type: "attachment", mediaType, data };
+}
+
+describe("media-type predicates", () => {
+	it("classifies image types", () => {
+		expect(isImageMediaType("image/png")).toBe(true);
+		expect(isImageMediaType("image/jpeg")).toBe(true);
+		expect(isImageMediaType("image/webp")).toBe(true);
+		expect(isImageMediaType("image/gif")).toBe(true);
+		expect(isImageMediaType("application/pdf")).toBe(false);
+		expect(isImageMediaType("image/svg+xml")).toBe(false);
+	});
+
+	it("classifies pdf + accepted types", () => {
+		expect(isPdfMediaType("application/pdf")).toBe(true);
+		expect(isPdfMediaType("image/png")).toBe(false);
+		expect(isAcceptedAttachmentMediaType("image/gif")).toBe(true);
+		expect(isAcceptedAttachmentMediaType("application/pdf")).toBe(true);
+		expect(isAcceptedAttachmentMediaType("text/plain")).toBe(false);
+	});
+});
+
+describe("base64ByteLength", () => {
+	it("computes decoded length without padding", () => {
+		// "AAAA" → 3 bytes.
+		expect(base64ByteLength("AAAA")).toBe(3);
+	});
+
+	it("accounts for padding", () => {
+		// "QQ==" → 1 byte ("A").
+		expect(base64ByteLength("QQ==")).toBe(1);
+		// "QUI=" → 2 bytes ("AB").
+		expect(base64ByteLength("QUI=")).toBe(2);
+	});
+
+	it("tolerates a data: URI prefix and whitespace", () => {
+		expect(base64ByteLength("data:image/png;base64,AAAA")).toBe(3);
+		expect(base64ByteLength("AA\nAA")).toBe(3);
+	});
+
+	it("returns 0 for empty input", () => {
+		expect(base64ByteLength("")).toBe(0);
+		expect(base64ByteLength("   ")).toBe(0);
+	});
+});
+
+describe("validateUserContent", () => {
+	it("accepts a small image and ignores text parts", () => {
+		const content: UserContentPart[] = [
+			{ type: "text", text: "hi" },
+			imagePart(base64OfBytes(1024)),
+		];
+		expect(validateUserContent(content)).toEqual({ ok: true, errors: [] });
+	});
+
+	it("accepts an empty / text-only content list", () => {
+		expect(validateUserContent([]).ok).toBe(true);
+		expect(validateUserContent([{ type: "text", text: "no files" }]).ok).toBe(true);
+	});
+
+	it("rejects an unsupported media type", () => {
+		const res = validateUserContent([imagePart(base64OfBytes(10), "image/svg+xml")]);
+		expect(res.ok).toBe(false);
+		expect(res.errors[0]).toMatchObject({ code: "unsupported-type", mediaType: "image/svg+xml" });
+	});
+
+	it("rejects an oversized image but allows a PDF of the same size", () => {
+		const big = base64OfBytes(MAX_IMAGE_BYTES + 3);
+		const imgRes = validateUserContent([imagePart(big, "image/png")]);
+		expect(imgRes.ok).toBe(false);
+		expect(imgRes.errors.some((e) => e.code === "image-too-large")).toBe(true);
+
+		// Same byte size as a PDF is fine (PDF limit is much higher).
+		const pdfRes = validateUserContent([imagePart(big, "application/pdf")]);
+		expect(pdfRes.ok).toBe(true);
+	});
+
+	it("rejects an oversized PDF", () => {
+		const res = validateUserContent([
+			imagePart(base64OfBytes(MAX_PDF_BYTES + 3), "application/pdf"),
+		]);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "pdf-too-large")).toBe(true);
+	});
+
+	it("rejects an empty attachment payload", () => {
+		const res = validateUserContent([imagePart("", "image/png")]);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "empty")).toBe(true);
+	});
+
+	it("rejects too many attachments", () => {
+		const content: UserContentPart[] = Array.from({ length: MAX_ATTACHMENTS + 1 }, () =>
+			imagePart(base64OfBytes(8)),
+		);
+		const res = validateUserContent(content);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "too-many")).toBe(true);
+	});
+
+	it("rejects when the total payload exceeds the request ceiling", () => {
+		// Several individually-legal PDFs that together exceed the total cap.
+		const each = Math.floor(MAX_TOTAL_ATTACHMENT_BYTES / 3);
+		const content: UserContentPart[] = [
+			imagePart(base64OfBytes(each), "application/pdf"),
+			imagePart(base64OfBytes(each), "application/pdf"),
+			imagePart(base64OfBytes(each), "application/pdf"),
+			imagePart(base64OfBytes(each), "application/pdf"),
+		];
+		const res = validateUserContent(content);
+		expect(res.ok).toBe(false);
+		expect(res.errors.some((e) => e.code === "total-too-large")).toBe(true);
+	});
+});
diff --git a/packages/core/tests/models/catalog.test.ts b/packages/core/tests/models/catalog.test.ts
index 51043e6..f4bddc2 100644
--- a/packages/core/tests/models/catalog.test.ts
+++ b/packages/core/tests/models/catalog.test.ts
@@ -4,6 +4,7 @@ import {
 	__resetCatalogCacheForTests,
 	getModelsCatalog,
 	resolveContextLimit,
+	resolveModelCapabilities,
 } from "../../src/models/catalog.js";
 
 const CACHE_PATH = "/tmp/dispatch/models-dev.json";
@@ -13,14 +14,30 @@ const CATALOG = {
 	anthropic: {
 		id: "anthropic",
 		models: {
-			"claude-sonnet-4-5": { limit: { context: 200000, output: 64000 } },
-			"claude-sonnet-4-6": { limit: { context: 1000000, output: 64000 } },
+			"claude-sonnet-4-5": {
+				limit: { context: 200000, output: 64000 },
+				modalities: { input: ["text", "image", "pdf"], output: ["text"] },
+			},
+			"claude-sonnet-4-6": {
+				limit: { context: 1000000, output: 64000 },
+				modalities: { input: ["text", "image", "pdf"], output: ["text"] },
+			},
+			// A text-only model: definitively no image/pdf input.
+			"text-only-model": {
+				limit: { context: 100000, output: 8192 },
+				modalities: { input: ["text"], output: ["text"] },
+			},
+			// An entry predating the modalities field → capability unknown.
+			"legacy-model": { limit: { context: 100000, output: 8192 } },
 		},
 	},
 	opencode: {
 		id: "opencode",
 		models: {
-			"glm-4-6": { limit: { context: 131072, output: 8192 } },
+			"glm-4-6": {
+				limit: { context: 131072, output: 8192 },
+				modalities: { input: ["text", "image"], output: ["text"] },
+			},
 		},
 	},
 };
@@ -156,3 +173,55 @@ describe("getModelsCatalog caching", () => {
 		warn.mockRestore();
 	});
 });
+
+describe("resolveModelCapabilities", () => {
+	it("reports image + pdf for a vision model", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toEqual({
+			image: true,
+			pdf: true,
+		});
+	});
+
+	it("reports image-only for a model whose modalities omit pdf", async () => {
+		mockFetchOnce(CATALOG);
+		// glm-4-6 lists image but not pdf (resolved via the opencode fallback).
+		expect(await resolveModelCapabilities("opencode-anthropic", "glm-4-6")).toEqual({
+			image: true,
+			pdf: false,
+		});
+	});
+
+	it("reports a definitive no for a text-only model", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "text-only-model")).toEqual({
+			image: false,
+			pdf: false,
+		});
+	});
+
+	it("returns null (unknown) for an entry without modalities", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "legacy-model")).toBeNull();
+	});
+
+	it("returns null (unknown) for an unknown model id", async () => {
+		mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("anthropic", "no-such-model")).toBeNull();
+	});
+
+	it("returns null for an unsupported provider without hitting the network", async () => {
+		const fetchFn = mockFetchOnce(CATALOG);
+		expect(await resolveModelCapabilities("google", "gemini-2.5-pro")).toBeNull();
+		expect(await resolveModelCapabilities("anthropic", "")).toBeNull();
+		expect(fetchFn).not.toHaveBeenCalled();
+	});
+
+	it("returns null (unknown) when the catalog is offline with no cache", async () => {
+		const fetchFn = vi.fn(() => Promise.reject(new Error("offline")));
+		vi.stubGlobal("fetch", fetchFn);
+		const warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+		expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toBeNull();
+		warn.mockRestore();
+	});
+});
-- 
cgit v1.2.3