From 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Tue, 2 Jun 2026 22:50:11 +0900 Subject: feat(chat): paste-to-attach images/PDFs with model capability check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests. --- packages/core/src/agent/agent.ts | Bin 57822 -> 60515 bytes packages/core/src/index.ts | 17 +++ packages/core/src/models/attachments.ts | 151 +++++++++++++++++++++++++ packages/core/src/models/catalog.ts | 50 ++++++++ packages/core/src/models/index.ts | 19 ++++ packages/core/src/types/index.ts | 49 ++++++++ packages/core/tests/agent/agent.test.ts | 98 ++++++++++++++++ packages/core/tests/models/attachments.test.ts | 136 ++++++++++++++++++++++ packages/core/tests/models/catalog.test.ts | 75 +++++++++++- 9 files changed, 592 insertions(+), 3 deletions(-) create mode 100644 packages/core/src/models/attachments.ts create mode 100644 packages/core/tests/models/attachments.test.ts (limited to 'packages/core') diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 4bfa7eb..08b317a 100644 Binary files a/packages/core/src/agent/agent.ts and b/packages/core/src/agent/agent.ts differ diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 08b426f..50012f1 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -82,9 +82,26 @@ export { } from "./lsp/index.js"; // Models export { + ACCEPTED_ATTACHMENT_MEDIA_TYPES, + ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, + type AttachmentValidationError, + type AttachmentValidationResult, + base64ByteLength, getModelsCatalog, + hasAttachments, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + type ModelInputCapabilities, ModelRegistry, resolveContextLimit, + resolveModelCapabilities, + validateUserContent, } from "./models/index.js"; // Notifications (ntfy.sh) export * from "./notifications/index.js"; diff --git a/packages/core/src/models/attachments.ts b/packages/core/src/models/attachments.ts new file mode 100644 index 0000000..5c98db4 --- /dev/null +++ b/packages/core/src/models/attachments.ts @@ -0,0 +1,151 @@ +// Validation + limits for multimodal user attachments (images / PDFs). +// +// Kept dependency-free (no DB / `bun:sqlite` import) so both the API layer +// (`/chat` request validation) and any future caller can share the exact same +// allowlist and size/count ceilings. The limits mirror Anthropic's documented +// vision/PDF API constraints (the only image-capable providers Dispatch maps), +// so a request that passes here won't be rejected by the provider for size. + +import type { UserAttachmentPart, UserContentPart } from "../types/index.js"; + +/** Accepted image media types. */ +export const ACCEPTED_IMAGE_MEDIA_TYPES = [ + "image/png", + "image/jpeg", + "image/webp", + "image/gif", +] as const; + +/** Accepted document media types. */ +export const ACCEPTED_PDF_MEDIA_TYPE = "application/pdf"; + +/** Every media type we accept as an attachment. */ +export const ACCEPTED_ATTACHMENT_MEDIA_TYPES = [ + ...ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, +] as const; + +/** Per-image byte ceiling (Anthropic: 5 MB/image). */ +export const MAX_IMAGE_BYTES = 5 * 1024 * 1024; + +/** Per-PDF byte ceiling (Anthropic: 32 MB/PDF). */ +export const MAX_PDF_BYTES = 32 * 1024 * 1024; + +/** Max attachments per message (Anthropic: 20 images/request). */ +export const MAX_ATTACHMENTS = 20; + +/** + * Total attachment payload ceiling for a single request (decoded bytes). Bounds + * the overall request size even when each individual file is within its limit. + */ +export const MAX_TOTAL_ATTACHMENT_BYTES = 32 * 1024 * 1024; + +/** Whether a media type is an accepted image type. */ +export function isImageMediaType(mediaType: string): boolean { + return (ACCEPTED_IMAGE_MEDIA_TYPES as readonly string[]).includes(mediaType); +} + +/** Whether a media type is the accepted PDF type. */ +export function isPdfMediaType(mediaType: string): boolean { + return mediaType === ACCEPTED_PDF_MEDIA_TYPE; +} + +/** Whether a media type is an accepted attachment type at all. */ +export function isAcceptedAttachmentMediaType(mediaType: string): boolean { + return (ACCEPTED_ATTACHMENT_MEDIA_TYPES as readonly string[]).includes(mediaType); +} + +/** + * Decoded byte length of a base64 string, computed WITHOUT allocating the + * decoded buffer. Tolerates an optional `data:;base64,` prefix and + * any embedded whitespace/newlines. Returns 0 for an empty/whitespace string. + */ +export function base64ByteLength(b64: string): number { + // Strip a data-URI prefix if present. + const comma = b64.indexOf(","); + const body = b64.startsWith("data:") && comma !== -1 ? b64.slice(comma + 1) : b64; + let len = 0; + let pad = 0; + for (let i = 0; i < body.length; i++) { + const ch = body.charCodeAt(i); + // Skip whitespace (space, \t, \n, \r). + if (ch === 32 || ch === 9 || ch === 10 || ch === 13) continue; + len++; + if (body[i] === "=") pad++; + } + if (len === 0) return 0; + // 4 base64 chars → 3 bytes, minus padding. + return Math.floor((len * 3) / 4) - pad; +} + +export type AttachmentValidationError = + | { code: "unsupported-type"; mediaType: string } + | { code: "image-too-large"; mediaType: string; bytes: number; limit: number } + | { code: "pdf-too-large"; bytes: number; limit: number } + | { code: "too-many"; count: number; limit: number } + | { code: "total-too-large"; bytes: number; limit: number } + | { code: "empty"; mediaType: string }; + +export interface AttachmentValidationResult { + ok: boolean; + errors: AttachmentValidationError[]; +} + +/** Extract just the attachment parts from a mixed content list. */ +function attachmentsOf(content: UserContentPart[]): UserAttachmentPart[] { + return content.filter((p): p is UserAttachmentPart => p.type === "attachment"); +} + +/** + * Validate the attachments in a multimodal user content list against the + * media-type allowlist and the size/count ceilings. Pure: never throws, + * collects every violation so the caller can report them all at once. + * + * Text parts are ignored (always valid). An empty content list is valid (it's + * just a text-only message expressed as parts). + */ +export function validateUserContent(content: UserContentPart[]): AttachmentValidationResult { + const errors: AttachmentValidationError[] = []; + const attachments = attachmentsOf(content); + + if (attachments.length > MAX_ATTACHMENTS) { + errors.push({ code: "too-many", count: attachments.length, limit: MAX_ATTACHMENTS }); + } + + let total = 0; + for (const att of attachments) { + if (!isAcceptedAttachmentMediaType(att.mediaType)) { + errors.push({ code: "unsupported-type", mediaType: att.mediaType }); + continue; + } + const bytes = base64ByteLength(att.data); + total += bytes; + if (bytes === 0) { + errors.push({ code: "empty", mediaType: att.mediaType }); + continue; + } + if (isPdfMediaType(att.mediaType)) { + if (bytes > MAX_PDF_BYTES) { + errors.push({ code: "pdf-too-large", bytes, limit: MAX_PDF_BYTES }); + } + } else if (bytes > MAX_IMAGE_BYTES) { + errors.push({ + code: "image-too-large", + mediaType: att.mediaType, + bytes, + limit: MAX_IMAGE_BYTES, + }); + } + } + + if (total > MAX_TOTAL_ATTACHMENT_BYTES) { + errors.push({ code: "total-too-large", bytes: total, limit: MAX_TOTAL_ATTACHMENT_BYTES }); + } + + return { ok: errors.length === 0, errors }; +} + +/** Convenience: does the content list contain at least one attachment? */ +export function hasAttachments(content: UserContentPart[] | undefined | null): boolean { + return !!content && content.some((p) => p.type === "attachment"); +} diff --git a/packages/core/src/models/catalog.ts b/packages/core/src/models/catalog.ts index dea4647..ac310b1 100644 --- a/packages/core/src/models/catalog.ts +++ b/packages/core/src/models/catalog.ts @@ -18,6 +18,15 @@ interface ModelsDevModel { context?: number; output?: number; }; + /** + * Input/output modalities the model accepts. We read `input` to decide + * whether the model can take image / pdf attachments. Absent on older + * catalog entries — treated as "unknown" (capability resolves to `null`). + */ + modalities?: { + input?: string[]; + output?: string[]; + }; } interface ModelsDevProvider { @@ -172,6 +181,47 @@ export async function resolveContextLimit( return null; } +/** + * Image / PDF input capabilities for a model, resolved from the models.dev + * catalog's `modalities.input` list. + */ +export interface ModelInputCapabilities { + /** Model accepts image input (vision). */ + image: boolean; + /** Model accepts PDF/document input. */ + pdf: boolean; +} + +/** + * Resolve whether a model accepts image / pdf input for the given Dispatch + * provider + model id. Returns `null` when the capability is UNKNOWN — i.e. the + * provider is unsupported/unmapped, the model is absent from the catalog, the + * entry predates the `modalities` field, or the catalog is unavailable. Callers + * should treat `null` as "can't verify" (optimistic allow) rather than a + * definitive "no", so a temporary catalog outage never disables a known-good + * vision model. + * + * A non-null result means the catalog DID describe the model's input modalities + * — `{ image, pdf }` then reflects exactly what it advertises (a definitive + * yes/no for each). + */ +export async function resolveModelCapabilities( + provider: string, + modelId: string, +): Promise { + const candidates = PROVIDER_MAP[provider]; + if (!candidates || !modelId) return null; + + const catalog = await getModelsCatalog(); + for (const providerId of candidates) { + const input = catalog[providerId]?.models?.[modelId]?.modalities?.input; + if (Array.isArray(input)) { + return { image: input.includes("image"), pdf: input.includes("pdf") }; + } + } + return null; +} + /** Test-only: reset the in-process memo so a test can re-exercise loading. */ export function __resetCatalogCacheForTests(): void { cached = null; diff --git a/packages/core/src/models/index.ts b/packages/core/src/models/index.ts index 2fcd657..15d1ee2 100644 --- a/packages/core/src/models/index.ts +++ b/packages/core/src/models/index.ts @@ -1,5 +1,24 @@ +export { + ACCEPTED_ATTACHMENT_MEDIA_TYPES, + ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, + type AttachmentValidationError, + type AttachmentValidationResult, + base64ByteLength, + hasAttachments, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + validateUserContent, +} from "./attachments.js"; export { getModelsCatalog, + type ModelInputCapabilities, resolveContextLimit, + resolveModelCapabilities, } from "./catalog.js"; export { ModelRegistry } from "./registry.js"; diff --git a/packages/core/src/types/index.ts b/packages/core/src/types/index.ts index 607b27d..273e074 100644 --- a/packages/core/src/types/index.ts +++ b/packages/core/src/types/index.ts @@ -76,8 +76,57 @@ export interface SystemChunk { export interface ChatMessage { role: MessageRole; chunks: Chunk[]; + /** + * Ephemeral ORDERED multimodal content for a user turn (interleaved text + + * image/pdf attachments). Set ONLY transiently on the in-flight user message + * so `toModelMessages` can emit multimodal `ImagePart`/`FilePart` content to + * the provider. Never persisted (the chunk log stores only the text, with + * `[image]`/`[pdf]` markers), so it's absent on history-rebuilt messages. + * When absent, the message is plain text built from its `chunks`. + */ + content?: UserContentPart[]; } +// ─── Multimodal user content (image / PDF attachments) ─────────── +// +// When a user pastes one or more images/PDFs into the chat input, the turn's +// user message carries an ORDERED list of content parts instead of a plain +// string. The ordering is meaningful — the user can interleave text and +// attachments ("here is image A: , here is image B: ") and the model +// sees them in exactly that sequence. +// +// These parts are EPHEMERAL: they are forwarded to the model for the turn that +// produced them but are NOT persisted as raw bytes in the chunk log. History +// stores only the user's text (with `[image]` / `[pdf]` markers in place of +// each attachment), so a later reload re-renders the text but never re-sends +// the binary payload. This keeps the persisted log small and avoids re-billing +// image tokens on every subsequent turn. + +/** A plain-text segment of a multimodal user message. */ +export interface UserTextPart { + type: "text"; + text: string; +} + +/** + * A binary attachment (image or PDF) in a multimodal user message. `data` is a + * base64-encoded payload (no `data:` URI prefix); `mediaType` is the IANA media + * type (e.g. `image/png`, `application/pdf`). `name` is an optional original + * filename, used only for PDF `filename` passthrough and diagnostics. + */ +export interface UserAttachmentPart { + type: "attachment"; + /** IANA media type, e.g. `image/png`, `image/jpeg`, `application/pdf`. */ + mediaType: string; + /** Base64-encoded bytes WITHOUT a `data:` URI prefix. */ + data: string; + /** Optional original filename (mainly for PDFs). */ + name?: string; +} + +/** One ordered part of a multimodal user message. */ +export type UserContentPart = UserTextPart | UserAttachmentPart; + // ─── Append-only chunk log (persisted model) ───────────────────── // // The DB stores a conversation as a flat stream of `ChunkRow`s (see diff --git a/packages/core/tests/agent/agent.test.ts b/packages/core/tests/agent/agent.test.ts index d8edec7..f4b33cc 100644 --- a/packages/core/tests/agent/agent.test.ts +++ b/packages/core/tests/agent/agent.test.ts @@ -1544,4 +1544,102 @@ describe("anthropicThinkingProviderOptions — adaptive-thinking model detection effort: "xhigh", }); }); + + describe("multimodal user content", () => { + it("emits ordered text + image parts to the model when content is provided", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("here is image A: [image]", { + content: [ + { type: "text", text: "here is image A: " }, + { type: "attachment", mediaType: "image/png", data: "QQ==" }, + ], + })) { + // consume + } + + const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0]; + const messages = callArgs?.messages as Array<{ role: string; content: unknown }>; + const userMsg = messages.find((m) => m.role === "user"); + expect(userMsg).toBeDefined(); + // Multimodal turn → content is an ordered parts array, not a string. + expect(Array.isArray(userMsg?.content)).toBe(true); + const parts = userMsg?.content as Array>; + expect(parts[0]).toMatchObject({ type: "text", text: "here is image A: " }); + expect(parts[1]).toMatchObject({ type: "image", mediaType: "image/png" }); + expect(String(parts[1]?.image)).toBe("data:image/png;base64,QQ=="); + }); + + it("emits a FilePart for a PDF attachment with its filename", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("see [pdf]", { + content: [ + { type: "text", text: "see " }, + { type: "attachment", mediaType: "application/pdf", data: "QQ==", name: "doc.pdf" }, + ], + })) { + // consume + } + + const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0]; + const messages = callArgs?.messages as Array<{ role: string; content: unknown }>; + const userMsg = messages.find((m) => m.role === "user"); + const parts = userMsg?.content as Array>; + const filePart = parts.find((p) => p.type === "file"); + expect(filePart).toMatchObject({ + type: "file", + mediaType: "application/pdf", + filename: "doc.pdf", + }); + expect(String(filePart?.data)).toBe("data:application/pdf;base64,QQ=="); + }); + + it("persists the user turn as text only (no content) for history", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("look: [image]", { + content: [ + { type: "text", text: "look: " }, + { type: "attachment", mediaType: "image/png", data: "QQ==" }, + ], + })) { + // consume + } + + // The in-memory user message keeps the text chunk for the render/persist + // path; the ephemeral `content` rides alongside it but isn't a chunk. + const userMsg = agent.messages.find((m) => m.role === "user"); + expect(userMsg?.chunks).toEqual([{ type: "text", text: "look: [image]" }]); + }); + + it("falls back to a plain string when content has no attachment", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("plain text", { + content: [{ type: "text", text: "plain text" }], + })) { + // consume + } + + const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0]; + const messages = callArgs?.messages as Array<{ role: string; content: unknown }>; + const userMsg = messages.find((m) => m.role === "user"); + // No attachment → plain string content (byte-identical to text-only path). + expect(typeof userMsg?.content).toBe("string"); + expect(userMsg?.content).toBe("plain text"); + }); + }); }); diff --git a/packages/core/tests/models/attachments.test.ts b/packages/core/tests/models/attachments.test.ts new file mode 100644 index 0000000..11a9f82 --- /dev/null +++ b/packages/core/tests/models/attachments.test.ts @@ -0,0 +1,136 @@ +import { describe, expect, it } from "vitest"; +import { + base64ByteLength, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + validateUserContent, +} from "../../src/models/attachments.js"; +import type { UserContentPart } from "../../src/types/index.js"; + +/** A base64 string that decodes to exactly `bytes` bytes (no padding chars). */ +function base64OfBytes(bytes: number): string { + // 4 base64 chars → 3 bytes. Use a multiple of 3 for clean (unpadded) output. + const groups = Math.ceil(bytes / 3); + return "A".repeat(groups * 4); +} + +function imagePart(data: string, mediaType = "image/png"): UserContentPart { + return { type: "attachment", mediaType, data }; +} + +describe("media-type predicates", () => { + it("classifies image types", () => { + expect(isImageMediaType("image/png")).toBe(true); + expect(isImageMediaType("image/jpeg")).toBe(true); + expect(isImageMediaType("image/webp")).toBe(true); + expect(isImageMediaType("image/gif")).toBe(true); + expect(isImageMediaType("application/pdf")).toBe(false); + expect(isImageMediaType("image/svg+xml")).toBe(false); + }); + + it("classifies pdf + accepted types", () => { + expect(isPdfMediaType("application/pdf")).toBe(true); + expect(isPdfMediaType("image/png")).toBe(false); + expect(isAcceptedAttachmentMediaType("image/gif")).toBe(true); + expect(isAcceptedAttachmentMediaType("application/pdf")).toBe(true); + expect(isAcceptedAttachmentMediaType("text/plain")).toBe(false); + }); +}); + +describe("base64ByteLength", () => { + it("computes decoded length without padding", () => { + // "AAAA" → 3 bytes. + expect(base64ByteLength("AAAA")).toBe(3); + }); + + it("accounts for padding", () => { + // "QQ==" → 1 byte ("A"). + expect(base64ByteLength("QQ==")).toBe(1); + // "QUI=" → 2 bytes ("AB"). + expect(base64ByteLength("QUI=")).toBe(2); + }); + + it("tolerates a data: URI prefix and whitespace", () => { + expect(base64ByteLength("data:image/png;base64,AAAA")).toBe(3); + expect(base64ByteLength("AA\nAA")).toBe(3); + }); + + it("returns 0 for empty input", () => { + expect(base64ByteLength("")).toBe(0); + expect(base64ByteLength(" ")).toBe(0); + }); +}); + +describe("validateUserContent", () => { + it("accepts a small image and ignores text parts", () => { + const content: UserContentPart[] = [ + { type: "text", text: "hi" }, + imagePart(base64OfBytes(1024)), + ]; + expect(validateUserContent(content)).toEqual({ ok: true, errors: [] }); + }); + + it("accepts an empty / text-only content list", () => { + expect(validateUserContent([]).ok).toBe(true); + expect(validateUserContent([{ type: "text", text: "no files" }]).ok).toBe(true); + }); + + it("rejects an unsupported media type", () => { + const res = validateUserContent([imagePart(base64OfBytes(10), "image/svg+xml")]); + expect(res.ok).toBe(false); + expect(res.errors[0]).toMatchObject({ code: "unsupported-type", mediaType: "image/svg+xml" }); + }); + + it("rejects an oversized image but allows a PDF of the same size", () => { + const big = base64OfBytes(MAX_IMAGE_BYTES + 3); + const imgRes = validateUserContent([imagePart(big, "image/png")]); + expect(imgRes.ok).toBe(false); + expect(imgRes.errors.some((e) => e.code === "image-too-large")).toBe(true); + + // Same byte size as a PDF is fine (PDF limit is much higher). + const pdfRes = validateUserContent([imagePart(big, "application/pdf")]); + expect(pdfRes.ok).toBe(true); + }); + + it("rejects an oversized PDF", () => { + const res = validateUserContent([ + imagePart(base64OfBytes(MAX_PDF_BYTES + 3), "application/pdf"), + ]); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "pdf-too-large")).toBe(true); + }); + + it("rejects an empty attachment payload", () => { + const res = validateUserContent([imagePart("", "image/png")]); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "empty")).toBe(true); + }); + + it("rejects too many attachments", () => { + const content: UserContentPart[] = Array.from({ length: MAX_ATTACHMENTS + 1 }, () => + imagePart(base64OfBytes(8)), + ); + const res = validateUserContent(content); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "too-many")).toBe(true); + }); + + it("rejects when the total payload exceeds the request ceiling", () => { + // Several individually-legal PDFs that together exceed the total cap. + const each = Math.floor(MAX_TOTAL_ATTACHMENT_BYTES / 3); + const content: UserContentPart[] = [ + imagePart(base64OfBytes(each), "application/pdf"), + imagePart(base64OfBytes(each), "application/pdf"), + imagePart(base64OfBytes(each), "application/pdf"), + imagePart(base64OfBytes(each), "application/pdf"), + ]; + const res = validateUserContent(content); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "total-too-large")).toBe(true); + }); +}); diff --git a/packages/core/tests/models/catalog.test.ts b/packages/core/tests/models/catalog.test.ts index 51043e6..f4bddc2 100644 --- a/packages/core/tests/models/catalog.test.ts +++ b/packages/core/tests/models/catalog.test.ts @@ -4,6 +4,7 @@ import { __resetCatalogCacheForTests, getModelsCatalog, resolveContextLimit, + resolveModelCapabilities, } from "../../src/models/catalog.js"; const CACHE_PATH = "/tmp/dispatch/models-dev.json"; @@ -13,14 +14,30 @@ const CATALOG = { anthropic: { id: "anthropic", models: { - "claude-sonnet-4-5": { limit: { context: 200000, output: 64000 } }, - "claude-sonnet-4-6": { limit: { context: 1000000, output: 64000 } }, + "claude-sonnet-4-5": { + limit: { context: 200000, output: 64000 }, + modalities: { input: ["text", "image", "pdf"], output: ["text"] }, + }, + "claude-sonnet-4-6": { + limit: { context: 1000000, output: 64000 }, + modalities: { input: ["text", "image", "pdf"], output: ["text"] }, + }, + // A text-only model: definitively no image/pdf input. + "text-only-model": { + limit: { context: 100000, output: 8192 }, + modalities: { input: ["text"], output: ["text"] }, + }, + // An entry predating the modalities field → capability unknown. + "legacy-model": { limit: { context: 100000, output: 8192 } }, }, }, opencode: { id: "opencode", models: { - "glm-4-6": { limit: { context: 131072, output: 8192 } }, + "glm-4-6": { + limit: { context: 131072, output: 8192 }, + modalities: { input: ["text", "image"], output: ["text"] }, + }, }, }, }; @@ -156,3 +173,55 @@ describe("getModelsCatalog caching", () => { warn.mockRestore(); }); }); + +describe("resolveModelCapabilities", () => { + it("reports image + pdf for a vision model", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toEqual({ + image: true, + pdf: true, + }); + }); + + it("reports image-only for a model whose modalities omit pdf", async () => { + mockFetchOnce(CATALOG); + // glm-4-6 lists image but not pdf (resolved via the opencode fallback). + expect(await resolveModelCapabilities("opencode-anthropic", "glm-4-6")).toEqual({ + image: true, + pdf: false, + }); + }); + + it("reports a definitive no for a text-only model", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "text-only-model")).toEqual({ + image: false, + pdf: false, + }); + }); + + it("returns null (unknown) for an entry without modalities", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "legacy-model")).toBeNull(); + }); + + it("returns null (unknown) for an unknown model id", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "no-such-model")).toBeNull(); + }); + + it("returns null for an unsupported provider without hitting the network", async () => { + const fetchFn = mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("google", "gemini-2.5-pro")).toBeNull(); + expect(await resolveModelCapabilities("anthropic", "")).toBeNull(); + expect(fetchFn).not.toHaveBeenCalled(); + }); + + it("returns null (unknown) when the catalog is offline with no cache", async () => { + const fetchFn = vi.fn(() => Promise.reject(new Error("offline"))); + vi.stubGlobal("fetch", fetchFn); + const warn = vi.spyOn(console, "warn").mockImplementation(() => {}); + expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toBeNull(); + warn.mockRestore(); + }); +}); -- cgit v1.2.3