diff options
| author | Adam Malczewski <[email protected]> | 2026-06-02 22:50:11 +0900 |
|---|---|---|
| committer | Adam Malczewski <[email protected]> | 2026-06-02 22:50:11 +0900 |
| commit | 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae (patch) | |
| tree | c3e039e09c89231f84dfd16f7bbbf8aedcc2dc7d /packages/core/src | |
| parent | 4b45d33c256cf580a53054078be6fd7148fa6302 (diff) | |
| download | dispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.tar.gz dispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.zip | |
feat(chat): paste-to-attach images/PDFs with model capability check
Add multimodal image/PDF input to the chat box via clipboard paste, gated by a
graceful per-model capability check.
UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the
draft, so attachments have ORDER relative to typed text and can be referenced
positionally. The token is the only handle — deleting it (atomic Backspace/
Delete, or selection overlap) detaches the file; an input-reconciliation safety
net detaches any attachment whose token is no longer intact. No preview strip.
Capability check: resolveModelCapabilities reads models.dev modalities.input
(new GET /models/capabilities, mirrors /context-limit). The input blocks Send
(no tokens spent) only on a definitive 'no'; unknown capability (catalog offline
/ unmapped provider) stays permissive. Attachments require a fresh turn — Send is
blocked while generating and /chat rejects content mid-turn (409).
Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK
ImagePart/FilePart content, but never persisted (history keeps the text with
[image]/[pdf] markers). Text-only turns serialize byte-identically to before.
Limits (Anthropic-aligned, enforced at paste + re-validated server-side):
PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total.
core: UserContentPart types, models/attachments validator, capability resolver,
agent.run+toModelMessages thread ordered content. api: /chat content validation +
passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating,
per-tab staged attachments, App.svelte capability fetch. +44 tests.
Diffstat (limited to 'packages/core/src')
| -rw-r--r-- | packages/core/src/agent/agent.ts | bin | 57822 -> 60515 bytes | |||
| -rw-r--r-- | packages/core/src/index.ts | 17 | ||||
| -rw-r--r-- | packages/core/src/models/attachments.ts | 151 | ||||
| -rw-r--r-- | packages/core/src/models/catalog.ts | 50 | ||||
| -rw-r--r-- | packages/core/src/models/index.ts | 19 | ||||
| -rw-r--r-- | packages/core/src/types/index.ts | 49 |
6 files changed, 286 insertions, 0 deletions
diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts Binary files differindex 4bfa7eb..08b317a 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 08b426f..50012f1 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -82,9 +82,26 @@ export { } from "./lsp/index.js"; // Models export { + ACCEPTED_ATTACHMENT_MEDIA_TYPES, + ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, + type AttachmentValidationError, + type AttachmentValidationResult, + base64ByteLength, getModelsCatalog, + hasAttachments, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + type ModelInputCapabilities, ModelRegistry, resolveContextLimit, + resolveModelCapabilities, + validateUserContent, } from "./models/index.js"; // Notifications (ntfy.sh) export * from "./notifications/index.js"; diff --git a/packages/core/src/models/attachments.ts b/packages/core/src/models/attachments.ts new file mode 100644 index 0000000..5c98db4 --- /dev/null +++ b/packages/core/src/models/attachments.ts @@ -0,0 +1,151 @@ +// Validation + limits for multimodal user attachments (images / PDFs). +// +// Kept dependency-free (no DB / `bun:sqlite` import) so both the API layer +// (`/chat` request validation) and any future caller can share the exact same +// allowlist and size/count ceilings. The limits mirror Anthropic's documented +// vision/PDF API constraints (the only image-capable providers Dispatch maps), +// so a request that passes here won't be rejected by the provider for size. + +import type { UserAttachmentPart, UserContentPart } from "../types/index.js"; + +/** Accepted image media types. */ +export const ACCEPTED_IMAGE_MEDIA_TYPES = [ + "image/png", + "image/jpeg", + "image/webp", + "image/gif", +] as const; + +/** Accepted document media types. */ +export const ACCEPTED_PDF_MEDIA_TYPE = "application/pdf"; + +/** Every media type we accept as an attachment. */ +export const ACCEPTED_ATTACHMENT_MEDIA_TYPES = [ + ...ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, +] as const; + +/** Per-image byte ceiling (Anthropic: 5 MB/image). */ +export const MAX_IMAGE_BYTES = 5 * 1024 * 1024; + +/** Per-PDF byte ceiling (Anthropic: 32 MB/PDF). */ +export const MAX_PDF_BYTES = 32 * 1024 * 1024; + +/** Max attachments per message (Anthropic: 20 images/request). */ +export const MAX_ATTACHMENTS = 20; + +/** + * Total attachment payload ceiling for a single request (decoded bytes). Bounds + * the overall request size even when each individual file is within its limit. + */ +export const MAX_TOTAL_ATTACHMENT_BYTES = 32 * 1024 * 1024; + +/** Whether a media type is an accepted image type. */ +export function isImageMediaType(mediaType: string): boolean { + return (ACCEPTED_IMAGE_MEDIA_TYPES as readonly string[]).includes(mediaType); +} + +/** Whether a media type is the accepted PDF type. */ +export function isPdfMediaType(mediaType: string): boolean { + return mediaType === ACCEPTED_PDF_MEDIA_TYPE; +} + +/** Whether a media type is an accepted attachment type at all. */ +export function isAcceptedAttachmentMediaType(mediaType: string): boolean { + return (ACCEPTED_ATTACHMENT_MEDIA_TYPES as readonly string[]).includes(mediaType); +} + +/** + * Decoded byte length of a base64 string, computed WITHOUT allocating the + * decoded buffer. Tolerates an optional `data:<mediaType>;base64,` prefix and + * any embedded whitespace/newlines. Returns 0 for an empty/whitespace string. + */ +export function base64ByteLength(b64: string): number { + // Strip a data-URI prefix if present. + const comma = b64.indexOf(","); + const body = b64.startsWith("data:") && comma !== -1 ? b64.slice(comma + 1) : b64; + let len = 0; + let pad = 0; + for (let i = 0; i < body.length; i++) { + const ch = body.charCodeAt(i); + // Skip whitespace (space, \t, \n, \r). + if (ch === 32 || ch === 9 || ch === 10 || ch === 13) continue; + len++; + if (body[i] === "=") pad++; + } + if (len === 0) return 0; + // 4 base64 chars → 3 bytes, minus padding. + return Math.floor((len * 3) / 4) - pad; +} + +export type AttachmentValidationError = + | { code: "unsupported-type"; mediaType: string } + | { code: "image-too-large"; mediaType: string; bytes: number; limit: number } + | { code: "pdf-too-large"; bytes: number; limit: number } + | { code: "too-many"; count: number; limit: number } + | { code: "total-too-large"; bytes: number; limit: number } + | { code: "empty"; mediaType: string }; + +export interface AttachmentValidationResult { + ok: boolean; + errors: AttachmentValidationError[]; +} + +/** Extract just the attachment parts from a mixed content list. */ +function attachmentsOf(content: UserContentPart[]): UserAttachmentPart[] { + return content.filter((p): p is UserAttachmentPart => p.type === "attachment"); +} + +/** + * Validate the attachments in a multimodal user content list against the + * media-type allowlist and the size/count ceilings. Pure: never throws, + * collects every violation so the caller can report them all at once. + * + * Text parts are ignored (always valid). An empty content list is valid (it's + * just a text-only message expressed as parts). + */ +export function validateUserContent(content: UserContentPart[]): AttachmentValidationResult { + const errors: AttachmentValidationError[] = []; + const attachments = attachmentsOf(content); + + if (attachments.length > MAX_ATTACHMENTS) { + errors.push({ code: "too-many", count: attachments.length, limit: MAX_ATTACHMENTS }); + } + + let total = 0; + for (const att of attachments) { + if (!isAcceptedAttachmentMediaType(att.mediaType)) { + errors.push({ code: "unsupported-type", mediaType: att.mediaType }); + continue; + } + const bytes = base64ByteLength(att.data); + total += bytes; + if (bytes === 0) { + errors.push({ code: "empty", mediaType: att.mediaType }); + continue; + } + if (isPdfMediaType(att.mediaType)) { + if (bytes > MAX_PDF_BYTES) { + errors.push({ code: "pdf-too-large", bytes, limit: MAX_PDF_BYTES }); + } + } else if (bytes > MAX_IMAGE_BYTES) { + errors.push({ + code: "image-too-large", + mediaType: att.mediaType, + bytes, + limit: MAX_IMAGE_BYTES, + }); + } + } + + if (total > MAX_TOTAL_ATTACHMENT_BYTES) { + errors.push({ code: "total-too-large", bytes: total, limit: MAX_TOTAL_ATTACHMENT_BYTES }); + } + + return { ok: errors.length === 0, errors }; +} + +/** Convenience: does the content list contain at least one attachment? */ +export function hasAttachments(content: UserContentPart[] | undefined | null): boolean { + return !!content && content.some((p) => p.type === "attachment"); +} diff --git a/packages/core/src/models/catalog.ts b/packages/core/src/models/catalog.ts index dea4647..ac310b1 100644 --- a/packages/core/src/models/catalog.ts +++ b/packages/core/src/models/catalog.ts @@ -18,6 +18,15 @@ interface ModelsDevModel { context?: number; output?: number; }; + /** + * Input/output modalities the model accepts. We read `input` to decide + * whether the model can take image / pdf attachments. Absent on older + * catalog entries — treated as "unknown" (capability resolves to `null`). + */ + modalities?: { + input?: string[]; + output?: string[]; + }; } interface ModelsDevProvider { @@ -172,6 +181,47 @@ export async function resolveContextLimit( return null; } +/** + * Image / PDF input capabilities for a model, resolved from the models.dev + * catalog's `modalities.input` list. + */ +export interface ModelInputCapabilities { + /** Model accepts image input (vision). */ + image: boolean; + /** Model accepts PDF/document input. */ + pdf: boolean; +} + +/** + * Resolve whether a model accepts image / pdf input for the given Dispatch + * provider + model id. Returns `null` when the capability is UNKNOWN — i.e. the + * provider is unsupported/unmapped, the model is absent from the catalog, the + * entry predates the `modalities` field, or the catalog is unavailable. Callers + * should treat `null` as "can't verify" (optimistic allow) rather than a + * definitive "no", so a temporary catalog outage never disables a known-good + * vision model. + * + * A non-null result means the catalog DID describe the model's input modalities + * — `{ image, pdf }` then reflects exactly what it advertises (a definitive + * yes/no for each). + */ +export async function resolveModelCapabilities( + provider: string, + modelId: string, +): Promise<ModelInputCapabilities | null> { + const candidates = PROVIDER_MAP[provider]; + if (!candidates || !modelId) return null; + + const catalog = await getModelsCatalog(); + for (const providerId of candidates) { + const input = catalog[providerId]?.models?.[modelId]?.modalities?.input; + if (Array.isArray(input)) { + return { image: input.includes("image"), pdf: input.includes("pdf") }; + } + } + return null; +} + /** Test-only: reset the in-process memo so a test can re-exercise loading. */ export function __resetCatalogCacheForTests(): void { cached = null; diff --git a/packages/core/src/models/index.ts b/packages/core/src/models/index.ts index 2fcd657..15d1ee2 100644 --- a/packages/core/src/models/index.ts +++ b/packages/core/src/models/index.ts @@ -1,5 +1,24 @@ export { + ACCEPTED_ATTACHMENT_MEDIA_TYPES, + ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, + type AttachmentValidationError, + type AttachmentValidationResult, + base64ByteLength, + hasAttachments, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + validateUserContent, +} from "./attachments.js"; +export { getModelsCatalog, + type ModelInputCapabilities, resolveContextLimit, + resolveModelCapabilities, } from "./catalog.js"; export { ModelRegistry } from "./registry.js"; diff --git a/packages/core/src/types/index.ts b/packages/core/src/types/index.ts index 607b27d..273e074 100644 --- a/packages/core/src/types/index.ts +++ b/packages/core/src/types/index.ts @@ -76,8 +76,57 @@ export interface SystemChunk { export interface ChatMessage { role: MessageRole; chunks: Chunk[]; + /** + * Ephemeral ORDERED multimodal content for a user turn (interleaved text + + * image/pdf attachments). Set ONLY transiently on the in-flight user message + * so `toModelMessages` can emit multimodal `ImagePart`/`FilePart` content to + * the provider. Never persisted (the chunk log stores only the text, with + * `[image]`/`[pdf]` markers), so it's absent on history-rebuilt messages. + * When absent, the message is plain text built from its `chunks`. + */ + content?: UserContentPart[]; } +// ─── Multimodal user content (image / PDF attachments) ─────────── +// +// When a user pastes one or more images/PDFs into the chat input, the turn's +// user message carries an ORDERED list of content parts instead of a plain +// string. The ordering is meaningful — the user can interleave text and +// attachments ("here is image A: <A>, here is image B: <B>") and the model +// sees them in exactly that sequence. +// +// These parts are EPHEMERAL: they are forwarded to the model for the turn that +// produced them but are NOT persisted as raw bytes in the chunk log. History +// stores only the user's text (with `[image]` / `[pdf]` markers in place of +// each attachment), so a later reload re-renders the text but never re-sends +// the binary payload. This keeps the persisted log small and avoids re-billing +// image tokens on every subsequent turn. + +/** A plain-text segment of a multimodal user message. */ +export interface UserTextPart { + type: "text"; + text: string; +} + +/** + * A binary attachment (image or PDF) in a multimodal user message. `data` is a + * base64-encoded payload (no `data:` URI prefix); `mediaType` is the IANA media + * type (e.g. `image/png`, `application/pdf`). `name` is an optional original + * filename, used only for PDF `filename` passthrough and diagnostics. + */ +export interface UserAttachmentPart { + type: "attachment"; + /** IANA media type, e.g. `image/png`, `image/jpeg`, `application/pdf`. */ + mediaType: string; + /** Base64-encoded bytes WITHOUT a `data:` URI prefix. */ + data: string; + /** Optional original filename (mainly for PDFs). */ + name?: string; +} + +/** One ordered part of a multimodal user message. */ +export type UserContentPart = UserTextPart | UserAttachmentPart; + // ─── Append-only chunk log (persisted model) ───────────────────── // // The DB stores a conversation as a flat stream of `ChunkRow`s (see |
