summaryrefslogtreecommitdiffhomepage
path: root/packages/core/src
diff options
context:
space:
mode:
authorAdam Malczewski <[email protected]>2026-06-02 22:50:11 +0900
committerAdam Malczewski <[email protected]>2026-06-02 22:50:11 +0900
commit66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae (patch)
treec3e039e09c89231f84dfd16f7bbbf8aedcc2dc7d /packages/core/src
parent4b45d33c256cf580a53054078be6fd7148fa6302 (diff)
downloaddispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.tar.gz
dispatch-66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae.zip
feat(chat): paste-to-attach images/PDFs with model capability check
Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests.
Diffstat (limited to 'packages/core/src')
-rw-r--r--packages/core/src/agent/agent.tsbin57822 -> 60515 bytes
-rw-r--r--packages/core/src/index.ts17
-rw-r--r--packages/core/src/models/attachments.ts151
-rw-r--r--packages/core/src/models/catalog.ts50
-rw-r--r--packages/core/src/models/index.ts19
-rw-r--r--packages/core/src/types/index.ts49
6 files changed, 286 insertions, 0 deletions
diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
index 4bfa7eb..08b317a 100644
--- a/packages/core/src/agent/agent.ts
+++ b/packages/core/src/agent/agent.ts
Binary files differ
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 08b426f..50012f1 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -82,9 +82,26 @@ export {
} from "./lsp/index.js";
// Models
export {
+ ACCEPTED_ATTACHMENT_MEDIA_TYPES,
+ ACCEPTED_IMAGE_MEDIA_TYPES,
+ ACCEPTED_PDF_MEDIA_TYPE,
+ type AttachmentValidationError,
+ type AttachmentValidationResult,
+ base64ByteLength,
getModelsCatalog,
+ hasAttachments,
+ isAcceptedAttachmentMediaType,
+ isImageMediaType,
+ isPdfMediaType,
+ MAX_ATTACHMENTS,
+ MAX_IMAGE_BYTES,
+ MAX_PDF_BYTES,
+ MAX_TOTAL_ATTACHMENT_BYTES,
+ type ModelInputCapabilities,
ModelRegistry,
resolveContextLimit,
+ resolveModelCapabilities,
+ validateUserContent,
} from "./models/index.js";
// Notifications (ntfy.sh)
export * from "./notifications/index.js";
diff --git a/packages/core/src/models/attachments.ts b/packages/core/src/models/attachments.ts
new file mode 100644
index 0000000..5c98db4
--- /dev/null
+++ b/packages/core/src/models/attachments.ts
@@ -0,0 +1,151 @@
+// Validation + limits for multimodal user attachments (images / PDFs).
+//
+// Kept dependency-free (no DB / `bun:sqlite` import) so both the API layer
+// (`/chat` request validation) and any future caller can share the exact same
+// allowlist and size/count ceilings. The limits mirror Anthropic's documented
+// vision/PDF API constraints (the only image-capable providers Dispatch maps),
+// so a request that passes here won't be rejected by the provider for size.
+
+import type { UserAttachmentPart, UserContentPart } from "../types/index.js";
+
+/** Accepted image media types. */
+export const ACCEPTED_IMAGE_MEDIA_TYPES = [
+ "image/png",
+ "image/jpeg",
+ "image/webp",
+ "image/gif",
+] as const;
+
+/** Accepted document media types. */
+export const ACCEPTED_PDF_MEDIA_TYPE = "application/pdf";
+
+/** Every media type we accept as an attachment. */
+export const ACCEPTED_ATTACHMENT_MEDIA_TYPES = [
+ ...ACCEPTED_IMAGE_MEDIA_TYPES,
+ ACCEPTED_PDF_MEDIA_TYPE,
+] as const;
+
+/** Per-image byte ceiling (Anthropic: 5 MB/image). */
+export const MAX_IMAGE_BYTES = 5 * 1024 * 1024;
+
+/** Per-PDF byte ceiling (Anthropic: 32 MB/PDF). */
+export const MAX_PDF_BYTES = 32 * 1024 * 1024;
+
+/** Max attachments per message (Anthropic: 20 images/request). */
+export const MAX_ATTACHMENTS = 20;
+
+/**
+ * Total attachment payload ceiling for a single request (decoded bytes). Bounds
+ * the overall request size even when each individual file is within its limit.
+ */
+export const MAX_TOTAL_ATTACHMENT_BYTES = 32 * 1024 * 1024;
+
+/** Whether a media type is an accepted image type. */
+export function isImageMediaType(mediaType: string): boolean {
+ return (ACCEPTED_IMAGE_MEDIA_TYPES as readonly string[]).includes(mediaType);
+}
+
+/** Whether a media type is the accepted PDF type. */
+export function isPdfMediaType(mediaType: string): boolean {
+ return mediaType === ACCEPTED_PDF_MEDIA_TYPE;
+}
+
+/** Whether a media type is an accepted attachment type at all. */
+export function isAcceptedAttachmentMediaType(mediaType: string): boolean {
+ return (ACCEPTED_ATTACHMENT_MEDIA_TYPES as readonly string[]).includes(mediaType);
+}
+
+/**
+ * Decoded byte length of a base64 string, computed WITHOUT allocating the
+ * decoded buffer. Tolerates an optional `data:<mediaType>;base64,` prefix and
+ * any embedded whitespace/newlines. Returns 0 for an empty/whitespace string.
+ */
+export function base64ByteLength(b64: string): number {
+ // Strip a data-URI prefix if present.
+ const comma = b64.indexOf(",");
+ const body = b64.startsWith("data:") && comma !== -1 ? b64.slice(comma + 1) : b64;
+ let len = 0;
+ let pad = 0;
+ for (let i = 0; i < body.length; i++) {
+ const ch = body.charCodeAt(i);
+ // Skip whitespace (space, \t, \n, \r).
+ if (ch === 32 || ch === 9 || ch === 10 || ch === 13) continue;
+ len++;
+ if (body[i] === "=") pad++;
+ }
+ if (len === 0) return 0;
+ // 4 base64 chars → 3 bytes, minus padding.
+ return Math.floor((len * 3) / 4) - pad;
+}
+
+export type AttachmentValidationError =
+ | { code: "unsupported-type"; mediaType: string }
+ | { code: "image-too-large"; mediaType: string; bytes: number; limit: number }
+ | { code: "pdf-too-large"; bytes: number; limit: number }
+ | { code: "too-many"; count: number; limit: number }
+ | { code: "total-too-large"; bytes: number; limit: number }
+ | { code: "empty"; mediaType: string };
+
+export interface AttachmentValidationResult {
+ ok: boolean;
+ errors: AttachmentValidationError[];
+}
+
+/** Extract just the attachment parts from a mixed content list. */
+function attachmentsOf(content: UserContentPart[]): UserAttachmentPart[] {
+ return content.filter((p): p is UserAttachmentPart => p.type === "attachment");
+}
+
+/**
+ * Validate the attachments in a multimodal user content list against the
+ * media-type allowlist and the size/count ceilings. Pure: never throws,
+ * collects every violation so the caller can report them all at once.
+ *
+ * Text parts are ignored (always valid). An empty content list is valid (it's
+ * just a text-only message expressed as parts).
+ */
+export function validateUserContent(content: UserContentPart[]): AttachmentValidationResult {
+ const errors: AttachmentValidationError[] = [];
+ const attachments = attachmentsOf(content);
+
+ if (attachments.length > MAX_ATTACHMENTS) {
+ errors.push({ code: "too-many", count: attachments.length, limit: MAX_ATTACHMENTS });
+ }
+
+ let total = 0;
+ for (const att of attachments) {
+ if (!isAcceptedAttachmentMediaType(att.mediaType)) {
+ errors.push({ code: "unsupported-type", mediaType: att.mediaType });
+ continue;
+ }
+ const bytes = base64ByteLength(att.data);
+ total += bytes;
+ if (bytes === 0) {
+ errors.push({ code: "empty", mediaType: att.mediaType });
+ continue;
+ }
+ if (isPdfMediaType(att.mediaType)) {
+ if (bytes > MAX_PDF_BYTES) {
+ errors.push({ code: "pdf-too-large", bytes, limit: MAX_PDF_BYTES });
+ }
+ } else if (bytes > MAX_IMAGE_BYTES) {
+ errors.push({
+ code: "image-too-large",
+ mediaType: att.mediaType,
+ bytes,
+ limit: MAX_IMAGE_BYTES,
+ });
+ }
+ }
+
+ if (total > MAX_TOTAL_ATTACHMENT_BYTES) {
+ errors.push({ code: "total-too-large", bytes: total, limit: MAX_TOTAL_ATTACHMENT_BYTES });
+ }
+
+ return { ok: errors.length === 0, errors };
+}
+
+/** Convenience: does the content list contain at least one attachment? */
+export function hasAttachments(content: UserContentPart[] | undefined | null): boolean {
+ return !!content && content.some((p) => p.type === "attachment");
+}
diff --git a/packages/core/src/models/catalog.ts b/packages/core/src/models/catalog.ts
index dea4647..ac310b1 100644
--- a/packages/core/src/models/catalog.ts
+++ b/packages/core/src/models/catalog.ts
@@ -18,6 +18,15 @@ interface ModelsDevModel {
context?: number;
output?: number;
};
+ /**
+ * Input/output modalities the model accepts. We read `input` to decide
+ * whether the model can take image / pdf attachments. Absent on older
+ * catalog entries — treated as "unknown" (capability resolves to `null`).
+ */
+ modalities?: {
+ input?: string[];
+ output?: string[];
+ };
}
interface ModelsDevProvider {
@@ -172,6 +181,47 @@ export async function resolveContextLimit(
return null;
}
+/**
+ * Image / PDF input capabilities for a model, resolved from the models.dev
+ * catalog's `modalities.input` list.
+ */
+export interface ModelInputCapabilities {
+ /** Model accepts image input (vision). */
+ image: boolean;
+ /** Model accepts PDF/document input. */
+ pdf: boolean;
+}
+
+/**
+ * Resolve whether a model accepts image / pdf input for the given Dispatch
+ * provider + model id. Returns `null` when the capability is UNKNOWN — i.e. the
+ * provider is unsupported/unmapped, the model is absent from the catalog, the
+ * entry predates the `modalities` field, or the catalog is unavailable. Callers
+ * should treat `null` as "can't verify" (optimistic allow) rather than a
+ * definitive "no", so a temporary catalog outage never disables a known-good
+ * vision model.
+ *
+ * A non-null result means the catalog DID describe the model's input modalities
+ * — `{ image, pdf }` then reflects exactly what it advertises (a definitive
+ * yes/no for each).
+ */
+export async function resolveModelCapabilities(
+ provider: string,
+ modelId: string,
+): Promise<ModelInputCapabilities | null> {
+ const candidates = PROVIDER_MAP[provider];
+ if (!candidates || !modelId) return null;
+
+ const catalog = await getModelsCatalog();
+ for (const providerId of candidates) {
+ const input = catalog[providerId]?.models?.[modelId]?.modalities?.input;
+ if (Array.isArray(input)) {
+ return { image: input.includes("image"), pdf: input.includes("pdf") };
+ }
+ }
+ return null;
+}
+
/** Test-only: reset the in-process memo so a test can re-exercise loading. */
export function __resetCatalogCacheForTests(): void {
cached = null;
diff --git a/packages/core/src/models/index.ts b/packages/core/src/models/index.ts
index 2fcd657..15d1ee2 100644
--- a/packages/core/src/models/index.ts
+++ b/packages/core/src/models/index.ts
@@ -1,5 +1,24 @@
export {
+ ACCEPTED_ATTACHMENT_MEDIA_TYPES,
+ ACCEPTED_IMAGE_MEDIA_TYPES,
+ ACCEPTED_PDF_MEDIA_TYPE,
+ type AttachmentValidationError,
+ type AttachmentValidationResult,
+ base64ByteLength,
+ hasAttachments,
+ isAcceptedAttachmentMediaType,
+ isImageMediaType,
+ isPdfMediaType,
+ MAX_ATTACHMENTS,
+ MAX_IMAGE_BYTES,
+ MAX_PDF_BYTES,
+ MAX_TOTAL_ATTACHMENT_BYTES,
+ validateUserContent,
+} from "./attachments.js";
+export {
getModelsCatalog,
+ type ModelInputCapabilities,
resolveContextLimit,
+ resolveModelCapabilities,
} from "./catalog.js";
export { ModelRegistry } from "./registry.js";
diff --git a/packages/core/src/types/index.ts b/packages/core/src/types/index.ts
index 607b27d..273e074 100644
--- a/packages/core/src/types/index.ts
+++ b/packages/core/src/types/index.ts
@@ -76,8 +76,57 @@ export interface SystemChunk {
export interface ChatMessage {
role: MessageRole;
chunks: Chunk[];
+ /**
+ * Ephemeral ORDERED multimodal content for a user turn (interleaved text +
+ * image/pdf attachments). Set ONLY transiently on the in-flight user message
+ * so `toModelMessages` can emit multimodal `ImagePart`/`FilePart` content to
+ * the provider. Never persisted (the chunk log stores only the text, with
+ * `[image]`/`[pdf]` markers), so it's absent on history-rebuilt messages.
+ * When absent, the message is plain text built from its `chunks`.
+ */
+ content?: UserContentPart[];
}
+// ─── Multimodal user content (image / PDF attachments) ───────────
+//
+// When a user pastes one or more images/PDFs into the chat input, the turn's
+// user message carries an ORDERED list of content parts instead of a plain
+// string. The ordering is meaningful — the user can interleave text and
+// attachments ("here is image A: <A>, here is image B: <B>") and the model
+// sees them in exactly that sequence.
+//
+// These parts are EPHEMERAL: they are forwarded to the model for the turn that
+// produced them but are NOT persisted as raw bytes in the chunk log. History
+// stores only the user's text (with `[image]` / `[pdf]` markers in place of
+// each attachment), so a later reload re-renders the text but never re-sends
+// the binary payload. This keeps the persisted log small and avoids re-billing
+// image tokens on every subsequent turn.
+
+/** A plain-text segment of a multimodal user message. */
+export interface UserTextPart {
+ type: "text";
+ text: string;
+}
+
+/**
+ * A binary attachment (image or PDF) in a multimodal user message. `data` is a
+ * base64-encoded payload (no `data:` URI prefix); `mediaType` is the IANA media
+ * type (e.g. `image/png`, `application/pdf`). `name` is an optional original
+ * filename, used only for PDF `filename` passthrough and diagnostics.
+ */
+export interface UserAttachmentPart {
+ type: "attachment";
+ /** IANA media type, e.g. `image/png`, `image/jpeg`, `application/pdf`. */
+ mediaType: string;
+ /** Base64-encoded bytes WITHOUT a `data:` URI prefix. */
+ data: string;
+ /** Optional original filename (mainly for PDFs). */
+ name?: string;
+}
+
+/** One ordered part of a multimodal user message. */
+export type UserContentPart = UserTextPart | UserAttachmentPart;
+
// ─── Append-only chunk log (persisted model) ─────────────────────
//
// The DB stores a conversation as a flat stream of `ChunkRow`s (see