From 66e5d3b105bfd2b34c6f35876bf33dbb3cb9dcae Mon Sep 17 00:00:00 2001 From: Adam Malczewski Date: Tue, 2 Jun 2026 22:50:11 +0900 Subject: feat(chat): paste-to-attach images/PDFs with model capability check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multimodal image/PDF input to the chat box via clipboard paste, gated by a graceful per-model capability check. UX: a pasted image/PDF inserts an inline token (【image:…】 / 【pdf:…】) into the draft, so attachments have ORDER relative to typed text and can be referenced positionally. The token is the only handle — deleting it (atomic Backspace/ Delete, or selection overlap) detaches the file; an input-reconciliation safety net detaches any attachment whose token is no longer intact. No preview strip. Capability check: resolveModelCapabilities reads models.dev modalities.input (new GET /models/capabilities, mirrors /context-limit). The input blocks Send (no tokens spent) only on a definitive 'no'; unknown capability (catalog offline / unmapped provider) stays permissive. Attachments require a fresh turn — Send is blocked while generating and /chat rejects content mid-turn (409). Attachments are EPHEMERAL: forwarded to the model for the turn via ordered AI SDK ImagePart/FilePart content, but never persisted (history keeps the text with [image]/[pdf] markers). Text-only turns serialize byte-identically to before. Limits (Anthropic-aligned, enforced at paste + re-validated server-side): PNG/JPEG/WebP/GIF/PDF; image ≤5MB, PDF ≤32MB, ≤20 attachments, ≤32MB total. core: UserContentPart types, models/attachments validator, capability resolver, agent.run+toModelMessages thread ordered content. api: /chat content validation + passthrough. frontend: attachment-tokens helper, ChatInput paste/token/gating, per-tab staged attachments, App.svelte capability fetch. +44 tests. --- packages/api/src/agent-manager.ts | 11 + packages/api/src/app.ts | 63 ++++++ packages/api/src/routes/models.ts | 18 ++ packages/api/tests/routes.test.ts | 63 ++++++ packages/core/src/agent/agent.ts | Bin 57822 -> 60515 bytes packages/core/src/index.ts | 17 ++ packages/core/src/models/attachments.ts | 151 +++++++++++++ packages/core/src/models/catalog.ts | 50 +++++ packages/core/src/models/index.ts | 19 ++ packages/core/src/types/index.ts | 49 +++++ packages/core/tests/agent/agent.test.ts | 98 +++++++++ packages/core/tests/models/attachments.test.ts | 136 ++++++++++++ packages/core/tests/models/catalog.test.ts | 75 ++++++- packages/frontend/src/App.svelte | 55 ++++- packages/frontend/src/lib/attachment-tokens.ts | 234 +++++++++++++++++++++ .../frontend/src/lib/components/ChatInput.svelte | 223 +++++++++++++++++++- packages/frontend/src/lib/tabs.svelte.ts | 66 +++++- packages/frontend/tests/attachment-tokens.test.ts | 130 ++++++++++++ packages/frontend/tests/chat-store.test.ts | 75 +++++++ 19 files changed, 1510 insertions(+), 23 deletions(-) create mode 100644 packages/core/src/models/attachments.ts create mode 100644 packages/core/tests/models/attachments.test.ts create mode 100644 packages/frontend/src/lib/attachment-tokens.ts create mode 100644 packages/frontend/tests/attachment-tokens.test.ts (limited to 'packages') diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts index 2532efa..3b12a80 100644 --- a/packages/api/src/agent-manager.ts +++ b/packages/api/src/agent-manager.ts @@ -65,6 +65,7 @@ import { toAvailableUserAgents, type UsageData, type UsageStats, + type UserContentPart, validateConfig, } from "@dispatch/core"; import type { PermissionManager } from "./permission-manager.js"; @@ -1535,6 +1536,13 @@ export class AgentManager { reasoningEffort?: ReasoningEffort; workingDirectory?: string; queueId?: string; + /** + * Ephemeral ordered multimodal content (image/pdf attachments) for a + * FRESH human turn. Forwarded to `processMessage` → `agent.run` only + * when the tab is idle (a started turn); never carried into the queue + * path (attachments require a fresh turn — the caller guards that). + */ + content?: UserContentPart[]; /** * Who is sending this message. `"human"` (default) is unrestricted * and REFILLS the target's agent-to-agent auto-wake budget. `"agent"` @@ -1606,6 +1614,7 @@ export class AgentManager { opts.reasoningEffort, opts.workingDirectory, agentModels, + opts.content, ).catch((err) => { console.error(`[dispatch] deliverMessage processMessage error for tab ${tabId}:`, err); }); @@ -1620,6 +1629,7 @@ export class AgentManager { reasoningEffort?: ReasoningEffort, workingDirectory?: string, agentModels?: AgentModelEntry[], + content?: UserContentPart[], ): Promise { const tabAgent = this._getOrCreateTabAgent(tabId); @@ -1731,6 +1741,7 @@ export class AgentManager { for await (const event of agent.run(message, { ...(effortForEntry ? { reasoningEffort: effortForEntry } : {}), abortSignal: tabAgent.abortController?.signal, + ...(content ? { content } : {}), })) { // Stop processing if the tab was aborted (closed/stopped). // stopTab() already injected a `cancelled` system chunk into diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts index 84afd2a..2f4e538 100644 --- a/packages/api/src/app.ts +++ b/packages/api/src/app.ts @@ -3,6 +3,8 @@ import { getTab, isReasoningEffort, NotificationDispatcher, + type UserContentPart, + validateUserContent, } from "@dispatch/core"; import { Hono } from "hono"; import { cors } from "hono/cors"; @@ -37,6 +39,41 @@ function sanitizeAgentModels(raw: unknown): AgentModelEntry[] | undefined { return out; } +/** + * Validate and normalise the optional multimodal `content` array from the + * `/chat` body. Each entry is either a `{ type: "text", text }` part or a + * `{ type: "attachment", mediaType, data, name? }` part (base64 payload). + * Returns `undefined` when the input isn't a non-empty array or contains no + * attachment (so the plain-string path is taken — byte-identical to before). + * Shape only: SIZE/TYPE limits are enforced separately by `validateUserContent`. + */ +function sanitizeUserContent(raw: unknown): UserContentPart[] | undefined { + if (!Array.isArray(raw) || raw.length === 0) return undefined; + const out: UserContentPart[] = []; + let hasAttachment = false; + for (const p of raw) { + if (!p || typeof p !== "object") continue; + const part = p as Record; + if (part.type === "text") { + if (typeof part.text === "string") out.push({ type: "text", text: part.text }); + continue; + } + if (part.type === "attachment") { + if (typeof part.mediaType !== "string" || typeof part.data !== "string") continue; + hasAttachment = true; + out.push({ + type: "attachment", + mediaType: part.mediaType, + data: part.data, + ...(typeof part.name === "string" ? { name: part.name } : {}), + }); + } + } + // No attachment → let the plain-text path handle it (avoids needlessly + // switching the model message to array content for a text-only turn). + return hasAttachment ? out : undefined; +} + export const permissionManager = new PermissionManager(); export const agentManager = new AgentManager(permissionManager); @@ -94,6 +131,7 @@ app.post("/chat", async (c) => { const body = await c.req.json<{ tabId?: unknown; message?: unknown; + content?: unknown; keyId?: unknown; modelId?: unknown; agentModels?: unknown; @@ -121,6 +159,30 @@ app.post("/chat", async (c) => { ? body.reasoningEffort : undefined; + // Optional multimodal content (image/pdf attachments). When present, the + // attachments are EPHEMERAL — forwarded to the model for this turn only and + // never persisted (the chunk log keeps just `message`, which the frontend + // has already projected to text with `[image]`/`[pdf]` markers). + const content = sanitizeUserContent(body.content); + if (content) { + // Enforce size/type/count ceilings server-side (defence in depth; the + // frontend also enforces them at paste time). Reject the whole request + // so no tokens are spent on an over-limit payload. + const validation = validateUserContent(content); + if (!validation.ok) { + return c.json({ error: "invalid attachments", details: validation.errors }, 400); + } + // Attachments only attach to a FRESH turn. If the tab is mid-turn the + // message would queue (text-only machinery), silently dropping the + // images. Reject clearly instead so the user can retry once idle. + if (agentManager.getTabStatus(tabId) === "running") { + return c.json( + { error: "cannot attach images while the agent is generating; wait for it to finish" }, + 409, + ); + } + } + // Single routing decision (queue if busy, new turn if idle) shared with the // `send_to_tab` tool via `AgentManager.deliverMessage`. Non-blocking — a // started turn runs in the background. @@ -131,6 +193,7 @@ app.post("/chat", async (c) => { ...(reasoningEffort ? { reasoningEffort } : {}), ...(workingDirectory !== undefined ? { workingDirectory } : {}), ...(queueId ? { queueId } : {}), + ...(content ? { content } : {}), }); if (outcome.status === "queued") { diff --git a/packages/api/src/routes/models.ts b/packages/api/src/routes/models.ts index eeb6029..a1700b1 100644 --- a/packages/api/src/routes/models.ts +++ b/packages/api/src/routes/models.ts @@ -20,6 +20,7 @@ import { refreshAccountCredentialsAsync, resolveApiKey, resolveContextLimit, + resolveModelCapabilities, selectHaikuModel, setApiKey, validateAccountCredentials, @@ -180,6 +181,23 @@ modelsRoutes.get("/context-limit", async (c) => { return c.json({ contextLimit }); }); +// Resolve a model's image / PDF INPUT capabilities from the models.dev catalog. +// Returns `{ capabilities: { image, pdf } | null }`. `null` means UNKNOWN — the +// provider is unmapped, the model is absent, the catalog predates the +// `modalities` field, or the catalog is offline. The frontend treats `null` as +// "can't verify" (optimistic allow) and a definitive `{ image: false }` as a +// hard block (no tokens spent). +modelsRoutes.get("/capabilities", async (c) => { + const provider = c.req.query("provider"); + const modelId = c.req.query("modelId"); + if (!provider || !modelId) { + return c.json({ error: "provider and modelId query parameters are required" }, 400); + } + + const capabilities = await resolveModelCapabilities(provider, modelId); + return c.json({ capabilities }); +}); + // List available Claude accounts with validated credentials modelsRoutes.get("/claude-accounts", async (c) => { const candidates = resolveClaudeAccounts(); diff --git a/packages/api/tests/routes.test.ts b/packages/api/tests/routes.test.ts index 37c19ca..7cfd8a7 100644 --- a/packages/api/tests/routes.test.ts +++ b/packages/api/tests/routes.test.ts @@ -219,6 +219,16 @@ vi.mock("@dispatch/core", () => ({ typeof value === "string" && ["none", "low", "medium", "high", "xhigh", "max"].includes(value) ); }, + // Lightweight stand-in for the real validator: accept the supported media + // types, reject everything else. Enough to exercise the /chat attachment + // validation branch (the real validator is unit-tested in core). + validateUserContent(content: Array<{ type: string; mediaType?: string }>) { + const accepted = ["image/png", "image/jpeg", "image/webp", "image/gif", "application/pdf"]; + const errors = content + .filter((p) => p.type === "attachment" && !accepted.includes(p.mediaType ?? "")) + .map((p) => ({ code: "unsupported-type", mediaType: p.mediaType })); + return { ok: errors.length === 0, errors }; + }, listOpenTabs() { return [...fakeOpenTabs]; }, @@ -449,6 +459,59 @@ describe("POST /chat", () => { expect(await res.json()).toEqual({ status: "ok" }); }); + it("accepts a valid image attachment and starts a turn", async () => { + const res = await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + tabId: "tab-img-ok", + message: "look: [image]", + content: [ + { type: "text", text: "look: " }, + { type: "attachment", mediaType: "image/png", data: "QQ==" }, + ], + }), + }); + expect(res.status).toBe(200); + expect(await res.json()).toEqual({ status: "ok" }); + }); + + it("returns 400 for an unsupported attachment media type", async () => { + const res = await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + tabId: "tab-img-bad", + message: "look: [image]", + content: [{ type: "attachment", mediaType: "image/svg+xml", data: "QQ==" }], + }), + }); + expect(res.status).toBe(400); + const body = await res.json(); + expect(body.error).toBe("invalid attachments"); + }); + + it("returns 409 when attaching while the agent is generating", async () => { + // Kick off a turn so the tab is running. + await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ tabId: "tab-img-busy", message: "first" }), + }); + await new Promise((r) => setTimeout(r, 20)); + + const res = await app.request("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + tabId: "tab-img-busy", + message: "second [image]", + content: [{ type: "attachment", mediaType: "image/png", data: "QQ==" }], + }), + }); + expect(res.status).toBe(409); + }); + it("returns 400 with empty message", async () => { const res = await app.request("/chat", { method: "POST", diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 4bfa7eb..08b317a 100644 Binary files a/packages/core/src/agent/agent.ts and b/packages/core/src/agent/agent.ts differ diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 08b426f..50012f1 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -82,9 +82,26 @@ export { } from "./lsp/index.js"; // Models export { + ACCEPTED_ATTACHMENT_MEDIA_TYPES, + ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, + type AttachmentValidationError, + type AttachmentValidationResult, + base64ByteLength, getModelsCatalog, + hasAttachments, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + type ModelInputCapabilities, ModelRegistry, resolveContextLimit, + resolveModelCapabilities, + validateUserContent, } from "./models/index.js"; // Notifications (ntfy.sh) export * from "./notifications/index.js"; diff --git a/packages/core/src/models/attachments.ts b/packages/core/src/models/attachments.ts new file mode 100644 index 0000000..5c98db4 --- /dev/null +++ b/packages/core/src/models/attachments.ts @@ -0,0 +1,151 @@ +// Validation + limits for multimodal user attachments (images / PDFs). +// +// Kept dependency-free (no DB / `bun:sqlite` import) so both the API layer +// (`/chat` request validation) and any future caller can share the exact same +// allowlist and size/count ceilings. The limits mirror Anthropic's documented +// vision/PDF API constraints (the only image-capable providers Dispatch maps), +// so a request that passes here won't be rejected by the provider for size. + +import type { UserAttachmentPart, UserContentPart } from "../types/index.js"; + +/** Accepted image media types. */ +export const ACCEPTED_IMAGE_MEDIA_TYPES = [ + "image/png", + "image/jpeg", + "image/webp", + "image/gif", +] as const; + +/** Accepted document media types. */ +export const ACCEPTED_PDF_MEDIA_TYPE = "application/pdf"; + +/** Every media type we accept as an attachment. */ +export const ACCEPTED_ATTACHMENT_MEDIA_TYPES = [ + ...ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, +] as const; + +/** Per-image byte ceiling (Anthropic: 5 MB/image). */ +export const MAX_IMAGE_BYTES = 5 * 1024 * 1024; + +/** Per-PDF byte ceiling (Anthropic: 32 MB/PDF). */ +export const MAX_PDF_BYTES = 32 * 1024 * 1024; + +/** Max attachments per message (Anthropic: 20 images/request). */ +export const MAX_ATTACHMENTS = 20; + +/** + * Total attachment payload ceiling for a single request (decoded bytes). Bounds + * the overall request size even when each individual file is within its limit. + */ +export const MAX_TOTAL_ATTACHMENT_BYTES = 32 * 1024 * 1024; + +/** Whether a media type is an accepted image type. */ +export function isImageMediaType(mediaType: string): boolean { + return (ACCEPTED_IMAGE_MEDIA_TYPES as readonly string[]).includes(mediaType); +} + +/** Whether a media type is the accepted PDF type. */ +export function isPdfMediaType(mediaType: string): boolean { + return mediaType === ACCEPTED_PDF_MEDIA_TYPE; +} + +/** Whether a media type is an accepted attachment type at all. */ +export function isAcceptedAttachmentMediaType(mediaType: string): boolean { + return (ACCEPTED_ATTACHMENT_MEDIA_TYPES as readonly string[]).includes(mediaType); +} + +/** + * Decoded byte length of a base64 string, computed WITHOUT allocating the + * decoded buffer. Tolerates an optional `data:;base64,` prefix and + * any embedded whitespace/newlines. Returns 0 for an empty/whitespace string. + */ +export function base64ByteLength(b64: string): number { + // Strip a data-URI prefix if present. + const comma = b64.indexOf(","); + const body = b64.startsWith("data:") && comma !== -1 ? b64.slice(comma + 1) : b64; + let len = 0; + let pad = 0; + for (let i = 0; i < body.length; i++) { + const ch = body.charCodeAt(i); + // Skip whitespace (space, \t, \n, \r). + if (ch === 32 || ch === 9 || ch === 10 || ch === 13) continue; + len++; + if (body[i] === "=") pad++; + } + if (len === 0) return 0; + // 4 base64 chars → 3 bytes, minus padding. + return Math.floor((len * 3) / 4) - pad; +} + +export type AttachmentValidationError = + | { code: "unsupported-type"; mediaType: string } + | { code: "image-too-large"; mediaType: string; bytes: number; limit: number } + | { code: "pdf-too-large"; bytes: number; limit: number } + | { code: "too-many"; count: number; limit: number } + | { code: "total-too-large"; bytes: number; limit: number } + | { code: "empty"; mediaType: string }; + +export interface AttachmentValidationResult { + ok: boolean; + errors: AttachmentValidationError[]; +} + +/** Extract just the attachment parts from a mixed content list. */ +function attachmentsOf(content: UserContentPart[]): UserAttachmentPart[] { + return content.filter((p): p is UserAttachmentPart => p.type === "attachment"); +} + +/** + * Validate the attachments in a multimodal user content list against the + * media-type allowlist and the size/count ceilings. Pure: never throws, + * collects every violation so the caller can report them all at once. + * + * Text parts are ignored (always valid). An empty content list is valid (it's + * just a text-only message expressed as parts). + */ +export function validateUserContent(content: UserContentPart[]): AttachmentValidationResult { + const errors: AttachmentValidationError[] = []; + const attachments = attachmentsOf(content); + + if (attachments.length > MAX_ATTACHMENTS) { + errors.push({ code: "too-many", count: attachments.length, limit: MAX_ATTACHMENTS }); + } + + let total = 0; + for (const att of attachments) { + if (!isAcceptedAttachmentMediaType(att.mediaType)) { + errors.push({ code: "unsupported-type", mediaType: att.mediaType }); + continue; + } + const bytes = base64ByteLength(att.data); + total += bytes; + if (bytes === 0) { + errors.push({ code: "empty", mediaType: att.mediaType }); + continue; + } + if (isPdfMediaType(att.mediaType)) { + if (bytes > MAX_PDF_BYTES) { + errors.push({ code: "pdf-too-large", bytes, limit: MAX_PDF_BYTES }); + } + } else if (bytes > MAX_IMAGE_BYTES) { + errors.push({ + code: "image-too-large", + mediaType: att.mediaType, + bytes, + limit: MAX_IMAGE_BYTES, + }); + } + } + + if (total > MAX_TOTAL_ATTACHMENT_BYTES) { + errors.push({ code: "total-too-large", bytes: total, limit: MAX_TOTAL_ATTACHMENT_BYTES }); + } + + return { ok: errors.length === 0, errors }; +} + +/** Convenience: does the content list contain at least one attachment? */ +export function hasAttachments(content: UserContentPart[] | undefined | null): boolean { + return !!content && content.some((p) => p.type === "attachment"); +} diff --git a/packages/core/src/models/catalog.ts b/packages/core/src/models/catalog.ts index dea4647..ac310b1 100644 --- a/packages/core/src/models/catalog.ts +++ b/packages/core/src/models/catalog.ts @@ -18,6 +18,15 @@ interface ModelsDevModel { context?: number; output?: number; }; + /** + * Input/output modalities the model accepts. We read `input` to decide + * whether the model can take image / pdf attachments. Absent on older + * catalog entries — treated as "unknown" (capability resolves to `null`). + */ + modalities?: { + input?: string[]; + output?: string[]; + }; } interface ModelsDevProvider { @@ -172,6 +181,47 @@ export async function resolveContextLimit( return null; } +/** + * Image / PDF input capabilities for a model, resolved from the models.dev + * catalog's `modalities.input` list. + */ +export interface ModelInputCapabilities { + /** Model accepts image input (vision). */ + image: boolean; + /** Model accepts PDF/document input. */ + pdf: boolean; +} + +/** + * Resolve whether a model accepts image / pdf input for the given Dispatch + * provider + model id. Returns `null` when the capability is UNKNOWN — i.e. the + * provider is unsupported/unmapped, the model is absent from the catalog, the + * entry predates the `modalities` field, or the catalog is unavailable. Callers + * should treat `null` as "can't verify" (optimistic allow) rather than a + * definitive "no", so a temporary catalog outage never disables a known-good + * vision model. + * + * A non-null result means the catalog DID describe the model's input modalities + * — `{ image, pdf }` then reflects exactly what it advertises (a definitive + * yes/no for each). + */ +export async function resolveModelCapabilities( + provider: string, + modelId: string, +): Promise { + const candidates = PROVIDER_MAP[provider]; + if (!candidates || !modelId) return null; + + const catalog = await getModelsCatalog(); + for (const providerId of candidates) { + const input = catalog[providerId]?.models?.[modelId]?.modalities?.input; + if (Array.isArray(input)) { + return { image: input.includes("image"), pdf: input.includes("pdf") }; + } + } + return null; +} + /** Test-only: reset the in-process memo so a test can re-exercise loading. */ export function __resetCatalogCacheForTests(): void { cached = null; diff --git a/packages/core/src/models/index.ts b/packages/core/src/models/index.ts index 2fcd657..15d1ee2 100644 --- a/packages/core/src/models/index.ts +++ b/packages/core/src/models/index.ts @@ -1,5 +1,24 @@ +export { + ACCEPTED_ATTACHMENT_MEDIA_TYPES, + ACCEPTED_IMAGE_MEDIA_TYPES, + ACCEPTED_PDF_MEDIA_TYPE, + type AttachmentValidationError, + type AttachmentValidationResult, + base64ByteLength, + hasAttachments, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + validateUserContent, +} from "./attachments.js"; export { getModelsCatalog, + type ModelInputCapabilities, resolveContextLimit, + resolveModelCapabilities, } from "./catalog.js"; export { ModelRegistry } from "./registry.js"; diff --git a/packages/core/src/types/index.ts b/packages/core/src/types/index.ts index 607b27d..273e074 100644 --- a/packages/core/src/types/index.ts +++ b/packages/core/src/types/index.ts @@ -76,8 +76,57 @@ export interface SystemChunk { export interface ChatMessage { role: MessageRole; chunks: Chunk[]; + /** + * Ephemeral ORDERED multimodal content for a user turn (interleaved text + + * image/pdf attachments). Set ONLY transiently on the in-flight user message + * so `toModelMessages` can emit multimodal `ImagePart`/`FilePart` content to + * the provider. Never persisted (the chunk log stores only the text, with + * `[image]`/`[pdf]` markers), so it's absent on history-rebuilt messages. + * When absent, the message is plain text built from its `chunks`. + */ + content?: UserContentPart[]; } +// ─── Multimodal user content (image / PDF attachments) ─────────── +// +// When a user pastes one or more images/PDFs into the chat input, the turn's +// user message carries an ORDERED list of content parts instead of a plain +// string. The ordering is meaningful — the user can interleave text and +// attachments ("here is image A: , here is image B: ") and the model +// sees them in exactly that sequence. +// +// These parts are EPHEMERAL: they are forwarded to the model for the turn that +// produced them but are NOT persisted as raw bytes in the chunk log. History +// stores only the user's text (with `[image]` / `[pdf]` markers in place of +// each attachment), so a later reload re-renders the text but never re-sends +// the binary payload. This keeps the persisted log small and avoids re-billing +// image tokens on every subsequent turn. + +/** A plain-text segment of a multimodal user message. */ +export interface UserTextPart { + type: "text"; + text: string; +} + +/** + * A binary attachment (image or PDF) in a multimodal user message. `data` is a + * base64-encoded payload (no `data:` URI prefix); `mediaType` is the IANA media + * type (e.g. `image/png`, `application/pdf`). `name` is an optional original + * filename, used only for PDF `filename` passthrough and diagnostics. + */ +export interface UserAttachmentPart { + type: "attachment"; + /** IANA media type, e.g. `image/png`, `image/jpeg`, `application/pdf`. */ + mediaType: string; + /** Base64-encoded bytes WITHOUT a `data:` URI prefix. */ + data: string; + /** Optional original filename (mainly for PDFs). */ + name?: string; +} + +/** One ordered part of a multimodal user message. */ +export type UserContentPart = UserTextPart | UserAttachmentPart; + // ─── Append-only chunk log (persisted model) ───────────────────── // // The DB stores a conversation as a flat stream of `ChunkRow`s (see diff --git a/packages/core/tests/agent/agent.test.ts b/packages/core/tests/agent/agent.test.ts index d8edec7..f4b33cc 100644 --- a/packages/core/tests/agent/agent.test.ts +++ b/packages/core/tests/agent/agent.test.ts @@ -1544,4 +1544,102 @@ describe("anthropicThinkingProviderOptions — adaptive-thinking model detection effort: "xhigh", }); }); + + describe("multimodal user content", () => { + it("emits ordered text + image parts to the model when content is provided", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("here is image A: [image]", { + content: [ + { type: "text", text: "here is image A: " }, + { type: "attachment", mediaType: "image/png", data: "QQ==" }, + ], + })) { + // consume + } + + const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0]; + const messages = callArgs?.messages as Array<{ role: string; content: unknown }>; + const userMsg = messages.find((m) => m.role === "user"); + expect(userMsg).toBeDefined(); + // Multimodal turn → content is an ordered parts array, not a string. + expect(Array.isArray(userMsg?.content)).toBe(true); + const parts = userMsg?.content as Array>; + expect(parts[0]).toMatchObject({ type: "text", text: "here is image A: " }); + expect(parts[1]).toMatchObject({ type: "image", mediaType: "image/png" }); + expect(String(parts[1]?.image)).toBe("data:image/png;base64,QQ=="); + }); + + it("emits a FilePart for a PDF attachment with its filename", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("see [pdf]", { + content: [ + { type: "text", text: "see " }, + { type: "attachment", mediaType: "application/pdf", data: "QQ==", name: "doc.pdf" }, + ], + })) { + // consume + } + + const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0]; + const messages = callArgs?.messages as Array<{ role: string; content: unknown }>; + const userMsg = messages.find((m) => m.role === "user"); + const parts = userMsg?.content as Array>; + const filePart = parts.find((p) => p.type === "file"); + expect(filePart).toMatchObject({ + type: "file", + mediaType: "application/pdf", + filename: "doc.pdf", + }); + expect(String(filePart?.data)).toBe("data:application/pdf;base64,QQ=="); + }); + + it("persists the user turn as text only (no content) for history", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("look: [image]", { + content: [ + { type: "text", text: "look: " }, + { type: "attachment", mediaType: "image/png", data: "QQ==" }, + ], + })) { + // consume + } + + // The in-memory user message keeps the text chunk for the render/persist + // path; the ephemeral `content` rides alongside it but isn't a chunk. + const userMsg = agent.messages.find((m) => m.role === "user"); + expect(userMsg?.chunks).toEqual([{ type: "text", text: "look: [image]" }]); + }); + + it("falls back to a plain string when content has no attachment", async () => { + vi.mocked(streamText).mockReturnValue( + makeMockStreamResult([{ type: "text-delta", id: "t0", text: "ok" }, finishStop]), + ); + + const agent = new Agent(makeConfig()); + for await (const _ of agent.run("plain text", { + content: [{ type: "text", text: "plain text" }], + })) { + // consume + } + + const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0]; + const messages = callArgs?.messages as Array<{ role: string; content: unknown }>; + const userMsg = messages.find((m) => m.role === "user"); + // No attachment → plain string content (byte-identical to text-only path). + expect(typeof userMsg?.content).toBe("string"); + expect(userMsg?.content).toBe("plain text"); + }); + }); }); diff --git a/packages/core/tests/models/attachments.test.ts b/packages/core/tests/models/attachments.test.ts new file mode 100644 index 0000000..11a9f82 --- /dev/null +++ b/packages/core/tests/models/attachments.test.ts @@ -0,0 +1,136 @@ +import { describe, expect, it } from "vitest"; +import { + base64ByteLength, + isAcceptedAttachmentMediaType, + isImageMediaType, + isPdfMediaType, + MAX_ATTACHMENTS, + MAX_IMAGE_BYTES, + MAX_PDF_BYTES, + MAX_TOTAL_ATTACHMENT_BYTES, + validateUserContent, +} from "../../src/models/attachments.js"; +import type { UserContentPart } from "../../src/types/index.js"; + +/** A base64 string that decodes to exactly `bytes` bytes (no padding chars). */ +function base64OfBytes(bytes: number): string { + // 4 base64 chars → 3 bytes. Use a multiple of 3 for clean (unpadded) output. + const groups = Math.ceil(bytes / 3); + return "A".repeat(groups * 4); +} + +function imagePart(data: string, mediaType = "image/png"): UserContentPart { + return { type: "attachment", mediaType, data }; +} + +describe("media-type predicates", () => { + it("classifies image types", () => { + expect(isImageMediaType("image/png")).toBe(true); + expect(isImageMediaType("image/jpeg")).toBe(true); + expect(isImageMediaType("image/webp")).toBe(true); + expect(isImageMediaType("image/gif")).toBe(true); + expect(isImageMediaType("application/pdf")).toBe(false); + expect(isImageMediaType("image/svg+xml")).toBe(false); + }); + + it("classifies pdf + accepted types", () => { + expect(isPdfMediaType("application/pdf")).toBe(true); + expect(isPdfMediaType("image/png")).toBe(false); + expect(isAcceptedAttachmentMediaType("image/gif")).toBe(true); + expect(isAcceptedAttachmentMediaType("application/pdf")).toBe(true); + expect(isAcceptedAttachmentMediaType("text/plain")).toBe(false); + }); +}); + +describe("base64ByteLength", () => { + it("computes decoded length without padding", () => { + // "AAAA" → 3 bytes. + expect(base64ByteLength("AAAA")).toBe(3); + }); + + it("accounts for padding", () => { + // "QQ==" → 1 byte ("A"). + expect(base64ByteLength("QQ==")).toBe(1); + // "QUI=" → 2 bytes ("AB"). + expect(base64ByteLength("QUI=")).toBe(2); + }); + + it("tolerates a data: URI prefix and whitespace", () => { + expect(base64ByteLength("data:image/png;base64,AAAA")).toBe(3); + expect(base64ByteLength("AA\nAA")).toBe(3); + }); + + it("returns 0 for empty input", () => { + expect(base64ByteLength("")).toBe(0); + expect(base64ByteLength(" ")).toBe(0); + }); +}); + +describe("validateUserContent", () => { + it("accepts a small image and ignores text parts", () => { + const content: UserContentPart[] = [ + { type: "text", text: "hi" }, + imagePart(base64OfBytes(1024)), + ]; + expect(validateUserContent(content)).toEqual({ ok: true, errors: [] }); + }); + + it("accepts an empty / text-only content list", () => { + expect(validateUserContent([]).ok).toBe(true); + expect(validateUserContent([{ type: "text", text: "no files" }]).ok).toBe(true); + }); + + it("rejects an unsupported media type", () => { + const res = validateUserContent([imagePart(base64OfBytes(10), "image/svg+xml")]); + expect(res.ok).toBe(false); + expect(res.errors[0]).toMatchObject({ code: "unsupported-type", mediaType: "image/svg+xml" }); + }); + + it("rejects an oversized image but allows a PDF of the same size", () => { + const big = base64OfBytes(MAX_IMAGE_BYTES + 3); + const imgRes = validateUserContent([imagePart(big, "image/png")]); + expect(imgRes.ok).toBe(false); + expect(imgRes.errors.some((e) => e.code === "image-too-large")).toBe(true); + + // Same byte size as a PDF is fine (PDF limit is much higher). + const pdfRes = validateUserContent([imagePart(big, "application/pdf")]); + expect(pdfRes.ok).toBe(true); + }); + + it("rejects an oversized PDF", () => { + const res = validateUserContent([ + imagePart(base64OfBytes(MAX_PDF_BYTES + 3), "application/pdf"), + ]); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "pdf-too-large")).toBe(true); + }); + + it("rejects an empty attachment payload", () => { + const res = validateUserContent([imagePart("", "image/png")]); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "empty")).toBe(true); + }); + + it("rejects too many attachments", () => { + const content: UserContentPart[] = Array.from({ length: MAX_ATTACHMENTS + 1 }, () => + imagePart(base64OfBytes(8)), + ); + const res = validateUserContent(content); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "too-many")).toBe(true); + }); + + it("rejects when the total payload exceeds the request ceiling", () => { + // Several individually-legal PDFs that together exceed the total cap. + const each = Math.floor(MAX_TOTAL_ATTACHMENT_BYTES / 3); + const content: UserContentPart[] = [ + imagePart(base64OfBytes(each), "application/pdf"), + imagePart(base64OfBytes(each), "application/pdf"), + imagePart(base64OfBytes(each), "application/pdf"), + imagePart(base64OfBytes(each), "application/pdf"), + ]; + const res = validateUserContent(content); + expect(res.ok).toBe(false); + expect(res.errors.some((e) => e.code === "total-too-large")).toBe(true); + }); +}); diff --git a/packages/core/tests/models/catalog.test.ts b/packages/core/tests/models/catalog.test.ts index 51043e6..f4bddc2 100644 --- a/packages/core/tests/models/catalog.test.ts +++ b/packages/core/tests/models/catalog.test.ts @@ -4,6 +4,7 @@ import { __resetCatalogCacheForTests, getModelsCatalog, resolveContextLimit, + resolveModelCapabilities, } from "../../src/models/catalog.js"; const CACHE_PATH = "/tmp/dispatch/models-dev.json"; @@ -13,14 +14,30 @@ const CATALOG = { anthropic: { id: "anthropic", models: { - "claude-sonnet-4-5": { limit: { context: 200000, output: 64000 } }, - "claude-sonnet-4-6": { limit: { context: 1000000, output: 64000 } }, + "claude-sonnet-4-5": { + limit: { context: 200000, output: 64000 }, + modalities: { input: ["text", "image", "pdf"], output: ["text"] }, + }, + "claude-sonnet-4-6": { + limit: { context: 1000000, output: 64000 }, + modalities: { input: ["text", "image", "pdf"], output: ["text"] }, + }, + // A text-only model: definitively no image/pdf input. + "text-only-model": { + limit: { context: 100000, output: 8192 }, + modalities: { input: ["text"], output: ["text"] }, + }, + // An entry predating the modalities field → capability unknown. + "legacy-model": { limit: { context: 100000, output: 8192 } }, }, }, opencode: { id: "opencode", models: { - "glm-4-6": { limit: { context: 131072, output: 8192 } }, + "glm-4-6": { + limit: { context: 131072, output: 8192 }, + modalities: { input: ["text", "image"], output: ["text"] }, + }, }, }, }; @@ -156,3 +173,55 @@ describe("getModelsCatalog caching", () => { warn.mockRestore(); }); }); + +describe("resolveModelCapabilities", () => { + it("reports image + pdf for a vision model", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toEqual({ + image: true, + pdf: true, + }); + }); + + it("reports image-only for a model whose modalities omit pdf", async () => { + mockFetchOnce(CATALOG); + // glm-4-6 lists image but not pdf (resolved via the opencode fallback). + expect(await resolveModelCapabilities("opencode-anthropic", "glm-4-6")).toEqual({ + image: true, + pdf: false, + }); + }); + + it("reports a definitive no for a text-only model", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "text-only-model")).toEqual({ + image: false, + pdf: false, + }); + }); + + it("returns null (unknown) for an entry without modalities", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "legacy-model")).toBeNull(); + }); + + it("returns null (unknown) for an unknown model id", async () => { + mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("anthropic", "no-such-model")).toBeNull(); + }); + + it("returns null for an unsupported provider without hitting the network", async () => { + const fetchFn = mockFetchOnce(CATALOG); + expect(await resolveModelCapabilities("google", "gemini-2.5-pro")).toBeNull(); + expect(await resolveModelCapabilities("anthropic", "")).toBeNull(); + expect(fetchFn).not.toHaveBeenCalled(); + }); + + it("returns null (unknown) when the catalog is offline with no cache", async () => { + const fetchFn = vi.fn(() => Promise.reject(new Error("offline"))); + vi.stubGlobal("fetch", fetchFn); + const warn = vi.spyOn(console, "warn").mockImplementation(() => {}); + expect(await resolveModelCapabilities("anthropic", "claude-sonnet-4-5")).toBeNull(); + warn.mockRestore(); + }); +}); diff --git a/packages/frontend/src/App.svelte b/packages/frontend/src/App.svelte index a0b25b7..ae0718e 100644 --- a/packages/frontend/src/App.svelte +++ b/packages/frontend/src/App.svelte @@ -131,6 +131,59 @@ $effect(() => { })(); }); +// ─── Image / PDF capability lookup ───────────────────────────── +// Resolve whether the active model accepts image/pdf INPUT from models.dev (via +// the API), so the chat input can block sending an unsupported attachment +// (no tokens spent) while staying permissive when the capability is unknown. +// `null` = unknown (catalog offline / unsupported provider) → optimistic allow. +let imageSupport = $state<{ image: boolean; pdf: boolean } | null>(null); +const capabilityCache = new Map(); + +$effect(() => { + const tab = tabStore.activeTab; + const keyId = tab?.keyId ?? null; + const modelId = tab?.modelId ?? null; + const provider = keyId ? (modelsData.keys.find((k) => k.id === keyId)?.provider ?? null) : null; + + if (!provider || !modelId) { + imageSupport = null; + return; + } + + const cacheKey = `${provider}/${modelId}`; + if (capabilityCache.has(cacheKey)) { + imageSupport = capabilityCache.get(cacheKey) ?? null; + return; + } + + // Clear immediately so a slow/failed fetch can't leave the PREVIOUS model's + // capability on screen (which could wrongly block/allow this model). + imageSupport = null; + + void (async () => { + try { + const res = await fetch( + `${config.apiBase}/models/capabilities?provider=${encodeURIComponent(provider)}&modelId=${encodeURIComponent(modelId)}`, + ); + if (!res.ok) return; + const data = (await res.json()) as { + capabilities?: { image: boolean; pdf: boolean } | null; + }; + const caps = data.capabilities ?? null; + capabilityCache.set(cacheKey, caps); + const current = tabStore.activeTab; + const currentProvider = current?.keyId + ? (modelsData.keys.find((k) => k.id === current.keyId)?.provider ?? null) + : null; + if (currentProvider === provider && current?.modelId === modelId) { + imageSupport = caps; + } + } catch { + // Leave imageSupport as null (unknown → permissive) on network error. + } + })(); +}); + onMount(() => { // Apply persisted theme (or the shared DEFAULT_THEME if nothing is // stored) so the first paint matches what the Settings panel will @@ -174,7 +227,7 @@ onMount(() => {
- + diff --git a/packages/frontend/src/lib/attachment-tokens.ts b/packages/frontend/src/lib/attachment-tokens.ts new file mode 100644 index 0000000..79d4cbc --- /dev/null +++ b/packages/frontend/src/lib/attachment-tokens.ts @@ -0,0 +1,234 @@ +// Inline attachment tokens for the chat input. +// +// A pasted image/PDF is represented in the textarea draft as an inline TOKEN +// (e.g. `【image:a1b2c3】`). The token is ordinary text living inside the draft, +// so attachments have ORDER relative to typed text and to each other, and the +// user can reference them positionally ("here is image A: 【image:…】"). The +// token is also the ONLY handle on an attachment — deleting it (atomic delete, +// below) detaches the underlying file. There is no separate preview strip. +// +// This module is pure (no DOM, no Svelte) so it can be unit-tested directly. + +import type { UserContentPart } from "@dispatch/core/src/types/index.js"; + +export type AttachmentKind = "image" | "pdf"; + +/** A staged attachment, keyed by its short token id. */ +export interface StagedAttachment { + id: string; + kind: AttachmentKind; + /** IANA media type, e.g. `image/png`, `application/pdf`. */ + mediaType: string; + /** Base64 payload WITHOUT a `data:` URI prefix. */ + data: string; + /** Optional original filename (used for PDFs). */ + name?: string; +} + +/** + * Token grammar: `【:】` where kind ∈ {image,pdf} and id is 6 + * lowercase alphanumerics. The CJK corner brackets (U+3010/U+3011) are used as + * delimiters because they're visually distinct and virtually never typed by + * hand, so a token won't collide with normal prose. + */ +export const ATTACHMENT_TOKEN_RE = /【(image|pdf):([a-z0-9]{6})】/g; + +/** Build the inline token string for a staged attachment id + kind. */ +export function makeAttachmentToken(kind: AttachmentKind, id: string): string { + return `【${kind}:${id}】`; +} + +/** Generate a short, URL-safe token id (6 lowercase alphanumerics). */ +export function generateTokenId(): string { + let out = ""; + const alphabet = "abcdefghijklmnopqrstuvwxyz0123456789"; + // crypto.getRandomValues is available in browsers and modern Node/Bun. + const cryptoObj = (globalThis as { crypto?: Crypto }).crypto; + if (cryptoObj?.getRandomValues) { + const buf = new Uint32Array(6); + cryptoObj.getRandomValues(buf); + for (let i = 0; i < 6; i++) out += alphabet[(buf[i] ?? 0) % alphabet.length]; + return out; + } + for (let i = 0; i < 6; i++) out += alphabet[Math.floor(Math.random() * alphabet.length)]; + return out; +} + +export interface FoundToken { + id: string; + kind: AttachmentKind; + /** Inclusive start index of the token within the text. */ + start: number; + /** Exclusive end index of the token within the text. */ + end: number; +} + +/** Find all attachment tokens in `text`, in order of appearance. */ +export function findTokens(text: string): FoundToken[] { + const out: FoundToken[] = []; + // Fresh regex per call so `lastIndex` state never leaks between calls. + const re = new RegExp(ATTACHMENT_TOKEN_RE.source, "g"); + let m: RegExpExecArray | null = re.exec(text); + while (m !== null) { + out.push({ + kind: m[1] as AttachmentKind, + id: m[2] ?? "", + start: m.index, + end: m.index + m[0].length, + }); + m = re.exec(text); + } + return out; +} + +/** The set of attachment ids whose token is still intact in `text`. */ +export function intactTokenIds(text: string): Set { + return new Set(findTokens(text).map((t) => t.id)); +} + +export interface DeletionResult { + /** Text after the deletion. */ + text: string; + /** New caret position (collapsed) after the deletion. */ + caret: number; + /** Ids of attachments whose tokens were removed by this deletion. */ + removedIds: string[]; +} + +/** + * Compute the result of a Backspace/Delete keystroke when it interacts with an + * attachment token, so a token deletes ATOMICALLY (one keystroke removes the + * whole `【…】`, never a single bracket). Returns `null` when the keystroke does + * NOT touch a token — the caller should then let the browser's default editing + * behaviour run. + * + * Rules: + * - Range selection (`selStart !== selEnd`): expand the range to fully cover + * any token it overlaps, then delete the expanded range. Only acts when at + * least one token actually overlaps (otherwise returns null). + * - Collapsed + Backspace: if a token ends exactly at the caret, delete it. + * - Collapsed + Delete: if a token starts exactly at the caret, delete it. + */ +export function computeTokenDeletion( + text: string, + selStart: number, + selEnd: number, + key: "Backspace" | "Delete", +): DeletionResult | null { + const tokens = findTokens(text); + if (tokens.length === 0) return null; + + if (selStart !== selEnd) { + const lo = Math.min(selStart, selEnd); + const hi = Math.max(selStart, selEnd); + const overlapping = tokens.filter((t) => t.start < hi && t.end > lo); + if (overlapping.length === 0) return null; + const delStart = Math.min(lo, ...overlapping.map((t) => t.start)); + const delEnd = Math.max(hi, ...overlapping.map((t) => t.end)); + return { + text: text.slice(0, delStart) + text.slice(delEnd), + caret: delStart, + removedIds: overlapping.map((t) => t.id), + }; + } + + // Collapsed caret. + if (key === "Backspace") { + const tok = tokens.find((t) => t.end === selStart); + if (!tok) return null; + return { + text: text.slice(0, tok.start) + text.slice(tok.end), + caret: tok.start, + removedIds: [tok.id], + }; + } + // Delete (forward). + const tok = tokens.find((t) => t.start === selStart); + if (!tok) return null; + return { + text: text.slice(0, tok.start) + text.slice(tok.end), + caret: tok.start, + removedIds: [tok.id], + }; +} + +/** Human-readable marker that replaces a token in persisted/display text. */ +export function markerFor(kind: AttachmentKind): string { + return kind === "pdf" ? "[pdf]" : "[image]"; +} + +export interface ParsedDraft { + /** + * Text-only projection of the draft with each attachment token replaced by a + * `[image]` / `[pdf]` marker. This is what gets persisted and rendered in the + * chat history (the raw bytes are never stored). + */ + displayText: string; + /** + * Ordered multimodal content (interleaved text + attachment parts) to send to + * the model, or `null` when the draft has no intact attachment token (the + * caller then sends plain text). + */ + content: UserContentPart[] | null; +} + +/** + * Split a draft (text containing attachment tokens) plus the staged-attachment + * map into: + * - `displayText`: tokens swapped for `[image]`/`[pdf]` markers, and + * - `content`: an ordered `UserContentPart[]` interleaving the surrounding text + * with the matching attachment parts. + * + * A token whose id has no matching staged attachment (e.g. a stray paste of the + * token text, or a detached attachment) is treated as plain text in BOTH + * outputs — its marker still appears in `displayText`, but it contributes no + * attachment part. `content` is `null` when no attachment part is produced. + */ +export function parseDraft(draft: string, attachments: Map): ParsedDraft { + const tokens = findTokens(draft); + let displayText = ""; + const content: UserContentPart[] = []; + let textBuf = ""; + let cursor = 0; + let producedAttachment = false; + + const flushText = () => { + if (textBuf.length > 0) { + content.push({ type: "text", text: textBuf }); + textBuf = ""; + } + }; + + for (const tok of tokens) { + const between = draft.slice(cursor, tok.start); + textBuf += between; + displayText += between; + const att = attachments.get(tok.id); + if (att) { + // displayText (persisted/rendered) gets a `[image]`/`[pdf]` marker; + // the multimodal content gets the ACTUAL attachment part instead — no + // marker text, since the part itself represents the file to the model. + displayText += markerFor(tok.kind); + flushText(); + content.push({ + type: "attachment", + mediaType: att.mediaType, + data: att.data, + ...(att.name ? { name: att.name } : {}), + }); + producedAttachment = true; + } else { + // Orphan token (no staged attachment) → keep the marker as plain text + // in BOTH outputs; it contributes no attachment part. + displayText += markerFor(tok.kind); + textBuf += markerFor(tok.kind); + } + cursor = tok.end; + } + const tail = draft.slice(cursor); + textBuf += tail; + displayText += tail; + flushText(); + + return { displayText, content: producedAttachment ? content : null }; +} diff --git a/packages/frontend/src/lib/components/ChatInput.svelte b/packages/frontend/src/lib/components/ChatInput.svelte index 079ef4a..4067b78 100644 --- a/packages/frontend/src/lib/components/ChatInput.svelte +++ b/packages/frontend/src/lib/components/ChatInput.svelte @@ -1,12 +1,40 @@
+ {#if attachmentWarning} +
+ + {attachmentWarning} +
+ {/if}