diff options
Diffstat (limited to 'packages/api/src')
| -rw-r--r-- | packages/api/src/agent-manager.ts | 11 | ||||
| -rw-r--r-- | packages/api/src/app.ts | 63 | ||||
| -rw-r--r-- | packages/api/src/routes/models.ts | 18 |
3 files changed, 92 insertions, 0 deletions
diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts index 2532efa..3b12a80 100644 --- a/packages/api/src/agent-manager.ts +++ b/packages/api/src/agent-manager.ts @@ -65,6 +65,7 @@ import { toAvailableUserAgents, type UsageData, type UsageStats, + type UserContentPart, validateConfig, } from "@dispatch/core"; import type { PermissionManager } from "./permission-manager.js"; @@ -1536,6 +1537,13 @@ export class AgentManager { workingDirectory?: string; queueId?: string; /** + * Ephemeral ordered multimodal content (image/pdf attachments) for a + * FRESH human turn. Forwarded to `processMessage` → `agent.run` only + * when the tab is idle (a started turn); never carried into the queue + * path (attachments require a fresh turn — the caller guards that). + */ + content?: UserContentPart[]; + /** * Who is sending this message. `"human"` (default) is unrestricted * and REFILLS the target's agent-to-agent auto-wake budget. `"agent"` * (from the `send_to_tab` tool) is governed by that budget: an @@ -1606,6 +1614,7 @@ export class AgentManager { opts.reasoningEffort, opts.workingDirectory, agentModels, + opts.content, ).catch((err) => { console.error(`[dispatch] deliverMessage processMessage error for tab ${tabId}:`, err); }); @@ -1620,6 +1629,7 @@ export class AgentManager { reasoningEffort?: ReasoningEffort, workingDirectory?: string, agentModels?: AgentModelEntry[], + content?: UserContentPart[], ): Promise<void> { const tabAgent = this._getOrCreateTabAgent(tabId); @@ -1731,6 +1741,7 @@ export class AgentManager { for await (const event of agent.run(message, { ...(effortForEntry ? { reasoningEffort: effortForEntry } : {}), abortSignal: tabAgent.abortController?.signal, + ...(content ? { content } : {}), })) { // Stop processing if the tab was aborted (closed/stopped). // stopTab() already injected a `cancelled` system chunk into diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts index 84afd2a..2f4e538 100644 --- a/packages/api/src/app.ts +++ b/packages/api/src/app.ts @@ -3,6 +3,8 @@ import { getTab, isReasoningEffort, NotificationDispatcher, + type UserContentPart, + validateUserContent, } from "@dispatch/core"; import { Hono } from "hono"; import { cors } from "hono/cors"; @@ -37,6 +39,41 @@ function sanitizeAgentModels(raw: unknown): AgentModelEntry[] | undefined { return out; } +/** + * Validate and normalise the optional multimodal `content` array from the + * `/chat` body. Each entry is either a `{ type: "text", text }` part or a + * `{ type: "attachment", mediaType, data, name? }` part (base64 payload). + * Returns `undefined` when the input isn't a non-empty array or contains no + * attachment (so the plain-string path is taken — byte-identical to before). + * Shape only: SIZE/TYPE limits are enforced separately by `validateUserContent`. + */ +function sanitizeUserContent(raw: unknown): UserContentPart[] | undefined { + if (!Array.isArray(raw) || raw.length === 0) return undefined; + const out: UserContentPart[] = []; + let hasAttachment = false; + for (const p of raw) { + if (!p || typeof p !== "object") continue; + const part = p as Record<string, unknown>; + if (part.type === "text") { + if (typeof part.text === "string") out.push({ type: "text", text: part.text }); + continue; + } + if (part.type === "attachment") { + if (typeof part.mediaType !== "string" || typeof part.data !== "string") continue; + hasAttachment = true; + out.push({ + type: "attachment", + mediaType: part.mediaType, + data: part.data, + ...(typeof part.name === "string" ? { name: part.name } : {}), + }); + } + } + // No attachment → let the plain-text path handle it (avoids needlessly + // switching the model message to array content for a text-only turn). + return hasAttachment ? out : undefined; +} + export const permissionManager = new PermissionManager(); export const agentManager = new AgentManager(permissionManager); @@ -94,6 +131,7 @@ app.post("/chat", async (c) => { const body = await c.req.json<{ tabId?: unknown; message?: unknown; + content?: unknown; keyId?: unknown; modelId?: unknown; agentModels?: unknown; @@ -121,6 +159,30 @@ app.post("/chat", async (c) => { ? body.reasoningEffort : undefined; + // Optional multimodal content (image/pdf attachments). When present, the + // attachments are EPHEMERAL — forwarded to the model for this turn only and + // never persisted (the chunk log keeps just `message`, which the frontend + // has already projected to text with `[image]`/`[pdf]` markers). + const content = sanitizeUserContent(body.content); + if (content) { + // Enforce size/type/count ceilings server-side (defence in depth; the + // frontend also enforces them at paste time). Reject the whole request + // so no tokens are spent on an over-limit payload. + const validation = validateUserContent(content); + if (!validation.ok) { + return c.json({ error: "invalid attachments", details: validation.errors }, 400); + } + // Attachments only attach to a FRESH turn. If the tab is mid-turn the + // message would queue (text-only machinery), silently dropping the + // images. Reject clearly instead so the user can retry once idle. + if (agentManager.getTabStatus(tabId) === "running") { + return c.json( + { error: "cannot attach images while the agent is generating; wait for it to finish" }, + 409, + ); + } + } + // Single routing decision (queue if busy, new turn if idle) shared with the // `send_to_tab` tool via `AgentManager.deliverMessage`. Non-blocking — a // started turn runs in the background. @@ -131,6 +193,7 @@ app.post("/chat", async (c) => { ...(reasoningEffort ? { reasoningEffort } : {}), ...(workingDirectory !== undefined ? { workingDirectory } : {}), ...(queueId ? { queueId } : {}), + ...(content ? { content } : {}), }); if (outcome.status === "queued") { diff --git a/packages/api/src/routes/models.ts b/packages/api/src/routes/models.ts index eeb6029..a1700b1 100644 --- a/packages/api/src/routes/models.ts +++ b/packages/api/src/routes/models.ts @@ -20,6 +20,7 @@ import { refreshAccountCredentialsAsync, resolveApiKey, resolveContextLimit, + resolveModelCapabilities, selectHaikuModel, setApiKey, validateAccountCredentials, @@ -180,6 +181,23 @@ modelsRoutes.get("/context-limit", async (c) => { return c.json({ contextLimit }); }); +// Resolve a model's image / PDF INPUT capabilities from the models.dev catalog. +// Returns `{ capabilities: { image, pdf } | null }`. `null` means UNKNOWN — the +// provider is unmapped, the model is absent, the catalog predates the +// `modalities` field, or the catalog is offline. The frontend treats `null` as +// "can't verify" (optimistic allow) and a definitive `{ image: false }` as a +// hard block (no tokens spent). +modelsRoutes.get("/capabilities", async (c) => { + const provider = c.req.query("provider"); + const modelId = c.req.query("modelId"); + if (!provider || !modelId) { + return c.json({ error: "provider and modelId query parameters are required" }, 400); + } + + const capabilities = await resolveModelCapabilities(provider, modelId); + return c.json({ capabilities }); +}); + // List available Claude accounts with validated credentials modelsRoutes.get("/claude-accounts", async (c) => { const candidates = resolveClaudeAccounts(); |
