fix: warm the SAME Anthropic message-cache bucket as real turns

Root cause of the 'first warmup misses' + 'switch to chat misses' bugs: Anthropic keys the MESSAGE-level prompt cache on `tool_choice` AND the extended-thinking parameters (both rows in their cache-invalidation table mark the messages cache as invalidated on change). The original warmCache() sent toolChoice:'none' and NO thinking providerOptions, while real turns send toolChoice:'auto' + thinking config for the effort. So warming and chat wrote TWO different message-cache buckets: - warmup #1 missed (no warm-only bucket existed yet), every later warmup hit its own bucket; - the next real chat message read the OTHER bucket → miss. Fix: extract a shared buildStreamOptions() that produces the cache-affecting params (toolChoice + thinking providerOptions + maxOutputTokens). Both run() and warmCache() now call it with the SAME resolved reasoning effort, so the warming replay refreshes the exact cache the next real message reads. The trivial probe turn is still appended AFTER the last cache breakpoint, so it never disturbs the cached prefix. Threaded the per-tab reasoning effort (per-model -> per-tab selector -> default, mirroring processMessage) from the frontend resolver through POST /chat/warm to warmCacheForTab to warmCache. Tests: updated the warmCache toolChoice test to assert it MATCHES a real turn, added an invariant test driving run() and warmCache() and asserting identical cache-affecting params, and assert effort forwarding in the frontend store. check / test (780) / frontend build / typecheck all green.
author: Adam Malczewski <[email protected]> 2026-06-03 14:49:04 +0900
committer: Adam Malczewski <[email protected]> 2026-06-03 14:49:04 +0900
commit: 656aad2752991ff32e98fed270fa330355650c17 (patch)
tree: ef6ca33b8bf988f6790c19eb021432c34e7ae798
parent: e87e6b39285c8001045d1ebdac873b182c0f7868 (diff)
download: dispatch-656aad2752991ff32e98fed270fa330355650c17.tar.gz
dispatch-656aad2752991ff32e98fed270fa330355650c17.zip
7 files changed, 92 insertions, 7 deletions
diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts
index 109dd33..0a6f3c6 100644
--- a/packages/api/src/agent-manager.ts
+++ b/packages/api/src/agent-manager.ts
@@ -1040,7 +1040,12 @@ export class AgentManager {
 	 */
 	async warmCacheForTab(
 		tabId: string,
-		opts: { keyId?: string; modelId?: string; agentModels?: AgentModelEntry[] } = {},
+		opts: {
+			keyId?: string;
+			modelId?: string;
+			agentModels?: AgentModelEntry[];
+			reasoningEffort?: ReasoningEffort;
+		} = {},
 	): Promise<{ ok: true; usage: UsageData } | { ok: false; error: string }> {
 		if (this.getTabStatus(tabId) === "running") {
 			return { ok: false, error: "tab is generating" };
@@ -1060,6 +1065,13 @@ export class AgentManager {
 				primary?.model_id || opts.modelId,
 			);
 
+			// Resolve the SAME reasoning effort the next real turn would use:
+			// per-model (agent definition) → per-tab selector → Agent default.
+			// This drives the thinking providerOptions, which is an Anthropic
+			// message-cache key — warming MUST match it or it warms a different
+			// cache bucket than the real turn reads (the 0%-on-switch bug).
+			const effort = primary?.effort ?? opts.reasoningEffort;
+
 			// Rebuild the genuine history exactly as `getOrCreateAgentForTab`'s
 			// pre-population does, but keep the FULL history (no trailing-user
 			// trim): warming replays the complete cached prefix as-is.
@@ -1071,7 +1083,9 @@ export class AgentManager {
 				history = [...agent.messages];
 			}
 
-			const usage = await agent.warmCache(history);
+			const usage = await agent.warmCache(history, {
+				...(effort ? { reasoningEffort: effort } : {}),
+			});
 			return { ok: true, usage };
 		} catch (err) {
 			return { ok: false, error: err instanceof Error ? err.message : String(err) };
diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts
index a957da7..72188ff 100644
--- a/packages/api/src/app.ts
+++ b/packages/api/src/app.ts
@@ -239,6 +239,7 @@ app.post("/chat/warm", async (c) => {
 		keyId?: unknown;
 		modelId?: unknown;
 		agentModels?: unknown;
+		reasoningEffort?: unknown;
 	}>();
 	const { tabId } = body;
 	if (typeof tabId !== "string" || tabId.trim() === "") {
@@ -247,11 +248,17 @@ app.post("/chat/warm", async (c) => {
 	const keyId = typeof body.keyId === "string" ? body.keyId : undefined;
 	const modelId = typeof body.modelId === "string" ? body.modelId : undefined;
 	const agentModels = sanitizeAgentModels(body.agentModels);
+	// Same effort the real turn would use — a message-cache key, so warming must
+	// match it to refresh the SAME bucket the next real message reads.
+	const reasoningEffort = isReasoningEffort(body.reasoningEffort)
+		? body.reasoningEffort
+		: undefined;
 
 	const result = await agentManager.warmCacheForTab(tabId, {
 		...(keyId ? { keyId } : {}),
 		...(modelId ? { modelId } : {}),
 		...(agentModels ? { agentModels } : {}),
+		...(reasoningEffort ? { reasoningEffort } : {}),
 	});
 	if (!result.ok) {
 		// "tab is generating" is an expected race (not a server fault) → 409.
diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
index d0a3bb9..2e2dbb2 100644
--- a/packages/core/src/agent/agent.ts
+++ b/packages/core/src/agent/agent.ts
diff --git a/packages/core/tests/agent/agent.test.ts b/packages/core/tests/agent/agent.test.ts
index 86a7a5b..797aea2 100644
--- a/packages/core/tests/agent/agent.test.ts
+++ b/packages/core/tests/agent/agent.test.ts
@@ -1709,7 +1709,12 @@ describe("anthropicThinkingProviderOptions — adaptive-thinking model detection
 			expect(userMsgs).toHaveLength(2);
 		});
 
-		it("sends Anthropic cache_control breakpoints + toolChoice none", async () => {
+		it("sends Anthropic cache_control breakpoints with the SAME toolChoice/thinking as a real turn", async () => {
+			// Anthropic keys the MESSAGE cache on `tool_choice` AND the extended-
+			// thinking parameters. If warming sent a different value than a real
+			// turn, it would warm a DIFFERENT message-cache bucket and the user's
+			// next real message would still miss. So warming MUST mirror run():
+			// toolChoice "auto" + the thinking providerOptions for the effort.
 			vi.mocked(streamText).mockReturnValue(
 				makeWarmStream({ inputTokens: 10, cacheReadTokens: 5, cacheWriteTokens: 0 }),
 			);
@@ -1717,7 +1722,9 @@ describe("anthropicThinkingProviderOptions — adaptive-thinking model detection
 			await agent.warmCache(history);
 
 			const callArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
-			expect(callArgs?.toolChoice).toBe("none");
+			expect(callArgs?.toolChoice).toBe("auto");
+			// Thinking providerOptions present (effort defaults to "max").
+			expect(callArgs?.providerOptions?.anthropic).toBeDefined();
 			const messages = callArgs?.messages as Array<{
 				role: string;
 				providerOptions?: { anthropic?: { cacheControl?: unknown } };
@@ -1728,6 +1735,38 @@ describe("anthropicThinkingProviderOptions — adaptive-thinking model detection
 			expect(hasBreakpoint).toBe(true);
 		});
 
+		it("warming and a real turn send IDENTICAL cache-affecting params (same bucket)", async () => {
+			// The core invariant of the whole feature: warmCache() and run() must
+			// produce the same toolChoice + thinking providerOptions + maxOutputTokens
+			// so the warming replay refreshes the EXACT cache the next real message
+			// reads. Drive both and compare the cache-key inputs streamText receives.
+			const cfg = makeConfig({ provider: "anthropic" });
+
+			// 1) Real turn for the same history + the probe text as the user msg.
+			const realAgent = new Agent(cfg);
+			realAgent.messages.push(...history.map((m) => ({ ...m })));
+			vi.mocked(streamText).mockReturnValue(
+				makeMockStreamResult([{ type: "text-delta", id: "t0", text: "." }, finishStop]),
+			);
+			for await (const _ of realAgent.run("reply with just a .")) {
+				// consume
+			}
+			const realArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+
+			// 2) Warming replay for the same history.
+			const warmAgent = new Agent(cfg);
+			vi.mocked(streamText).mockReturnValue(
+				makeWarmStream({ inputTokens: 10, cacheReadTokens: 5, cacheWriteTokens: 0 }),
+			);
+			await warmAgent.warmCache(history);
+			const warmArgs = vi.mocked(streamText).mock.calls.at(-1)?.[0];
+
+			// The cache-affecting parameters must be byte-identical.
+			expect(warmArgs?.toolChoice).toEqual(realArgs?.toolChoice);
+			expect(warmArgs?.maxOutputTokens).toEqual(realArgs?.maxOutputTokens);
+			expect(warmArgs?.providerOptions).toEqual(realArgs?.providerOptions);
+		});
+
 		it("does NOT mutate the agent's own message history", async () => {
 			vi.mocked(streamText).mockReturnValue(
 				makeWarmStream({ inputTokens: 10, cacheReadTokens: 5, cacheWriteTokens: 0 }),
diff --git a/packages/frontend/src/lib/cache-warming.svelte.ts b/packages/frontend/src/lib/cache-warming.svelte.ts
index cda3fd1..0253c08 100644
--- a/packages/frontend/src/lib/cache-warming.svelte.ts
+++ b/packages/frontend/src/lib/cache-warming.svelte.ts
@@ -41,6 +41,12 @@ export interface WarmRequestParams {
 	keyId: string | null;
 	modelId: string | null;
 	agentModels: AgentModelEntry[] | null;
+	/**
+	 * The SAME reasoning effort the next real turn would use. It drives the
+	 * Anthropic thinking providerOptions, which is a message-cache key — warming
+	 * must match it so it refreshes the bucket the real message reads.
+	 */
+	reasoningEffort: string | null;
 }
 
 /** Reactive, per-tab warming UI state (read by the Chat Settings debug strip). */
@@ -177,6 +183,7 @@ export function createCacheWarmingStore() {
 					...(params?.keyId ? { keyId: params.keyId } : {}),
 					...(params?.modelId ? { modelId: params.modelId } : {}),
 					...(params?.agentModels ? { agentModels: params.agentModels } : {}),
+					...(params?.reasoningEffort ? { reasoningEffort: params.reasoningEffort } : {}),
 				}),
 			});
 			// A newer cancel/fire superseded this request — drop its result so it
diff --git a/packages/frontend/src/lib/tabs.svelte.ts b/packages/frontend/src/lib/tabs.svelte.ts
index a0125ef..ca04e62 100644
--- a/packages/frontend/src/lib/tabs.svelte.ts
+++ b/packages/frontend/src/lib/tabs.svelte.ts
@@ -251,7 +251,12 @@ export function createTabStore() {
 	cacheWarming.setRequestResolver((tabId) => {
 		const t = getTabById(tabId);
 		if (!t) return null;
-		return { keyId: t.keyId, modelId: t.modelId, agentModels: t.agentModels };
+		return {
+			keyId: t.keyId,
+			modelId: t.modelId,
+			agentModels: t.agentModels,
+			reasoningEffort: t.reasoningEffort,
+		};
 	});
 
 	$effect.root(() => {
diff --git a/packages/frontend/tests/cache-warming.test.ts b/packages/frontend/tests/cache-warming.test.ts
index 012efb1..583d563 100644
--- a/packages/frontend/tests/cache-warming.test.ts
+++ b/packages/frontend/tests/cache-warming.test.ts
@@ -98,7 +98,12 @@ describe("firing cadence", () => {
 		const fetchMock = makeFetchOk({ inputTokens: 1000, cacheReadTokens: 900 });
 		vi.stubGlobal("fetch", fetchMock);
 
-		store.setRequestResolver(() => ({ keyId: "k", modelId: "m", agentModels: null }));
+		store.setRequestResolver(() => ({
+			keyId: "k",
+			modelId: "m",
+			agentModels: null,
+			reasoningEffort: "high",
+		}));
 		store.setEnabled("tab-1", true);
 
 		await vi.advanceTimersByTimeAsync(WARM_INTERVAL_MS);
@@ -108,7 +113,15 @@ describe("firing cadence", () => {
 		const [url, opts] = (fetchMock as unknown as { mock: { calls: unknown[][] } }).mock
 			.calls[0] as [string, { body: string }];
 		expect(url).toContain("/chat/warm");
-		expect(JSON.parse(opts.body)).toMatchObject({ tabId: "tab-1", keyId: "k", modelId: "m" });
+		// The request forwards the SAME effort the real turn uses — it's an
+		// Anthropic message-cache key, so warming must match it to refresh the
+		// bucket the next real message reads.
+		expect(JSON.parse(opts.body)).toMatchObject({
+			tabId: "tab-1",
+			keyId: "k",
+			modelId: "m",
+			reasoningEffort: "high",
+		});
 
 		const s = store.stateFor("tab-1");
 		expect(s.lastPct).toBe(90); // 900 / 1000
author	Adam Malczewski <[email protected]>	2026-06-03 14:49:04 +0900
committer	Adam Malczewski <[email protected]>	2026-06-03 14:49:04 +0900
commit	656aad2752991ff32e98fed270fa330355650c17 (patch)
tree	ef6ca33b8bf988f6790c19eb021432c34e7ae798
parent	e87e6b39285c8001045d1ebdac873b182c0f7868 (diff)
download	dispatch-656aad2752991ff32e98fed270fa330355650c17.tar.gz dispatch-656aad2752991ff32e98fed270fa330355650c17.zip