From 656aad2752991ff32e98fed270fa330355650c17 Mon Sep 17 00:00:00 2001
From: Adam Malczewski <github@tradam.dev>
Date: Wed, 3 Jun 2026 14:49:04 +0900
Subject: fix: warm the SAME Anthropic message-cache bucket as real turns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause of the 'first warmup misses' + 'switch to chat misses' bugs:
Anthropic keys the MESSAGE-level prompt cache on `tool_choice` AND the
extended-thinking parameters (both rows in their cache-invalidation table mark
the messages cache as invalidated on change). The original warmCache() sent
toolChoice:'none' and NO thinking providerOptions, while real turns send
toolChoice:'auto' + thinking config for the effort. So warming and chat wrote
TWO different message-cache buckets:
  - warmup #1 missed (no warm-only bucket existed yet), every later warmup hit
    its own bucket;
  - the next real chat message read the OTHER bucket → miss.

Fix: extract a shared buildStreamOptions() that produces the cache-affecting
params (toolChoice + thinking providerOptions + maxOutputTokens). Both run()
and warmCache() now call it with the SAME resolved reasoning effort, so the
warming replay refreshes the exact cache the next real message reads. The
trivial probe turn is still appended AFTER the last cache breakpoint, so it
never disturbs the cached prefix.

Threaded the per-tab reasoning effort (per-model -> per-tab selector -> default,
mirroring processMessage) from the frontend resolver through POST /chat/warm to
warmCacheForTab to warmCache.

Tests: updated the warmCache toolChoice test to assert it MATCHES a real turn,
added an invariant test driving run() and warmCache() and asserting identical
cache-affecting params, and assert effort forwarding in the frontend store.
check / test (780) / frontend build / typecheck all green.
---
 packages/api/src/agent-manager.ts | 18 ++++++++++++++++--
 packages/api/src/app.ts           |  7 +++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'packages/api/src')

diff --git a/packages/api/src/agent-manager.ts b/packages/api/src/agent-manager.ts
index 109dd33..0a6f3c6 100644
--- a/packages/api/src/agent-manager.ts
+++ b/packages/api/src/agent-manager.ts
@@ -1040,7 +1040,12 @@ export class AgentManager {
 	 */
 	async warmCacheForTab(
 		tabId: string,
-		opts: { keyId?: string; modelId?: string; agentModels?: AgentModelEntry[] } = {},
+		opts: {
+			keyId?: string;
+			modelId?: string;
+			agentModels?: AgentModelEntry[];
+			reasoningEffort?: ReasoningEffort;
+		} = {},
 	): Promise<{ ok: true; usage: UsageData } | { ok: false; error: string }> {
 		if (this.getTabStatus(tabId) === "running") {
 			return { ok: false, error: "tab is generating" };
@@ -1060,6 +1065,13 @@ export class AgentManager {
 				primary?.model_id || opts.modelId,
 			);
 
+			// Resolve the SAME reasoning effort the next real turn would use:
+			// per-model (agent definition) → per-tab selector → Agent default.
+			// This drives the thinking providerOptions, which is an Anthropic
+			// message-cache key — warming MUST match it or it warms a different
+			// cache bucket than the real turn reads (the 0%-on-switch bug).
+			const effort = primary?.effort ?? opts.reasoningEffort;
+
 			// Rebuild the genuine history exactly as `getOrCreateAgentForTab`'s
 			// pre-population does, but keep the FULL history (no trailing-user
 			// trim): warming replays the complete cached prefix as-is.
@@ -1071,7 +1083,9 @@ export class AgentManager {
 				history = [...agent.messages];
 			}
 
-			const usage = await agent.warmCache(history);
+			const usage = await agent.warmCache(history, {
+				...(effort ? { reasoningEffort: effort } : {}),
+			});
 			return { ok: true, usage };
 		} catch (err) {
 			return { ok: false, error: err instanceof Error ? err.message : String(err) };
diff --git a/packages/api/src/app.ts b/packages/api/src/app.ts
index a957da7..72188ff 100644
--- a/packages/api/src/app.ts
+++ b/packages/api/src/app.ts
@@ -239,6 +239,7 @@ app.post("/chat/warm", async (c) => {
 		keyId?: unknown;
 		modelId?: unknown;
 		agentModels?: unknown;
+		reasoningEffort?: unknown;
 	}>();
 	const { tabId } = body;
 	if (typeof tabId !== "string" || tabId.trim() === "") {
@@ -247,11 +248,17 @@ app.post("/chat/warm", async (c) => {
 	const keyId = typeof body.keyId === "string" ? body.keyId : undefined;
 	const modelId = typeof body.modelId === "string" ? body.modelId : undefined;
 	const agentModels = sanitizeAgentModels(body.agentModels);
+	// Same effort the real turn would use — a message-cache key, so warming must
+	// match it to refresh the SAME bucket the next real message reads.
+	const reasoningEffort = isReasoningEffort(body.reasoningEffort)
+		? body.reasoningEffort
+		: undefined;
 
 	const result = await agentManager.warmCacheForTab(tabId, {
 		...(keyId ? { keyId } : {}),
 		...(modelId ? { modelId } : {}),
 		...(agentModels ? { agentModels } : {}),
+		...(reasoningEffort ? { reasoningEffort } : {}),
 	});
 	if (!result.ok) {
 		// "tab is generating" is an expected race (not a server fault) → 409.
-- 
cgit v1.2.3