fix(vision): tell vision agents not to use tools, just describe images directly

Kimi was trying to use Python tools to analyze images rather than just describing them. Updated both vision system prompts (consult_vision and image compaction) to explicitly instruct: do not use any tools unless specifically asked to — just use your vision to see the image and describe it directly.
author: Adam Malczewski <[email protected]> 2026-06-28 21:24:18 +0900
committer: Adam Malczewski <[email protected]> 2026-06-28 21:24:18 +0900
commit: 6dd9ea9b935e5011c16faed6c869c976cf5ff172 (patch)
tree: 4702e1ba6a58cbf630831deb24bcd1d6d261a75c
parent: 7f1381c4452846e5a2689d868ab0ee2bc90042c9 (diff)
download: dispatch-6dd9ea9b935e5011c16faed6c869c976cf5ff172.tar.gz
dispatch-6dd9ea9b935e5011c16faed6c869c976cf5ff172.zip
1 files changed, 7 insertions, 2 deletions
diff --git a/packages/vision-handoff/src/service.ts b/packages/vision-handoff/src/service.ts
index 01245df..397d81a 100644
--- a/packages/vision-handoff/src/service.ts
+++ b/packages/vision-handoff/src/service.ts
@@ -354,7 +354,9 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando
         };
         const stream = vision.provider.stream([userMessage], [], {
           model: vision.model,
-          systemPrompt: "You are a vision assistant. Describe images faithfully and thoroughly.",
+          systemPrompt:
+            "You are a vision assistant. Describe images faithfully and thoroughly. " +
+            "Do not use any tools — just use your vision to see the image and describe it directly.",
         });
         const description = (await collectTextFromStream(stream)).trim();
         const text =
@@ -657,7 +659,10 @@ export function createVisionHandoffService(deps: VisionHandoffDeps): VisionHando
           modelName: vision.modelName,
           ...(opts.cwd !== undefined ? { cwd: opts.cwd } : {}),
           systemPrompt:
-            "You are a vision assistant. A developer who cannot see images is asking you specific questions about an image they attached. Answer their question precisely and thoroughly.",
+            "You are a vision assistant. A developer who cannot see images is asking you specific " +
+            "questions about an image they attached. Answer their question precisely and thoroughly. " +
+            "Do not use any tools unless specifically asked to — just use your vision to see the " +
+            "image and describe it directly.",
           onEvent: (event: AgentEvent) => {
             if (event.type === "text-delta") {
               responseText += event.delta;
author	Adam Malczewski <[email protected]>	2026-06-28 21:24:18 +0900
committer	Adam Malczewski <[email protected]>	2026-06-28 21:24:18 +0900
commit	6dd9ea9b935e5011c16faed6c869c976cf5ff172 (patch)
tree	4702e1ba6a58cbf630831deb24bcd1d6d261a75c
parent	7f1381c4452846e5a2689d868ab0ee2bc90042c9 (diff)
download	dispatch-6dd9ea9b935e5011c16faed6c869c976cf5ff172.tar.gz dispatch-6dd9ea9b935e5011c16faed6c869c976cf5ff172.zip