diff options
| author | Frank <[email protected]> | 2025-12-28 20:00:47 -0500 |
|---|---|---|
| committer | Frank <[email protected]> | 2025-12-28 20:00:49 -0500 |
| commit | e0bb96a9f9edf925c445607bc3e19742ba14af8c (patch) | |
| tree | 2a5b2eb6e24aeba4e1fc59dd8686c20c5348aa29 | |
| parent | 82e5d6d458297af9fe60768eee49bc48f975f09e (diff) | |
| download | opencode-e0bb96a9f9edf925c445607bc3e19742ba14af8c.tar.gz opencode-e0bb96a9f9edf925c445607bc3e19742ba14af8c.zip | |
wip: bench
| -rw-r--r-- | packages/console/app/src/routes/bench/[id].tsx | 365 | ||||
| -rw-r--r-- | packages/console/app/src/routes/bench/index.tsx | 209 |
2 files changed, 383 insertions, 191 deletions
diff --git a/packages/console/app/src/routes/bench/[id].tsx b/packages/console/app/src/routes/bench/[id].tsx new file mode 100644 index 000000000..4586eef9b --- /dev/null +++ b/packages/console/app/src/routes/bench/[id].tsx @@ -0,0 +1,365 @@ +import { Title } from "@solidjs/meta" +import { createAsync, query, useParams } from "@solidjs/router" +import { createSignal, For, Show } from "solid-js" +import { Database, desc, eq } from "@opencode-ai/console-core/drizzle/index.js" +import { BenchmarkTable } from "@opencode-ai/console-core/schema/benchmark.sql.js" + +interface TaskSource { + repo: string + from: string + to: string +} + +interface Judge { + score: number + rationale: string + judge: string +} + +interface ScoreDetail { + criterion: string + weight: number + average: number + variance?: number + judges?: Judge[] +} + +interface RunUsage { + input: number + output: number + cost: number +} + +interface Run { + task: string + model: string + agent: string + score: { + final: number + base: number + penalty: number + } + scoreDetails: ScoreDetail[] + usage?: RunUsage + duration?: number +} + +interface Prompt { + commit: string + prompt: string +} + +interface AverageUsage { + input: number + output: number + cost: number +} + +interface Task { + averageScore: number + averageDuration?: number + averageUsage?: AverageUsage + model?: string + agent?: string + summary?: string + runs?: Run[] + task: { + id: string + source: TaskSource + prompts?: Prompt[] + } +} + +interface BenchmarkResult { + averageScore: number + tasks: Task[] +} + +async function getTaskDetail(benchmarkId: string, taskId: string) { + "use server" + const rows = await Database.use((tx) => + tx.select().from(BenchmarkTable).where(eq(BenchmarkTable.id, benchmarkId)).limit(1), + ) + if (!rows[0]) return null + const parsed = JSON.parse(rows[0].result) as BenchmarkResult + const task = parsed.tasks.find((t) => t.task.id === taskId) + return task ?? null +} + +const queryTaskDetail = query(getTaskDetail, "benchmark.task.detail") + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000) + const minutes = Math.floor(seconds / 60) + const remainingSeconds = seconds % 60 + if (minutes > 0) { + return `${minutes}m ${remainingSeconds}s` + } + return `${remainingSeconds}s` +} + +export default function BenchDetail() { + const params = useParams() + const [benchmarkId, taskId] = (params.id ?? "").split(":") + const task = createAsync(() => queryTaskDetail(benchmarkId, taskId)) + + return ( + <main data-page="bench-detail"> + <Title>Benchmark - {taskId}</Title> + <div style={{ padding: "1rem" }}> + <Show when={task()} fallback={<p>Task not found</p>}> + <div style={{ "margin-bottom": "1rem" }}> + <div> + <strong>Agent: </strong> + {task()?.agent ?? "N/A"} + </div> + <div> + <strong>Model: </strong> + {task()?.model ?? "N/A"} + </div> + <div> + <strong>Task: </strong> + {task()!.task.id} + </div> + </div> + + <div style={{ "margin-bottom": "1rem" }}> + <div> + <strong>Repo: </strong> + <a + href={`https://github.com/${task()!.task.source.repo}`} + target="_blank" + rel="noopener noreferrer" + style={{ color: "#0066cc" }} + > + {task()!.task.source.repo} + </a> + </div> + <div> + <strong>From: </strong> + <a + href={`https://github.com/${task()!.task.source.repo}/commit/${task()!.task.source.from}`} + target="_blank" + rel="noopener noreferrer" + style={{ color: "#0066cc" }} + > + {task()!.task.source.from.slice(0, 7)} + </a> + </div> + <div> + <strong>To: </strong> + <a + href={`https://github.com/${task()!.task.source.repo}/commit/${task()!.task.source.to}`} + target="_blank" + rel="noopener noreferrer" + style={{ color: "#0066cc" }} + > + {task()!.task.source.to.slice(0, 7)} + </a> + </div> + </div> + + <Show when={task()?.task.prompts && task()!.task.prompts!.length > 0}> + <div style={{ "margin-bottom": "1rem" }}> + <strong>Prompt:</strong> + <For each={task()!.task.prompts}> + {(p) => ( + <div style={{ "margin-top": "0.5rem" }}> + <div style={{ "font-size": "0.875rem", color: "#666" }}>Commit: {p.commit.slice(0, 7)}</div> + <p style={{ "margin-top": "0.25rem", "white-space": "pre-wrap" }}>{p.prompt}</p> + </div> + )} + </For> + </div> + </Show> + + <hr style={{ margin: "1rem 0", border: "none", "border-top": "1px solid #ccc" }} /> + + <div style={{ "margin-bottom": "1rem" }}> + <div> + <strong>Average Duration: </strong> + {task()?.averageDuration ? formatDuration(task()!.averageDuration!) : "N/A"} + </div> + <div> + <strong>Average Score: </strong> + {task()?.averageScore?.toFixed(3) ?? "N/A"} + </div> + <div> + <strong>Average Cost: </strong> + {task()?.averageUsage?.cost ? `$${task()!.averageUsage!.cost.toFixed(4)}` : "N/A"} + </div> + </div> + + <Show when={task()?.summary}> + <div style={{ "margin-bottom": "1rem" }}> + <strong>Summary:</strong> + <p style={{ "margin-top": "0.5rem", "white-space": "pre-wrap" }}>{task()!.summary}</p> + </div> + </Show> + + <Show when={task()?.runs && task()!.runs!.length > 0}> + <div style={{ "margin-bottom": "1rem" }}> + <strong>Runs:</strong> + <table style={{ "margin-top": "0.5rem", "border-collapse": "collapse", width: "100%" }}> + <thead> + <tr> + <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Run</th> + <th + style={{ + border: "1px solid #ccc", + padding: "0.5rem", + "text-align": "left", + "white-space": "nowrap", + }} + > + Score (Base - Penalty) + </th> + <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Cost</th> + <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Duration</th> + <For each={task()!.runs![0]?.scoreDetails}> + {(detail) => ( + <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}> + {detail.criterion} ({detail.weight}) + </th> + )} + </For> + </tr> + </thead> + <tbody> + <For each={task()!.runs}> + {(run, index) => ( + <tr> + <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{index() + 1}</td> + <td style={{ border: "1px solid #ccc", padding: "0.5rem", "white-space": "nowrap" }}> + {run.score.final.toFixed(3)} ({run.score.base.toFixed(3)} - {run.score.penalty.toFixed(3)}) + </td> + <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}> + {run.usage?.cost ? `$${run.usage.cost.toFixed(4)}` : "N/A"} + </td> + <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}> + {run.duration ? formatDuration(run.duration) : "N/A"} + </td> + <For each={run.scoreDetails}> + {(detail) => ( + <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}> + <For each={detail.judges}> + {(judge) => ( + <span + style={{ + color: judge.score === 1 ? "green" : judge.score === 0 ? "red" : "inherit", + "margin-right": "0.25rem", + }} + > + {judge.score === 1 ? "✓" : judge.score === 0 ? "✗" : judge.score} + </span> + )} + </For> + </td> + )} + </For> + </tr> + )} + </For> + </tbody> + </table> + <For each={task()!.runs}> + {(run, index) => ( + <div style={{ "margin-top": "1rem" }}> + <h3 style={{ margin: "0 0 0.5rem 0" }}>Run {index() + 1}</h3> + <div> + <strong>Score: </strong> + {run.score.final.toFixed(3)} (Base: {run.score.base.toFixed(3)} - Penalty:{" "} + {run.score.penalty.toFixed(3)}) + </div> + <For each={run.scoreDetails}> + {(detail) => ( + <div style={{ "margin-top": "1rem", "padding-left": "1rem", "border-left": "2px solid #ccc" }}> + <div> + {detail.criterion} (weight: {detail.weight}){" "} + <For each={detail.judges}> + {(judge) => ( + <span + style={{ + color: judge.score === 1 ? "green" : judge.score === 0 ? "red" : "inherit", + "margin-right": "0.25rem", + }} + > + {judge.score === 1 ? "✓" : judge.score === 0 ? "✗" : judge.score} + </span> + )} + </For> + </div> + <Show when={detail.judges && detail.judges.length > 0}> + <For each={detail.judges}> + {(judge) => { + const [expanded, setExpanded] = createSignal(false) + return ( + <div style={{ "margin-top": "0.5rem", "padding-left": "1rem" }}> + <div + style={{ "font-size": "0.875rem", cursor: "pointer" }} + onClick={() => setExpanded(!expanded())} + > + <span style={{ "margin-right": "0.5rem" }}>{expanded() ? "▼" : "▶"}</span> + <span + style={{ + color: judge.score === 1 ? "green" : judge.score === 0 ? "red" : "inherit", + }} + > + {judge.score === 1 ? "✓" : judge.score === 0 ? "✗" : judge.score} + </span>{" "} + {judge.judge} + </div> + <Show when={expanded()}> + <p + style={{ + margin: "0.25rem 0 0 0", + "white-space": "pre-wrap", + "font-size": "0.875rem", + }} + > + {judge.rationale} + </p> + </Show> + </div> + ) + }} + </For> + </Show> + </div> + )} + </For> + </div> + )} + </For> + </div> + </Show> + + {(() => { + const [jsonExpanded, setJsonExpanded] = createSignal(false) + return ( + <div style={{ "margin-top": "1rem" }}> + <button + style={{ + cursor: "pointer", + padding: "0.75rem 1.5rem", + "font-size": "1rem", + background: "#f0f0f0", + border: "1px solid #ccc", + "border-radius": "4px", + }} + onClick={() => setJsonExpanded(!jsonExpanded())} + > + <span style={{ "margin-right": "0.5rem" }}>{jsonExpanded() ? "▼" : "▶"}</span> + Raw JSON + </button> + <Show when={jsonExpanded()}> + <pre>{JSON.stringify(task(), null, 2)}</pre> + </Show> + </div> + ) + })()} + </Show> + </div> + </main> + ) +} diff --git a/packages/console/app/src/routes/bench/index.tsx b/packages/console/app/src/routes/bench/index.tsx index 6339c8017..9b8d0b8f2 100644 --- a/packages/console/app/src/routes/bench/index.tsx +++ b/packages/console/app/src/routes/bench/index.tsx @@ -1,52 +1,12 @@ import { Title } from "@solidjs/meta" -import { createAsync, query } from "@solidjs/router" -import { createMemo, createSignal, For, Show } from "solid-js" +import { A, createAsync, query } from "@solidjs/router" +import { createMemo, For, Show } from "solid-js" import { Database, desc } from "@opencode-ai/console-core/drizzle/index.js" import { BenchmarkTable } from "@opencode-ai/console-core/schema/benchmark.sql.js" -interface TaskSource { - repo: string - from: string - to: string -} - -interface ScoreDetail { - criterion: string - weight: number - average: number -} - -interface Run { - task: string - model: string - agent: string - score: { - final: number - base: number - penalty: number - } - scoreDetails: ScoreDetail[] -} - -interface Prompt { - commit: string - prompt: string -} - -interface Task { - averageScore: number - summary?: string - runs?: Run[] - task: { - id: string - source: TaskSource - prompts?: Prompt[] - } -} - interface BenchmarkResult { averageScore: number - tasks: Task[] + tasks: { averageScore: number; task: { id: string } }[] } async function getBenchmarks() { @@ -57,17 +17,15 @@ async function getBenchmarks() { return rows.map((row) => { const parsed = JSON.parse(row.result) as BenchmarkResult const taskScores: Record<string, number> = {} - const taskData: Record<string, Task> = {} for (const t of parsed.tasks) { taskScores[t.task.id] = t.averageScore - taskData[t.task.id] = t } return { + id: row.id, agent: row.agent, model: row.model, averageScore: parsed.averageScore, taskScores, - taskData, } }) } @@ -76,7 +34,6 @@ const queryBenchmarks = query(getBenchmarks, "benchmarks.list") export default function Bench() { const benchmarks = createAsync(() => queryBenchmarks()) - const [modalTask, setModalTask] = createSignal<Task | null>(null) const taskIds = createMemo(() => { const ids = new Set<string>() @@ -89,34 +46,32 @@ export default function Bench() { }) return ( - <main data-page="bench"> + <main data-page="bench" style={{ padding: "2rem" }}> <Title>Benchmark</Title> - <table> + <h1 style={{ "margin-bottom": "1.5rem" }}>Benchmarks</h1> + <table style={{ "border-collapse": "collapse", width: "100%" }}> <thead> <tr> - <th>Agent</th> - <th>Model</th> - <th>Final Score</th> - <For each={taskIds()}>{(id) => <th>{id}</th>}</For> + <th style={{ "text-align": "left", padding: "0.75rem" }}>Agent</th> + <th style={{ "text-align": "left", padding: "0.75rem" }}>Model</th> + <th style={{ "text-align": "left", padding: "0.75rem" }}>Score</th> + <For each={taskIds()}>{(id) => <th style={{ "text-align": "left", padding: "0.75rem" }}>{id}</th>}</For> </tr> </thead> <tbody> <For each={benchmarks()}> {(row) => ( <tr> - <td>{row.agent}</td> - <td>{row.model}</td> - <td>{row.averageScore.toFixed(3)}</td> + <td style={{ padding: "0.75rem" }}>{row.agent}</td> + <td style={{ padding: "0.75rem" }}>{row.model}</td> + <td style={{ padding: "0.75rem" }}>{row.averageScore.toFixed(3)}</td> <For each={taskIds()}> {(id) => ( - <td> - <Show when={row.taskData[id]} fallback={row.taskScores[id]?.toFixed(3) ?? ""}> - <span - style={{ cursor: "pointer", "text-decoration": "underline" }} - onClick={() => setModalTask(row.taskData[id])} - > + <td style={{ padding: "0.75rem" }}> + <Show when={row.taskScores[id] !== undefined} fallback=""> + <A href={`/bench/${row.id}:${id}`} style={{ color: "#0066cc" }}> {row.taskScores[id]?.toFixed(3)} - </span> + </A> </Show> </td> )} @@ -126,134 +81,6 @@ export default function Bench() { </For> </tbody> </table> - - <Show when={modalTask()}> - <div - data-component="modal-overlay" - style={{ - position: "fixed", - inset: "0", - background: "rgba(0, 0, 0, 0.5)", - display: "flex", - "align-items": "center", - "justify-content": "center", - "z-index": "1000", - }} - onClick={() => setModalTask(null)} - > - <div - data-component="modal" - style={{ - background: "var(--color-background, #fff)", - padding: "1rem", - "border-radius": "8px", - "max-width": "80vw", - "max-height": "80vh", - overflow: "auto", - }} - onClick={(e) => e.stopPropagation()} - > - <div style={{ "margin-bottom": "1rem", color: "#000" }}> - <div> - <strong>Repo: </strong> - <a - href={`https://github.com/${modalTask()!.task.source.repo}`} - target="_blank" - rel="noopener noreferrer" - style={{ color: "#0066cc" }} - > - {modalTask()!.task.source.repo} - </a> - </div> - <div> - <strong>From: </strong> - <a - href={`https://github.com/${modalTask()!.task.source.repo}/commit/${modalTask()!.task.source.from}`} - target="_blank" - rel="noopener noreferrer" - style={{ color: "#0066cc" }} - > - {modalTask()!.task.source.from.slice(0, 7)} - </a> - </div> - <div> - <strong>To: </strong> - <a - href={`https://github.com/${modalTask()!.task.source.repo}/commit/${modalTask()!.task.source.to}`} - target="_blank" - rel="noopener noreferrer" - style={{ color: "#0066cc" }} - > - {modalTask()!.task.source.to.slice(0, 7)} - </a> - </div> - </div> - <Show when={modalTask()?.task.prompts && modalTask()!.task.prompts!.length > 0}> - <div style={{ "margin-bottom": "1rem", color: "#000" }}> - <strong>Prompt:</strong> - <For each={modalTask()!.task.prompts}> - {(p) => ( - <div style={{ "margin-top": "0.5rem" }}> - <div style={{ "font-size": "0.875rem", color: "#666" }}>Commit: {p.commit.slice(0, 7)}</div> - <p style={{ "margin-top": "0.25rem", "white-space": "pre-wrap" }}>{p.prompt}</p> - </div> - )} - </For> - </div> - </Show> - <Show when={modalTask()?.runs && modalTask()!.runs!.length > 0}> - <div style={{ "margin-bottom": "1rem", color: "#000" }}> - <strong>Runs:</strong> - <table style={{ "margin-top": "0.5rem", "border-collapse": "collapse", width: "100%" }}> - <thead> - <tr> - <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Run</th> - <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Final</th> - <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Base</th> - <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Penalty</th> - <For each={modalTask()!.runs![0]?.scoreDetails}> - {(detail) => ( - <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}> - {detail.criterion} ({detail.weight}) - </th> - )} - </For> - </tr> - </thead> - <tbody> - <For each={modalTask()!.runs}> - {(run, index) => ( - <tr> - <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{index() + 1}</td> - <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{run.score.final.toFixed(3)}</td> - <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{run.score.base.toFixed(3)}</td> - <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}> - {run.score.penalty.toFixed(3)} - </td> - <For each={run.scoreDetails}> - {(detail) => ( - <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}> - {detail.average.toFixed(3)} - </td> - )} - </For> - </tr> - )} - </For> - </tbody> - </table> - </div> - </Show> - <Show when={modalTask()?.summary}> - <div style={{ "margin-bottom": "1rem", color: "#000" }}> - <strong>Summary:</strong> - <p style={{ "margin-top": "0.5rem", "white-space": "pre-wrap" }}>{modalTask()!.summary}</p> - </div> - </Show> - <pre style={{ color: "#000" }}>{JSON.stringify(modalTask(), null, 2)}</pre> - </div> - </div> - </Show> </main> ) } |
