summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorFrank <[email protected]>2025-12-28 20:00:47 -0500
committerFrank <[email protected]>2025-12-28 20:00:49 -0500
commite0bb96a9f9edf925c445607bc3e19742ba14af8c (patch)
tree2a5b2eb6e24aeba4e1fc59dd8686c20c5348aa29
parent82e5d6d458297af9fe60768eee49bc48f975f09e (diff)
downloadopencode-e0bb96a9f9edf925c445607bc3e19742ba14af8c.tar.gz
opencode-e0bb96a9f9edf925c445607bc3e19742ba14af8c.zip
wip: bench
-rw-r--r--packages/console/app/src/routes/bench/[id].tsx365
-rw-r--r--packages/console/app/src/routes/bench/index.tsx209
2 files changed, 383 insertions, 191 deletions
diff --git a/packages/console/app/src/routes/bench/[id].tsx b/packages/console/app/src/routes/bench/[id].tsx
new file mode 100644
index 000000000..4586eef9b
--- /dev/null
+++ b/packages/console/app/src/routes/bench/[id].tsx
@@ -0,0 +1,365 @@
+import { Title } from "@solidjs/meta"
+import { createAsync, query, useParams } from "@solidjs/router"
+import { createSignal, For, Show } from "solid-js"
+import { Database, desc, eq } from "@opencode-ai/console-core/drizzle/index.js"
+import { BenchmarkTable } from "@opencode-ai/console-core/schema/benchmark.sql.js"
+
+interface TaskSource {
+ repo: string
+ from: string
+ to: string
+}
+
+interface Judge {
+ score: number
+ rationale: string
+ judge: string
+}
+
+interface ScoreDetail {
+ criterion: string
+ weight: number
+ average: number
+ variance?: number
+ judges?: Judge[]
+}
+
+interface RunUsage {
+ input: number
+ output: number
+ cost: number
+}
+
+interface Run {
+ task: string
+ model: string
+ agent: string
+ score: {
+ final: number
+ base: number
+ penalty: number
+ }
+ scoreDetails: ScoreDetail[]
+ usage?: RunUsage
+ duration?: number
+}
+
+interface Prompt {
+ commit: string
+ prompt: string
+}
+
+interface AverageUsage {
+ input: number
+ output: number
+ cost: number
+}
+
+interface Task {
+ averageScore: number
+ averageDuration?: number
+ averageUsage?: AverageUsage
+ model?: string
+ agent?: string
+ summary?: string
+ runs?: Run[]
+ task: {
+ id: string
+ source: TaskSource
+ prompts?: Prompt[]
+ }
+}
+
+interface BenchmarkResult {
+ averageScore: number
+ tasks: Task[]
+}
+
+async function getTaskDetail(benchmarkId: string, taskId: string) {
+ "use server"
+ const rows = await Database.use((tx) =>
+ tx.select().from(BenchmarkTable).where(eq(BenchmarkTable.id, benchmarkId)).limit(1),
+ )
+ if (!rows[0]) return null
+ const parsed = JSON.parse(rows[0].result) as BenchmarkResult
+ const task = parsed.tasks.find((t) => t.task.id === taskId)
+ return task ?? null
+}
+
+const queryTaskDetail = query(getTaskDetail, "benchmark.task.detail")
+
+function formatDuration(ms: number): string {
+ const seconds = Math.floor(ms / 1000)
+ const minutes = Math.floor(seconds / 60)
+ const remainingSeconds = seconds % 60
+ if (minutes > 0) {
+ return `${minutes}m ${remainingSeconds}s`
+ }
+ return `${remainingSeconds}s`
+}
+
+export default function BenchDetail() {
+ const params = useParams()
+ const [benchmarkId, taskId] = (params.id ?? "").split(":")
+ const task = createAsync(() => queryTaskDetail(benchmarkId, taskId))
+
+ return (
+ <main data-page="bench-detail">
+ <Title>Benchmark - {taskId}</Title>
+ <div style={{ padding: "1rem" }}>
+ <Show when={task()} fallback={<p>Task not found</p>}>
+ <div style={{ "margin-bottom": "1rem" }}>
+ <div>
+ <strong>Agent: </strong>
+ {task()?.agent ?? "N/A"}
+ </div>
+ <div>
+ <strong>Model: </strong>
+ {task()?.model ?? "N/A"}
+ </div>
+ <div>
+ <strong>Task: </strong>
+ {task()!.task.id}
+ </div>
+ </div>
+
+ <div style={{ "margin-bottom": "1rem" }}>
+ <div>
+ <strong>Repo: </strong>
+ <a
+ href={`https://github.com/${task()!.task.source.repo}`}
+ target="_blank"
+ rel="noopener noreferrer"
+ style={{ color: "#0066cc" }}
+ >
+ {task()!.task.source.repo}
+ </a>
+ </div>
+ <div>
+ <strong>From: </strong>
+ <a
+ href={`https://github.com/${task()!.task.source.repo}/commit/${task()!.task.source.from}`}
+ target="_blank"
+ rel="noopener noreferrer"
+ style={{ color: "#0066cc" }}
+ >
+ {task()!.task.source.from.slice(0, 7)}
+ </a>
+ </div>
+ <div>
+ <strong>To: </strong>
+ <a
+ href={`https://github.com/${task()!.task.source.repo}/commit/${task()!.task.source.to}`}
+ target="_blank"
+ rel="noopener noreferrer"
+ style={{ color: "#0066cc" }}
+ >
+ {task()!.task.source.to.slice(0, 7)}
+ </a>
+ </div>
+ </div>
+
+ <Show when={task()?.task.prompts && task()!.task.prompts!.length > 0}>
+ <div style={{ "margin-bottom": "1rem" }}>
+ <strong>Prompt:</strong>
+ <For each={task()!.task.prompts}>
+ {(p) => (
+ <div style={{ "margin-top": "0.5rem" }}>
+ <div style={{ "font-size": "0.875rem", color: "#666" }}>Commit: {p.commit.slice(0, 7)}</div>
+ <p style={{ "margin-top": "0.25rem", "white-space": "pre-wrap" }}>{p.prompt}</p>
+ </div>
+ )}
+ </For>
+ </div>
+ </Show>
+
+ <hr style={{ margin: "1rem 0", border: "none", "border-top": "1px solid #ccc" }} />
+
+ <div style={{ "margin-bottom": "1rem" }}>
+ <div>
+ <strong>Average Duration: </strong>
+ {task()?.averageDuration ? formatDuration(task()!.averageDuration!) : "N/A"}
+ </div>
+ <div>
+ <strong>Average Score: </strong>
+ {task()?.averageScore?.toFixed(3) ?? "N/A"}
+ </div>
+ <div>
+ <strong>Average Cost: </strong>
+ {task()?.averageUsage?.cost ? `$${task()!.averageUsage!.cost.toFixed(4)}` : "N/A"}
+ </div>
+ </div>
+
+ <Show when={task()?.summary}>
+ <div style={{ "margin-bottom": "1rem" }}>
+ <strong>Summary:</strong>
+ <p style={{ "margin-top": "0.5rem", "white-space": "pre-wrap" }}>{task()!.summary}</p>
+ </div>
+ </Show>
+
+ <Show when={task()?.runs && task()!.runs!.length > 0}>
+ <div style={{ "margin-bottom": "1rem" }}>
+ <strong>Runs:</strong>
+ <table style={{ "margin-top": "0.5rem", "border-collapse": "collapse", width: "100%" }}>
+ <thead>
+ <tr>
+ <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Run</th>
+ <th
+ style={{
+ border: "1px solid #ccc",
+ padding: "0.5rem",
+ "text-align": "left",
+ "white-space": "nowrap",
+ }}
+ >
+ Score (Base - Penalty)
+ </th>
+ <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Cost</th>
+ <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Duration</th>
+ <For each={task()!.runs![0]?.scoreDetails}>
+ {(detail) => (
+ <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>
+ {detail.criterion} ({detail.weight})
+ </th>
+ )}
+ </For>
+ </tr>
+ </thead>
+ <tbody>
+ <For each={task()!.runs}>
+ {(run, index) => (
+ <tr>
+ <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{index() + 1}</td>
+ <td style={{ border: "1px solid #ccc", padding: "0.5rem", "white-space": "nowrap" }}>
+ {run.score.final.toFixed(3)} ({run.score.base.toFixed(3)} - {run.score.penalty.toFixed(3)})
+ </td>
+ <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>
+ {run.usage?.cost ? `$${run.usage.cost.toFixed(4)}` : "N/A"}
+ </td>
+ <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>
+ {run.duration ? formatDuration(run.duration) : "N/A"}
+ </td>
+ <For each={run.scoreDetails}>
+ {(detail) => (
+ <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>
+ <For each={detail.judges}>
+ {(judge) => (
+ <span
+ style={{
+ color: judge.score === 1 ? "green" : judge.score === 0 ? "red" : "inherit",
+ "margin-right": "0.25rem",
+ }}
+ >
+ {judge.score === 1 ? "✓" : judge.score === 0 ? "✗" : judge.score}
+ </span>
+ )}
+ </For>
+ </td>
+ )}
+ </For>
+ </tr>
+ )}
+ </For>
+ </tbody>
+ </table>
+ <For each={task()!.runs}>
+ {(run, index) => (
+ <div style={{ "margin-top": "1rem" }}>
+ <h3 style={{ margin: "0 0 0.5rem 0" }}>Run {index() + 1}</h3>
+ <div>
+ <strong>Score: </strong>
+ {run.score.final.toFixed(3)} (Base: {run.score.base.toFixed(3)} - Penalty:{" "}
+ {run.score.penalty.toFixed(3)})
+ </div>
+ <For each={run.scoreDetails}>
+ {(detail) => (
+ <div style={{ "margin-top": "1rem", "padding-left": "1rem", "border-left": "2px solid #ccc" }}>
+ <div>
+ {detail.criterion} (weight: {detail.weight}){" "}
+ <For each={detail.judges}>
+ {(judge) => (
+ <span
+ style={{
+ color: judge.score === 1 ? "green" : judge.score === 0 ? "red" : "inherit",
+ "margin-right": "0.25rem",
+ }}
+ >
+ {judge.score === 1 ? "✓" : judge.score === 0 ? "✗" : judge.score}
+ </span>
+ )}
+ </For>
+ </div>
+ <Show when={detail.judges && detail.judges.length > 0}>
+ <For each={detail.judges}>
+ {(judge) => {
+ const [expanded, setExpanded] = createSignal(false)
+ return (
+ <div style={{ "margin-top": "0.5rem", "padding-left": "1rem" }}>
+ <div
+ style={{ "font-size": "0.875rem", cursor: "pointer" }}
+ onClick={() => setExpanded(!expanded())}
+ >
+ <span style={{ "margin-right": "0.5rem" }}>{expanded() ? "▼" : "▶"}</span>
+ <span
+ style={{
+ color: judge.score === 1 ? "green" : judge.score === 0 ? "red" : "inherit",
+ }}
+ >
+ {judge.score === 1 ? "✓" : judge.score === 0 ? "✗" : judge.score}
+ </span>{" "}
+ {judge.judge}
+ </div>
+ <Show when={expanded()}>
+ <p
+ style={{
+ margin: "0.25rem 0 0 0",
+ "white-space": "pre-wrap",
+ "font-size": "0.875rem",
+ }}
+ >
+ {judge.rationale}
+ </p>
+ </Show>
+ </div>
+ )
+ }}
+ </For>
+ </Show>
+ </div>
+ )}
+ </For>
+ </div>
+ )}
+ </For>
+ </div>
+ </Show>
+
+ {(() => {
+ const [jsonExpanded, setJsonExpanded] = createSignal(false)
+ return (
+ <div style={{ "margin-top": "1rem" }}>
+ <button
+ style={{
+ cursor: "pointer",
+ padding: "0.75rem 1.5rem",
+ "font-size": "1rem",
+ background: "#f0f0f0",
+ border: "1px solid #ccc",
+ "border-radius": "4px",
+ }}
+ onClick={() => setJsonExpanded(!jsonExpanded())}
+ >
+ <span style={{ "margin-right": "0.5rem" }}>{jsonExpanded() ? "▼" : "▶"}</span>
+ Raw JSON
+ </button>
+ <Show when={jsonExpanded()}>
+ <pre>{JSON.stringify(task(), null, 2)}</pre>
+ </Show>
+ </div>
+ )
+ })()}
+ </Show>
+ </div>
+ </main>
+ )
+}
diff --git a/packages/console/app/src/routes/bench/index.tsx b/packages/console/app/src/routes/bench/index.tsx
index 6339c8017..9b8d0b8f2 100644
--- a/packages/console/app/src/routes/bench/index.tsx
+++ b/packages/console/app/src/routes/bench/index.tsx
@@ -1,52 +1,12 @@
import { Title } from "@solidjs/meta"
-import { createAsync, query } from "@solidjs/router"
-import { createMemo, createSignal, For, Show } from "solid-js"
+import { A, createAsync, query } from "@solidjs/router"
+import { createMemo, For, Show } from "solid-js"
import { Database, desc } from "@opencode-ai/console-core/drizzle/index.js"
import { BenchmarkTable } from "@opencode-ai/console-core/schema/benchmark.sql.js"
-interface TaskSource {
- repo: string
- from: string
- to: string
-}
-
-interface ScoreDetail {
- criterion: string
- weight: number
- average: number
-}
-
-interface Run {
- task: string
- model: string
- agent: string
- score: {
- final: number
- base: number
- penalty: number
- }
- scoreDetails: ScoreDetail[]
-}
-
-interface Prompt {
- commit: string
- prompt: string
-}
-
-interface Task {
- averageScore: number
- summary?: string
- runs?: Run[]
- task: {
- id: string
- source: TaskSource
- prompts?: Prompt[]
- }
-}
-
interface BenchmarkResult {
averageScore: number
- tasks: Task[]
+ tasks: { averageScore: number; task: { id: string } }[]
}
async function getBenchmarks() {
@@ -57,17 +17,15 @@ async function getBenchmarks() {
return rows.map((row) => {
const parsed = JSON.parse(row.result) as BenchmarkResult
const taskScores: Record<string, number> = {}
- const taskData: Record<string, Task> = {}
for (const t of parsed.tasks) {
taskScores[t.task.id] = t.averageScore
- taskData[t.task.id] = t
}
return {
+ id: row.id,
agent: row.agent,
model: row.model,
averageScore: parsed.averageScore,
taskScores,
- taskData,
}
})
}
@@ -76,7 +34,6 @@ const queryBenchmarks = query(getBenchmarks, "benchmarks.list")
export default function Bench() {
const benchmarks = createAsync(() => queryBenchmarks())
- const [modalTask, setModalTask] = createSignal<Task | null>(null)
const taskIds = createMemo(() => {
const ids = new Set<string>()
@@ -89,34 +46,32 @@ export default function Bench() {
})
return (
- <main data-page="bench">
+ <main data-page="bench" style={{ padding: "2rem" }}>
<Title>Benchmark</Title>
- <table>
+ <h1 style={{ "margin-bottom": "1.5rem" }}>Benchmarks</h1>
+ <table style={{ "border-collapse": "collapse", width: "100%" }}>
<thead>
<tr>
- <th>Agent</th>
- <th>Model</th>
- <th>Final Score</th>
- <For each={taskIds()}>{(id) => <th>{id}</th>}</For>
+ <th style={{ "text-align": "left", padding: "0.75rem" }}>Agent</th>
+ <th style={{ "text-align": "left", padding: "0.75rem" }}>Model</th>
+ <th style={{ "text-align": "left", padding: "0.75rem" }}>Score</th>
+ <For each={taskIds()}>{(id) => <th style={{ "text-align": "left", padding: "0.75rem" }}>{id}</th>}</For>
</tr>
</thead>
<tbody>
<For each={benchmarks()}>
{(row) => (
<tr>
- <td>{row.agent}</td>
- <td>{row.model}</td>
- <td>{row.averageScore.toFixed(3)}</td>
+ <td style={{ padding: "0.75rem" }}>{row.agent}</td>
+ <td style={{ padding: "0.75rem" }}>{row.model}</td>
+ <td style={{ padding: "0.75rem" }}>{row.averageScore.toFixed(3)}</td>
<For each={taskIds()}>
{(id) => (
- <td>
- <Show when={row.taskData[id]} fallback={row.taskScores[id]?.toFixed(3) ?? ""}>
- <span
- style={{ cursor: "pointer", "text-decoration": "underline" }}
- onClick={() => setModalTask(row.taskData[id])}
- >
+ <td style={{ padding: "0.75rem" }}>
+ <Show when={row.taskScores[id] !== undefined} fallback="">
+ <A href={`/bench/${row.id}:${id}`} style={{ color: "#0066cc" }}>
{row.taskScores[id]?.toFixed(3)}
- </span>
+ </A>
</Show>
</td>
)}
@@ -126,134 +81,6 @@ export default function Bench() {
</For>
</tbody>
</table>
-
- <Show when={modalTask()}>
- <div
- data-component="modal-overlay"
- style={{
- position: "fixed",
- inset: "0",
- background: "rgba(0, 0, 0, 0.5)",
- display: "flex",
- "align-items": "center",
- "justify-content": "center",
- "z-index": "1000",
- }}
- onClick={() => setModalTask(null)}
- >
- <div
- data-component="modal"
- style={{
- background: "var(--color-background, #fff)",
- padding: "1rem",
- "border-radius": "8px",
- "max-width": "80vw",
- "max-height": "80vh",
- overflow: "auto",
- }}
- onClick={(e) => e.stopPropagation()}
- >
- <div style={{ "margin-bottom": "1rem", color: "#000" }}>
- <div>
- <strong>Repo: </strong>
- <a
- href={`https://github.com/${modalTask()!.task.source.repo}`}
- target="_blank"
- rel="noopener noreferrer"
- style={{ color: "#0066cc" }}
- >
- {modalTask()!.task.source.repo}
- </a>
- </div>
- <div>
- <strong>From: </strong>
- <a
- href={`https://github.com/${modalTask()!.task.source.repo}/commit/${modalTask()!.task.source.from}`}
- target="_blank"
- rel="noopener noreferrer"
- style={{ color: "#0066cc" }}
- >
- {modalTask()!.task.source.from.slice(0, 7)}
- </a>
- </div>
- <div>
- <strong>To: </strong>
- <a
- href={`https://github.com/${modalTask()!.task.source.repo}/commit/${modalTask()!.task.source.to}`}
- target="_blank"
- rel="noopener noreferrer"
- style={{ color: "#0066cc" }}
- >
- {modalTask()!.task.source.to.slice(0, 7)}
- </a>
- </div>
- </div>
- <Show when={modalTask()?.task.prompts && modalTask()!.task.prompts!.length > 0}>
- <div style={{ "margin-bottom": "1rem", color: "#000" }}>
- <strong>Prompt:</strong>
- <For each={modalTask()!.task.prompts}>
- {(p) => (
- <div style={{ "margin-top": "0.5rem" }}>
- <div style={{ "font-size": "0.875rem", color: "#666" }}>Commit: {p.commit.slice(0, 7)}</div>
- <p style={{ "margin-top": "0.25rem", "white-space": "pre-wrap" }}>{p.prompt}</p>
- </div>
- )}
- </For>
- </div>
- </Show>
- <Show when={modalTask()?.runs && modalTask()!.runs!.length > 0}>
- <div style={{ "margin-bottom": "1rem", color: "#000" }}>
- <strong>Runs:</strong>
- <table style={{ "margin-top": "0.5rem", "border-collapse": "collapse", width: "100%" }}>
- <thead>
- <tr>
- <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Run</th>
- <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Final</th>
- <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Base</th>
- <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>Penalty</th>
- <For each={modalTask()!.runs![0]?.scoreDetails}>
- {(detail) => (
- <th style={{ border: "1px solid #ccc", padding: "0.5rem", "text-align": "left" }}>
- {detail.criterion} ({detail.weight})
- </th>
- )}
- </For>
- </tr>
- </thead>
- <tbody>
- <For each={modalTask()!.runs}>
- {(run, index) => (
- <tr>
- <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{index() + 1}</td>
- <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{run.score.final.toFixed(3)}</td>
- <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>{run.score.base.toFixed(3)}</td>
- <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>
- {run.score.penalty.toFixed(3)}
- </td>
- <For each={run.scoreDetails}>
- {(detail) => (
- <td style={{ border: "1px solid #ccc", padding: "0.5rem" }}>
- {detail.average.toFixed(3)}
- </td>
- )}
- </For>
- </tr>
- )}
- </For>
- </tbody>
- </table>
- </div>
- </Show>
- <Show when={modalTask()?.summary}>
- <div style={{ "margin-bottom": "1rem", color: "#000" }}>
- <strong>Summary:</strong>
- <p style={{ "margin-top": "0.5rem", "white-space": "pre-wrap" }}>{modalTask()!.summary}</p>
- </div>
- </Show>
- <pre style={{ color: "#000" }}>{JSON.stringify(modalTask(), null, 2)}</pre>
- </div>
- </div>
- </Show>
</main>
)
}