From 3036e57d33eec4b68a60a1222daa232bd7906d99 Mon Sep 17 00:00:00 2001 From: Khush Patel Date: Fri, 5 Jun 2026 11:40:10 +0530 Subject: [PATCH] =?UTF-8?q?feat(agentos):=20Agent=20Simulation=20Engine=20?= =?UTF-8?q?=E2=80=94=20multi-judge=20evals=20+=20overview=20dashboard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an eval/simulation framework to the AgentOS Console for grading registered agents against test suites, with results surfaced in an observability-style dashboard. Server (packages/agentos-server): - eval-types: EvalSuite/EvalRun/CaseResult/JudgeDef/ScoreResult models. - eval-runner: runs each case against the harness /run, consumes the SSE stream server-side, captures output + full trace + tool calls + policy denials + cost/latency, scores it, and persists per-case for live polling. - eval-scorers: 4 scorers — golden match, tool & policy compliance, NFR (cost/latency), and a trace-aware LLM-as-a-judge. Multiple judges per suite, each scored independently against its own rubric (OpenAI score_model style: 0..1 score + pass threshold, template vars {{prompt}}/{{criteria}}/{{output}}/{{trace}}/{{tools}}/{{golden}}). A case passes only when every enabled scorer + judge passes. - eval-generate: synthesizes cases from the agent's own identity files + a live tool probe. - routes/evals: suite CRUD, run trigger, run readback, case generation. - mongo/index: eval_suites + eval_runs collections; router mounted. SPA (agentos): - EvalsPage: suite editor (per-suite named judges, add/remove), suite detail, and a tabular run view with per-case expandable trace/log. - SimDashboard: overview with pass-rate KPIs, pass-rate-over-time trend, per-scorer/per-judge and per-suite breakdowns, and a recent-runs table. Co-Authored-By: Claude Opus 4.8 --- agentos/src/App.tsx | 5 +- agentos/src/api.ts | 105 +++ agentos/src/components/EvalsPage.tsx | 797 +++++++++++++++++++ agentos/src/components/SimDashboard.tsx | 343 ++++++++ packages/agentos-server/src/eval-generate.ts | 145 ++++ packages/agentos-server/src/eval-runner.ts | 303 +++++++ packages/agentos-server/src/eval-scorers.ts | 192 +++++ packages/agentos-server/src/eval-types.ts | 129 +++ packages/agentos-server/src/index.ts | 2 + packages/agentos-server/src/mongo.ts | 9 + packages/agentos-server/src/routes/evals.ts | 186 +++++ 11 files changed, 2215 insertions(+), 1 deletion(-) create mode 100644 agentos/src/components/EvalsPage.tsx create mode 100644 agentos/src/components/SimDashboard.tsx create mode 100644 packages/agentos-server/src/eval-generate.ts create mode 100644 packages/agentos-server/src/eval-runner.ts create mode 100644 packages/agentos-server/src/eval-scorers.ts create mode 100644 packages/agentos-server/src/eval-types.ts create mode 100644 packages/agentos-server/src/routes/evals.ts diff --git a/agentos/src/App.tsx b/agentos/src/App.tsx index 2c650c0..eccaa09 100644 --- a/agentos/src/App.tsx +++ b/agentos/src/App.tsx @@ -1,7 +1,8 @@ -import { Home as HomeIcon, Activity, Shield, Boxes } from "lucide-react"; +import { Home as HomeIcon, Activity, Shield, Boxes, FlaskConical } from "lucide-react"; import { NavLink, Navigate, Outlet, Route, Routes, useLocation, useNavigate } from "react-router-dom"; import { HomePage } from "./components/HomePage.tsx"; import { PoliciesPage } from "./components/PoliciesPage.tsx"; +import { EvalsPage } from "./components/EvalsPage.tsx"; import { ObservabilityTab } from "./components/observability/ObservabilityTab.tsx"; import { RegistryPage } from "./components/RegistryPage.tsx"; import { AgentDashboard } from "./components/AgentDashboard.tsx"; @@ -18,6 +19,7 @@ export default function App() { } /> } /> } /> + } /> } /> } /> @@ -49,6 +51,7 @@ function Layout() { +
diff --git a/agentos/src/api.ts b/agentos/src/api.ts index 29c122a..ae023c5 100644 --- a/agentos/src/api.ts +++ b/agentos/src/api.ts @@ -244,6 +244,95 @@ async function reqJSON(method: string, path: string, body?: unknown): Promise return r.json() as Promise; } +// ── Evals ────────────────────────────────────────────────────────────────── +export interface GoldenExpectation { + mode: "exact" | "contains" | "regex"; + value: string; +} +export interface EvalCase { + id: string; + prompt: string; + criteria?: string; + golden?: GoldenExpectation; + expectedTools?: string[]; + forbiddenTools?: string[]; + maxCostUsd?: number; + maxLatencyMs?: number; +} +export interface ScorerConfig { + taskSuccess: boolean; + toolCompliance: boolean; + golden: boolean; + nfr: boolean; +} +export interface EvalSuite { + _id: string; + name: string; + description?: string; + agentName: string; + cases: EvalCase[]; + scorers: ScorerConfig; + judges?: JudgeDef[]; + // legacy single-judge fields (read-only back-compat) + judgeModel?: string; + judgePrompt?: string; + judgePassThreshold?: number; + passThreshold?: number; + createdAt?: string; + updatedAt?: string; +} +export interface JudgeDef { + id: string; + name: string; + rubric?: string; + model?: string; + passThreshold?: number; +} +export type EvalSuiteInput = Omit; +export interface ScoreResult { + scorer: "taskSuccess" | "toolCompliance" | "golden" | "nfr"; + label?: string; + passed: boolean; + score?: number; + detail?: string; +} +export interface PolicyDenial { + tool: string; + reason: string; +} +export interface EvalTraceEntry { + type: "thinking" | "text" | "tool_use" | "tool_result"; + text?: string; + tool?: string; + input?: unknown; + isError?: boolean; +} +export interface CaseResult { + caseId: string; + prompt: string; + output: string; + toolCalls: string[]; + policyDenials: PolicyDenial[]; + transcript: EvalTraceEntry[]; + costUsd: number; + latencyMs: number; + scores: ScoreResult[]; + passed: boolean; + error?: string; +} +export interface EvalRun { + _id: string; + suiteId: string; + suiteName: string; + agentName: string; + status: "running" | "completed" | "failed"; + startedAt: string; + completedAt?: string; + results: CaseResult[]; + summary: { total: number; passed: number; passRate: number; gatePassed?: boolean }; + error?: string; +} + export const api = { agents: () => getJSON<{ agents: Agent[] }>("/agents").then((d) => d.agents), registerAgent: (input: RegisterAgentInput) => @@ -303,4 +392,20 @@ export const api = { reqJSON("PUT", `/opa-policies/${encodeURIComponent(id)}`, body), deleteOpaPolicy: (id: string) => reqJSON<{ success?: boolean }>("DELETE", `/opa-policies/${encodeURIComponent(id)}`), + + // Evals — suite CRUD + run trigger + run readback. + evals: { + listSuites: () => getJSON<{ suites: EvalSuite[] }>("/evals/suites").then((d) => d.suites), + getSuite: (id: string) => getJSON(`/evals/suites/${encodeURIComponent(id)}`), + createSuite: (body: EvalSuiteInput) => postJSON("/evals/suites", body), + updateSuite: (id: string, body: EvalSuiteInput) => + reqJSON("PUT", `/evals/suites/${encodeURIComponent(id)}`, body), + deleteSuite: (id: string) => reqJSON<{ ok: boolean }>("DELETE", `/evals/suites/${encodeURIComponent(id)}`), + runSuite: (id: string) => postJSON<{ runId: string }>(`/evals/suites/${encodeURIComponent(id)}/run`, {}), + generateCases: (agentName: string, count: number, focus?: string) => + postJSON<{ cases: EvalCase[] }>("/evals/generate", { agentName, count, focus }).then((d) => d.cases), + listRuns: (suiteId?: string) => + getJSON<{ runs: EvalRun[] }>(`/evals/runs${suiteId ? `?suite=${encodeURIComponent(suiteId)}` : ""}`).then((d) => d.runs), + getRun: (id: string) => getJSON(`/evals/runs/${encodeURIComponent(id)}`), + }, }; diff --git a/agentos/src/components/EvalsPage.tsx b/agentos/src/components/EvalsPage.tsx new file mode 100644 index 0000000..461f7d5 --- /dev/null +++ b/agentos/src/components/EvalsPage.tsx @@ -0,0 +1,797 @@ +import { useEffect, useRef, useState } from "react"; +import { FlaskConical, Play, Plus, Trash2, ChevronLeft, ChevronRight, CheckCircle2, XCircle, Loader2, Sparkles, LayoutDashboard } from "lucide-react"; +import { + api, + type EvalSuite, + type EvalSuiteInput, + type EvalRun, + type EvalCase, + type CaseResult, + type EvalTraceEntry, + type ScorerConfig, + type ScoreResult, + type JudgeDef, +} from "../api.ts"; +import { useAgents } from "../context/AgentsContext.tsx"; +import { cn } from "../lib/cn.ts"; +import { SimDashboard } from "./SimDashboard.tsx"; + +type RightView = { kind: "empty" } | { kind: "dashboard" } | { kind: "suite"; id: string } | { kind: "edit"; suite: EvalSuite | null } | { kind: "run"; id: string }; + +export function EvalsPage() { + const [suites, setSuites] = useState([]); + const [loading, setLoading] = useState(true); + const [err, setErr] = useState(null); + const [view, setView] = useState({ kind: "dashboard" }); + + const load = () => { + setLoading(true); + setErr(null); + api.evals.listSuites().then(setSuites).catch((e) => setErr(String(e))).finally(() => setLoading(false)); + }; + useEffect(load, []); + + return ( +
+ {/* Left: suite list */} +
+
+
+ + Agent Simulation Engine +
+ +
+
+ +
Suites
+ {loading &&
Loading…
} + {err &&
{err}
} + {!loading && suites.length === 0 && ( +
+ No suites yet. Create one to start simulating an agent. +
+ )} + {suites.map((s) => { + const active = (view.kind === "suite" && view.id === s._id) || (view.kind === "edit" && view.suite?._id === s._id); + return ( + + ); + })} +
+
+ + {/* Right: editor / detail / run */} +
+ {view.kind === "empty" && ( +
+ Select a suite, or create one to simulate an agent. +
+ )} + {view.kind === "dashboard" && setView({ kind: "run", id: runId })} />} + {view.kind === "edit" && ( + setView(view.suite ? { kind: "suite", id: view.suite._id } : { kind: "empty" })} + onSaved={(s) => { + load(); + setView({ kind: "suite", id: s._id }); + }} + /> + )} + {view.kind === "suite" && ( + setView({ kind: "edit", suite: s })} + onDeleted={() => { + load(); + setView({ kind: "empty" }); + }} + onOpenRun={(runId) => setView({ kind: "run", id: runId })} + /> + )} + {view.kind === "run" && setView({ kind: "suite", id: suiteId })} />} +
+
+ ); +} + +// ── Suite detail (read) — run + recent runs ───────────────────────────────── +function SuiteDetail({ + id, + onEdit, + onDeleted, + onOpenRun, +}: { + id: string; + onEdit: (s: EvalSuite) => void; + onDeleted: () => void; + onOpenRun: (runId: string) => void; +}) { + const [suite, setSuite] = useState(null); + const [runs, setRuns] = useState([]); + const [busy, setBusy] = useState(false); + const [err, setErr] = useState(null); + + const reload = () => { + api.evals.getSuite(id).then(setSuite).catch((e) => setErr(String(e))); + api.evals.listRuns(id).then(setRuns).catch(() => {}); + }; + useEffect(reload, [id]); + + if (err) return
{err}
; + if (!suite) return
Loading…
; + + const enabledScorers = (Object.entries(suite.scorers) as [keyof ScorerConfig, boolean][]) + .filter(([, on]) => on) + .map(([k]) => SCORER_LABEL[k]); + + const run = async () => { + setBusy(true); + setErr(null); + try { + const { runId } = await api.evals.runSuite(suite._id); + onOpenRun(runId); + } catch (e) { + setErr(String(e)); + } finally { + setBusy(false); + } + }; + + const del = async () => { + if (!confirm(`Delete suite "${suite.name}" and its runs?`)) return; + await api.evals.deleteSuite(suite._id); + onDeleted(); + }; + + return ( +
+
+
+

{suite.name}

+ {suite.description &&

{suite.description}

} +
+ agent: {suite.agentName} · {suite.cases.length} cases · scorers: {enabledScorers.join(", ") || "none"} + {suite.passThreshold !== undefined && ` · gate ≥ ${(suite.passThreshold * 100).toFixed(0)}%`} +
+
+
+ + +
+
+ + {/* Cases preview */} +
+

Cases

+
+ {suite.cases.map((c, i) => ( +
+ {i + 1}. + {c.prompt} +
+ ))} +
+
+ + {/* Runs */} +
+
+

Runs

+ +
+ {runs.length === 0 &&
No runs yet.
} +
+ {runs.map((r) => ( + + ))} +
+
+ +
+ +
+
+ ); +} + +// ── Run view — live results ───────────────────────────────────────────────── +function RunView({ id, onBack }: { id: string; onBack: (suiteId: string) => void }) { + const [run, setRun] = useState(null); + const [err, setErr] = useState(null); + const [openId, setOpenId] = useState(null); + const timer = useRef | null>(null); + + useEffect(() => { + let cancelled = false; + const poll = () => { + api.evals + .getRun(id) + .then((r) => { + if (cancelled) return; + setRun(r); + if (r.status === "running") timer.current = setTimeout(poll, 1500); + }) + .catch((e) => !cancelled && setErr(String(e))); + }; + poll(); + return () => { + cancelled = true; + if (timer.current) clearTimeout(timer.current); + }; + }, [id]); + + if (err) return
{err}
; + if (!run) return
Loading…
; + + const pct = (run.summary.passRate * 100).toFixed(0); + + return ( +
+ + + {/* Summary */} +
+
+ +
+
+ {run.summary.passed}/{run.summary.total} passed ({pct}%) +
+
+ {run.status === "running" ? "running…" : run.status} · agent {run.agentName} + {run.summary.gatePassed !== undefined && ( + + {" "}· gate {run.summary.gatePassed ? "PASS" : "FAIL"} + + )} +
+
+
+
+
+
+ {run.error &&
{run.error}
} +
+ + {/* Results table */} +
+ + + + + + + + + + + + + {run.results.map((c, i) => ( + setOpenId(openId === c.caseId ? null : c.caseId)} /> + ))} + {run.status === "running" && run.results.length < run.summary.total && ( + + + + )} + +
#CaseScoresCostLatency
+ + running case {run.results.length + 1} of {run.summary.total}… + +
+
+
+ ); +} + +function CaseRows({ c, i, open, onToggle }: { c: CaseResult; i: number; open: boolean; onToggle: () => void }) { + return ( + <> + + + + + {i + 1} + +
+ {c.passed ? : } + {c.prompt} +
+ + +
+ {c.scores.map((s, i) => ( + + ))} + {c.scores.length === 0 && } +
+ + ${c.costUsd.toFixed(4)} + {Math.round(c.latencyMs)}ms + + {open && ( + + + + + + )} + + ); +} + +function CaseDetail({ c }: { c: CaseResult }) { + return ( +
+ {/* Scores with reasons */} +
+ {c.scores.map((s, i) => ( +
+ {s.passed ? "✓" : "✕"} + {scoreLabel(s)} + {s.detail} +
+ ))} +
+ + {c.error && {c.error}} + +
+ +
{c.output || "(empty)"}
+
+
+ {c.toolCalls.length > 0 && ( + + {c.toolCalls.join(", ")} + + )} + {c.policyDenials.length > 0 && ( + +
+ {c.policyDenials.map((d, i) => ( +
{d.tool} — {d.reason}
+ ))} +
+
+ )} +
+
+ + {/* Trace / log */} + {c.transcript && c.transcript.length > 0 && ( + +
+ {c.transcript.map((e, i) => ( + + ))} +
+
+ )} +
+ ); +} + +function TraceStep({ e }: { e: EvalTraceEntry }) { + const meta: Record = { + thinking: { label: "thinking", cls: "text-muted-foreground" }, + text: { label: "assistant", cls: "text-foreground" }, + tool_use: { label: "tool", cls: "text-primary" }, + tool_result: { label: "result", cls: e.isError ? "text-destructive" : "text-emerald-500" }, + }; + const m = meta[e.type]; + return ( +
+ {m.label} + {e.type === "tool_use" ? ( + + {e.tool} + {e.input !== undefined && ( +
{JSON.stringify(e.input, null, 2)}
+ )} +
+ ) : ( + {e.text} + )} +
+ ); +} + +function MiniScore({ s }: { s: ScoreResult }) { + return ( + + {s.passed ? "✓" : "✕"} + + ); +} + +// ── Suite editor ──────────────────────────────────────────────────────────── +const SCORER_LABEL: Record = { + taskSuccess: "Task success (LLM-judge)", + toolCompliance: "Tool & policy compliance", + golden: "Golden match", + nfr: "NFRs (cost/latency)", +}; + +/** Display name for a score row — the judge's name for LLM-judge scores + * (so multiple judges show separately), else the scorer's label. */ +function scoreLabel(s: ScoreResult): string { + if (s.scorer === "taskSuccess" && s.label) return `Judge · ${s.label}`; + return SCORER_LABEL[s.scorer]; +} + +function freshCase(i: number): EvalCase { + return { id: `case-${Date.now()}-${i}`, prompt: "" }; +} + +/** Initial judges for the editor: the suite's judges[], else a single judge + * migrated from the legacy single-judge fields, else one empty default. */ +function initialJudges(s: EvalSuite | null | undefined): JudgeDef[] { + if (s?.judges?.length) return s.judges.map((j) => ({ ...j })); + if (s && (s.judgePrompt || s.judgeModel || s.judgePassThreshold !== undefined)) { + return [ + { + id: "j1", + name: "Task success", + ...(s.judgePrompt ? { rubric: s.judgePrompt } : {}), + ...(s.judgeModel ? { model: s.judgeModel } : {}), + ...(s.judgePassThreshold !== undefined ? { passThreshold: s.judgePassThreshold } : {}), + }, + ]; + } + return [{ id: "j1", name: "Task success" }]; +} + +function SuiteEditor({ + initial, + onCancel, + onSaved, +}: { + initial: EvalSuite | null; + onCancel: () => void; + onSaved: (s: EvalSuite) => void; +}) { + const { agents } = useAgents(); + const [name, setName] = useState(initial?.name ?? ""); + const [description, setDescription] = useState(initial?.description ?? ""); + const [agentName, setAgentName] = useState(initial?.agentName ?? ""); + const [scorers, setScorers] = useState( + initial?.scorers ?? { taskSuccess: true, toolCompliance: false, golden: false, nfr: false }, + ); + const [judges, setJudges] = useState(() => initialJudges(initial)); + const updJudge = (idx: number, patch: Partial) => + setJudges((p) => p.map((j, i) => (i === idx ? { ...j, ...patch } : j))); + const addJudge = () => + setJudges((p) => [...p, { id: `j-${p.length + 1}-${p.reduce((n, j) => n + j.id.length, 0)}`, name: `Judge ${p.length + 1}` }]); + const removeJudge = (idx: number) => setJudges((p) => p.filter((_, i) => i !== idx)); + const [passThreshold, setPassThreshold] = useState( + initial?.passThreshold !== undefined ? String(Math.round(initial.passThreshold * 100)) : "", + ); + const [cases, setCases] = useState(initial?.cases?.length ? initial.cases : [freshCase(0)]); + const [saving, setSaving] = useState(false); + const [err, setErr] = useState(null); + const [genBusy, setGenBusy] = useState(false); + const [genCount, setGenCount] = useState("5"); + const [genFocus, setGenFocus] = useState(""); + + const updCase = (idx: number, patch: Partial) => + setCases((p) => p.map((c, i) => (i === idx ? { ...c, ...patch } : c))); + + const generate = async () => { + if (!agentName) return setErr("Pick an agent first — cases are generated from it."); + setGenBusy(true); + setErr(null); + try { + const n = Math.max(1, Math.min(20, Number(genCount) || 5)); + const gen = await api.evals.generateCases(agentName, n, genFocus.trim() || undefined); + // Drop the empty starter case, then append the generated ones. + setCases((p) => [...p.filter((c) => c.prompt.trim()), ...gen]); + } catch (e) { + setErr(String(e)); + } finally { + setGenBusy(false); + } + }; + + const save = async () => { + if (!name.trim()) return setErr("Name is required"); + if (!agentName) return setErr("Pick an agent"); + setSaving(true); + setErr(null); + const body: EvalSuiteInput = { + name: name.trim(), + description: description.trim() || undefined, + agentName, + scorers, + judges: judges.map((j) => ({ + id: j.id, + name: j.name.trim() || "Judge", + ...(j.rubric?.trim() ? { rubric: j.rubric.trim() } : {}), + ...(j.model?.trim() ? { model: j.model.trim() } : {}), + ...(j.passThreshold !== undefined && !Number.isNaN(j.passThreshold) + ? { passThreshold: Math.max(0, Math.min(1, j.passThreshold)) } + : {}), + })), + passThreshold: passThreshold.trim() ? Math.max(0, Math.min(100, Number(passThreshold))) / 100 : undefined, + cases: cases.filter((c) => c.prompt.trim()), + }; + try { + const saved = initial ? await api.evals.updateSuite(initial._id, body) : await api.evals.createSuite(body); + onSaved(saved); + } catch (e) { + setErr(String(e)); + } finally { + setSaving(false); + } + }; + + return ( +
+

{initial ? "Edit suite" : "New simulation suite"}

+ {err &&
{err}
} + +
+ + setName(e.target.value)} className={inputCls} placeholder="e.g. Customer-support smoke" /> + + + setDescription(e.target.value)} className={inputCls} /> + + + + +
+ +
+ {(Object.keys(SCORER_LABEL) as (keyof ScorerConfig)[]).map((k) => ( +
+ {SCORER_LABEL[k]} + setScorers((p) => ({ ...p, [k]: v }))} /> +
+ ))} + + setPassThreshold(e.target.value)} className={inputCls} placeholder="e.g. 80 — run is green only if pass-rate ≥ this" /> + +
+ + {scorers.taskSuccess && ( +
+ Add judge} + > +

+ Each judge scores every case from the agent's whole trace (tool calls + steps) and output, returns a 0–1 score, and is shown separately in results. Custom rubric vars:{" "} + {"{{prompt}} {{criteria}} {{output}} {{trace}} {{tools}} {{golden}}"}. +

+ {judges.map((j, i) => ( +
+
+ updJudge(i, { name: e.target.value })} className={cn(inputCls, "flex-1")} placeholder="Judge name (e.g. Correctness, Safety, Tone)" /> + updJudge(i, { model: e.target.value || undefined })} className={cn(inputCls, "w-40")} placeholder="model (optional)" /> + updJudge(i, { passThreshold: e.target.value ? Number(e.target.value) : undefined })} + className={cn(inputCls, "w-16 text-center")} + placeholder="0.5" + title="pass threshold 0–1" + /> + {judges.length > 1 && ( + + )} +
+