diff --git a/.github/workflows/runledger.yml b/.github/workflows/runledger.yml new file mode 100644 index 0000000..807298b --- /dev/null +++ b/.github/workflows/runledger.yml @@ -0,0 +1,24 @@ +name: runledger-gate +on: + pull_request: + +jobs: + evals: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install RunLedger + run: | + python -m pip install --upgrade pip + python -m pip install runledger + - name: Run deterministic evals (replay) + run: runledger run evals/runledger --mode replay --baseline baselines/runledger-demo.json + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: runledger-artifacts + path: runledger_out/** diff --git a/.gitignore b/.gitignore index 3a1d45d..cc00bfd 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ yarn.lock test-injection/ notepad.md oauth-success.html +runledger_out/ diff --git a/README.md b/README.md index 9a2402a..a4f38cb 100644 --- a/README.md +++ b/README.md @@ -954,3 +954,14 @@ I have no affiliation with any project or model mentioned here. This is purely p - Fun fact: That PR was discovered and fixed thanks to OhMyOpenCode's Librarian, Explore, and Oracle setup. *Special thanks to [@junhoyeo](https://github.com/junhoyeo) for this amazing hero image.* + +## RunLedger CI gate + +This repo includes a deterministic CI gate for tool-using agents: + +```bash +runledger run evals/runledger --mode replay --baseline baselines/runledger-demo.json +``` + +It replays recorded tool calls and fails the PR on schema/tool/budget regressions. + diff --git a/baselines/runledger-demo.json b/baselines/runledger-demo.json new file mode 100644 index 0000000..56bb421 --- /dev/null +++ b/baselines/runledger-demo.json @@ -0,0 +1,113 @@ +{ + "aggregates": { + "cases_error": 0, + "cases_fail": 0, + "cases_pass": 1, + "cases_total": 1, + "metrics": { + "cost_usd": { + "max": null, + "mean": null, + "min": null, + "p50": null, + "p95": null + }, + "steps": { + "max": null, + "mean": null, + "min": null, + "p50": null, + "p95": null + }, + "tokens_in": { + "max": null, + "mean": null, + "min": null, + "p50": null, + "p95": null + }, + "tokens_out": { + "max": null, + "mean": null, + "min": null, + "p50": null, + "p95": null + }, + "tool_calls": { + "max": 1.0, + "mean": 1.0, + "min": 1.0, + "p50": 1.0, + "p95": 1.0 + }, + "tool_errors": { + "max": 0.0, + "mean": 0.0, + "min": 0.0, + "p50": 0.0, + "p95": 0.0 + }, + "wall_ms": { + "max": 51.0, + "mean": 51.0, + "min": 51.0, + "p50": 51.0, + "p95": 51.0 + } + }, + "pass_rate": 1.0 + }, + "cases": [ + { + "assertions": { + "failed": 0, + "total": 2 + }, + "cost_usd": null, + "failed_assertions": null, + "failure_reason": null, + "id": "t1", + "replay": { + "cassette_path": "evals/runledger/cassettes/t1.jsonl", + "cassette_sha256": "3ca88ba1cde6952e1927e83ba82cc13948f96157d200ef01a7a15b5e586883e5" + }, + "status": "pass", + "steps": null, + "tokens_in": null, + "tokens_out": null, + "tool_calls": 1, + "tool_calls_by_name": { + "search_docs": 1 + }, + "tool_errors": 0, + "tool_errors_by_name": {}, + "wall_ms": 51 + } + ], + "generated_at": "2025-12-19T12:11:00.657302Z", + "policy_snapshot": { + "thresholds": { + "min_pass_rate": 1.0 + } + }, + "run": { + "ci": null, + "exit_status": "success", + "git_sha": null, + "mode": "replay", + "run_id": "20251219-121100-485aac" + }, + "runledger_version": "0.1.0", + "schema_version": 1, + "suite": { + "agent_command": [ + "python", + "evals/runledger/agent/agent.py" + ], + "cases_total": 1, + "name": "runledger-demo", + "suite_config_hash": null, + "suite_path": "evals/runledger/suite.yaml", + "tool_mode": "replay" + } +} diff --git a/evals/runledger/agent/agent.py b/evals/runledger/agent/agent.py new file mode 100644 index 0000000..6e2b939 --- /dev/null +++ b/evals/runledger/agent/agent.py @@ -0,0 +1,22 @@ +import json +import sys + +def send(payload): + sys.stdout.write(json.dumps(payload) + "\n") + sys.stdout.flush() + +def main(): + for line in sys.stdin: + line = line.strip() + if not line: + continue + msg = json.loads(line) + if msg.get("type") == "task_start": + ticket = msg.get("input", {}).get("ticket", "") + send({"type": "tool_call", "name": "search_docs", "call_id": "c1", "args": {"q": ticket}}) + elif msg.get("type") == "tool_result": + send({"type": "final_output", "output": {"category": "account", "reply": "Reset password instructions sent."}}) + break + +if __name__ == "__main__": + main() diff --git a/evals/runledger/cases/t1.yaml b/evals/runledger/cases/t1.yaml new file mode 100644 index 0000000..07d4412 --- /dev/null +++ b/evals/runledger/cases/t1.yaml @@ -0,0 +1,10 @@ +id: t1 +description: triage a login ticket +input: + ticket: reset password +cassette: cassettes/t1.jsonl +assertions: +- type: required_fields + fields: + - category + - reply diff --git a/evals/runledger/cassettes/t1.jsonl b/evals/runledger/cassettes/t1.jsonl new file mode 100644 index 0000000..122ff10 --- /dev/null +++ b/evals/runledger/cassettes/t1.jsonl @@ -0,0 +1 @@ +{"args": {"q": "reset password"}, "ok": true, "result": {"hits": [{"snippet": "Use the reset link.", "title": "Reset password"}]}, "tool": "search_docs"} diff --git a/evals/runledger/schema.json b/evals/runledger/schema.json new file mode 100644 index 0000000..0f13149 --- /dev/null +++ b/evals/runledger/schema.json @@ -0,0 +1,15 @@ +{ + "type": "object", + "properties": { + "category": { + "type": "string" + }, + "reply": { + "type": "string" + } + }, + "required": [ + "category", + "reply" + ] +} diff --git a/evals/runledger/suite.yaml b/evals/runledger/suite.yaml new file mode 100644 index 0000000..3a39fec --- /dev/null +++ b/evals/runledger/suite.yaml @@ -0,0 +1,18 @@ +suite_name: runledger-demo +agent_command: +- python +- agent/agent.py +mode: replay +cases_path: cases +tool_registry: +- search_docs +assertions: +- type: json_schema + schema_path: schema.json +budgets: + max_wall_ms: 20000 + max_tool_calls: 1 + max_tool_errors: 0 +regression: + min_pass_rate: 1.0 +baseline_path: ../../baselines/runledger-demo.json