This commit is contained in:
Tejas Kumar
2026-04-02 11:24:46 +02:00
parent 7fad7f0f97
commit c85b0b5cd5
2 changed files with 120 additions and 3 deletions
+115 -1
View File
@@ -5,12 +5,120 @@ import { defaultGuardrails } from "./4-guardrails.js";
import { runLoop } from "./5-loop.js";
import type { LoopResult } from "./5-loop.js";
export type VerifyResult = {
passed: boolean;
reason: string;
fatal?: boolean;
};
export type HarnessExecutionResult = LoopResult & {
task: string;
model: string;
};
export type HarnessOptions = {
verify?: (result: HarnessExecutionResult) => VerifyResult;
maxAttempts?: number;
};
export type HarnessResult = HarnessExecutionResult & {
attempts: number;
verification: VerifyResult | null;
};
export async function runHarness(
task: string,
model: string,
options: HarnessOptions = {}
): Promise<HarnessResult> {
const maxAttempts = options.maxAttempts ?? 1;
let latestResult: HarnessResult | null = null;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const result = await runHarnessAttempt(task, model);
const verification = options.verify ? options.verify(result) : null;
const answer =
verification && !verification.passed
? verification.reason
: result.answer;
latestResult = { ...result, answer, attempts: attempt, verification };
if (!verification || verification.passed || verification.fatal || attempt === maxAttempts) {
return latestResult;
}
console.log(`\nAttempt ${attempt} failed - retrying (${attempt + 1}/${maxAttempts})...\n`);
}
throw new Error("Harness finished without producing a result");
}
export function verifySuccessfulUpvote(result: HarnessExecutionResult): VerifyResult {
const successfulUpvote = result.trace
.flatMap((iteration) => iteration.toolEvents)
.find(
(event) =>
event.tool === "browser_click" &&
/up_/.test(JSON.stringify(event.args)) &&
/news\.ycombinator\.com\/(news)?$/.test(event.result.split("now at ")[1]?.trim() ?? "")
);
if (successfulUpvote) {
return {
passed: true,
reason: `Upvote click confirmed - landed on ${successfulUpvote.result.split("now at ")[1]}`,
};
}
const failedLogin = result.trace
.flatMap((iteration) => iteration.toolEvents)
.find(
(event) =>
event.tool === "harness_auto_login" &&
event.result.startsWith("Harness failed to handle login at ")
);
if (failedLogin) {
return {
passed: false,
reason: failedLogin.result,
fatal: true,
};
}
const unrecoveredLoginRedirect = result.trace
.flatMap((iteration) => iteration.toolEvents)
.find(
(event) =>
event.tool !== "harness_auto_login" &&
isLoginUrl(extractUrl(event.result))
);
if (unrecoveredLoginRedirect) {
return {
passed: false,
reason: `Hit login screen instead of completing the upvote (${extractUrl(unrecoveredLoginRedirect.result)})`,
fatal: true,
};
}
return {
passed: false,
reason: "No successful upvote click found in trace",
};
}
function extractUrl(result: string): string | null {
const match = result.match(/https?:\/\/\S+/);
return match ? match[0] : null;
}
function isLoginUrl(url: string | null): boolean {
return !!url && (url.includes("/login") || url.includes("/vote"));
}
async function runHarnessAttempt(
task: string,
model: string
): Promise<HarnessExecutionResult> {
@@ -27,7 +135,7 @@ export async function runHarness(
}
}
export function printHarnessResult(result: HarnessExecutionResult): void {
export function printHarnessResult(result: HarnessResult): void {
console.log("\n--- Agent trace ---\n");
for (const iteration of result.trace) {
@@ -50,4 +158,10 @@ export function printHarnessResult(result: HarnessExecutionResult): void {
console.log("--- Result ---\n");
console.log(result.answer);
console.log(`\nStopped by: ${result.stoppedBy} after ${result.iterations} iteration(s)`);
console.log(`Attempts: ${result.attempts}`);
if (result.verification) {
const status = result.verification.passed ? "PASS" : "FAIL";
console.log(`Verify: ${status} - ${result.verification.reason}`);
}
}
+5 -2
View File
@@ -1,4 +1,4 @@
import { printHarnessResult, runHarness } from "./6-harness.js";
import { printHarnessResult, runHarness, verifySuccessfulUpvote } from "./6-harness.js";
// try a shitty model
const MODEL = "openai/gpt-3.5-turbo-0613";
@@ -15,5 +15,8 @@ Click its upvote arrow using the exact selector: a[id="up_STORYID"] (replace STO
console.log(`Model: ${MODEL}`);
console.log(`Task: upvote on Hacker News\n`);
const result = await runHarness(TASK, MODEL);
const result = await runHarness(TASK, MODEL, {
verify: verifySuccessfulUpvote,
maxAttempts: 3,
});
printHarnessResult(result);