This commit is contained in:
Tejas Kumar
2026-04-02 11:06:18 +02:00
parent b29905a3ac
commit eeb5f0ff9d
5 changed files with 21 additions and 278 deletions
-14
View File
@@ -13,17 +13,3 @@ export function createContext(task: string): ChatCompletionMessageParam[] {
{ role: "user", content: task }, { role: "user", content: task },
]; ];
} }
// Drop old tool messages if context grows too large.
// Always keep: the system prompt and the original user task.
export function trimContext(
messages: ChatCompletionMessageParam[],
maxMessages: number
): ChatCompletionMessageParam[] {
if (messages.length <= maxMessages) return messages;
const [system, user] = messages;
const rest = messages.slice(2);
const trimmed = rest.slice(rest.length - (maxMessages - 2));
return [system, user, ...trimmed];
}
-58
View File
@@ -1,58 +0,0 @@
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
type GuardrailInput = {
iterations: number;
messages: ChatCompletionMessageParam[];
};
export type GuardrailResult = { ok: true } | { ok: false; reason: string };
export type GuardrailFn = (input: GuardrailInput) => GuardrailResult;
// ── Individual guardrails ─────────────────────
// Stop after too many iterations — prevents infinite loops
const maxIterations =
(limit: number): GuardrailFn =>
({ iterations }) =>
iterations >= limit
? { ok: false, reason: `Guardrail: reached iteration limit (${limit})` }
: { ok: true };
// Stop if context has ballooned unexpectedly
const maxMessages =
(limit: number): GuardrailFn =>
({ messages }) =>
messages.length > limit
? { ok: false, reason: `Guardrail: context too large (${messages.length} messages)` }
: { ok: true };
// ── Compose into one fn ───────────────────────
export function combineGuardrails(...fns: GuardrailFn[]): GuardrailFn {
return (input) => {
for (const check of fns) {
const result = check(input);
if (!result.ok) return result;
}
return { ok: true };
};
}
// Stop after successful upvote
export const stopAfterUpvote =
(getUpvotedStory: () => { id: string; title?: string; rank?: number } | null): GuardrailFn =>
() => {
const story = getUpvotedStory();
if (story) {
const storyInfo = story.title && story.rank
? `"${story.title}" (rank ${story.rank})`
: `story ID ${story.id}`;
return { ok: false, reason: `Successfully upvoted ${storyInfo}` };
}
return { ok: true };
};
export const defaultGuardrails = combineGuardrails(
maxIterations(15),
maxMessages(50)
);
+2 -34
View File
@@ -1,7 +1,5 @@
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions"; import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
import { client } from "./2-model.js"; import { client } from "./2-model.js";
import { trimContext } from "./3-context.js";
import type { GuardrailFn } from "./4-guardrails.js";
import type { ToolRegistry } from "./1-tools.js"; import type { ToolRegistry } from "./1-tools.js";
const MAX_CONTEXT_MESSAGES = 20; const MAX_CONTEXT_MESSAGES = 20;
@@ -19,7 +17,6 @@ export type LoopIteration = {
outcome: "tool_calls" | "answer"; outcome: "tool_calls" | "answer";
toolEvents: ToolEvent[]; // empty if outcome is "answer" toolEvents: ToolEvent[]; // empty if outcome is "answer"
contextSize: number; // how many messages were in context for this call contextSize: number; // how many messages were in context for this call
contextTrimmed: boolean; // true if we dropped old messages before this call
}; };
export type LoopResult = { export type LoopResult = {
@@ -29,33 +26,17 @@ export type LoopResult = {
stoppedBy: "model" | "guardrail" | "success"; stoppedBy: "model" | "guardrail" | "success";
}; };
export type LoginHandler = () => Promise<ToolEvent | null>;
export async function runLoop( export async function runLoop(
model: string, model: string,
messages: ChatCompletionMessageParam[], messages: ChatCompletionMessageParam[],
guardrail: GuardrailFn,
tools: ToolRegistry, // injected by the harness, not imported globally tools: ToolRegistry, // injected by the harness, not imported globally
loginHandler?: LoginHandler // optional callback to handle login redirects
): Promise<LoopResult> { ): Promise<LoopResult> {
const trace: LoopIteration[] = []; const trace: LoopIteration[] = [];
while (true) { while (true) {
const iterationIndex = trace.length + 1; const iterationIndex = trace.length + 1;
// ── Context management ────────────────────
const beforeTrim = messages.length;
messages = trimContext(messages, MAX_CONTEXT_MESSAGES);
const contextTrimmed = messages.length < beforeTrim;
// ── Guardrails check ──────────────────────
const check = guardrail({ iterations: trace.length, messages });
if (!check.ok) {
// Check if this is a success completion (reason starts with "Successfully")
const stoppedBy = check.reason.startsWith("Successfully") ? "success" : "guardrail";
return { answer: check.reason, iterations: trace.length, trace, stoppedBy };
}
// ── Model call ──────────────────────────── // ── Model call ────────────────────────────
process.stdout.write(`[iter ${iterationIndex}] calling model... `); process.stdout.write(`[iter ${iterationIndex}] calling model... `);
const response = await client.chat.completions.create({ const response = await client.chat.completions.create({
@@ -72,7 +53,7 @@ export async function runLoop(
// ── Final answer ────────────────────────── // ── Final answer ──────────────────────────
if (choice.finish_reason === "stop") { if (choice.finish_reason === "stop") {
trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize, contextTrimmed }); trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize });
return { return {
answer: choice.message.content ?? "(no response)", answer: choice.message.content ?? "(no response)",
iterations: trace.length, iterations: trace.length,
@@ -104,20 +85,7 @@ export async function runLoop(
messages.push({ role: "tool", tool_call_id: call.id, content: result }); messages.push({ role: "tool", tool_call_id: call.id, content: result });
} }
// ── Check for login redirect after tool execution ─── trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize });
if (loginHandler) {
const loginEvent = await loginHandler();
if (loginEvent) {
toolEvents.push(loginEvent);
// Add a system message to inform the agent that login was handled
messages.push({
role: "user",
content: "Authentication completed by harness. You are now logged in. Navigate back to https://news.ycombinator.com and complete your upvote task.",
});
}
}
trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize, contextTrimmed });
} }
} }
} }
-169
View File
@@ -1,169 +0,0 @@
import { BrowserSession } from "./browser.js";
import { createTools } from "./1-tools.js";
import { createContext } from "./3-context.js";
import { combineGuardrails, defaultGuardrails, stopAfterUpvote } from "./4-guardrails.js";
import { runLoop } from "./5-loop.js";
import type { LoopResult, ToolEvent } from "./5-loop.js";
export type VerifyResult = {
passed: boolean;
reason: string;
};
export type HarnessExecutionResult = LoopResult & {
task: string;
model: string;
};
export type HarnessOptions = {
verify?: (result: HarnessExecutionResult) => VerifyResult;
maxAttempts?: number;
};
export type HarnessResult = HarnessExecutionResult & {
attempts: number;
verification: VerifyResult | null;
};
export async function runHarness(
task: string,
model: string,
options: HarnessOptions = {}
): Promise<HarnessResult> {
const maxAttempts = options.maxAttempts ?? 1;
let latestResult: HarnessResult | null = null;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const result = await runHarnessAttempt(task, model);
const verification = options.verify ? options.verify(result) : null;
latestResult = { ...result, attempts: attempt, verification };
if (verification?.passed || attempt === maxAttempts) {
return latestResult;
}
console.log(`\nAttempt ${attempt} failed — retrying (${attempt + 1}/${maxAttempts})...\n`);
}
throw new Error("Harness finished without producing a result");
}
export function verifySuccessfulUpvote(result: HarnessExecutionResult): VerifyResult {
const successfulUpvote = result.trace
.flatMap((iter) => iter.toolEvents)
.find(
(e) =>
e.tool === "browser_click" &&
/up_/.test(JSON.stringify(e.args)) &&
/news\.ycombinator\.com\/(news)?$/.test(e.result.split("now at ")[1]?.trim() ?? "")
);
return {
passed: !!successfulUpvote,
reason: successfulUpvote
? `Upvote click confirmed — landed on ${successfulUpvote.result.split("now at ")[1]}`
: "No successful upvote click found in trace (all arrows may be hidden, or login failed)",
};
}
export function printHarnessResult(result: HarnessResult): void {
console.log("\n─── Agent trace ───────────────────────────\n");
for (const iteration of result.trace) {
const trimNote = iteration.contextTrimmed ? " ✂ context trimmed" : "";
const ctx = `[ctx: ${iteration.contextSize} msgs${trimNote}]`;
if (iteration.outcome === "tool_calls") {
console.log(`[iter ${iteration.index}] ${iteration.toolEvents.length} tool call(s) ${ctx}`);
for (const event of iteration.toolEvents) {
console.log(`${event.tool}(${JSON.stringify(event.args)})`);
console.log(` ${event.result.slice(0, 120)}${event.result.length > 120 ? "…" : ""}`);
}
} else {
console.log(`[iter ${iteration.index}] answered ${ctx}`);
}
console.log();
}
console.log("─── Result ────────────────────────────────\n");
console.log(result.answer);
console.log(`\nStopped by: ${result.stoppedBy} after ${result.iterations} iteration(s)`);
console.log(`Attempts: ${result.attempts}`);
if (result.verification) {
const { passed, reason } = result.verification;
console.log(`Verify: ${passed ? "✓ PASS" : "✗ FAIL"}${reason}`);
}
}
async function runHarnessAttempt(
task: string,
model: string
): Promise<HarnessExecutionResult> {
// Open the environment — each run gets its own isolated browser page
const session = new BrowserSession();
await session.open();
try {
const messages = createContext(task); // fresh context for this task
// Track upvoted story
let upvotedStory: { id: string; title?: string; rank?: number } | null = null;
let storiesData: any[] = [];
// Create tools with hooks to track upvote success and story data
const tools = createTools(session, {
onUpvoteSuccess: (storyId) => {
const story = storiesData.find(s => s.id === storyId);
upvotedStory = story
? { id: storyId, title: story.title, rank: story.rank }
: { id: storyId };
console.log(`\n[harness] Upvote successful for story ID ${storyId} — forcing completion\n`);
},
onStoriesLoaded: (stories) => {
storiesData = stories;
},
});
// Login handler checks for redirects after each tool execution
const loginHandler = async (): Promise<ToolEvent | null> => {
const currentUrl = await session.getUrl();
const isLoginPage = currentUrl.includes("login") || currentUrl.includes("vote");
if (!isLoginPage) return null;
console.log("\n[harness] Login redirect detected — handling automatically...");
try {
await session.fill("input[name='acct']", "tejasthrowaway");
await session.fill("input[name='pw']", "tejasthrowaway");
await session.click("input[type='submit']");
console.log("[harness] Login completed — agent can continue\n");
return {
tool: "harness_auto_login",
args: {},
result: `Harness automatically handled login at ${currentUrl}. You are now authenticated and back at ${await session.getUrl()}.`,
};
} catch (err) {
console.log(`[harness] Login failed: ${err instanceof Error ? err.message : String(err)}\n`);
return null;
}
};
// Combine default guardrails with upvote completion check
const guardrails = combineGuardrails(
stopAfterUpvote(() => upvotedStory),
defaultGuardrails
);
const result = await runLoop(model, messages, guardrails, tools, loginHandler);
return { task, model, ...result };
} finally {
// Always close the environment — even if the loop threw
await session.close();
}
}
+19 -3
View File
@@ -1,4 +1,7 @@
import { printHarnessResult, runHarness, verifySuccessfulUpvote } from "./6-harness.js"; import { createTools } from "./1-tools.js";
import { createContext } from "./3-context.js";
import { runLoop } from "./5-loop.js";
import { BrowserSession } from "./browser.js";
// try a shitty model // try a shitty model
const MODEL = "openai/gpt-3.5-turbo-0613"; const MODEL = "openai/gpt-3.5-turbo-0613";
@@ -15,5 +18,18 @@ Click its upvote arrow using the exact selector: a[id="up_STORYID"] (replace STO
console.log(`Model: ${MODEL}`); console.log(`Model: ${MODEL}`);
console.log(`Task: upvote on Hacker News\n`); console.log(`Task: upvote on Hacker News\n`);
const result = await runHarness(TASK, MODEL, { verify: verifySuccessfulUpvote, maxAttempts: 3 }); const session = new BrowserSession();
printHarnessResult(result);
try {
await session.open();
const tools = createTools(session);
const messages = createContext(TASK);
const result = await runLoop(MODEL, messages, tools);
console.log(`\nAnswer: ${result.answer}`);
console.log(`Stopped by: ${result.stoppedBy}`);
console.log(`Iterations: ${result.iterations}`);
} finally {
await session.close();
}