diff --git a/agent/3-context.ts b/agent/3-context.ts index ac51d0f..5d2f0b8 100644 --- a/agent/3-context.ts +++ b/agent/3-context.ts @@ -13,17 +13,3 @@ export function createContext(task: string): ChatCompletionMessageParam[] { { role: "user", content: task }, ]; } - -// Drop old tool messages if context grows too large. -// Always keep: the system prompt and the original user task. -export function trimContext( - messages: ChatCompletionMessageParam[], - maxMessages: number -): ChatCompletionMessageParam[] { - if (messages.length <= maxMessages) return messages; - - const [system, user] = messages; - const rest = messages.slice(2); - const trimmed = rest.slice(rest.length - (maxMessages - 2)); - return [system, user, ...trimmed]; -} diff --git a/agent/4-guardrails.ts b/agent/4-guardrails.ts index 565ac1d..e69de29 100644 --- a/agent/4-guardrails.ts +++ b/agent/4-guardrails.ts @@ -1,58 +0,0 @@ -import type { ChatCompletionMessageParam } from "openai/resources/chat/completions"; - -type GuardrailInput = { - iterations: number; - messages: ChatCompletionMessageParam[]; -}; - -export type GuardrailResult = { ok: true } | { ok: false; reason: string }; -export type GuardrailFn = (input: GuardrailInput) => GuardrailResult; - -// ── Individual guardrails ───────────────────── - -// Stop after too many iterations — prevents infinite loops -const maxIterations = - (limit: number): GuardrailFn => - ({ iterations }) => - iterations >= limit - ? { ok: false, reason: `Guardrail: reached iteration limit (${limit})` } - : { ok: true }; - -// Stop if context has ballooned unexpectedly -const maxMessages = - (limit: number): GuardrailFn => - ({ messages }) => - messages.length > limit - ? { ok: false, reason: `Guardrail: context too large (${messages.length} messages)` } - : { ok: true }; - -// ── Compose into one fn ─────────────────────── - -export function combineGuardrails(...fns: GuardrailFn[]): GuardrailFn { - return (input) => { - for (const check of fns) { - const result = check(input); - if (!result.ok) return result; - } - return { ok: true }; - }; -} - -// Stop after successful upvote -export const stopAfterUpvote = - (getUpvotedStory: () => { id: string; title?: string; rank?: number } | null): GuardrailFn => - () => { - const story = getUpvotedStory(); - if (story) { - const storyInfo = story.title && story.rank - ? `"${story.title}" (rank ${story.rank})` - : `story ID ${story.id}`; - return { ok: false, reason: `Successfully upvoted ${storyInfo}` }; - } - return { ok: true }; - }; - -export const defaultGuardrails = combineGuardrails( - maxIterations(15), - maxMessages(50) -); diff --git a/agent/5-loop.ts b/agent/5-loop.ts index 67f8cfa..d896031 100644 --- a/agent/5-loop.ts +++ b/agent/5-loop.ts @@ -1,7 +1,5 @@ import type { ChatCompletionMessageParam } from "openai/resources/chat/completions"; import { client } from "./2-model.js"; -import { trimContext } from "./3-context.js"; -import type { GuardrailFn } from "./4-guardrails.js"; import type { ToolRegistry } from "./1-tools.js"; const MAX_CONTEXT_MESSAGES = 20; @@ -19,7 +17,6 @@ export type LoopIteration = { outcome: "tool_calls" | "answer"; toolEvents: ToolEvent[]; // empty if outcome is "answer" contextSize: number; // how many messages were in context for this call - contextTrimmed: boolean; // true if we dropped old messages before this call }; export type LoopResult = { @@ -29,33 +26,17 @@ export type LoopResult = { stoppedBy: "model" | "guardrail" | "success"; }; -export type LoginHandler = () => Promise; export async function runLoop( model: string, messages: ChatCompletionMessageParam[], - guardrail: GuardrailFn, tools: ToolRegistry, // injected by the harness, not imported globally - loginHandler?: LoginHandler // optional callback to handle login redirects ): Promise { const trace: LoopIteration[] = []; while (true) { const iterationIndex = trace.length + 1; - // ── Context management ──────────────────── - const beforeTrim = messages.length; - messages = trimContext(messages, MAX_CONTEXT_MESSAGES); - const contextTrimmed = messages.length < beforeTrim; - - // ── Guardrails check ────────────────────── - const check = guardrail({ iterations: trace.length, messages }); - if (!check.ok) { - // Check if this is a success completion (reason starts with "Successfully") - const stoppedBy = check.reason.startsWith("Successfully") ? "success" : "guardrail"; - return { answer: check.reason, iterations: trace.length, trace, stoppedBy }; - } - // ── Model call ──────────────────────────── process.stdout.write(`[iter ${iterationIndex}] calling model... `); const response = await client.chat.completions.create({ @@ -72,7 +53,7 @@ export async function runLoop( // ── Final answer ────────────────────────── if (choice.finish_reason === "stop") { - trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize, contextTrimmed }); + trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize }); return { answer: choice.message.content ?? "(no response)", iterations: trace.length, @@ -104,20 +85,7 @@ export async function runLoop( messages.push({ role: "tool", tool_call_id: call.id, content: result }); } - // ── Check for login redirect after tool execution ─── - if (loginHandler) { - const loginEvent = await loginHandler(); - if (loginEvent) { - toolEvents.push(loginEvent); - // Add a system message to inform the agent that login was handled - messages.push({ - role: "user", - content: "Authentication completed by harness. You are now logged in. Navigate back to https://news.ycombinator.com and complete your upvote task.", - }); - } - } - - trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize, contextTrimmed }); + trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize }); } } } diff --git a/agent/6-harness.ts b/agent/6-harness.ts index 0975809..e69de29 100644 --- a/agent/6-harness.ts +++ b/agent/6-harness.ts @@ -1,169 +0,0 @@ -import { BrowserSession } from "./browser.js"; -import { createTools } from "./1-tools.js"; -import { createContext } from "./3-context.js"; -import { combineGuardrails, defaultGuardrails, stopAfterUpvote } from "./4-guardrails.js"; -import { runLoop } from "./5-loop.js"; -import type { LoopResult, ToolEvent } from "./5-loop.js"; - -export type VerifyResult = { - passed: boolean; - reason: string; -}; - -export type HarnessExecutionResult = LoopResult & { - task: string; - model: string; -}; - -export type HarnessOptions = { - verify?: (result: HarnessExecutionResult) => VerifyResult; - maxAttempts?: number; -}; - -export type HarnessResult = HarnessExecutionResult & { - attempts: number; - verification: VerifyResult | null; -}; - -export async function runHarness( - task: string, - model: string, - options: HarnessOptions = {} -): Promise { - const maxAttempts = options.maxAttempts ?? 1; - let latestResult: HarnessResult | null = null; - - for (let attempt = 1; attempt <= maxAttempts; attempt++) { - const result = await runHarnessAttempt(task, model); - const verification = options.verify ? options.verify(result) : null; - - latestResult = { ...result, attempts: attempt, verification }; - - if (verification?.passed || attempt === maxAttempts) { - return latestResult; - } - - console.log(`\nAttempt ${attempt} failed — retrying (${attempt + 1}/${maxAttempts})...\n`); - } - - throw new Error("Harness finished without producing a result"); -} - -export function verifySuccessfulUpvote(result: HarnessExecutionResult): VerifyResult { - const successfulUpvote = result.trace - .flatMap((iter) => iter.toolEvents) - .find( - (e) => - e.tool === "browser_click" && - /up_/.test(JSON.stringify(e.args)) && - /news\.ycombinator\.com\/(news)?$/.test(e.result.split("now at ")[1]?.trim() ?? "") - ); - - return { - passed: !!successfulUpvote, - reason: successfulUpvote - ? `Upvote click confirmed — landed on ${successfulUpvote.result.split("now at ")[1]}` - : "No successful upvote click found in trace (all arrows may be hidden, or login failed)", - }; -} - -export function printHarnessResult(result: HarnessResult): void { - console.log("\n─── Agent trace ───────────────────────────\n"); - - for (const iteration of result.trace) { - const trimNote = iteration.contextTrimmed ? " ✂ context trimmed" : ""; - const ctx = `[ctx: ${iteration.contextSize} msgs${trimNote}]`; - - if (iteration.outcome === "tool_calls") { - console.log(`[iter ${iteration.index}] ${iteration.toolEvents.length} tool call(s) ${ctx}`); - for (const event of iteration.toolEvents) { - console.log(` → ${event.tool}(${JSON.stringify(event.args)})`); - console.log(` ${event.result.slice(0, 120)}${event.result.length > 120 ? "…" : ""}`); - } - } else { - console.log(`[iter ${iteration.index}] answered ${ctx}`); - } - console.log(); - } - - console.log("─── Result ────────────────────────────────\n"); - console.log(result.answer); - console.log(`\nStopped by: ${result.stoppedBy} after ${result.iterations} iteration(s)`); - console.log(`Attempts: ${result.attempts}`); - - if (result.verification) { - const { passed, reason } = result.verification; - console.log(`Verify: ${passed ? "✓ PASS" : "✗ FAIL"} — ${reason}`); - } -} - -async function runHarnessAttempt( - task: string, - model: string -): Promise { - // Open the environment — each run gets its own isolated browser page - const session = new BrowserSession(); - await session.open(); - - try { - const messages = createContext(task); // fresh context for this task - - // Track upvoted story - let upvotedStory: { id: string; title?: string; rank?: number } | null = null; - let storiesData: any[] = []; - - // Create tools with hooks to track upvote success and story data - const tools = createTools(session, { - onUpvoteSuccess: (storyId) => { - const story = storiesData.find(s => s.id === storyId); - upvotedStory = story - ? { id: storyId, title: story.title, rank: story.rank } - : { id: storyId }; - console.log(`\n[harness] Upvote successful for story ID ${storyId} — forcing completion\n`); - }, - onStoriesLoaded: (stories) => { - storiesData = stories; - }, - }); - - // Login handler checks for redirects after each tool execution - const loginHandler = async (): Promise => { - const currentUrl = await session.getUrl(); - const isLoginPage = currentUrl.includes("login") || currentUrl.includes("vote"); - - if (!isLoginPage) return null; - - console.log("\n[harness] Login redirect detected — handling automatically..."); - - try { - await session.fill("input[name='acct']", "tejasthrowaway"); - await session.fill("input[name='pw']", "tejasthrowaway"); - await session.click("input[type='submit']"); - - console.log("[harness] Login completed — agent can continue\n"); - - return { - tool: "harness_auto_login", - args: {}, - result: `Harness automatically handled login at ${currentUrl}. You are now authenticated and back at ${await session.getUrl()}.`, - }; - } catch (err) { - console.log(`[harness] Login failed: ${err instanceof Error ? err.message : String(err)}\n`); - return null; - } - }; - - // Combine default guardrails with upvote completion check - const guardrails = combineGuardrails( - stopAfterUpvote(() => upvotedStory), - defaultGuardrails - ); - - const result = await runLoop(model, messages, guardrails, tools, loginHandler); - - return { task, model, ...result }; - } finally { - // Always close the environment — even if the loop threw - await session.close(); - } -} diff --git a/agent/7-index.ts b/agent/7-index.ts index 099cfcb..7a97e8d 100644 --- a/agent/7-index.ts +++ b/agent/7-index.ts @@ -1,4 +1,7 @@ -import { printHarnessResult, runHarness, verifySuccessfulUpvote } from "./6-harness.js"; +import { createTools } from "./1-tools.js"; +import { createContext } from "./3-context.js"; +import { runLoop } from "./5-loop.js"; +import { BrowserSession } from "./browser.js"; // try a shitty model const MODEL = "openai/gpt-3.5-turbo-0613"; @@ -15,5 +18,18 @@ Click its upvote arrow using the exact selector: a[id="up_STORYID"] (replace STO console.log(`Model: ${MODEL}`); console.log(`Task: upvote on Hacker News\n`); -const result = await runHarness(TASK, MODEL, { verify: verifySuccessfulUpvote, maxAttempts: 3 }); -printHarnessResult(result); +const session = new BrowserSession(); + +try { + await session.open(); + + const tools = createTools(session); + const messages = createContext(TASK); + const result = await runLoop(MODEL, messages, tools); + + console.log(`\nAnswer: ${result.answer}`); + console.log(`Stopped by: ${result.stoppedBy}`); + console.log(`Iterations: ${result.iterations}`); +} finally { + await session.close(); +}