Simplify

2026-06-13 19:20:06 +00:00 · 2026-04-02 11:06:18 +02:00
parent b29905a3ac
commit eeb5f0ff9d
5 changed files with 21 additions and 278 deletions
@@ -13,17 +13,3 @@ export function createContext(task: string): ChatCompletionMessageParam[] {
    { role: "user", content: task },
  ];
 }
 // Drop old tool messages if context grows too large.
 // Always keep: the system prompt and the original user task.
 export function trimContext(
  messages: ChatCompletionMessageParam[],
  maxMessages: number
 ): ChatCompletionMessageParam[] {
  if (messages.length <= maxMessages) return messages;
  const [system, user] = messages;
  const rest = messages.slice(2);
  const trimmed = rest.slice(rest.length - (maxMessages - 2));
  return [system, user, ...trimmed];
 }
@@ -1,58 +0,0 @@
 import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
 type GuardrailInput = {
  iterations: number;
  messages: ChatCompletionMessageParam[];
 };
 export type GuardrailResult = { ok: true } | { ok: false; reason: string };
 export type GuardrailFn = (input: GuardrailInput) => GuardrailResult;
 // ── Individual guardrails ─────────────────────
 // Stop after too many iterations — prevents infinite loops
 const maxIterations =
  (limit: number): GuardrailFn =>
  ({ iterations }) =>
    iterations >= limit
      ? { ok: false, reason: `Guardrail: reached iteration limit (${limit})` }
      : { ok: true };
 // Stop if context has ballooned unexpectedly
 const maxMessages =
  (limit: number): GuardrailFn =>
  ({ messages }) =>
    messages.length > limit
      ? { ok: false, reason: `Guardrail: context too large (${messages.length} messages)` }
      : { ok: true };
 // ── Compose into one fn ───────────────────────
 export function combineGuardrails(...fns: GuardrailFn[]): GuardrailFn {
  return (input) => {
    for (const check of fns) {
      const result = check(input);
      if (!result.ok) return result;
    }
    return { ok: true };
  };
 }
 // Stop after successful upvote
 export const stopAfterUpvote =
  (getUpvotedStory: () => { id: string; title?: string; rank?: number } | null): GuardrailFn =>
  () => {
    const story = getUpvotedStory();
    if (story) {
      const storyInfo = story.title && story.rank
        ? `"${story.title}" (rank ${story.rank})`
        : `story ID ${story.id}`;
      return { ok: false, reason: `Successfully upvoted ${storyInfo}` };
    }
    return { ok: true };
  };
 export const defaultGuardrails = combineGuardrails(
  maxIterations(15),
  maxMessages(50)
 );
@@ -1,7 +1,5 @@
 import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
 import { client } from "./2-model.js";
 import { trimContext } from "./3-context.js";
 import type { GuardrailFn } from "./4-guardrails.js";
 import type { ToolRegistry } from "./1-tools.js";
 const MAX_CONTEXT_MESSAGES = 20;
@@ -19,7 +17,6 @@ export type LoopIteration = {
  outcome: "tool_calls" | "answer";
  toolEvents: ToolEvent[];    // empty if outcome is "answer"
  contextSize: number;        // how many messages were in context for this call
  contextTrimmed: boolean;    // true if we dropped old messages before this call
 };
 export type LoopResult = {
@@ -29,33 +26,17 @@ export type LoopResult = {
  stoppedBy: "model" | "guardrail" | "success";
 };
 export type LoginHandler = () => Promise<ToolEvent | null>;
 export async function runLoop(
  model: string,
  messages: ChatCompletionMessageParam[],
  guardrail: GuardrailFn,
  tools: ToolRegistry,           // injected by the harness, not imported globally
  loginHandler?: LoginHandler    // optional callback to handle login redirects
 ): Promise<LoopResult> {
  const trace: LoopIteration[] = [];
  while (true) {
    const iterationIndex = trace.length + 1;
    // ── Context management ────────────────────
    const beforeTrim = messages.length;
    messages = trimContext(messages, MAX_CONTEXT_MESSAGES);
    const contextTrimmed = messages.length < beforeTrim;
    // ── Guardrails check ──────────────────────
    const check = guardrail({ iterations: trace.length, messages });
    if (!check.ok) {
      // Check if this is a success completion (reason starts with "Successfully")
      const stoppedBy = check.reason.startsWith("Successfully") ? "success" : "guardrail";
      return { answer: check.reason, iterations: trace.length, trace, stoppedBy };
    }
    // ── Model call ────────────────────────────
    process.stdout.write(`[iter ${iterationIndex}] calling model... `);
    const response = await client.chat.completions.create({
@@ -72,7 +53,7 @@ export async function runLoop(
    // ── Final answer ──────────────────────────
    if (choice.finish_reason === "stop") {
-      trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize, contextTrimmed });
+      trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize });
      return {
        answer: choice.message.content ?? "(no response)",
        iterations: trace.length,
@@ -104,20 +85,7 @@ export async function runLoop(
        messages.push({ role: "tool", tool_call_id: call.id, content: result });
      }
-      // ── Check for login redirect after tool execution ───
+      trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize });
      if (loginHandler) {
        const loginEvent = await loginHandler();
        if (loginEvent) {
          toolEvents.push(loginEvent);
          // Add a system message to inform the agent that login was handled
          messages.push({
            role: "user",
            content: "Authentication completed by harness. You are now logged in. Navigate back to https://news.ycombinator.com and complete your upvote task.",
          });
        }
      }
      trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize, contextTrimmed });
    }
  }
 }
@@ -1,169 +0,0 @@
 import { BrowserSession } from "./browser.js";
 import { createTools } from "./1-tools.js";
 import { createContext } from "./3-context.js";
 import { combineGuardrails, defaultGuardrails, stopAfterUpvote } from "./4-guardrails.js";
 import { runLoop } from "./5-loop.js";
 import type { LoopResult, ToolEvent } from "./5-loop.js";
 export type VerifyResult = {
  passed: boolean;
  reason: string;
 };
 export type HarnessExecutionResult = LoopResult & {
  task: string;
  model: string;
 };
 export type HarnessOptions = {
  verify?: (result: HarnessExecutionResult) => VerifyResult;
  maxAttempts?: number;
 };
 export type HarnessResult = HarnessExecutionResult & {
  attempts: number;
  verification: VerifyResult | null;
 };
 export async function runHarness(
  task: string,
  model: string,
  options: HarnessOptions = {}
 ): Promise<HarnessResult> {
  const maxAttempts = options.maxAttempts ?? 1;
  let latestResult: HarnessResult | null = null;
  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
    const result = await runHarnessAttempt(task, model);
    const verification = options.verify ? options.verify(result) : null;
    latestResult = { ...result, attempts: attempt, verification };
    if (verification?.passed || attempt === maxAttempts) {
      return latestResult;
    }
    console.log(`\nAttempt ${attempt} failed — retrying (${attempt + 1}/${maxAttempts})...\n`);
  }
  throw new Error("Harness finished without producing a result");
 }
 export function verifySuccessfulUpvote(result: HarnessExecutionResult): VerifyResult {
  const successfulUpvote = result.trace
    .flatMap((iter) => iter.toolEvents)
    .find(
      (e) =>
        e.tool === "browser_click" &&
        /up_/.test(JSON.stringify(e.args)) &&
        /news\.ycombinator\.com\/(news)?$/.test(e.result.split("now at ")[1]?.trim() ?? "")
    );
  return {
    passed: !!successfulUpvote,
    reason: successfulUpvote
      ? `Upvote click confirmed — landed on ${successfulUpvote.result.split("now at ")[1]}`
      : "No successful upvote click found in trace (all arrows may be hidden, or login failed)",
  };
 }
 export function printHarnessResult(result: HarnessResult): void {
  console.log("\n─── Agent trace ───────────────────────────\n");
  for (const iteration of result.trace) {
    const trimNote = iteration.contextTrimmed ? " ✂ context trimmed" : "";
    const ctx = `[ctx: ${iteration.contextSize} msgs${trimNote}]`;
    if (iteration.outcome === "tool_calls") {
      console.log(`[iter ${iteration.index}] ${iteration.toolEvents.length} tool call(s)  ${ctx}`);
      for (const event of iteration.toolEvents) {
        console.log(`           → ${event.tool}(${JSON.stringify(event.args)})`);
        console.log(`             ${event.result.slice(0, 120)}${event.result.length > 120 ? "…" : ""}`);
      }
    } else {
      console.log(`[iter ${iteration.index}] answered  ${ctx}`);
    }
    console.log();
  }
  console.log("─── Result ────────────────────────────────\n");
  console.log(result.answer);
  console.log(`\nStopped by: ${result.stoppedBy} after ${result.iterations} iteration(s)`);
  console.log(`Attempts:   ${result.attempts}`);
  if (result.verification) {
    const { passed, reason } = result.verification;
    console.log(`Verify:     ${passed ? "✓ PASS" : "✗ FAIL"} — ${reason}`);
  }
 }
 async function runHarnessAttempt(
  task: string,
  model: string
 ): Promise<HarnessExecutionResult> {
  // Open the environment — each run gets its own isolated browser page
  const session = new BrowserSession();
  await session.open();
  try {
    const messages = createContext(task);         // fresh context for this task
    // Track upvoted story
    let upvotedStory: { id: string; title?: string; rank?: number } | null = null;
    let storiesData: any[] = [];
    // Create tools with hooks to track upvote success and story data
    const tools = createTools(session, {
      onUpvoteSuccess: (storyId) => {
        const story = storiesData.find(s => s.id === storyId);
        upvotedStory = story
          ? { id: storyId, title: story.title, rank: story.rank }
          : { id: storyId };
        console.log(`\n[harness] Upvote successful for story ID ${storyId} — forcing completion\n`);
      },
      onStoriesLoaded: (stories) => {
        storiesData = stories;
      },
    });
    // Login handler checks for redirects after each tool execution
    const loginHandler = async (): Promise<ToolEvent | null> => {
      const currentUrl = await session.getUrl();
      const isLoginPage = currentUrl.includes("login") || currentUrl.includes("vote");
      if (!isLoginPage) return null;
      console.log("\n[harness] Login redirect detected — handling automatically...");
      try {
        await session.fill("input[name='acct']", "tejasthrowaway");
        await session.fill("input[name='pw']", "tejasthrowaway");
        await session.click("input[type='submit']");
        console.log("[harness] Login completed — agent can continue\n");
        return {
          tool: "harness_auto_login",
          args: {},
          result: `Harness automatically handled login at ${currentUrl}. You are now authenticated and back at ${await session.getUrl()}.`,
        };
      } catch (err) {
        console.log(`[harness] Login failed: ${err instanceof Error ? err.message : String(err)}\n`);
        return null;
      }
    };
    // Combine default guardrails with upvote completion check
    const guardrails = combineGuardrails(
      stopAfterUpvote(() => upvotedStory),
      defaultGuardrails
    );
    const result = await runLoop(model, messages, guardrails, tools, loginHandler);
    return { task, model, ...result };
  } finally {
    // Always close the environment — even if the loop threw
    await session.close();
  }
 }
@@ -1,4 +1,7 @@
-import { printHarnessResult, runHarness, verifySuccessfulUpvote } from "./6-harness.js";
+import { createTools } from "./1-tools.js";
 import { createContext } from "./3-context.js";
 import { runLoop } from "./5-loop.js";
 import { BrowserSession } from "./browser.js";
 // try a shitty model
 const MODEL = "openai/gpt-3.5-turbo-0613";
@@ -15,5 +18,18 @@ Click its upvote arrow using the exact selector: a[id="up_STORYID"] (replace STO
 console.log(`Model: ${MODEL}`);
 console.log(`Task:  upvote on Hacker News\n`);
-const result = await runHarness(TASK, MODEL, { verify: verifySuccessfulUpvote, maxAttempts: 3 });
+const session = new BrowserSession();
-printHarnessResult(result);
+
 try {
  await session.open();
  const tools = createTools(session);
  const messages = createContext(TASK);
  const result = await runLoop(MODEL, messages, tools);
  console.log(`\nAnswer: ${result.answer}`);
  console.log(`Stopped by: ${result.stoppedBy}`);
  console.log(`Iterations: ${result.iterations}`);
 } finally {
  await session.close();
 }