mirror of
https://github.com/TejasQ/basically-ai-harness.git
synced 2026-06-13 19:20:06 +00:00
Simplify
This commit is contained in:
@@ -13,17 +13,3 @@ export function createContext(task: string): ChatCompletionMessageParam[] {
|
||||
{ role: "user", content: task },
|
||||
];
|
||||
}
|
||||
|
||||
// Drop old tool messages if context grows too large.
|
||||
// Always keep: the system prompt and the original user task.
|
||||
export function trimContext(
|
||||
messages: ChatCompletionMessageParam[],
|
||||
maxMessages: number
|
||||
): ChatCompletionMessageParam[] {
|
||||
if (messages.length <= maxMessages) return messages;
|
||||
|
||||
const [system, user] = messages;
|
||||
const rest = messages.slice(2);
|
||||
const trimmed = rest.slice(rest.length - (maxMessages - 2));
|
||||
return [system, user, ...trimmed];
|
||||
}
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
|
||||
|
||||
type GuardrailInput = {
|
||||
iterations: number;
|
||||
messages: ChatCompletionMessageParam[];
|
||||
};
|
||||
|
||||
export type GuardrailResult = { ok: true } | { ok: false; reason: string };
|
||||
export type GuardrailFn = (input: GuardrailInput) => GuardrailResult;
|
||||
|
||||
// ── Individual guardrails ─────────────────────
|
||||
|
||||
// Stop after too many iterations — prevents infinite loops
|
||||
const maxIterations =
|
||||
(limit: number): GuardrailFn =>
|
||||
({ iterations }) =>
|
||||
iterations >= limit
|
||||
? { ok: false, reason: `Guardrail: reached iteration limit (${limit})` }
|
||||
: { ok: true };
|
||||
|
||||
// Stop if context has ballooned unexpectedly
|
||||
const maxMessages =
|
||||
(limit: number): GuardrailFn =>
|
||||
({ messages }) =>
|
||||
messages.length > limit
|
||||
? { ok: false, reason: `Guardrail: context too large (${messages.length} messages)` }
|
||||
: { ok: true };
|
||||
|
||||
// ── Compose into one fn ───────────────────────
|
||||
|
||||
export function combineGuardrails(...fns: GuardrailFn[]): GuardrailFn {
|
||||
return (input) => {
|
||||
for (const check of fns) {
|
||||
const result = check(input);
|
||||
if (!result.ok) return result;
|
||||
}
|
||||
return { ok: true };
|
||||
};
|
||||
}
|
||||
|
||||
// Stop after successful upvote
|
||||
export const stopAfterUpvote =
|
||||
(getUpvotedStory: () => { id: string; title?: string; rank?: number } | null): GuardrailFn =>
|
||||
() => {
|
||||
const story = getUpvotedStory();
|
||||
if (story) {
|
||||
const storyInfo = story.title && story.rank
|
||||
? `"${story.title}" (rank ${story.rank})`
|
||||
: `story ID ${story.id}`;
|
||||
return { ok: false, reason: `Successfully upvoted ${storyInfo}` };
|
||||
}
|
||||
return { ok: true };
|
||||
};
|
||||
|
||||
export const defaultGuardrails = combineGuardrails(
|
||||
maxIterations(15),
|
||||
maxMessages(50)
|
||||
);
|
||||
|
||||
+2
-34
@@ -1,7 +1,5 @@
|
||||
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
|
||||
import { client } from "./2-model.js";
|
||||
import { trimContext } from "./3-context.js";
|
||||
import type { GuardrailFn } from "./4-guardrails.js";
|
||||
import type { ToolRegistry } from "./1-tools.js";
|
||||
|
||||
const MAX_CONTEXT_MESSAGES = 20;
|
||||
@@ -19,7 +17,6 @@ export type LoopIteration = {
|
||||
outcome: "tool_calls" | "answer";
|
||||
toolEvents: ToolEvent[]; // empty if outcome is "answer"
|
||||
contextSize: number; // how many messages were in context for this call
|
||||
contextTrimmed: boolean; // true if we dropped old messages before this call
|
||||
};
|
||||
|
||||
export type LoopResult = {
|
||||
@@ -29,33 +26,17 @@ export type LoopResult = {
|
||||
stoppedBy: "model" | "guardrail" | "success";
|
||||
};
|
||||
|
||||
export type LoginHandler = () => Promise<ToolEvent | null>;
|
||||
|
||||
export async function runLoop(
|
||||
model: string,
|
||||
messages: ChatCompletionMessageParam[],
|
||||
guardrail: GuardrailFn,
|
||||
tools: ToolRegistry, // injected by the harness, not imported globally
|
||||
loginHandler?: LoginHandler // optional callback to handle login redirects
|
||||
): Promise<LoopResult> {
|
||||
const trace: LoopIteration[] = [];
|
||||
|
||||
while (true) {
|
||||
const iterationIndex = trace.length + 1;
|
||||
|
||||
// ── Context management ────────────────────
|
||||
const beforeTrim = messages.length;
|
||||
messages = trimContext(messages, MAX_CONTEXT_MESSAGES);
|
||||
const contextTrimmed = messages.length < beforeTrim;
|
||||
|
||||
// ── Guardrails check ──────────────────────
|
||||
const check = guardrail({ iterations: trace.length, messages });
|
||||
if (!check.ok) {
|
||||
// Check if this is a success completion (reason starts with "Successfully")
|
||||
const stoppedBy = check.reason.startsWith("Successfully") ? "success" : "guardrail";
|
||||
return { answer: check.reason, iterations: trace.length, trace, stoppedBy };
|
||||
}
|
||||
|
||||
// ── Model call ────────────────────────────
|
||||
process.stdout.write(`[iter ${iterationIndex}] calling model... `);
|
||||
const response = await client.chat.completions.create({
|
||||
@@ -72,7 +53,7 @@ export async function runLoop(
|
||||
|
||||
// ── Final answer ──────────────────────────
|
||||
if (choice.finish_reason === "stop") {
|
||||
trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize, contextTrimmed });
|
||||
trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize });
|
||||
return {
|
||||
answer: choice.message.content ?? "(no response)",
|
||||
iterations: trace.length,
|
||||
@@ -104,20 +85,7 @@ export async function runLoop(
|
||||
messages.push({ role: "tool", tool_call_id: call.id, content: result });
|
||||
}
|
||||
|
||||
// ── Check for login redirect after tool execution ───
|
||||
if (loginHandler) {
|
||||
const loginEvent = await loginHandler();
|
||||
if (loginEvent) {
|
||||
toolEvents.push(loginEvent);
|
||||
// Add a system message to inform the agent that login was handled
|
||||
messages.push({
|
||||
role: "user",
|
||||
content: "Authentication completed by harness. You are now logged in. Navigate back to https://news.ycombinator.com and complete your upvote task.",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize, contextTrimmed });
|
||||
trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,169 +0,0 @@
|
||||
import { BrowserSession } from "./browser.js";
|
||||
import { createTools } from "./1-tools.js";
|
||||
import { createContext } from "./3-context.js";
|
||||
import { combineGuardrails, defaultGuardrails, stopAfterUpvote } from "./4-guardrails.js";
|
||||
import { runLoop } from "./5-loop.js";
|
||||
import type { LoopResult, ToolEvent } from "./5-loop.js";
|
||||
|
||||
export type VerifyResult = {
|
||||
passed: boolean;
|
||||
reason: string;
|
||||
};
|
||||
|
||||
export type HarnessExecutionResult = LoopResult & {
|
||||
task: string;
|
||||
model: string;
|
||||
};
|
||||
|
||||
export type HarnessOptions = {
|
||||
verify?: (result: HarnessExecutionResult) => VerifyResult;
|
||||
maxAttempts?: number;
|
||||
};
|
||||
|
||||
export type HarnessResult = HarnessExecutionResult & {
|
||||
attempts: number;
|
||||
verification: VerifyResult | null;
|
||||
};
|
||||
|
||||
export async function runHarness(
|
||||
task: string,
|
||||
model: string,
|
||||
options: HarnessOptions = {}
|
||||
): Promise<HarnessResult> {
|
||||
const maxAttempts = options.maxAttempts ?? 1;
|
||||
let latestResult: HarnessResult | null = null;
|
||||
|
||||
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
||||
const result = await runHarnessAttempt(task, model);
|
||||
const verification = options.verify ? options.verify(result) : null;
|
||||
|
||||
latestResult = { ...result, attempts: attempt, verification };
|
||||
|
||||
if (verification?.passed || attempt === maxAttempts) {
|
||||
return latestResult;
|
||||
}
|
||||
|
||||
console.log(`\nAttempt ${attempt} failed — retrying (${attempt + 1}/${maxAttempts})...\n`);
|
||||
}
|
||||
|
||||
throw new Error("Harness finished without producing a result");
|
||||
}
|
||||
|
||||
export function verifySuccessfulUpvote(result: HarnessExecutionResult): VerifyResult {
|
||||
const successfulUpvote = result.trace
|
||||
.flatMap((iter) => iter.toolEvents)
|
||||
.find(
|
||||
(e) =>
|
||||
e.tool === "browser_click" &&
|
||||
/up_/.test(JSON.stringify(e.args)) &&
|
||||
/news\.ycombinator\.com\/(news)?$/.test(e.result.split("now at ")[1]?.trim() ?? "")
|
||||
);
|
||||
|
||||
return {
|
||||
passed: !!successfulUpvote,
|
||||
reason: successfulUpvote
|
||||
? `Upvote click confirmed — landed on ${successfulUpvote.result.split("now at ")[1]}`
|
||||
: "No successful upvote click found in trace (all arrows may be hidden, or login failed)",
|
||||
};
|
||||
}
|
||||
|
||||
export function printHarnessResult(result: HarnessResult): void {
|
||||
console.log("\n─── Agent trace ───────────────────────────\n");
|
||||
|
||||
for (const iteration of result.trace) {
|
||||
const trimNote = iteration.contextTrimmed ? " ✂ context trimmed" : "";
|
||||
const ctx = `[ctx: ${iteration.contextSize} msgs${trimNote}]`;
|
||||
|
||||
if (iteration.outcome === "tool_calls") {
|
||||
console.log(`[iter ${iteration.index}] ${iteration.toolEvents.length} tool call(s) ${ctx}`);
|
||||
for (const event of iteration.toolEvents) {
|
||||
console.log(` → ${event.tool}(${JSON.stringify(event.args)})`);
|
||||
console.log(` ${event.result.slice(0, 120)}${event.result.length > 120 ? "…" : ""}`);
|
||||
}
|
||||
} else {
|
||||
console.log(`[iter ${iteration.index}] answered ${ctx}`);
|
||||
}
|
||||
console.log();
|
||||
}
|
||||
|
||||
console.log("─── Result ────────────────────────────────\n");
|
||||
console.log(result.answer);
|
||||
console.log(`\nStopped by: ${result.stoppedBy} after ${result.iterations} iteration(s)`);
|
||||
console.log(`Attempts: ${result.attempts}`);
|
||||
|
||||
if (result.verification) {
|
||||
const { passed, reason } = result.verification;
|
||||
console.log(`Verify: ${passed ? "✓ PASS" : "✗ FAIL"} — ${reason}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function runHarnessAttempt(
|
||||
task: string,
|
||||
model: string
|
||||
): Promise<HarnessExecutionResult> {
|
||||
// Open the environment — each run gets its own isolated browser page
|
||||
const session = new BrowserSession();
|
||||
await session.open();
|
||||
|
||||
try {
|
||||
const messages = createContext(task); // fresh context for this task
|
||||
|
||||
// Track upvoted story
|
||||
let upvotedStory: { id: string; title?: string; rank?: number } | null = null;
|
||||
let storiesData: any[] = [];
|
||||
|
||||
// Create tools with hooks to track upvote success and story data
|
||||
const tools = createTools(session, {
|
||||
onUpvoteSuccess: (storyId) => {
|
||||
const story = storiesData.find(s => s.id === storyId);
|
||||
upvotedStory = story
|
||||
? { id: storyId, title: story.title, rank: story.rank }
|
||||
: { id: storyId };
|
||||
console.log(`\n[harness] Upvote successful for story ID ${storyId} — forcing completion\n`);
|
||||
},
|
||||
onStoriesLoaded: (stories) => {
|
||||
storiesData = stories;
|
||||
},
|
||||
});
|
||||
|
||||
// Login handler checks for redirects after each tool execution
|
||||
const loginHandler = async (): Promise<ToolEvent | null> => {
|
||||
const currentUrl = await session.getUrl();
|
||||
const isLoginPage = currentUrl.includes("login") || currentUrl.includes("vote");
|
||||
|
||||
if (!isLoginPage) return null;
|
||||
|
||||
console.log("\n[harness] Login redirect detected — handling automatically...");
|
||||
|
||||
try {
|
||||
await session.fill("input[name='acct']", "tejasthrowaway");
|
||||
await session.fill("input[name='pw']", "tejasthrowaway");
|
||||
await session.click("input[type='submit']");
|
||||
|
||||
console.log("[harness] Login completed — agent can continue\n");
|
||||
|
||||
return {
|
||||
tool: "harness_auto_login",
|
||||
args: {},
|
||||
result: `Harness automatically handled login at ${currentUrl}. You are now authenticated and back at ${await session.getUrl()}.`,
|
||||
};
|
||||
} catch (err) {
|
||||
console.log(`[harness] Login failed: ${err instanceof Error ? err.message : String(err)}\n`);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
// Combine default guardrails with upvote completion check
|
||||
const guardrails = combineGuardrails(
|
||||
stopAfterUpvote(() => upvotedStory),
|
||||
defaultGuardrails
|
||||
);
|
||||
|
||||
const result = await runLoop(model, messages, guardrails, tools, loginHandler);
|
||||
|
||||
return { task, model, ...result };
|
||||
} finally {
|
||||
// Always close the environment — even if the loop threw
|
||||
await session.close();
|
||||
}
|
||||
}
|
||||
|
||||
+19
-3
@@ -1,4 +1,7 @@
|
||||
import { printHarnessResult, runHarness, verifySuccessfulUpvote } from "./6-harness.js";
|
||||
import { createTools } from "./1-tools.js";
|
||||
import { createContext } from "./3-context.js";
|
||||
import { runLoop } from "./5-loop.js";
|
||||
import { BrowserSession } from "./browser.js";
|
||||
|
||||
// try a shitty model
|
||||
const MODEL = "openai/gpt-3.5-turbo-0613";
|
||||
@@ -15,5 +18,18 @@ Click its upvote arrow using the exact selector: a[id="up_STORYID"] (replace STO
|
||||
console.log(`Model: ${MODEL}`);
|
||||
console.log(`Task: upvote on Hacker News\n`);
|
||||
|
||||
const result = await runHarness(TASK, MODEL, { verify: verifySuccessfulUpvote, maxAttempts: 3 });
|
||||
printHarnessResult(result);
|
||||
const session = new BrowserSession();
|
||||
|
||||
try {
|
||||
await session.open();
|
||||
|
||||
const tools = createTools(session);
|
||||
const messages = createContext(TASK);
|
||||
const result = await runLoop(MODEL, messages, tools);
|
||||
|
||||
console.log(`\nAnswer: ${result.answer}`);
|
||||
console.log(`Stopped by: ${result.stoppedBy}`);
|
||||
console.log(`Iterations: ${result.iterations}`);
|
||||
} finally {
|
||||
await session.close();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user