mirror of
https://github.com/TejasQ/basically-ai-harness.git
synced 2026-06-13 19:20:06 +00:00
Simplify
This commit is contained in:
@@ -13,17 +13,3 @@ export function createContext(task: string): ChatCompletionMessageParam[] {
|
|||||||
{ role: "user", content: task },
|
{ role: "user", content: task },
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Drop old tool messages if context grows too large.
|
|
||||||
// Always keep: the system prompt and the original user task.
|
|
||||||
export function trimContext(
|
|
||||||
messages: ChatCompletionMessageParam[],
|
|
||||||
maxMessages: number
|
|
||||||
): ChatCompletionMessageParam[] {
|
|
||||||
if (messages.length <= maxMessages) return messages;
|
|
||||||
|
|
||||||
const [system, user] = messages;
|
|
||||||
const rest = messages.slice(2);
|
|
||||||
const trimmed = rest.slice(rest.length - (maxMessages - 2));
|
|
||||||
return [system, user, ...trimmed];
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,58 +0,0 @@
|
|||||||
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
|
|
||||||
|
|
||||||
type GuardrailInput = {
|
|
||||||
iterations: number;
|
|
||||||
messages: ChatCompletionMessageParam[];
|
|
||||||
};
|
|
||||||
|
|
||||||
export type GuardrailResult = { ok: true } | { ok: false; reason: string };
|
|
||||||
export type GuardrailFn = (input: GuardrailInput) => GuardrailResult;
|
|
||||||
|
|
||||||
// ── Individual guardrails ─────────────────────
|
|
||||||
|
|
||||||
// Stop after too many iterations — prevents infinite loops
|
|
||||||
const maxIterations =
|
|
||||||
(limit: number): GuardrailFn =>
|
|
||||||
({ iterations }) =>
|
|
||||||
iterations >= limit
|
|
||||||
? { ok: false, reason: `Guardrail: reached iteration limit (${limit})` }
|
|
||||||
: { ok: true };
|
|
||||||
|
|
||||||
// Stop if context has ballooned unexpectedly
|
|
||||||
const maxMessages =
|
|
||||||
(limit: number): GuardrailFn =>
|
|
||||||
({ messages }) =>
|
|
||||||
messages.length > limit
|
|
||||||
? { ok: false, reason: `Guardrail: context too large (${messages.length} messages)` }
|
|
||||||
: { ok: true };
|
|
||||||
|
|
||||||
// ── Compose into one fn ───────────────────────
|
|
||||||
|
|
||||||
export function combineGuardrails(...fns: GuardrailFn[]): GuardrailFn {
|
|
||||||
return (input) => {
|
|
||||||
for (const check of fns) {
|
|
||||||
const result = check(input);
|
|
||||||
if (!result.ok) return result;
|
|
||||||
}
|
|
||||||
return { ok: true };
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop after successful upvote
|
|
||||||
export const stopAfterUpvote =
|
|
||||||
(getUpvotedStory: () => { id: string; title?: string; rank?: number } | null): GuardrailFn =>
|
|
||||||
() => {
|
|
||||||
const story = getUpvotedStory();
|
|
||||||
if (story) {
|
|
||||||
const storyInfo = story.title && story.rank
|
|
||||||
? `"${story.title}" (rank ${story.rank})`
|
|
||||||
: `story ID ${story.id}`;
|
|
||||||
return { ok: false, reason: `Successfully upvoted ${storyInfo}` };
|
|
||||||
}
|
|
||||||
return { ok: true };
|
|
||||||
};
|
|
||||||
|
|
||||||
export const defaultGuardrails = combineGuardrails(
|
|
||||||
maxIterations(15),
|
|
||||||
maxMessages(50)
|
|
||||||
);
|
|
||||||
|
|||||||
+2
-34
@@ -1,7 +1,5 @@
|
|||||||
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
|
import type { ChatCompletionMessageParam } from "openai/resources/chat/completions";
|
||||||
import { client } from "./2-model.js";
|
import { client } from "./2-model.js";
|
||||||
import { trimContext } from "./3-context.js";
|
|
||||||
import type { GuardrailFn } from "./4-guardrails.js";
|
|
||||||
import type { ToolRegistry } from "./1-tools.js";
|
import type { ToolRegistry } from "./1-tools.js";
|
||||||
|
|
||||||
const MAX_CONTEXT_MESSAGES = 20;
|
const MAX_CONTEXT_MESSAGES = 20;
|
||||||
@@ -19,7 +17,6 @@ export type LoopIteration = {
|
|||||||
outcome: "tool_calls" | "answer";
|
outcome: "tool_calls" | "answer";
|
||||||
toolEvents: ToolEvent[]; // empty if outcome is "answer"
|
toolEvents: ToolEvent[]; // empty if outcome is "answer"
|
||||||
contextSize: number; // how many messages were in context for this call
|
contextSize: number; // how many messages were in context for this call
|
||||||
contextTrimmed: boolean; // true if we dropped old messages before this call
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export type LoopResult = {
|
export type LoopResult = {
|
||||||
@@ -29,33 +26,17 @@ export type LoopResult = {
|
|||||||
stoppedBy: "model" | "guardrail" | "success";
|
stoppedBy: "model" | "guardrail" | "success";
|
||||||
};
|
};
|
||||||
|
|
||||||
export type LoginHandler = () => Promise<ToolEvent | null>;
|
|
||||||
|
|
||||||
export async function runLoop(
|
export async function runLoop(
|
||||||
model: string,
|
model: string,
|
||||||
messages: ChatCompletionMessageParam[],
|
messages: ChatCompletionMessageParam[],
|
||||||
guardrail: GuardrailFn,
|
|
||||||
tools: ToolRegistry, // injected by the harness, not imported globally
|
tools: ToolRegistry, // injected by the harness, not imported globally
|
||||||
loginHandler?: LoginHandler // optional callback to handle login redirects
|
|
||||||
): Promise<LoopResult> {
|
): Promise<LoopResult> {
|
||||||
const trace: LoopIteration[] = [];
|
const trace: LoopIteration[] = [];
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
const iterationIndex = trace.length + 1;
|
const iterationIndex = trace.length + 1;
|
||||||
|
|
||||||
// ── Context management ────────────────────
|
|
||||||
const beforeTrim = messages.length;
|
|
||||||
messages = trimContext(messages, MAX_CONTEXT_MESSAGES);
|
|
||||||
const contextTrimmed = messages.length < beforeTrim;
|
|
||||||
|
|
||||||
// ── Guardrails check ──────────────────────
|
|
||||||
const check = guardrail({ iterations: trace.length, messages });
|
|
||||||
if (!check.ok) {
|
|
||||||
// Check if this is a success completion (reason starts with "Successfully")
|
|
||||||
const stoppedBy = check.reason.startsWith("Successfully") ? "success" : "guardrail";
|
|
||||||
return { answer: check.reason, iterations: trace.length, trace, stoppedBy };
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Model call ────────────────────────────
|
// ── Model call ────────────────────────────
|
||||||
process.stdout.write(`[iter ${iterationIndex}] calling model... `);
|
process.stdout.write(`[iter ${iterationIndex}] calling model... `);
|
||||||
const response = await client.chat.completions.create({
|
const response = await client.chat.completions.create({
|
||||||
@@ -72,7 +53,7 @@ export async function runLoop(
|
|||||||
|
|
||||||
// ── Final answer ──────────────────────────
|
// ── Final answer ──────────────────────────
|
||||||
if (choice.finish_reason === "stop") {
|
if (choice.finish_reason === "stop") {
|
||||||
trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize, contextTrimmed });
|
trace.push({ index: iterationIndex, outcome: "answer", toolEvents: [], contextSize });
|
||||||
return {
|
return {
|
||||||
answer: choice.message.content ?? "(no response)",
|
answer: choice.message.content ?? "(no response)",
|
||||||
iterations: trace.length,
|
iterations: trace.length,
|
||||||
@@ -104,20 +85,7 @@ export async function runLoop(
|
|||||||
messages.push({ role: "tool", tool_call_id: call.id, content: result });
|
messages.push({ role: "tool", tool_call_id: call.id, content: result });
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Check for login redirect after tool execution ───
|
trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize });
|
||||||
if (loginHandler) {
|
|
||||||
const loginEvent = await loginHandler();
|
|
||||||
if (loginEvent) {
|
|
||||||
toolEvents.push(loginEvent);
|
|
||||||
// Add a system message to inform the agent that login was handled
|
|
||||||
messages.push({
|
|
||||||
role: "user",
|
|
||||||
content: "Authentication completed by harness. You are now logged in. Navigate back to https://news.ycombinator.com and complete your upvote task.",
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
trace.push({ index: iterationIndex, outcome: "tool_calls", toolEvents, contextSize, contextTrimmed });
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,169 +0,0 @@
|
|||||||
import { BrowserSession } from "./browser.js";
|
|
||||||
import { createTools } from "./1-tools.js";
|
|
||||||
import { createContext } from "./3-context.js";
|
|
||||||
import { combineGuardrails, defaultGuardrails, stopAfterUpvote } from "./4-guardrails.js";
|
|
||||||
import { runLoop } from "./5-loop.js";
|
|
||||||
import type { LoopResult, ToolEvent } from "./5-loop.js";
|
|
||||||
|
|
||||||
export type VerifyResult = {
|
|
||||||
passed: boolean;
|
|
||||||
reason: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type HarnessExecutionResult = LoopResult & {
|
|
||||||
task: string;
|
|
||||||
model: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type HarnessOptions = {
|
|
||||||
verify?: (result: HarnessExecutionResult) => VerifyResult;
|
|
||||||
maxAttempts?: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type HarnessResult = HarnessExecutionResult & {
|
|
||||||
attempts: number;
|
|
||||||
verification: VerifyResult | null;
|
|
||||||
};
|
|
||||||
|
|
||||||
export async function runHarness(
|
|
||||||
task: string,
|
|
||||||
model: string,
|
|
||||||
options: HarnessOptions = {}
|
|
||||||
): Promise<HarnessResult> {
|
|
||||||
const maxAttempts = options.maxAttempts ?? 1;
|
|
||||||
let latestResult: HarnessResult | null = null;
|
|
||||||
|
|
||||||
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
||||||
const result = await runHarnessAttempt(task, model);
|
|
||||||
const verification = options.verify ? options.verify(result) : null;
|
|
||||||
|
|
||||||
latestResult = { ...result, attempts: attempt, verification };
|
|
||||||
|
|
||||||
if (verification?.passed || attempt === maxAttempts) {
|
|
||||||
return latestResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`\nAttempt ${attempt} failed — retrying (${attempt + 1}/${maxAttempts})...\n`);
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new Error("Harness finished without producing a result");
|
|
||||||
}
|
|
||||||
|
|
||||||
export function verifySuccessfulUpvote(result: HarnessExecutionResult): VerifyResult {
|
|
||||||
const successfulUpvote = result.trace
|
|
||||||
.flatMap((iter) => iter.toolEvents)
|
|
||||||
.find(
|
|
||||||
(e) =>
|
|
||||||
e.tool === "browser_click" &&
|
|
||||||
/up_/.test(JSON.stringify(e.args)) &&
|
|
||||||
/news\.ycombinator\.com\/(news)?$/.test(e.result.split("now at ")[1]?.trim() ?? "")
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
|
||||||
passed: !!successfulUpvote,
|
|
||||||
reason: successfulUpvote
|
|
||||||
? `Upvote click confirmed — landed on ${successfulUpvote.result.split("now at ")[1]}`
|
|
||||||
: "No successful upvote click found in trace (all arrows may be hidden, or login failed)",
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
export function printHarnessResult(result: HarnessResult): void {
|
|
||||||
console.log("\n─── Agent trace ───────────────────────────\n");
|
|
||||||
|
|
||||||
for (const iteration of result.trace) {
|
|
||||||
const trimNote = iteration.contextTrimmed ? " ✂ context trimmed" : "";
|
|
||||||
const ctx = `[ctx: ${iteration.contextSize} msgs${trimNote}]`;
|
|
||||||
|
|
||||||
if (iteration.outcome === "tool_calls") {
|
|
||||||
console.log(`[iter ${iteration.index}] ${iteration.toolEvents.length} tool call(s) ${ctx}`);
|
|
||||||
for (const event of iteration.toolEvents) {
|
|
||||||
console.log(` → ${event.tool}(${JSON.stringify(event.args)})`);
|
|
||||||
console.log(` ${event.result.slice(0, 120)}${event.result.length > 120 ? "…" : ""}`);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.log(`[iter ${iteration.index}] answered ${ctx}`);
|
|
||||||
}
|
|
||||||
console.log();
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log("─── Result ────────────────────────────────\n");
|
|
||||||
console.log(result.answer);
|
|
||||||
console.log(`\nStopped by: ${result.stoppedBy} after ${result.iterations} iteration(s)`);
|
|
||||||
console.log(`Attempts: ${result.attempts}`);
|
|
||||||
|
|
||||||
if (result.verification) {
|
|
||||||
const { passed, reason } = result.verification;
|
|
||||||
console.log(`Verify: ${passed ? "✓ PASS" : "✗ FAIL"} — ${reason}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function runHarnessAttempt(
|
|
||||||
task: string,
|
|
||||||
model: string
|
|
||||||
): Promise<HarnessExecutionResult> {
|
|
||||||
// Open the environment — each run gets its own isolated browser page
|
|
||||||
const session = new BrowserSession();
|
|
||||||
await session.open();
|
|
||||||
|
|
||||||
try {
|
|
||||||
const messages = createContext(task); // fresh context for this task
|
|
||||||
|
|
||||||
// Track upvoted story
|
|
||||||
let upvotedStory: { id: string; title?: string; rank?: number } | null = null;
|
|
||||||
let storiesData: any[] = [];
|
|
||||||
|
|
||||||
// Create tools with hooks to track upvote success and story data
|
|
||||||
const tools = createTools(session, {
|
|
||||||
onUpvoteSuccess: (storyId) => {
|
|
||||||
const story = storiesData.find(s => s.id === storyId);
|
|
||||||
upvotedStory = story
|
|
||||||
? { id: storyId, title: story.title, rank: story.rank }
|
|
||||||
: { id: storyId };
|
|
||||||
console.log(`\n[harness] Upvote successful for story ID ${storyId} — forcing completion\n`);
|
|
||||||
},
|
|
||||||
onStoriesLoaded: (stories) => {
|
|
||||||
storiesData = stories;
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
// Login handler checks for redirects after each tool execution
|
|
||||||
const loginHandler = async (): Promise<ToolEvent | null> => {
|
|
||||||
const currentUrl = await session.getUrl();
|
|
||||||
const isLoginPage = currentUrl.includes("login") || currentUrl.includes("vote");
|
|
||||||
|
|
||||||
if (!isLoginPage) return null;
|
|
||||||
|
|
||||||
console.log("\n[harness] Login redirect detected — handling automatically...");
|
|
||||||
|
|
||||||
try {
|
|
||||||
await session.fill("input[name='acct']", "tejasthrowaway");
|
|
||||||
await session.fill("input[name='pw']", "tejasthrowaway");
|
|
||||||
await session.click("input[type='submit']");
|
|
||||||
|
|
||||||
console.log("[harness] Login completed — agent can continue\n");
|
|
||||||
|
|
||||||
return {
|
|
||||||
tool: "harness_auto_login",
|
|
||||||
args: {},
|
|
||||||
result: `Harness automatically handled login at ${currentUrl}. You are now authenticated and back at ${await session.getUrl()}.`,
|
|
||||||
};
|
|
||||||
} catch (err) {
|
|
||||||
console.log(`[harness] Login failed: ${err instanceof Error ? err.message : String(err)}\n`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Combine default guardrails with upvote completion check
|
|
||||||
const guardrails = combineGuardrails(
|
|
||||||
stopAfterUpvote(() => upvotedStory),
|
|
||||||
defaultGuardrails
|
|
||||||
);
|
|
||||||
|
|
||||||
const result = await runLoop(model, messages, guardrails, tools, loginHandler);
|
|
||||||
|
|
||||||
return { task, model, ...result };
|
|
||||||
} finally {
|
|
||||||
// Always close the environment — even if the loop threw
|
|
||||||
await session.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
+19
-3
@@ -1,4 +1,7 @@
|
|||||||
import { printHarnessResult, runHarness, verifySuccessfulUpvote } from "./6-harness.js";
|
import { createTools } from "./1-tools.js";
|
||||||
|
import { createContext } from "./3-context.js";
|
||||||
|
import { runLoop } from "./5-loop.js";
|
||||||
|
import { BrowserSession } from "./browser.js";
|
||||||
|
|
||||||
// try a shitty model
|
// try a shitty model
|
||||||
const MODEL = "openai/gpt-3.5-turbo-0613";
|
const MODEL = "openai/gpt-3.5-turbo-0613";
|
||||||
@@ -15,5 +18,18 @@ Click its upvote arrow using the exact selector: a[id="up_STORYID"] (replace STO
|
|||||||
console.log(`Model: ${MODEL}`);
|
console.log(`Model: ${MODEL}`);
|
||||||
console.log(`Task: upvote on Hacker News\n`);
|
console.log(`Task: upvote on Hacker News\n`);
|
||||||
|
|
||||||
const result = await runHarness(TASK, MODEL, { verify: verifySuccessfulUpvote, maxAttempts: 3 });
|
const session = new BrowserSession();
|
||||||
printHarnessResult(result);
|
|
||||||
|
try {
|
||||||
|
await session.open();
|
||||||
|
|
||||||
|
const tools = createTools(session);
|
||||||
|
const messages = createContext(TASK);
|
||||||
|
const result = await runLoop(MODEL, messages, tools);
|
||||||
|
|
||||||
|
console.log(`\nAnswer: ${result.answer}`);
|
||||||
|
console.log(`Stopped by: ${result.stoppedBy}`);
|
||||||
|
console.log(`Iterations: ${result.iterations}`);
|
||||||
|
} finally {
|
||||||
|
await session.close();
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user