mirror of
https://github.com/TejasQ/basically-ai-harness.git
synced 2026-06-14 03:30:26 +00:00
79 lines
2.4 KiB
TypeScript
79 lines
2.4 KiB
TypeScript
|
|
// ─────────────────────────────────────────────
|
||
|
|
// PART 1: The dataset
|
||
|
|
//
|
||
|
|
// A fixed set of test cases.
|
||
|
|
// Each one has an input we send to the model
|
||
|
|
// and an expected output we judge it against.
|
||
|
|
//
|
||
|
|
// These cases are designed to trigger common
|
||
|
|
// hallucinations — the "obvious" answer is often
|
||
|
|
// wrong, which exposes weaker models quickly.
|
||
|
|
// ─────────────────────────────────────────────
|
||
|
|
|
||
|
|
export type TestCase = {
|
||
|
|
id: string;
|
||
|
|
input: string;
|
||
|
|
expected: string; // the correct answer
|
||
|
|
trap?: string; // the wrong answer most models give
|
||
|
|
tags?: string[];
|
||
|
|
};
|
||
|
|
|
||
|
|
export const dataset: TestCase[] = [
|
||
|
|
{
|
||
|
|
id: "geo-australia-capital",
|
||
|
|
input: "What is the capital of Australia?",
|
||
|
|
expected: "Canberra",
|
||
|
|
trap: "Sydney", // most models confidently say Sydney
|
||
|
|
tags: ["geography"],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: "geo-brazil-capital",
|
||
|
|
input: "What is the capital of Brazil?",
|
||
|
|
expected: "Brasília",
|
||
|
|
trap: "Rio de Janeiro", // or São Paulo
|
||
|
|
tags: ["geography"],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: "geo-most-lakes",
|
||
|
|
input: "Which country has the most natural lakes in the world?",
|
||
|
|
expected: "Canada",
|
||
|
|
trap: "Russia", // or USA — both common wrong answers
|
||
|
|
tags: ["geography"],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: "bio-octopus-hearts",
|
||
|
|
input: "How many hearts does an octopus have?",
|
||
|
|
expected: "3",
|
||
|
|
trap: "1", // models assume one heart like most animals
|
||
|
|
tags: ["biology"],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: "bio-spider-legs",
|
||
|
|
input: "How many legs does a spider have?",
|
||
|
|
expected: "8",
|
||
|
|
trap: "6", // models sometimes confuse spiders with insects
|
||
|
|
tags: ["biology"],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: "astro-mars-moons",
|
||
|
|
input: "How many moons does Mars have?",
|
||
|
|
expected: "2",
|
||
|
|
trap: "1", // or 0 — models often guess wrong
|
||
|
|
tags: ["astronomy"],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: "geo-populous-2024",
|
||
|
|
input: "What is the most populous country in the world as of 2024?",
|
||
|
|
expected: "India",
|
||
|
|
trap: "China", // India surpassed China in 2023 — tests recency
|
||
|
|
tags: ["geography", "recency"],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id: "sci-salt-boiling",
|
||
|
|
input: "Does adding salt to water raise or lower its boiling point?",
|
||
|
|
expected: "raise",
|
||
|
|
trap: "lower", // counterintuitive — boiling point elevation
|
||
|
|
tags: ["science"],
|
||
|
|
},
|
||
|
|
];
|