Files
basically-ai-harness/eval/1-dataset.ts
T

79 lines
2.4 KiB
TypeScript
Raw Normal View History

2026-04-02 10:30:35 +02:00
// ─────────────────────────────────────────────
// PART 1: The dataset
//
// A fixed set of test cases.
// Each one has an input we send to the model
// and an expected output we judge it against.
//
// These cases are designed to trigger common
// hallucinations — the "obvious" answer is often
// wrong, which exposes weaker models quickly.
// ─────────────────────────────────────────────
export type TestCase = {
id: string;
input: string;
expected: string; // the correct answer
trap?: string; // the wrong answer most models give
tags?: string[];
};
export const dataset: TestCase[] = [
{
id: "geo-australia-capital",
input: "What is the capital of Australia?",
expected: "Canberra",
trap: "Sydney", // most models confidently say Sydney
tags: ["geography"],
},
{
id: "geo-brazil-capital",
input: "What is the capital of Brazil?",
expected: "Brasília",
trap: "Rio de Janeiro", // or São Paulo
tags: ["geography"],
},
{
id: "geo-most-lakes",
input: "Which country has the most natural lakes in the world?",
expected: "Canada",
trap: "Russia", // or USA — both common wrong answers
tags: ["geography"],
},
{
id: "bio-octopus-hearts",
input: "How many hearts does an octopus have?",
expected: "3",
trap: "1", // models assume one heart like most animals
tags: ["biology"],
},
{
id: "bio-spider-legs",
input: "How many legs does a spider have?",
expected: "8",
trap: "6", // models sometimes confuse spiders with insects
tags: ["biology"],
},
{
id: "astro-mars-moons",
input: "How many moons does Mars have?",
expected: "2",
trap: "1", // or 0 — models often guess wrong
tags: ["astronomy"],
},
{
id: "geo-populous-2024",
input: "What is the most populous country in the world as of 2024?",
expected: "India",
trap: "China", // India surpassed China in 2023 — tests recency
tags: ["geography", "recency"],
},
{
id: "sci-salt-boiling",
input: "Does adding salt to water raise or lower its boiling point?",
expected: "raise",
trap: "lower", // counterintuitive — boiling point elevation
tags: ["science"],
},
];