♻️ refactor(cli): merge external eval commands into unified tree with --external flag

Remove separate `eval ext` namespace; use `--external` flag on overlapping commands (dataset get, run get) and integrate external-only commands directly into the tree. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
💄 style(cli): rename eval irun to run since external moved to ext namespace
2026-06-14 03:30:19 +00:00 · 2026-03-12 01:36:18 +08:00 · 2026-03-12 01:26:18 +08:00 · 2026-03-12 01:07:10 +08:00
10 changed files with 1715 additions and 319 deletions
@@ -1,6 +1,6 @@
 {
  "name": "@lobehub/cli",
-  "version": "0.0.1-canary.11",
+  "version": "0.0.1-canary.12",
  "type": "module",
  "bin": {
    "lh": "./dist/index.js",
@@ -16,6 +16,13 @@ let _client: TrpcClient | undefined;
 let _toolsClient: ToolsTrpcClient | undefined;

 async function getAuthAndServer() {
+  // LOBEHUB_JWT + LOBEHUB_SERVER env vars (used by server-side sandbox execution)
+  const envJwt = process.env.LOBEHUB_JWT;
+  if (envJwt) {
+    const serverUrl = process.env.LOBEHUB_SERVER || OFFICIAL_SERVER_URL;
+    return { accessToken: envJwt, serverUrl: serverUrl.replace(/\/$/, '') };
+  }
+
  const result = await getValidToken();
  if (!result) {
    log.error("No authentication found. Run 'lh login' first.");
@@ -29,6 +29,18 @@ function parseJwtSub(token: string): string | undefined {
 * Exits the process if no token can be resolved.
 */
 export async function resolveToken(options: ResolveTokenOptions): Promise<ResolvedAuth> {
+  // LOBEHUB_JWT env var takes highest priority (used by server-side sandbox execution)
+  const envJwt = process.env.LOBEHUB_JWT;
+  if (envJwt) {
+    const userId = parseJwtSub(envJwt);
+    if (!userId) {
+      log.error('Could not extract userId from LOBEHUB_JWT.');
+      process.exit(1);
+    }
+    log.debug('Using LOBEHUB_JWT from environment');
+    return { token: envJwt, userId };
+  }
+
  // Explicit token takes priority
  if (options.token) {
    const userId = parseJwtSub(options.token);
@@ -8,12 +8,20 @@ const { mockTrpcClient } = vi.hoisted(() => ({
  mockTrpcClient: {
    agent: {
      createAgent: { mutate: vi.fn() },
+      createAgentFiles: { mutate: vi.fn() },
+      createAgentKnowledgeBase: { mutate: vi.fn() },
+      deleteAgentFile: { mutate: vi.fn() },
+      deleteAgentKnowledgeBase: { mutate: vi.fn() },
      duplicateAgent: { mutate: vi.fn() },
      getAgentConfigById: { query: vi.fn() },
      getBuiltinAgent: { query: vi.fn() },
+      getKnowledgeBasesAndFiles: { query: vi.fn() },
      queryAgents: { query: vi.fn() },
      removeAgent: { mutate: vi.fn() },
+      toggleFile: { mutate: vi.fn() },
+      toggleKnowledgeBase: { mutate: vi.fn() },
      updateAgentConfig: { mutate: vi.fn() },
+      updateAgentPinned: { mutate: vi.fn() },
    },
    aiAgent: {
      execAgent: { mutate: vi.fn() },
@@ -403,6 +411,158 @@ describe('agent command', () => {
    });
  });

+  describe('pin/unpin', () => {
+    it('should pin an agent', async () => {
+      mockTrpcClient.agent.updateAgentPinned.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'pin', 'a1']);
+
+      expect(mockTrpcClient.agent.updateAgentPinned.mutate).toHaveBeenCalledWith({
+        id: 'a1',
+        pinned: true,
+      });
+    });
+
+    it('should unpin an agent', async () => {
+      mockTrpcClient.agent.updateAgentPinned.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'unpin', 'a1']);
+
+      expect(mockTrpcClient.agent.updateAgentPinned.mutate).toHaveBeenCalledWith({
+        id: 'a1',
+        pinned: false,
+      });
+    });
+  });
+
+  describe('kb-files', () => {
+    it('should list kb and files', async () => {
+      mockTrpcClient.agent.getKnowledgeBasesAndFiles.query.mockResolvedValue([
+        { enabled: true, id: 'f1', name: 'file.txt', type: 'file' },
+      ]);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'kb-files', 'a1']);
+
+      expect(mockTrpcClient.agent.getKnowledgeBasesAndFiles.query).toHaveBeenCalledWith({
+        agentId: 'a1',
+      });
+    });
+
+    it('should show empty message', async () => {
+      mockTrpcClient.agent.getKnowledgeBasesAndFiles.query.mockResolvedValue([]);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'kb-files', 'a1']);
+
+      expect(consoleSpy).toHaveBeenCalledWith('No knowledge bases or files found.');
+    });
+  });
+
+  describe('add-file', () => {
+    it('should add files to agent', async () => {
+      mockTrpcClient.agent.createAgentFiles.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'add-file', 'a1', '--file-ids', 'f1,f2']);
+
+      expect(mockTrpcClient.agent.createAgentFiles.mutate).toHaveBeenCalledWith(
+        expect.objectContaining({ agentId: 'a1', fileIds: ['f1', 'f2'] }),
+      );
+    });
+  });
+
+  describe('remove-file', () => {
+    it('should remove a file from agent', async () => {
+      mockTrpcClient.agent.deleteAgentFile.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'remove-file', 'a1', '--file-id', 'f1']);
+
+      expect(mockTrpcClient.agent.deleteAgentFile.mutate).toHaveBeenCalledWith({
+        agentId: 'a1',
+        fileId: 'f1',
+      });
+    });
+  });
+
+  describe('toggle-file', () => {
+    it('should toggle file with enable', async () => {
+      mockTrpcClient.agent.toggleFile.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'agent',
+        'toggle-file',
+        'a1',
+        '--file-id',
+        'f1',
+        '--enable',
+      ]);
+
+      expect(mockTrpcClient.agent.toggleFile.mutate).toHaveBeenCalledWith({
+        agentId: 'a1',
+        enabled: true,
+        fileId: 'f1',
+      });
+    });
+  });
+
+  describe('add-kb', () => {
+    it('should add kb to agent', async () => {
+      mockTrpcClient.agent.createAgentKnowledgeBase.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'add-kb', 'a1', '--kb-id', 'kb1']);
+
+      expect(mockTrpcClient.agent.createAgentKnowledgeBase.mutate).toHaveBeenCalledWith(
+        expect.objectContaining({ agentId: 'a1', knowledgeBaseId: 'kb1' }),
+      );
+    });
+  });
+
+  describe('remove-kb', () => {
+    it('should remove kb from agent', async () => {
+      mockTrpcClient.agent.deleteAgentKnowledgeBase.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'agent', 'remove-kb', 'a1', '--kb-id', 'kb1']);
+
+      expect(mockTrpcClient.agent.deleteAgentKnowledgeBase.mutate).toHaveBeenCalledWith({
+        agentId: 'a1',
+        knowledgeBaseId: 'kb1',
+      });
+    });
+  });
+
+  describe('toggle-kb', () => {
+    it('should toggle kb with disable', async () => {
+      mockTrpcClient.agent.toggleKnowledgeBase.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'agent',
+        'toggle-kb',
+        'a1',
+        '--kb-id',
+        'kb1',
+        '--disable',
+      ]);
+
+      expect(mockTrpcClient.agent.toggleKnowledgeBase.mutate).toHaveBeenCalledWith({
+        agentId: 'a1',
+        enabled: false,
+        knowledgeBaseId: 'kb1',
+      });
+    });
+  });
+
  describe('status', () => {
    it('should display operation status', async () => {
      mockTrpcClient.aiAgent.getOperationStatus.query.mockResolvedValue({
@@ -316,6 +316,204 @@ export function registerAgentCommand(program: Command) {
      },
    );

+  // ── pin / unpin ─────────────────────────────────────
+
+  agent
+    .command('pin <agentId>')
+    .description('Pin an agent')
+    .action(async (agentId: string) => {
+      const client = await getTrpcClient();
+      await client.agent.updateAgentPinned.mutate({ id: agentId, pinned: true });
+      console.log(`${pc.green('✓')} Pinned agent ${pc.bold(agentId)}`);
+    });
+
+  agent
+    .command('unpin <agentId>')
+    .description('Unpin an agent')
+    .action(async (agentId: string) => {
+      const client = await getTrpcClient();
+      await client.agent.updateAgentPinned.mutate({ id: agentId, pinned: false });
+      console.log(`${pc.green('✓')} Unpinned agent ${pc.bold(agentId)}`);
+    });
+
+  // ── kb-files ───────────────────────────────────────
+
+  agent
+    .command('kb-files [agentId]')
+    .description('List knowledge bases and files associated with an agent')
+    .option('-s, --slug <slug>', 'Agent slug')
+    .option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
+    .action(
+      async (
+        agentIdArg: string | undefined,
+        options: { json?: string | boolean; slug?: string },
+      ) => {
+        const client = await getTrpcClient();
+        const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
+        const items = await client.agent.getKnowledgeBasesAndFiles.query({ agentId });
+
+        if (options.json !== undefined) {
+          const fields = typeof options.json === 'string' ? options.json : undefined;
+          outputJson(items, fields);
+          return;
+        }
+
+        const list = Array.isArray(items) ? items : [];
+        if (list.length === 0) {
+          console.log('No knowledge bases or files found.');
+          return;
+        }
+
+        const rows = list.map((item: any) => [
+          item.id || '',
+          truncate(item.name || '', 40),
+          item.type || '',
+          item.enabled ? 'enabled' : 'disabled',
+        ]);
+
+        printTable(rows, ['ID', 'NAME', 'TYPE', 'STATUS']);
+      },
+    );
+
+  // ── add-file ───────────────────────────────────────
+
+  agent
+    .command('add-file [agentId]')
+    .description('Associate files with an agent')
+    .option('-s, --slug <slug>', 'Agent slug')
+    .requiredOption('--file-ids <ids>', 'Comma-separated file IDs')
+    .option('--enabled', 'Enable files immediately')
+    .action(
+      async (
+        agentIdArg: string | undefined,
+        options: { enabled?: boolean; fileIds: string; slug?: string },
+      ) => {
+        const client = await getTrpcClient();
+        const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
+        const fileIds = options.fileIds.split(',').map((s) => s.trim());
+
+        const input: Record<string, any> = { agentId, fileIds };
+        if (options.enabled !== undefined) input.enabled = options.enabled;
+
+        await client.agent.createAgentFiles.mutate(input as any);
+        console.log(
+          `${pc.green('✓')} Added ${fileIds.length} file(s) to agent ${pc.bold(agentId)}`,
+        );
+      },
+    );
+
+  // ── remove-file ────────────────────────────────────
+
+  agent
+    .command('remove-file [agentId]')
+    .description('Remove a file from an agent')
+    .option('-s, --slug <slug>', 'Agent slug')
+    .requiredOption('--file-id <id>', 'File ID to remove')
+    .action(async (agentIdArg: string | undefined, options: { fileId: string; slug?: string }) => {
+      const client = await getTrpcClient();
+      const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
+      await client.agent.deleteAgentFile.mutate({ agentId, fileId: options.fileId });
+      console.log(
+        `${pc.green('✓')} Removed file ${pc.bold(options.fileId)} from agent ${pc.bold(agentId)}`,
+      );
+    });
+
+  // ── toggle-file ────────────────────────────────────
+
+  agent
+    .command('toggle-file [agentId]')
+    .description('Toggle a file on/off for an agent')
+    .option('-s, --slug <slug>', 'Agent slug')
+    .requiredOption('--file-id <id>', 'File ID')
+    .option('--enable', 'Enable the file')
+    .option('--disable', 'Disable the file')
+    .action(
+      async (
+        agentIdArg: string | undefined,
+        options: { disable?: boolean; enable?: boolean; fileId: string; slug?: string },
+      ) => {
+        const enabled = options.enable ? true : options.disable ? false : undefined;
+        const client = await getTrpcClient();
+        const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
+        await client.agent.toggleFile.mutate({ agentId, enabled, fileId: options.fileId });
+        console.log(
+          `${pc.green('✓')} Toggled file ${pc.bold(options.fileId)} for agent ${pc.bold(agentId)}`,
+        );
+      },
+    );
+
+  // ── add-kb ─────────────────────────────────────────
+
+  agent
+    .command('add-kb [agentId]')
+    .description('Associate a knowledge base with an agent')
+    .option('-s, --slug <slug>', 'Agent slug')
+    .requiredOption('--kb-id <id>', 'Knowledge base ID')
+    .option('--enabled', 'Enable immediately')
+    .action(
+      async (
+        agentIdArg: string | undefined,
+        options: { enabled?: boolean; kbId: string; slug?: string },
+      ) => {
+        const client = await getTrpcClient();
+        const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
+        const input: Record<string, any> = { agentId, knowledgeBaseId: options.kbId };
+        if (options.enabled !== undefined) input.enabled = options.enabled;
+
+        await client.agent.createAgentKnowledgeBase.mutate(input as any);
+        console.log(
+          `${pc.green('✓')} Added knowledge base ${pc.bold(options.kbId)} to agent ${pc.bold(agentId)}`,
+        );
+      },
+    );
+
+  // ── remove-kb ──────────────────────────────────────
+
+  agent
+    .command('remove-kb [agentId]')
+    .description('Remove a knowledge base from an agent')
+    .option('-s, --slug <slug>', 'Agent slug')
+    .requiredOption('--kb-id <id>', 'Knowledge base ID')
+    .action(async (agentIdArg: string | undefined, options: { kbId: string; slug?: string }) => {
+      const client = await getTrpcClient();
+      const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
+      await client.agent.deleteAgentKnowledgeBase.mutate({
+        agentId,
+        knowledgeBaseId: options.kbId,
+      });
+      console.log(
+        `${pc.green('✓')} Removed knowledge base ${pc.bold(options.kbId)} from agent ${pc.bold(agentId)}`,
+      );
+    });
+
+  // ── toggle-kb ──────────────────────────────────────
+
+  agent
+    .command('toggle-kb [agentId]')
+    .description('Toggle a knowledge base on/off for an agent')
+    .option('-s, --slug <slug>', 'Agent slug')
+    .requiredOption('--kb-id <id>', 'Knowledge base ID')
+    .option('--enable', 'Enable the knowledge base')
+    .option('--disable', 'Disable the knowledge base')
+    .action(
+      async (
+        agentIdArg: string | undefined,
+        options: { disable?: boolean; enable?: boolean; kbId: string; slug?: string },
+      ) => {
+        const enabled = options.enable ? true : options.disable ? false : undefined;
+        const client = await getTrpcClient();
+        const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
+        await client.agent.toggleKnowledgeBase.mutate({
+          agentId,
+          enabled,
+          knowledgeBaseId: options.kbId,
+        });
+        console.log(
+          `${pc.green('✓')} Toggled knowledge base ${pc.bold(options.kbId)} for agent ${pc.bold(agentId)}`,
+        );
+      },
+    );
+
  // ── status ──────────────────────────────────────────

  agent
@@ -3,6 +3,32 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';

 const { mockTrpcClient } = vi.hoisted(() => ({
  mockTrpcClient: {
+    agentEval: {
+      abortRun: { mutate: vi.fn() },
+      createBenchmark: { mutate: vi.fn() },
+      createDataset: { mutate: vi.fn() },
+      createRun: { mutate: vi.fn() },
+      createTestCase: { mutate: vi.fn() },
+      deleteBenchmark: { mutate: vi.fn() },
+      deleteDataset: { mutate: vi.fn() },
+      deleteRun: { mutate: vi.fn() },
+      deleteTestCase: { mutate: vi.fn() },
+      getBenchmark: { query: vi.fn() },
+      getDataset: { query: vi.fn() },
+      getRunDetails: { query: vi.fn() },
+      getRunProgress: { query: vi.fn() },
+      getRunResults: { query: vi.fn() },
+      getTestCase: { query: vi.fn() },
+      listBenchmarks: { query: vi.fn() },
+      listDatasets: { query: vi.fn() },
+      listRuns: { query: vi.fn() },
+      listTestCases: { query: vi.fn() },
+      retryRunErrors: { mutate: vi.fn() },
+      startRun: { mutate: vi.fn() },
+      updateBenchmark: { mutate: vi.fn() },
+      updateDataset: { mutate: vi.fn() },
+      updateTestCase: { mutate: vi.fn() },
+    },
    agentEvalExternal: {
      datasetGet: { query: vi.fn() },
      messagesList: { query: vi.fn() },
@@ -48,9 +74,11 @@ describe('eval command', () => {
    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
    logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});

-    for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
-      for (const fn of Object.values(method)) {
-        (fn as ReturnType<typeof vi.fn>).mockReset();
+    for (const ns of Object.values(mockTrpcClient)) {
+      for (const method of Object.values(ns as Record<string, any>)) {
+        for (const fn of Object.values(method as Record<string, any>)) {
+          (fn as ReturnType<typeof vi.fn>).mockReset();
+        }
      }
    }
  });
@@ -68,218 +96,505 @@ describe('eval command', () => {
    return program;
  };

-  it('should call runGet and output json envelope', async () => {
-    mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
-      config: { k: 1 },
-      datasetId: 'dataset-1',
-      id: 'run-1',
+  // ============================================
+  // Benchmark tests
+  // ============================================
+  describe('benchmark', () => {
+    it('should list benchmarks', async () => {
+      mockTrpcClient.agentEval.listBenchmarks.query.mockResolvedValue([
+        { id: 'b1', name: 'Bench 1' },
+      ]);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'benchmark', 'list', '--json']);
+
+      expect(mockTrpcClient.agentEval.listBenchmarks.query).toHaveBeenCalled();
    });

-    const program = createProgram();
-    await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
+    it('should create a benchmark', async () => {
+      mockTrpcClient.agentEval.createBenchmark.mutate.mockResolvedValue({ id: 'b1' });

-    expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'benchmark',
+        'create',
+        '--identifier',
+        'test-bench',
+        '-n',
+        'Test Bench',
+        '--json',
+      ]);

-    const payload = JSON.parse(logSpy.mock.calls[0][0]);
-    expect(payload).toEqual({
-      data: {
+      expect(mockTrpcClient.agentEval.createBenchmark.mutate).toHaveBeenCalledWith(
+        expect.objectContaining({ identifier: 'test-bench', name: 'Test Bench' }),
+      );
+    });
+
+    it('should delete a benchmark', async () => {
+      mockTrpcClient.agentEval.deleteBenchmark.mutate.mockResolvedValue({ success: true });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'benchmark', 'delete', '--id', 'b1']);
+
+      expect(mockTrpcClient.agentEval.deleteBenchmark.mutate).toHaveBeenCalledWith({ id: 'b1' });
+    });
+  });
+
+  // ============================================
+  // Dataset tests
+  // ============================================
+  describe('dataset', () => {
+    it('should list datasets', async () => {
+      mockTrpcClient.agentEval.listDatasets.query.mockResolvedValue([{ id: 'd1', name: 'DS 1' }]);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'dataset', 'list', '--json']);
+
+      expect(mockTrpcClient.agentEval.listDatasets.query).toHaveBeenCalled();
+    });
+
+    it('should get dataset via internal API', async () => {
+      mockTrpcClient.agentEval.getDataset.query.mockResolvedValue({ id: 'd1' });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'dataset', 'get', '--id', 'd1', '--json']);
+
+      expect(mockTrpcClient.agentEval.getDataset.query).toHaveBeenCalledWith({ id: 'd1' });
+    });
+
+    it('should get dataset via external API with --external', async () => {
+      mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
+        id: 'dataset-1',
+        metadata: { preset: 'deepsearchqa' },
+      });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'dataset',
+        'get',
+        '--id',
+        'dataset-1',
+        '--external',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
+        datasetId: 'dataset-1',
+      });
+    });
+
+    it('should create a dataset', async () => {
+      mockTrpcClient.agentEval.createDataset.mutate.mockResolvedValue({ id: 'd1' });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'dataset',
+        'create',
+        '--benchmark-id',
+        'b1',
+        '--identifier',
+        'ds1',
+        '-n',
+        'Dataset 1',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEval.createDataset.mutate).toHaveBeenCalledWith(
+        expect.objectContaining({ benchmarkId: 'b1', identifier: 'ds1', name: 'Dataset 1' }),
+      );
+    });
+  });
+
+  // ============================================
+  // TestCase tests
+  // ============================================
+  describe('testcase', () => {
+    it('should list test cases', async () => {
+      mockTrpcClient.agentEval.listTestCases.query.mockResolvedValue({ data: [], total: 0 });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'testcase',
+        'list',
+        '--dataset-id',
+        'd1',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEval.listTestCases.query).toHaveBeenCalledWith(
+        expect.objectContaining({ datasetId: 'd1' }),
+      );
+    });
+
+    it('should create a test case', async () => {
+      mockTrpcClient.agentEval.createTestCase.mutate.mockResolvedValue({ id: 'tc1' });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'testcase',
+        'create',
+        '--dataset-id',
+        'd1',
+        '--input',
+        'What is 2+2?',
+        '--expected',
+        '4',
+      ]);
+
+      expect(mockTrpcClient.agentEval.createTestCase.mutate).toHaveBeenCalledWith(
+        expect.objectContaining({
+          content: expect.objectContaining({ expected: '4', input: 'What is 2+2?' }),
+          datasetId: 'd1',
+        }),
+      );
+    });
+
+    it('should delete a test case', async () => {
+      mockTrpcClient.agentEval.deleteTestCase.mutate.mockResolvedValue({ success: true });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'testcase', 'delete', '--id', 'tc1']);
+
+      expect(mockTrpcClient.agentEval.deleteTestCase.mutate).toHaveBeenCalledWith({ id: 'tc1' });
+    });
+
+    it('should count test cases via external API', async () => {
+      mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'testcase',
+        'count',
+        '--dataset-id',
+        'dataset-1',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
+        datasetId: 'dataset-1',
+      });
+    });
+  });
+
+  // ============================================
+  // Run tests
+  // ============================================
+  describe('run', () => {
+    it('should list runs', async () => {
+      mockTrpcClient.agentEval.listRuns.query.mockResolvedValue({ data: [], total: 0 });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'list', '--json']);
+
+      expect(mockTrpcClient.agentEval.listRuns.query).toHaveBeenCalled();
+    });
+
+    it('should get run via internal API', async () => {
+      mockTrpcClient.agentEval.getRunDetails.query.mockResolvedValue({ id: 'r1' });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--id', 'r1', '--json']);
+
+      expect(mockTrpcClient.agentEval.getRunDetails.query).toHaveBeenCalledWith({ id: 'r1' });
+    });
+
+    it('should get run via external API with --external', async () => {
+      mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
        config: { k: 1 },
        datasetId: 'dataset-1',
        id: 'run-1',
-      },
-      error: null,
-      ok: true,
-      version: 'v1',
+      });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'run',
+        'get',
+        '--id',
+        'run-1',
+        '--external',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({
+        runId: 'run-1',
+      });
+
+      const payload = JSON.parse(logSpy.mock.calls[0][0]);
+      expect(payload).toEqual({
+        data: { config: { k: 1 }, datasetId: 'dataset-1', id: 'run-1' },
+        error: null,
+        ok: true,
+        version: 'v1',
+      });
+    });
+
+    it('should create a run', async () => {
+      mockTrpcClient.agentEval.createRun.mutate.mockResolvedValue({ id: 'r1' });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'run',
+        'create',
+        '--dataset-id',
+        'd1',
+        '-n',
+        'Run 1',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEval.createRun.mutate).toHaveBeenCalledWith(
+        expect.objectContaining({ datasetId: 'd1', name: 'Run 1' }),
+      );
+    });
+
+    it('should start a run', async () => {
+      mockTrpcClient.agentEval.startRun.mutate.mockResolvedValue({ success: true, runId: 'r1' });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'start', '--id', 'r1']);
+
+      expect(mockTrpcClient.agentEval.startRun.mutate).toHaveBeenCalledWith(
+        expect.objectContaining({ id: 'r1' }),
+      );
+    });
+
+    it('should abort a run', async () => {
+      mockTrpcClient.agentEval.abortRun.mutate.mockResolvedValue({ success: true });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'abort', '--id', 'r1']);
+
+      expect(mockTrpcClient.agentEval.abortRun.mutate).toHaveBeenCalledWith({ id: 'r1' });
+    });
+
+    it('should get run progress', async () => {
+      mockTrpcClient.agentEval.getRunProgress.query.mockResolvedValue({ status: 'running' });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'progress', '--id', 'r1', '--json']);
+
+      expect(mockTrpcClient.agentEval.getRunProgress.query).toHaveBeenCalledWith({ id: 'r1' });
+    });
+
+    it('should get run results', async () => {
+      mockTrpcClient.agentEval.getRunResults.query.mockResolvedValue({
+        results: [],
+        runId: 'r1',
+        total: 0,
+      });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'results', '--id', 'r1', '--json']);
+
+      expect(mockTrpcClient.agentEval.getRunResults.query).toHaveBeenCalledWith({ id: 'r1' });
+    });
+
+    it('should delete a run', async () => {
+      mockTrpcClient.agentEval.deleteRun.mutate.mockResolvedValue({ success: true });
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'delete', '--id', 'r1']);
+
+      expect(mockTrpcClient.agentEval.deleteRun.mutate).toHaveBeenCalledWith({ id: 'r1' });
+    });
+
+    it('should set run status via external API', async () => {
+      mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
+        runId: 'run-1',
+        status: 'completed',
+        success: true,
+      });
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'run',
+        'set-status',
+        '--id',
+        'run-1',
+        '--status',
+        'completed',
+      ]);
+
+      expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
+        runId: 'run-1',
+        status: 'completed',
+      });
+      expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
    });
  });

-  it('should call datasetGet and output json envelope', async () => {
-    mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
-      id: 'dataset-1',
-      metadata: { preset: 'deepsearchqa' },
+  // ============================================
+  // Run-Topic tests (external eval API)
+  // ============================================
+  describe('run-topic', () => {
+    it('should list run topics', async () => {
+      mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'run-topic',
+        'list',
+        '--run-id',
+        'run-1',
+        '--only-external',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
+        onlyExternal: true,
+        runId: 'run-1',
+      });
    });

-    const program = createProgram();
-    await program.parseAsync([
-      'node',
-      'test',
-      'eval',
-      'dataset',
-      'get',
-      '--dataset-id',
-      'dataset-1',
-      '--json',
-    ]);
+    it('should report run-topic result', async () => {
+      mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
+        success: true,
+      });

-    expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
-      datasetId: 'dataset-1',
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'run-topic',
+        'report-result',
+        '--run-id',
+        'run-1',
+        '--topic-id',
+        'topic-1',
+        '--thread-id',
+        'thread-1',
+        '--score',
+        '0.91',
+        '--correct',
+        'true',
+        '--result-json',
+        '{"grade":"A"}',
+        '--json',
+      ]);
+
+      expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
+        correct: true,
+        result: { grade: 'A' },
+        runId: 'run-1',
+        score: 0.91,
+        threadId: 'thread-1',
+        topicId: 'topic-1',
+      });
    });
  });

-  it('should pass onlyExternal to runTopicsList', async () => {
-    mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
+  // ============================================
+  // Eval thread/message tests (external eval API)
+  // ============================================
+  describe('eval thread', () => {
+    it('should list threads by topic', async () => {
+      mockTrpcClient.agentEvalExternal.threadsList.query.mockResolvedValue([]);

-    const program = createProgram();
-    await program.parseAsync([
-      'node',
-      'test',
-      'eval',
-      'run-topics',
-      'list',
-      '--run-id',
-      'run-1',
-      '--only-external',
-      '--json',
-    ]);
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'thread',
+        'list',
+        '--topic-id',
+        'topic-1',
+        '--json',
+      ]);

-    expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
-      onlyExternal: true,
-      runId: 'run-1',
+      expect(mockTrpcClient.agentEvalExternal.threadsList.query).toHaveBeenCalledWith({
+        topicId: 'topic-1',
+      });
    });
  });

-  it('should pass topicId and threadId to messagesList', async () => {
-    mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
+  describe('eval message', () => {
+    it('should list messages by topic and thread', async () => {
+      mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);

-    const program = createProgram();
-    await program.parseAsync([
-      'node',
-      'test',
-      'eval',
-      'messages',
-      'list',
-      '--topic-id',
-      'topic-1',
-      '--thread-id',
-      'thread-1',
-      '--json',
-    ]);
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'eval',
+        'message',
+        'list',
+        '--topic-id',
+        'topic-1',
+        '--thread-id',
+        'thread-1',
+        '--json',
+      ]);

-    expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
-      threadId: 'thread-1',
-      topicId: 'topic-1',
+      expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
+        threadId: 'thread-1',
+        topicId: 'topic-1',
+      });
    });
  });

-  it('should parse and report run-topic result', async () => {
-    mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
-      success: true,
+  // ============================================
+  // Error handling
+  // ============================================
+  describe('error handling', () => {
+    it('should output json error envelope when command fails', async () => {
+      const error = Object.assign(new Error('Run not found'), {
+        data: { code: 'NOT_FOUND' },
+      });
+      mockTrpcClient.agentEval.getRunDetails.query.mockRejectedValue(error);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--id', 'run-404', '--json']);
+
+      const payload = JSON.parse(logSpy.mock.calls[0][0]);
+      expect(payload).toEqual({
+        data: null,
+        error: { code: 'NOT_FOUND', message: 'Run not found' },
+        ok: false,
+        version: 'v1',
+      });
+      expect(exitSpy).toHaveBeenCalledWith(1);
    });

-    const program = createProgram();
-    await program.parseAsync([
-      'node',
-      'test',
-      'eval',
-      'run-topic',
-      'report-result',
-      '--run-id',
-      'run-1',
-      '--topic-id',
-      'topic-1',
-      '--thread-id',
-      'thread-1',
-      '--score',
-      '0.91',
-      '--correct',
-      'true',
-      '--result-json',
-      '{"grade":"A"}',
-      '--json',
-    ]);
+    it('should log plain error without --json', async () => {
+      mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));

-    expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
-      correct: true,
-      result: { grade: 'A' },
-      runId: 'run-1',
-      score: 0.91,
-      threadId: 'thread-1',
-      topicId: 'topic-1',
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'eval', 'thread', 'list', '--topic-id', 'topic-1']);
+
+      expect(log.error).toHaveBeenCalledWith('boom');
+      expect(exitSpy).toHaveBeenCalledWith(1);
    });
  });
-
-  it('should update run status', async () => {
-    mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
-      runId: 'run-1',
-      status: 'completed',
-      success: true,
-    });
-
-    const program = createProgram();
-    await program.parseAsync([
-      'node',
-      'test',
-      'eval',
-      'run',
-      'set-status',
-      '--run-id',
-      'run-1',
-      '--status',
-      'completed',
-    ]);
-
-    expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
-      runId: 'run-1',
-      status: 'completed',
-    });
-    expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
-  });
-
-  it('should output json error envelope when command fails', async () => {
-    const error = Object.assign(new Error('Run not found'), {
-      data: { code: 'NOT_FOUND' },
-    });
-    mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
-
-    const program = createProgram();
-    await program.parseAsync([
-      'node',
-      'test',
-      'eval',
-      'run',
-      'get',
-      '--run-id',
-      'run-404',
-      '--json',
-    ]);
-
-    const payload = JSON.parse(logSpy.mock.calls[0][0]);
-    expect(payload).toEqual({
-      data: null,
-      error: { code: 'NOT_FOUND', message: 'Run not found' },
-      ok: false,
-      version: 'v1',
-    });
-    expect(exitSpy).toHaveBeenCalledWith(1);
-  });
-
-  it('should query test case count', async () => {
-    mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
-
-    const program = createProgram();
-    await program.parseAsync([
-      'node',
-      'test',
-      'eval',
-      'test-cases',
-      'count',
-      '--dataset-id',
-      'dataset-1',
-      '--json',
-    ]);
-
-    expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
-      datasetId: 'dataset-1',
-    });
-  });
-
-  it('should log plain error without --json', async () => {
-    mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
-
-    const program = createProgram();
-    await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
-
-    expect(log.error).toHaveBeenCalledWith('boom');
-    expect(exitSpy).toHaveBeenCalledWith(1);
-  });
 });
@@ -23,46 +23,6 @@ interface JsonOption {
  json?: boolean;
 }

-interface RunGetOptions extends JsonOption {
-  runId: string;
-}
-
-interface RunSetStatusOptions extends JsonOption {
-  runId: string;
-  status: 'completed' | 'external';
-}
-
-interface DatasetGetOptions extends JsonOption {
-  datasetId: string;
-}
-
-interface RunTopicsListOptions extends JsonOption {
-  onlyExternal?: boolean;
-  runId: string;
-}
-
-interface ThreadsListOptions extends JsonOption {
-  topicId: string;
-}
-
-interface MessagesListOptions extends JsonOption {
-  threadId?: string;
-  topicId: string;
-}
-
-interface TestCasesCountOptions extends JsonOption {
-  datasetId: string;
-}
-
-interface RunTopicReportResultOptions extends JsonOption {
-  correct: boolean;
-  resultJson: Record<string, unknown>;
-  runId: string;
-  score: number;
-  threadId?: string;
-  topicId: string;
-}
-
 const printJson = (data: unknown) => {
  console.log(JSON.stringify(data, null, 2));
 };
@@ -180,65 +140,587 @@ const executeCommand = async (
 };

 export function registerEvalCommand(program: Command) {
-  const evalCmd = program.command('eval').description('Manage external evaluation workflows');
+  const evalCmd = program.command('eval').description('Manage evaluation workflows');

+  // ============================================
+  // Benchmark Operations
+  // ============================================
+  const benchmarkCmd = evalCmd.command('benchmark').description('Manage evaluation benchmarks');
+
+  benchmarkCmd
+    .command('list')
+    .description('List benchmarks')
+    .option('--include-system', 'Include system benchmarks')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { includeSystem?: boolean }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEval.listBenchmarks.query({
+          includeSystem: options.includeSystem ?? true,
+        });
+      }),
+    );
+
+  benchmarkCmd
+    .command('get')
+    .description('Get benchmark details')
+    .requiredOption('--id <id>', 'Benchmark ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEval.getBenchmark.query({ id: options.id });
+      }),
+    );
+
+  benchmarkCmd
+    .command('create')
+    .description('Create a benchmark')
+    .requiredOption('--identifier <identifier>', 'Unique identifier')
+    .requiredOption('-n, --name <name>', 'Benchmark name')
+    .option('-d, --description <desc>', 'Description')
+    .option('--reference-url <url>', 'Reference URL')
+    .option('--json', 'Output JSON envelope')
+    .action(
+      async (
+        options: JsonOption & {
+          description?: string;
+          identifier: string;
+          name: string;
+          referenceUrl?: string;
+        },
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            const input: Record<string, any> = {
+              identifier: options.identifier,
+              name: options.name,
+            };
+            if (options.description) input.description = options.description;
+            if (options.referenceUrl) input.referenceUrl = options.referenceUrl;
+            return client.agentEval.createBenchmark.mutate(input as any);
+          },
+          `Created benchmark ${pc.bold(options.name)}`,
+        ),
+    );
+
+  benchmarkCmd
+    .command('update')
+    .description('Update a benchmark')
+    .requiredOption('--id <id>', 'Benchmark ID')
+    .option('-n, --name <name>', 'New name')
+    .option('-d, --description <desc>', 'New description')
+    .option('--reference-url <url>', 'New reference URL')
+    .option('--json', 'Output JSON envelope')
+    .action(
+      async (
+        options: JsonOption & {
+          description?: string;
+          id: string;
+          name?: string;
+          referenceUrl?: string;
+        },
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            const input: Record<string, any> = { id: options.id };
+            if (options.name) input.name = options.name;
+            if (options.description) input.description = options.description;
+            if (options.referenceUrl) input.referenceUrl = options.referenceUrl;
+            return client.agentEval.updateBenchmark.mutate(input as any);
+          },
+          `Updated benchmark ${pc.bold(options.id)}`,
+        ),
+    );
+
+  benchmarkCmd
+    .command('delete')
+    .description('Delete a benchmark')
+    .requiredOption('--id <id>', 'Benchmark ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEval.deleteBenchmark.mutate({ id: options.id });
+        },
+        `Deleted benchmark ${pc.bold(options.id)}`,
+      ),
+    );
+
+  // ============================================
+  // Dataset Operations
+  // ============================================
+  const datasetCmd = evalCmd.command('dataset').description('Manage evaluation datasets');
+
+  datasetCmd
+    .command('list')
+    .description('List datasets')
+    .option('--benchmark-id <id>', 'Filter by benchmark ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { benchmarkId?: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEval.listDatasets.query(
+          options.benchmarkId ? { benchmarkId: options.benchmarkId } : undefined,
+        );
+      }),
+    );
+
+  datasetCmd
+    .command('get')
+    .description('Get dataset details (use --external for external eval API)')
+    .requiredOption('--id <id>', 'Dataset ID')
+    .option('--external', 'Use external evaluation API')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { external?: boolean; id: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        if (options.external) {
+          return client.agentEvalExternal.datasetGet.query({ datasetId: options.id });
+        }
+        return client.agentEval.getDataset.query({ id: options.id });
+      }),
+    );
+
+  datasetCmd
+    .command('create')
+    .description('Create a dataset')
+    .requiredOption('--benchmark-id <id>', 'Benchmark ID')
+    .requiredOption('--identifier <identifier>', 'Unique identifier')
+    .requiredOption('-n, --name <name>', 'Dataset name')
+    .option('-d, --description <desc>', 'Description')
+    .option('--eval-mode <mode>', 'Evaluation mode')
+    .option('--json', 'Output JSON envelope')
+    .action(
+      async (
+        options: JsonOption & {
+          benchmarkId: string;
+          description?: string;
+          evalMode?: string;
+          identifier: string;
+          name: string;
+        },
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            const input: Record<string, any> = {
+              benchmarkId: options.benchmarkId,
+              identifier: options.identifier,
+              name: options.name,
+            };
+            if (options.description) input.description = options.description;
+            if (options.evalMode) input.evalMode = options.evalMode;
+            return client.agentEval.createDataset.mutate(input as any);
+          },
+          `Created dataset ${pc.bold(options.name)}`,
+        ),
+    );
+
+  datasetCmd
+    .command('update')
+    .description('Update a dataset')
+    .requiredOption('--id <id>', 'Dataset ID')
+    .option('-n, --name <name>', 'New name')
+    .option('-d, --description <desc>', 'New description')
+    .option('--eval-mode <mode>', 'New evaluation mode')
+    .option('--json', 'Output JSON envelope')
+    .action(
+      async (
+        options: JsonOption & {
+          description?: string;
+          evalMode?: string;
+          id: string;
+          name?: string;
+        },
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            const input: Record<string, any> = { id: options.id };
+            if (options.name) input.name = options.name;
+            if (options.description) input.description = options.description;
+            if (options.evalMode) input.evalMode = options.evalMode;
+            return client.agentEval.updateDataset.mutate(input as any);
+          },
+          `Updated dataset ${pc.bold(options.id)}`,
+        ),
+    );
+
+  datasetCmd
+    .command('delete')
+    .description('Delete a dataset')
+    .requiredOption('--id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEval.deleteDataset.mutate({ id: options.id });
+        },
+        `Deleted dataset ${pc.bold(options.id)}`,
+      ),
+    );
+
+  // ============================================
+  // TestCase Operations
+  // ============================================
+  const testcaseCmd = evalCmd.command('testcase').description('Manage evaluation test cases');
+
+  testcaseCmd
+    .command('list')
+    .description('List test cases')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('-L, --limit <n>', 'Page size', '50')
+    .option('--offset <n>', 'Offset', '0')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { datasetId: string; limit?: string; offset?: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEval.listTestCases.query({
+          datasetId: options.datasetId,
+          limit: Number.parseInt(options.limit || '50', 10),
+          offset: Number.parseInt(options.offset || '0', 10),
+        });
+      }),
+    );
+
+  testcaseCmd
+    .command('get')
+    .description('Get test case details')
+    .requiredOption('--id <id>', 'Test case ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEval.getTestCase.query({ id: options.id });
+      }),
+    );
+
+  testcaseCmd
+    .command('create')
+    .description('Create a test case')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .requiredOption('--input <text>', 'Input text')
+    .option('--expected <text>', 'Expected output')
+    .option('--category <cat>', 'Category')
+    .option('--sort-order <n>', 'Sort order')
+    .option('--json', 'Output JSON envelope')
+    .action(
+      async (
+        options: JsonOption & {
+          category?: string;
+          datasetId: string;
+          expected?: string;
+          input: string;
+          sortOrder?: string;
+        },
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            const content: Record<string, any> = { input: options.input };
+            if (options.expected) content.expected = options.expected;
+            if (options.category) content.category = options.category;
+
+            const input: Record<string, any> = { content, datasetId: options.datasetId };
+            if (options.sortOrder) input.sortOrder = Number.parseInt(options.sortOrder, 10);
+            return client.agentEval.createTestCase.mutate(input as any);
+          },
+          'Created test case',
+        ),
+    );
+
+  testcaseCmd
+    .command('update')
+    .description('Update a test case')
+    .requiredOption('--id <id>', 'Test case ID')
+    .option('--input <text>', 'New input text')
+    .option('--expected <text>', 'New expected output')
+    .option('--category <cat>', 'New category')
+    .option('--sort-order <n>', 'New sort order')
+    .option('--json', 'Output JSON envelope')
+    .action(
+      async (
+        options: JsonOption & {
+          category?: string;
+          expected?: string;
+          id: string;
+          input?: string;
+          sortOrder?: string;
+        },
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            const input: Record<string, any> = { id: options.id };
+            const content: Record<string, any> = {};
+            if (options.input) content.input = options.input;
+            if (options.expected) content.expected = options.expected;
+            if (options.category) content.category = options.category;
+            if (Object.keys(content).length > 0) input.content = content;
+            if (options.sortOrder) input.sortOrder = Number.parseInt(options.sortOrder, 10);
+            return client.agentEval.updateTestCase.mutate(input as any);
+          },
+          `Updated test case ${pc.bold(options.id)}`,
+        ),
+    );
+
+  testcaseCmd
+    .command('delete')
+    .description('Delete a test case')
+    .requiredOption('--id <id>', 'Test case ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEval.deleteTestCase.mutate({ id: options.id });
+        },
+        `Deleted test case ${pc.bold(options.id)}`,
+      ),
+    );
+
+  testcaseCmd
+    .command('count')
+    .description('Count test cases by dataset (external eval API)')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { datasetId: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
+      }),
+    );
+
+  // ============================================
+  // Run Operations
+  // ============================================
  const runCmd = evalCmd.command('run').description('Manage evaluation runs');

  runCmd
-    .command('get')
-    .description('Get run information')
-    .requiredOption('--run-id <id>', 'Run ID')
+    .command('list')
+    .description('List evaluation runs')
+    .option('--benchmark-id <id>', 'Filter by benchmark ID')
+    .option('--dataset-id <id>', 'Filter by dataset ID')
+    .option('--status <status>', 'Filter by status')
+    .option('-L, --limit <n>', 'Page size', '50')
+    .option('--offset <n>', 'Offset', '0')
    .option('--json', 'Output JSON envelope')
-    .action(async (options: RunGetOptions) =>
+    .action(
+      async (
+        options: JsonOption & {
+          benchmarkId?: string;
+          datasetId?: string;
+          limit?: string;
+          offset?: string;
+          status?: string;
+        },
+      ) =>
+        executeCommand(options, async () => {
+          const client = await getTrpcClient();
+          const input: Record<string, any> = {};
+          if (options.benchmarkId) input.benchmarkId = options.benchmarkId;
+          if (options.datasetId) input.datasetId = options.datasetId;
+          if (options.status) input.status = options.status;
+          input.limit = Number.parseInt(options.limit || '50', 10);
+          input.offset = Number.parseInt(options.offset || '0', 10);
+          return client.agentEval.listRuns.query(input as any);
+        }),
+    );
+
+  runCmd
+    .command('get')
+    .description('Get run details (use --external for external eval API)')
+    .requiredOption('--id <id>', 'Run ID')
+    .option('--external', 'Use external evaluation API')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { external?: boolean; id: string }) =>
      executeCommand(options, async () => {
        const client = await getTrpcClient();
-        return client.agentEvalExternal.runGet.query({ runId: options.runId });
+        if (options.external) {
+          return client.agentEvalExternal.runGet.query({ runId: options.id });
+        }
+        return client.agentEval.getRunDetails.query({ id: options.id });
+      }),
+    );
+
+  runCmd
+    .command('create')
+    .description('Create an evaluation run')
+    .requiredOption('--dataset-id <id>', 'Dataset ID')
+    .option('--agent-id <id>', 'Target agent ID')
+    .option('-n, --name <name>', 'Run name')
+    .option('--k <n>', 'Number of runs per test case (1-10)')
+    .option('--max-concurrency <n>', 'Max concurrency (1-10)')
+    .option('--max-steps <n>', 'Max steps (1-1000)')
+    .option('--timeout <ms>', 'Timeout in ms (60000-3600000)')
+    .option('--json', 'Output JSON envelope')
+    .action(
+      async (
+        options: JsonOption & {
+          agentId?: string;
+          datasetId: string;
+          k?: string;
+          maxConcurrency?: string;
+          maxSteps?: string;
+          name?: string;
+          timeout?: string;
+        },
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            const input: Record<string, any> = { datasetId: options.datasetId };
+            if (options.agentId) input.targetAgentId = options.agentId;
+            if (options.name) input.name = options.name;
+            const config: Record<string, any> = {};
+            if (options.k) config.k = Number.parseInt(options.k, 10);
+            if (options.maxConcurrency)
+              config.maxConcurrency = Number.parseInt(options.maxConcurrency, 10);
+            if (options.maxSteps) config.maxSteps = Number.parseInt(options.maxSteps, 10);
+            if (options.timeout) config.timeout = Number.parseInt(options.timeout, 10);
+            if (Object.keys(config).length > 0) input.config = config;
+            return client.agentEval.createRun.mutate(input as any);
+          },
+          'Created evaluation run',
+        ),
+    );
+
+  runCmd
+    .command('delete')
+    .description('Delete an evaluation run')
+    .requiredOption('--id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEval.deleteRun.mutate({ id: options.id });
+        },
+        `Deleted run ${pc.bold(options.id)}`,
+      ),
+    );
+
+  runCmd
+    .command('start')
+    .description('Start an evaluation run')
+    .requiredOption('--id <id>', 'Run ID')
+    .option('--force', 'Force restart even if already running')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { force?: boolean; id: string }) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEval.startRun.mutate({ id: options.id, force: options.force });
+        },
+        `Started run ${pc.bold(options.id)}`,
+      ),
+    );
+
+  runCmd
+    .command('abort')
+    .description('Abort a running evaluation')
+    .requiredOption('--id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEval.abortRun.mutate({ id: options.id });
+        },
+        `Aborted run ${pc.bold(options.id)}`,
+      ),
+    );
+
+  runCmd
+    .command('retry-errors')
+    .description('Retry failed test cases in a run')
+    .requiredOption('--id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(
+        options,
+        async () => {
+          const client = await getTrpcClient();
+          return client.agentEval.retryRunErrors.mutate({ id: options.id });
+        },
+        `Retrying errors for run ${pc.bold(options.id)}`,
+      ),
+    );
+
+  runCmd
+    .command('progress')
+    .description('Get run progress')
+    .requiredOption('--id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEval.getRunProgress.query({ id: options.id });
+      }),
+    );
+
+  runCmd
+    .command('results')
+    .description('Get run results')
+    .requiredOption('--id <id>', 'Run ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { id: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEval.getRunResults.query({ id: options.id });
      }),
    );

  runCmd
    .command('set-status')
-    .description('Set run status (external API supports completed or external)')
-    .requiredOption('--run-id <id>', 'Run ID')
+    .description('Set run status (external eval API, supports completed or external)')
+    .requiredOption('--id <id>', 'Run ID')
    .requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
    .option('--json', 'Output JSON envelope')
-    .action(async (options: RunSetStatusOptions) =>
+    .action(async (options: JsonOption & { id: string; status: 'completed' | 'external' }) =>
      executeCommand(
        options,
        async () => {
          const client = await getTrpcClient();
          return client.agentEvalExternal.runSetStatus.mutate({
-            runId: options.runId,
+            runId: options.id,
            status: options.status,
          });
        },
-        `Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
+        `Run ${pc.bold(options.id)} status updated to ${pc.bold(options.status)}`,
      ),
    );

-  evalCmd
-    .command('dataset')
-    .description('Manage evaluation datasets')
-    .command('get')
-    .description('Get dataset information')
-    .requiredOption('--dataset-id <id>', 'Dataset ID')
-    .option('--json', 'Output JSON envelope')
-    .action(async (options: DatasetGetOptions) =>
-      executeCommand(options, async () => {
-        const client = await getTrpcClient();
-        return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
-      }),
-    );
+  // ============================================
+  // Run-Topic Operations (external eval API)
+  // ============================================
+  const runTopicCmd = evalCmd.command('run-topic').description('Manage evaluation run topics');

-  evalCmd
-    .command('run-topics')
-    .description('Manage run topics')
+  runTopicCmd
    .command('list')
    .description('List topics in a run')
    .requiredOption('--run-id <id>', 'Run ID')
    .option('--only-external', 'Only return topics pending external evaluation')
    .option('--json', 'Output JSON envelope')
-    .action(async (options: RunTopicsListOptions) =>
+    .action(async (options: JsonOption & { onlyExternal?: boolean; runId: string }) =>
      executeCommand(options, async () => {
        const client = await getTrpcClient();
        return client.agentEvalExternal.runTopicsList.query({
@@ -248,55 +730,7 @@ export function registerEvalCommand(program: Command) {
      }),
    );

-  evalCmd
-    .command('threads')
-    .description('Manage evaluation threads')
-    .command('list')
-    .description('List threads by topic')
-    .requiredOption('--topic-id <id>', 'Topic ID')
-    .option('--json', 'Output JSON envelope')
-    .action(async (options: ThreadsListOptions) =>
-      executeCommand(options, async () => {
-        const client = await getTrpcClient();
-        return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
-      }),
-    );
-
-  evalCmd
-    .command('messages')
-    .description('Manage evaluation messages')
-    .command('list')
-    .description('List messages by topic and optional thread')
-    .requiredOption('--topic-id <id>', 'Topic ID')
-    .option('--thread-id <id>', 'Thread ID')
-    .option('--json', 'Output JSON envelope')
-    .action(async (options: MessagesListOptions) =>
-      executeCommand(options, async () => {
-        const client = await getTrpcClient();
-        return client.agentEvalExternal.messagesList.query({
-          threadId: options.threadId,
-          topicId: options.topicId,
-        });
-      }),
-    );
-
-  evalCmd
-    .command('test-cases')
-    .description('Manage evaluation test cases')
-    .command('count')
-    .description('Count test cases by dataset')
-    .requiredOption('--dataset-id <id>', 'Dataset ID')
-    .option('--json', 'Output JSON envelope')
-    .action(async (options: TestCasesCountOptions) =>
-      executeCommand(options, async () => {
-        const client = await getTrpcClient();
-        return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
-      }),
-    );
-
-  evalCmd
-    .command('run-topic')
-    .description('Manage evaluation run-topic reporting')
+  runTopicCmd
    .command('report-result')
    .description('Report one evaluation result for a run topic')
    .requiredOption('--run-id <id>', 'Run ID')
@@ -306,21 +740,69 @@ export function registerEvalCommand(program: Command) {
    .requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
    .requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
    .option('--json', 'Output JSON envelope')
-    .action(async (options: RunTopicReportResultOptions) =>
-      executeCommand(
-        options,
-        async () => {
-          const client = await getTrpcClient();
-          return client.agentEvalExternal.runTopicReportResult.mutate({
-            correct: options.correct,
-            result: options.resultJson,
-            runId: options.runId,
-            score: options.score,
-            threadId: options.threadId,
-            topicId: options.topicId,
-          });
+    .action(
+      async (
+        options: JsonOption & {
+          correct: boolean;
+          resultJson: Record<string, unknown>;
+          runId: string;
+          score: number;
+          threadId?: string;
+          topicId: string;
        },
-        `Reported result for topic ${pc.bold(options.topicId)}`,
-      ),
+      ) =>
+        executeCommand(
+          options,
+          async () => {
+            const client = await getTrpcClient();
+            return client.agentEvalExternal.runTopicReportResult.mutate({
+              correct: options.correct,
+              result: options.resultJson,
+              runId: options.runId,
+              score: options.score,
+              threadId: options.threadId,
+              topicId: options.topicId,
+            });
+          },
+          `Reported result for topic ${pc.bold(options.topicId)}`,
+        ),
+    );
+
+  // ============================================
+  // Eval Thread Operations (external eval API)
+  // ============================================
+  evalCmd
+    .command('thread')
+    .description('Manage evaluation threads')
+    .command('list')
+    .description('List threads by topic')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { topicId: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
+      }),
+    );
+
+  // ============================================
+  // Eval Message Operations (external eval API)
+  // ============================================
+  evalCmd
+    .command('message')
+    .description('Manage evaluation messages')
+    .command('list')
+    .description('List messages by topic and optional thread')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--thread-id <id>', 'Thread ID')
+    .option('--json', 'Output JSON envelope')
+    .action(async (options: JsonOption & { threadId?: string; topicId: string }) =>
+      executeCommand(options, async () => {
+        const client = await getTrpcClient();
+        return client.agentEvalExternal.messagesList.query({
+          threadId: options.threadId,
+          topicId: options.topicId,
+        });
+      }),
    );
 }
@@ -0,0 +1,121 @@
+import { Command } from 'commander';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { registerThreadCommand } from './thread';
+
+const { mockTrpcClient } = vi.hoisted(() => ({
+  mockTrpcClient: {
+    thread: {
+      getThread: { query: vi.fn() },
+      getThreads: { query: vi.fn() },
+      removeThread: { mutate: vi.fn() },
+    },
+  },
+}));
+
+const { getTrpcClient: mockGetTrpcClient } = vi.hoisted(() => ({
+  getTrpcClient: vi.fn(),
+}));
+
+vi.mock('../api/client', () => ({ getTrpcClient: mockGetTrpcClient }));
+vi.mock('../utils/logger', () => ({
+  log: { debug: vi.fn(), error: vi.fn(), info: vi.fn(), warn: vi.fn() },
+  setVerbose: vi.fn(),
+}));
+
+describe('thread command', () => {
+  let exitSpy: ReturnType<typeof vi.spyOn>;
+  let consoleSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
+    consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+    mockGetTrpcClient.mockResolvedValue(mockTrpcClient);
+    for (const method of Object.values(mockTrpcClient.thread)) {
+      for (const fn of Object.values(method)) {
+        (fn as ReturnType<typeof vi.fn>).mockReset();
+      }
+    }
+  });
+
+  afterEach(() => {
+    exitSpy.mockRestore();
+    consoleSpy.mockRestore();
+  });
+
+  function createProgram() {
+    const program = new Command();
+    program.exitOverride();
+    registerThreadCommand(program);
+    return program;
+  }
+
+  describe('list', () => {
+    it('should list threads by topic', async () => {
+      mockTrpcClient.thread.getThreads.query.mockResolvedValue([
+        { id: 't1', title: 'Thread 1', type: 'standalone' },
+      ]);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'thread', 'list', '--topic-id', 'topic1']);
+
+      expect(mockTrpcClient.thread.getThreads.query).toHaveBeenCalledWith({ topicId: 'topic1' });
+    });
+
+    it('should show empty message when no threads', async () => {
+      mockTrpcClient.thread.getThreads.query.mockResolvedValue([]);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'thread', 'list', '--topic-id', 'topic1']);
+
+      expect(consoleSpy).toHaveBeenCalledWith('No threads found.');
+    });
+  });
+
+  describe('list-all', () => {
+    it('should list all threads', async () => {
+      mockTrpcClient.thread.getThread.query.mockResolvedValue([
+        { id: 't1', title: 'Thread 1', type: 'standalone' },
+      ]);
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'thread', 'list-all']);
+
+      expect(mockTrpcClient.thread.getThread.query).toHaveBeenCalled();
+    });
+  });
+
+  describe('delete', () => {
+    it('should delete a thread', async () => {
+      mockTrpcClient.thread.removeThread.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync(['node', 'test', 'thread', 'delete', 't1', '--yes']);
+
+      expect(mockTrpcClient.thread.removeThread.mutate).toHaveBeenCalledWith({
+        id: 't1',
+        removeChildren: undefined,
+      });
+    });
+
+    it('should delete with remove-children flag', async () => {
+      mockTrpcClient.thread.removeThread.mutate.mockResolvedValue({});
+
+      const program = createProgram();
+      await program.parseAsync([
+        'node',
+        'test',
+        'thread',
+        'delete',
+        't1',
+        '--remove-children',
+        '--yes',
+      ]);
+
+      expect(mockTrpcClient.thread.removeThread.mutate).toHaveBeenCalledWith({
+        id: 't1',
+        removeChildren: true,
+      });
+    });
+  });
+});
@@ -0,0 +1,99 @@
+import type { Command } from 'commander';
+import pc from 'picocolors';
+
+import { getTrpcClient } from '../api/client';
+import { confirm, outputJson, printTable, timeAgo, truncate } from '../utils/format';
+
+export function registerThreadCommand(program: Command) {
+  const thread = program.command('thread').description('Manage message threads');
+
+  // ── list ──────────────────────────────────────────────
+
+  thread
+    .command('list')
+    .description('List threads by topic')
+    .requiredOption('--topic-id <id>', 'Topic ID')
+    .option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
+    .action(async (options: { json?: string | boolean; topicId: string }) => {
+      const client = await getTrpcClient();
+      const result = await client.thread.getThreads.query({ topicId: options.topicId });
+      const items = Array.isArray(result) ? result : [];
+
+      if (options.json !== undefined) {
+        const fields = typeof options.json === 'string' ? options.json : undefined;
+        outputJson(items, fields);
+        return;
+      }
+
+      if (items.length === 0) {
+        console.log('No threads found.');
+        return;
+      }
+
+      const rows = items.map((t: any) => [
+        t.id || '',
+        truncate(t.title || 'Untitled', 50),
+        t.type || '',
+        t.updatedAt ? timeAgo(t.updatedAt) : '',
+      ]);
+
+      printTable(rows, ['ID', 'TITLE', 'TYPE', 'UPDATED']);
+    });
+
+  // ── list-all ──────────────────────────────────────────
+
+  thread
+    .command('list-all')
+    .description('List all threads for the current user')
+    .option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
+    .action(async (options: { json?: string | boolean }) => {
+      const client = await getTrpcClient();
+      const result = await client.thread.getThread.query();
+      const items = Array.isArray(result) ? result : [];
+
+      if (options.json !== undefined) {
+        const fields = typeof options.json === 'string' ? options.json : undefined;
+        outputJson(items, fields);
+        return;
+      }
+
+      if (items.length === 0) {
+        console.log('No threads found.');
+        return;
+      }
+
+      const rows = items.map((t: any) => [
+        t.id || '',
+        truncate(t.title || 'Untitled', 50),
+        t.type || '',
+        t.topicId || '',
+        t.updatedAt ? timeAgo(t.updatedAt) : '',
+      ]);
+
+      printTable(rows, ['ID', 'TITLE', 'TYPE', 'TOPIC', 'UPDATED']);
+    });
+
+  // ── delete ────────────────────────────────────────────
+
+  thread
+    .command('delete <id>')
+    .description('Delete a thread')
+    .option('--remove-children', 'Also remove child messages')
+    .option('--yes', 'Skip confirmation prompt')
+    .action(async (id: string, options: { removeChildren?: boolean; yes?: boolean }) => {
+      if (!options.yes) {
+        const confirmed = await confirm('Are you sure you want to delete this thread?');
+        if (!confirmed) {
+          console.log('Cancelled.');
+          return;
+        }
+      }
+
+      const client = await getTrpcClient();
+      await client.thread.removeThread.mutate({
+        id,
+        removeChildren: options.removeChildren,
+      });
+      console.log(`${pc.green('✓')} Deleted thread ${pc.bold(id)}`);
+    });
+}
@@ -25,6 +25,7 @@ import { registerSearchCommand } from './commands/search';
 import { registerSessionGroupCommand } from './commands/session-group';
 import { registerSkillCommand } from './commands/skill';
 import { registerStatusCommand } from './commands/status';
+import { registerThreadCommand } from './commands/thread';
 import { registerTopicCommand } from './commands/topic';

 const require = createRequire(import.meta.url);
@@ -54,6 +55,7 @@ registerGenerateCommand(program);
 registerFileCommand(program);
 registerSkillCommand(program);
 registerSessionGroupCommand(program);
+registerThreadCommand(program);
 registerTopicCommand(program);
 registerMessageCommand(program);
 registerModelCommand(program);