Compare commits

...

3 Commits

Author SHA1 Message Date
arvinxx 4a2c304eb5 ♻️ refactor(cli): merge external eval commands into unified tree with --external flag
Remove separate `eval ext` namespace; use `--external` flag on overlapping commands
(dataset get, run get) and integrate external-only commands directly into the tree.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 01:36:18 +08:00
arvinxx 2a336ddf20 💄 style(cli): rename eval irun to run since external moved to ext namespace
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 01:26:18 +08:00
arvinxx a17cbf9bb7 feat(cli): CLI Phase 5 - agent KB/file/pin, thread management, eval expansion
- Add agent subcommands: pin/unpin, kb-files, add-file/remove-file/toggle-file, add-kb/remove-kb/toggle-kb
- Create thread command with list/list-all/delete subcommands
- Expand eval with internal benchmark/dataset/testcase/irun management
- Move existing external eval commands under `eval ext` namespace
- Add comprehensive unit tests for all new functionality

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 01:07:10 +08:00
10 changed files with 1715 additions and 319 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@lobehub/cli",
"version": "0.0.1-canary.11",
"version": "0.0.1-canary.12",
"type": "module",
"bin": {
"lh": "./dist/index.js",
+7
View File
@@ -16,6 +16,13 @@ let _client: TrpcClient | undefined;
let _toolsClient: ToolsTrpcClient | undefined;
async function getAuthAndServer() {
// LOBEHUB_JWT + LOBEHUB_SERVER env vars (used by server-side sandbox execution)
const envJwt = process.env.LOBEHUB_JWT;
if (envJwt) {
const serverUrl = process.env.LOBEHUB_SERVER || OFFICIAL_SERVER_URL;
return { accessToken: envJwt, serverUrl: serverUrl.replace(/\/$/, '') };
}
const result = await getValidToken();
if (!result) {
log.error("No authentication found. Run 'lh login' first.");
+12
View File
@@ -29,6 +29,18 @@ function parseJwtSub(token: string): string | undefined {
* Exits the process if no token can be resolved.
*/
export async function resolveToken(options: ResolveTokenOptions): Promise<ResolvedAuth> {
// LOBEHUB_JWT env var takes highest priority (used by server-side sandbox execution)
const envJwt = process.env.LOBEHUB_JWT;
if (envJwt) {
const userId = parseJwtSub(envJwt);
if (!userId) {
log.error('Could not extract userId from LOBEHUB_JWT.');
process.exit(1);
}
log.debug('Using LOBEHUB_JWT from environment');
return { token: envJwt, userId };
}
// Explicit token takes priority
if (options.token) {
const userId = parseJwtSub(options.token);
+160
View File
@@ -8,12 +8,20 @@ const { mockTrpcClient } = vi.hoisted(() => ({
mockTrpcClient: {
agent: {
createAgent: { mutate: vi.fn() },
createAgentFiles: { mutate: vi.fn() },
createAgentKnowledgeBase: { mutate: vi.fn() },
deleteAgentFile: { mutate: vi.fn() },
deleteAgentKnowledgeBase: { mutate: vi.fn() },
duplicateAgent: { mutate: vi.fn() },
getAgentConfigById: { query: vi.fn() },
getBuiltinAgent: { query: vi.fn() },
getKnowledgeBasesAndFiles: { query: vi.fn() },
queryAgents: { query: vi.fn() },
removeAgent: { mutate: vi.fn() },
toggleFile: { mutate: vi.fn() },
toggleKnowledgeBase: { mutate: vi.fn() },
updateAgentConfig: { mutate: vi.fn() },
updateAgentPinned: { mutate: vi.fn() },
},
aiAgent: {
execAgent: { mutate: vi.fn() },
@@ -403,6 +411,158 @@ describe('agent command', () => {
});
});
describe('pin/unpin', () => {
it('should pin an agent', async () => {
mockTrpcClient.agent.updateAgentPinned.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'pin', 'a1']);
expect(mockTrpcClient.agent.updateAgentPinned.mutate).toHaveBeenCalledWith({
id: 'a1',
pinned: true,
});
});
it('should unpin an agent', async () => {
mockTrpcClient.agent.updateAgentPinned.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'unpin', 'a1']);
expect(mockTrpcClient.agent.updateAgentPinned.mutate).toHaveBeenCalledWith({
id: 'a1',
pinned: false,
});
});
});
describe('kb-files', () => {
it('should list kb and files', async () => {
mockTrpcClient.agent.getKnowledgeBasesAndFiles.query.mockResolvedValue([
{ enabled: true, id: 'f1', name: 'file.txt', type: 'file' },
]);
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'kb-files', 'a1']);
expect(mockTrpcClient.agent.getKnowledgeBasesAndFiles.query).toHaveBeenCalledWith({
agentId: 'a1',
});
});
it('should show empty message', async () => {
mockTrpcClient.agent.getKnowledgeBasesAndFiles.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'kb-files', 'a1']);
expect(consoleSpy).toHaveBeenCalledWith('No knowledge bases or files found.');
});
});
describe('add-file', () => {
it('should add files to agent', async () => {
mockTrpcClient.agent.createAgentFiles.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'add-file', 'a1', '--file-ids', 'f1,f2']);
expect(mockTrpcClient.agent.createAgentFiles.mutate).toHaveBeenCalledWith(
expect.objectContaining({ agentId: 'a1', fileIds: ['f1', 'f2'] }),
);
});
});
describe('remove-file', () => {
it('should remove a file from agent', async () => {
mockTrpcClient.agent.deleteAgentFile.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'remove-file', 'a1', '--file-id', 'f1']);
expect(mockTrpcClient.agent.deleteAgentFile.mutate).toHaveBeenCalledWith({
agentId: 'a1',
fileId: 'f1',
});
});
});
describe('toggle-file', () => {
it('should toggle file with enable', async () => {
mockTrpcClient.agent.toggleFile.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'agent',
'toggle-file',
'a1',
'--file-id',
'f1',
'--enable',
]);
expect(mockTrpcClient.agent.toggleFile.mutate).toHaveBeenCalledWith({
agentId: 'a1',
enabled: true,
fileId: 'f1',
});
});
});
describe('add-kb', () => {
it('should add kb to agent', async () => {
mockTrpcClient.agent.createAgentKnowledgeBase.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'add-kb', 'a1', '--kb-id', 'kb1']);
expect(mockTrpcClient.agent.createAgentKnowledgeBase.mutate).toHaveBeenCalledWith(
expect.objectContaining({ agentId: 'a1', knowledgeBaseId: 'kb1' }),
);
});
});
describe('remove-kb', () => {
it('should remove kb from agent', async () => {
mockTrpcClient.agent.deleteAgentKnowledgeBase.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync(['node', 'test', 'agent', 'remove-kb', 'a1', '--kb-id', 'kb1']);
expect(mockTrpcClient.agent.deleteAgentKnowledgeBase.mutate).toHaveBeenCalledWith({
agentId: 'a1',
knowledgeBaseId: 'kb1',
});
});
});
describe('toggle-kb', () => {
it('should toggle kb with disable', async () => {
mockTrpcClient.agent.toggleKnowledgeBase.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'agent',
'toggle-kb',
'a1',
'--kb-id',
'kb1',
'--disable',
]);
expect(mockTrpcClient.agent.toggleKnowledgeBase.mutate).toHaveBeenCalledWith({
agentId: 'a1',
enabled: false,
knowledgeBaseId: 'kb1',
});
});
});
describe('status', () => {
it('should display operation status', async () => {
mockTrpcClient.aiAgent.getOperationStatus.query.mockResolvedValue({
+198
View File
@@ -316,6 +316,204 @@ export function registerAgentCommand(program: Command) {
},
);
// ── pin / unpin ─────────────────────────────────────
agent
.command('pin <agentId>')
.description('Pin an agent')
.action(async (agentId: string) => {
const client = await getTrpcClient();
await client.agent.updateAgentPinned.mutate({ id: agentId, pinned: true });
console.log(`${pc.green('✓')} Pinned agent ${pc.bold(agentId)}`);
});
agent
.command('unpin <agentId>')
.description('Unpin an agent')
.action(async (agentId: string) => {
const client = await getTrpcClient();
await client.agent.updateAgentPinned.mutate({ id: agentId, pinned: false });
console.log(`${pc.green('✓')} Unpinned agent ${pc.bold(agentId)}`);
});
// ── kb-files ───────────────────────────────────────
agent
.command('kb-files [agentId]')
.description('List knowledge bases and files associated with an agent')
.option('-s, --slug <slug>', 'Agent slug')
.option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
.action(
async (
agentIdArg: string | undefined,
options: { json?: string | boolean; slug?: string },
) => {
const client = await getTrpcClient();
const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
const items = await client.agent.getKnowledgeBasesAndFiles.query({ agentId });
if (options.json !== undefined) {
const fields = typeof options.json === 'string' ? options.json : undefined;
outputJson(items, fields);
return;
}
const list = Array.isArray(items) ? items : [];
if (list.length === 0) {
console.log('No knowledge bases or files found.');
return;
}
const rows = list.map((item: any) => [
item.id || '',
truncate(item.name || '', 40),
item.type || '',
item.enabled ? 'enabled' : 'disabled',
]);
printTable(rows, ['ID', 'NAME', 'TYPE', 'STATUS']);
},
);
// ── add-file ───────────────────────────────────────
agent
.command('add-file [agentId]')
.description('Associate files with an agent')
.option('-s, --slug <slug>', 'Agent slug')
.requiredOption('--file-ids <ids>', 'Comma-separated file IDs')
.option('--enabled', 'Enable files immediately')
.action(
async (
agentIdArg: string | undefined,
options: { enabled?: boolean; fileIds: string; slug?: string },
) => {
const client = await getTrpcClient();
const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
const fileIds = options.fileIds.split(',').map((s) => s.trim());
const input: Record<string, any> = { agentId, fileIds };
if (options.enabled !== undefined) input.enabled = options.enabled;
await client.agent.createAgentFiles.mutate(input as any);
console.log(
`${pc.green('✓')} Added ${fileIds.length} file(s) to agent ${pc.bold(agentId)}`,
);
},
);
// ── remove-file ────────────────────────────────────
agent
.command('remove-file [agentId]')
.description('Remove a file from an agent')
.option('-s, --slug <slug>', 'Agent slug')
.requiredOption('--file-id <id>', 'File ID to remove')
.action(async (agentIdArg: string | undefined, options: { fileId: string; slug?: string }) => {
const client = await getTrpcClient();
const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
await client.agent.deleteAgentFile.mutate({ agentId, fileId: options.fileId });
console.log(
`${pc.green('✓')} Removed file ${pc.bold(options.fileId)} from agent ${pc.bold(agentId)}`,
);
});
// ── toggle-file ────────────────────────────────────
agent
.command('toggle-file [agentId]')
.description('Toggle a file on/off for an agent')
.option('-s, --slug <slug>', 'Agent slug')
.requiredOption('--file-id <id>', 'File ID')
.option('--enable', 'Enable the file')
.option('--disable', 'Disable the file')
.action(
async (
agentIdArg: string | undefined,
options: { disable?: boolean; enable?: boolean; fileId: string; slug?: string },
) => {
const enabled = options.enable ? true : options.disable ? false : undefined;
const client = await getTrpcClient();
const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
await client.agent.toggleFile.mutate({ agentId, enabled, fileId: options.fileId });
console.log(
`${pc.green('✓')} Toggled file ${pc.bold(options.fileId)} for agent ${pc.bold(agentId)}`,
);
},
);
// ── add-kb ─────────────────────────────────────────
agent
.command('add-kb [agentId]')
.description('Associate a knowledge base with an agent')
.option('-s, --slug <slug>', 'Agent slug')
.requiredOption('--kb-id <id>', 'Knowledge base ID')
.option('--enabled', 'Enable immediately')
.action(
async (
agentIdArg: string | undefined,
options: { enabled?: boolean; kbId: string; slug?: string },
) => {
const client = await getTrpcClient();
const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
const input: Record<string, any> = { agentId, knowledgeBaseId: options.kbId };
if (options.enabled !== undefined) input.enabled = options.enabled;
await client.agent.createAgentKnowledgeBase.mutate(input as any);
console.log(
`${pc.green('✓')} Added knowledge base ${pc.bold(options.kbId)} to agent ${pc.bold(agentId)}`,
);
},
);
// ── remove-kb ──────────────────────────────────────
agent
.command('remove-kb [agentId]')
.description('Remove a knowledge base from an agent')
.option('-s, --slug <slug>', 'Agent slug')
.requiredOption('--kb-id <id>', 'Knowledge base ID')
.action(async (agentIdArg: string | undefined, options: { kbId: string; slug?: string }) => {
const client = await getTrpcClient();
const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
await client.agent.deleteAgentKnowledgeBase.mutate({
agentId,
knowledgeBaseId: options.kbId,
});
console.log(
`${pc.green('✓')} Removed knowledge base ${pc.bold(options.kbId)} from agent ${pc.bold(agentId)}`,
);
});
// ── toggle-kb ──────────────────────────────────────
agent
.command('toggle-kb [agentId]')
.description('Toggle a knowledge base on/off for an agent')
.option('-s, --slug <slug>', 'Agent slug')
.requiredOption('--kb-id <id>', 'Knowledge base ID')
.option('--enable', 'Enable the knowledge base')
.option('--disable', 'Disable the knowledge base')
.action(
async (
agentIdArg: string | undefined,
options: { disable?: boolean; enable?: boolean; kbId: string; slug?: string },
) => {
const enabled = options.enable ? true : options.disable ? false : undefined;
const client = await getTrpcClient();
const agentId = await resolveAgentId(client, { agentId: agentIdArg, slug: options.slug });
await client.agent.toggleKnowledgeBase.mutate({
agentId,
enabled,
knowledgeBaseId: options.kbId,
});
console.log(
`${pc.green('✓')} Toggled knowledge base ${pc.bold(options.kbId)} for agent ${pc.bold(agentId)}`,
);
},
);
// ── status ──────────────────────────────────────────
agent
+501 -186
View File
@@ -3,6 +3,32 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
const { mockTrpcClient } = vi.hoisted(() => ({
mockTrpcClient: {
agentEval: {
abortRun: { mutate: vi.fn() },
createBenchmark: { mutate: vi.fn() },
createDataset: { mutate: vi.fn() },
createRun: { mutate: vi.fn() },
createTestCase: { mutate: vi.fn() },
deleteBenchmark: { mutate: vi.fn() },
deleteDataset: { mutate: vi.fn() },
deleteRun: { mutate: vi.fn() },
deleteTestCase: { mutate: vi.fn() },
getBenchmark: { query: vi.fn() },
getDataset: { query: vi.fn() },
getRunDetails: { query: vi.fn() },
getRunProgress: { query: vi.fn() },
getRunResults: { query: vi.fn() },
getTestCase: { query: vi.fn() },
listBenchmarks: { query: vi.fn() },
listDatasets: { query: vi.fn() },
listRuns: { query: vi.fn() },
listTestCases: { query: vi.fn() },
retryRunErrors: { mutate: vi.fn() },
startRun: { mutate: vi.fn() },
updateBenchmark: { mutate: vi.fn() },
updateDataset: { mutate: vi.fn() },
updateTestCase: { mutate: vi.fn() },
},
agentEvalExternal: {
datasetGet: { query: vi.fn() },
messagesList: { query: vi.fn() },
@@ -48,9 +74,11 @@ describe('eval command', () => {
exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
for (const method of Object.values(mockTrpcClient.agentEvalExternal)) {
for (const fn of Object.values(method)) {
(fn as ReturnType<typeof vi.fn>).mockReset();
for (const ns of Object.values(mockTrpcClient)) {
for (const method of Object.values(ns as Record<string, any>)) {
for (const fn of Object.values(method as Record<string, any>)) {
(fn as ReturnType<typeof vi.fn>).mockReset();
}
}
}
});
@@ -68,218 +96,505 @@ describe('eval command', () => {
return program;
};
it('should call runGet and output json envelope', async () => {
mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
config: { k: 1 },
datasetId: 'dataset-1',
id: 'run-1',
// ============================================
// Benchmark tests
// ============================================
describe('benchmark', () => {
it('should list benchmarks', async () => {
mockTrpcClient.agentEval.listBenchmarks.query.mockResolvedValue([
{ id: 'b1', name: 'Bench 1' },
]);
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'benchmark', 'list', '--json']);
expect(mockTrpcClient.agentEval.listBenchmarks.query).toHaveBeenCalled();
});
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--run-id', 'run-1', '--json']);
it('should create a benchmark', async () => {
mockTrpcClient.agentEval.createBenchmark.mutate.mockResolvedValue({ id: 'b1' });
expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({ runId: 'run-1' });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'benchmark',
'create',
'--identifier',
'test-bench',
'-n',
'Test Bench',
'--json',
]);
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: {
expect(mockTrpcClient.agentEval.createBenchmark.mutate).toHaveBeenCalledWith(
expect.objectContaining({ identifier: 'test-bench', name: 'Test Bench' }),
);
});
it('should delete a benchmark', async () => {
mockTrpcClient.agentEval.deleteBenchmark.mutate.mockResolvedValue({ success: true });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'benchmark', 'delete', '--id', 'b1']);
expect(mockTrpcClient.agentEval.deleteBenchmark.mutate).toHaveBeenCalledWith({ id: 'b1' });
});
});
// ============================================
// Dataset tests
// ============================================
describe('dataset', () => {
it('should list datasets', async () => {
mockTrpcClient.agentEval.listDatasets.query.mockResolvedValue([{ id: 'd1', name: 'DS 1' }]);
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'dataset', 'list', '--json']);
expect(mockTrpcClient.agentEval.listDatasets.query).toHaveBeenCalled();
});
it('should get dataset via internal API', async () => {
mockTrpcClient.agentEval.getDataset.query.mockResolvedValue({ id: 'd1' });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'dataset', 'get', '--id', 'd1', '--json']);
expect(mockTrpcClient.agentEval.getDataset.query).toHaveBeenCalledWith({ id: 'd1' });
});
it('should get dataset via external API with --external', async () => {
mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
id: 'dataset-1',
metadata: { preset: 'deepsearchqa' },
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'dataset',
'get',
'--id',
'dataset-1',
'--external',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
});
});
it('should create a dataset', async () => {
mockTrpcClient.agentEval.createDataset.mutate.mockResolvedValue({ id: 'd1' });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'dataset',
'create',
'--benchmark-id',
'b1',
'--identifier',
'ds1',
'-n',
'Dataset 1',
'--json',
]);
expect(mockTrpcClient.agentEval.createDataset.mutate).toHaveBeenCalledWith(
expect.objectContaining({ benchmarkId: 'b1', identifier: 'ds1', name: 'Dataset 1' }),
);
});
});
// ============================================
// TestCase tests
// ============================================
describe('testcase', () => {
it('should list test cases', async () => {
mockTrpcClient.agentEval.listTestCases.query.mockResolvedValue({ data: [], total: 0 });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'testcase',
'list',
'--dataset-id',
'd1',
'--json',
]);
expect(mockTrpcClient.agentEval.listTestCases.query).toHaveBeenCalledWith(
expect.objectContaining({ datasetId: 'd1' }),
);
});
it('should create a test case', async () => {
mockTrpcClient.agentEval.createTestCase.mutate.mockResolvedValue({ id: 'tc1' });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'testcase',
'create',
'--dataset-id',
'd1',
'--input',
'What is 2+2?',
'--expected',
'4',
]);
expect(mockTrpcClient.agentEval.createTestCase.mutate).toHaveBeenCalledWith(
expect.objectContaining({
content: expect.objectContaining({ expected: '4', input: 'What is 2+2?' }),
datasetId: 'd1',
}),
);
});
it('should delete a test case', async () => {
mockTrpcClient.agentEval.deleteTestCase.mutate.mockResolvedValue({ success: true });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'testcase', 'delete', '--id', 'tc1']);
expect(mockTrpcClient.agentEval.deleteTestCase.mutate).toHaveBeenCalledWith({ id: 'tc1' });
});
it('should count test cases via external API', async () => {
mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'testcase',
'count',
'--dataset-id',
'dataset-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
});
});
});
// ============================================
// Run tests
// ============================================
describe('run', () => {
it('should list runs', async () => {
mockTrpcClient.agentEval.listRuns.query.mockResolvedValue({ data: [], total: 0 });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'list', '--json']);
expect(mockTrpcClient.agentEval.listRuns.query).toHaveBeenCalled();
});
it('should get run via internal API', async () => {
mockTrpcClient.agentEval.getRunDetails.query.mockResolvedValue({ id: 'r1' });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--id', 'r1', '--json']);
expect(mockTrpcClient.agentEval.getRunDetails.query).toHaveBeenCalledWith({ id: 'r1' });
});
it('should get run via external API with --external', async () => {
mockTrpcClient.agentEvalExternal.runGet.query.mockResolvedValue({
config: { k: 1 },
datasetId: 'dataset-1',
id: 'run-1',
},
error: null,
ok: true,
version: 'v1',
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'get',
'--id',
'run-1',
'--external',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runGet.query).toHaveBeenCalledWith({
runId: 'run-1',
});
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: { config: { k: 1 }, datasetId: 'dataset-1', id: 'run-1' },
error: null,
ok: true,
version: 'v1',
});
});
it('should create a run', async () => {
mockTrpcClient.agentEval.createRun.mutate.mockResolvedValue({ id: 'r1' });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'create',
'--dataset-id',
'd1',
'-n',
'Run 1',
'--json',
]);
expect(mockTrpcClient.agentEval.createRun.mutate).toHaveBeenCalledWith(
expect.objectContaining({ datasetId: 'd1', name: 'Run 1' }),
);
});
it('should start a run', async () => {
mockTrpcClient.agentEval.startRun.mutate.mockResolvedValue({ success: true, runId: 'r1' });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'start', '--id', 'r1']);
expect(mockTrpcClient.agentEval.startRun.mutate).toHaveBeenCalledWith(
expect.objectContaining({ id: 'r1' }),
);
});
it('should abort a run', async () => {
mockTrpcClient.agentEval.abortRun.mutate.mockResolvedValue({ success: true });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'abort', '--id', 'r1']);
expect(mockTrpcClient.agentEval.abortRun.mutate).toHaveBeenCalledWith({ id: 'r1' });
});
it('should get run progress', async () => {
mockTrpcClient.agentEval.getRunProgress.query.mockResolvedValue({ status: 'running' });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'progress', '--id', 'r1', '--json']);
expect(mockTrpcClient.agentEval.getRunProgress.query).toHaveBeenCalledWith({ id: 'r1' });
});
it('should get run results', async () => {
mockTrpcClient.agentEval.getRunResults.query.mockResolvedValue({
results: [],
runId: 'r1',
total: 0,
});
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'results', '--id', 'r1', '--json']);
expect(mockTrpcClient.agentEval.getRunResults.query).toHaveBeenCalledWith({ id: 'r1' });
});
it('should delete a run', async () => {
mockTrpcClient.agentEval.deleteRun.mutate.mockResolvedValue({ success: true });
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'delete', '--id', 'r1']);
expect(mockTrpcClient.agentEval.deleteRun.mutate).toHaveBeenCalledWith({ id: 'r1' });
});
it('should set run status via external API', async () => {
mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
runId: 'run-1',
status: 'completed',
success: true,
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'set-status',
'--id',
'run-1',
'--status',
'completed',
]);
expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
runId: 'run-1',
status: 'completed',
});
expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
});
});
it('should call datasetGet and output json envelope', async () => {
mockTrpcClient.agentEvalExternal.datasetGet.query.mockResolvedValue({
id: 'dataset-1',
metadata: { preset: 'deepsearchqa' },
// ============================================
// Run-Topic tests (external eval API)
// ============================================
describe('run-topic', () => {
it('should list run topics', async () => {
mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topic',
'list',
'--run-id',
'run-1',
'--only-external',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
onlyExternal: true,
runId: 'run-1',
});
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'dataset',
'get',
'--dataset-id',
'dataset-1',
'--json',
]);
it('should report run-topic result', async () => {
mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
success: true,
});
expect(mockTrpcClient.agentEvalExternal.datasetGet.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topic',
'report-result',
'--run-id',
'run-1',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--score',
'0.91',
'--correct',
'true',
'--result-json',
'{"grade":"A"}',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
correct: true,
result: { grade: 'A' },
runId: 'run-1',
score: 0.91,
threadId: 'thread-1',
topicId: 'topic-1',
});
});
});
it('should pass onlyExternal to runTopicsList', async () => {
mockTrpcClient.agentEvalExternal.runTopicsList.query.mockResolvedValue([]);
// ============================================
// Eval thread/message tests (external eval API)
// ============================================
describe('eval thread', () => {
it('should list threads by topic', async () => {
mockTrpcClient.agentEvalExternal.threadsList.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topics',
'list',
'--run-id',
'run-1',
'--only-external',
'--json',
]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'thread',
'list',
'--topic-id',
'topic-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.runTopicsList.query).toHaveBeenCalledWith({
onlyExternal: true,
runId: 'run-1',
expect(mockTrpcClient.agentEvalExternal.threadsList.query).toHaveBeenCalledWith({
topicId: 'topic-1',
});
});
});
it('should pass topicId and threadId to messagesList', async () => {
mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
describe('eval message', () => {
it('should list messages by topic and thread', async () => {
mockTrpcClient.agentEvalExternal.messagesList.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'messages',
'list',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--json',
]);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'message',
'list',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
threadId: 'thread-1',
topicId: 'topic-1',
expect(mockTrpcClient.agentEvalExternal.messagesList.query).toHaveBeenCalledWith({
threadId: 'thread-1',
topicId: 'topic-1',
});
});
});
it('should parse and report run-topic result', async () => {
mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate.mockResolvedValue({
success: true,
// ============================================
// Error handling
// ============================================
describe('error handling', () => {
it('should output json error envelope when command fails', async () => {
const error = Object.assign(new Error('Run not found'), {
data: { code: 'NOT_FOUND' },
});
mockTrpcClient.agentEval.getRunDetails.query.mockRejectedValue(error);
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'run', 'get', '--id', 'run-404', '--json']);
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: null,
error: { code: 'NOT_FOUND', message: 'Run not found' },
ok: false,
version: 'v1',
});
expect(exitSpy).toHaveBeenCalledWith(1);
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run-topic',
'report-result',
'--run-id',
'run-1',
'--topic-id',
'topic-1',
'--thread-id',
'thread-1',
'--score',
'0.91',
'--correct',
'true',
'--result-json',
'{"grade":"A"}',
'--json',
]);
it('should log plain error without --json', async () => {
mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
expect(mockTrpcClient.agentEvalExternal.runTopicReportResult.mutate).toHaveBeenCalledWith({
correct: true,
result: { grade: 'A' },
runId: 'run-1',
score: 0.91,
threadId: 'thread-1',
topicId: 'topic-1',
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'thread', 'list', '--topic-id', 'topic-1']);
expect(log.error).toHaveBeenCalledWith('boom');
expect(exitSpy).toHaveBeenCalledWith(1);
});
});
it('should update run status', async () => {
mockTrpcClient.agentEvalExternal.runSetStatus.mutate.mockResolvedValue({
runId: 'run-1',
status: 'completed',
success: true,
});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'set-status',
'--run-id',
'run-1',
'--status',
'completed',
]);
expect(mockTrpcClient.agentEvalExternal.runSetStatus.mutate).toHaveBeenCalledWith({
runId: 'run-1',
status: 'completed',
});
expect(logSpy).toHaveBeenCalledWith(expect.stringContaining('status updated to'));
});
it('should output json error envelope when command fails', async () => {
const error = Object.assign(new Error('Run not found'), {
data: { code: 'NOT_FOUND' },
});
mockTrpcClient.agentEvalExternal.runGet.query.mockRejectedValue(error);
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'run',
'get',
'--run-id',
'run-404',
'--json',
]);
const payload = JSON.parse(logSpy.mock.calls[0][0]);
expect(payload).toEqual({
data: null,
error: { code: 'NOT_FOUND', message: 'Run not found' },
ok: false,
version: 'v1',
});
expect(exitSpy).toHaveBeenCalledWith(1);
});
it('should query test case count', async () => {
mockTrpcClient.agentEvalExternal.testCasesCount.query.mockResolvedValue({ count: 12 });
const program = createProgram();
await program.parseAsync([
'node',
'test',
'eval',
'test-cases',
'count',
'--dataset-id',
'dataset-1',
'--json',
]);
expect(mockTrpcClient.agentEvalExternal.testCasesCount.query).toHaveBeenCalledWith({
datasetId: 'dataset-1',
});
});
it('should log plain error without --json', async () => {
mockTrpcClient.agentEvalExternal.threadsList.query.mockRejectedValue(new Error('boom'));
const program = createProgram();
await program.parseAsync(['node', 'test', 'eval', 'threads', 'list', '--topic-id', 'topic-1']);
expect(log.error).toHaveBeenCalledWith('boom');
expect(exitSpy).toHaveBeenCalledWith(1);
});
});
+614 -132
View File
@@ -23,46 +23,6 @@ interface JsonOption {
json?: boolean;
}
interface RunGetOptions extends JsonOption {
runId: string;
}
interface RunSetStatusOptions extends JsonOption {
runId: string;
status: 'completed' | 'external';
}
interface DatasetGetOptions extends JsonOption {
datasetId: string;
}
interface RunTopicsListOptions extends JsonOption {
onlyExternal?: boolean;
runId: string;
}
interface ThreadsListOptions extends JsonOption {
topicId: string;
}
interface MessagesListOptions extends JsonOption {
threadId?: string;
topicId: string;
}
interface TestCasesCountOptions extends JsonOption {
datasetId: string;
}
interface RunTopicReportResultOptions extends JsonOption {
correct: boolean;
resultJson: Record<string, unknown>;
runId: string;
score: number;
threadId?: string;
topicId: string;
}
const printJson = (data: unknown) => {
console.log(JSON.stringify(data, null, 2));
};
@@ -180,65 +140,587 @@ const executeCommand = async (
};
export function registerEvalCommand(program: Command) {
const evalCmd = program.command('eval').description('Manage external evaluation workflows');
const evalCmd = program.command('eval').description('Manage evaluation workflows');
// ============================================
// Benchmark Operations
// ============================================
const benchmarkCmd = evalCmd.command('benchmark').description('Manage evaluation benchmarks');
benchmarkCmd
.command('list')
.description('List benchmarks')
.option('--include-system', 'Include system benchmarks')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { includeSystem?: boolean }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEval.listBenchmarks.query({
includeSystem: options.includeSystem ?? true,
});
}),
);
benchmarkCmd
.command('get')
.description('Get benchmark details')
.requiredOption('--id <id>', 'Benchmark ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEval.getBenchmark.query({ id: options.id });
}),
);
benchmarkCmd
.command('create')
.description('Create a benchmark')
.requiredOption('--identifier <identifier>', 'Unique identifier')
.requiredOption('-n, --name <name>', 'Benchmark name')
.option('-d, --description <desc>', 'Description')
.option('--reference-url <url>', 'Reference URL')
.option('--json', 'Output JSON envelope')
.action(
async (
options: JsonOption & {
description?: string;
identifier: string;
name: string;
referenceUrl?: string;
},
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
const input: Record<string, any> = {
identifier: options.identifier,
name: options.name,
};
if (options.description) input.description = options.description;
if (options.referenceUrl) input.referenceUrl = options.referenceUrl;
return client.agentEval.createBenchmark.mutate(input as any);
},
`Created benchmark ${pc.bold(options.name)}`,
),
);
benchmarkCmd
.command('update')
.description('Update a benchmark')
.requiredOption('--id <id>', 'Benchmark ID')
.option('-n, --name <name>', 'New name')
.option('-d, --description <desc>', 'New description')
.option('--reference-url <url>', 'New reference URL')
.option('--json', 'Output JSON envelope')
.action(
async (
options: JsonOption & {
description?: string;
id: string;
name?: string;
referenceUrl?: string;
},
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
const input: Record<string, any> = { id: options.id };
if (options.name) input.name = options.name;
if (options.description) input.description = options.description;
if (options.referenceUrl) input.referenceUrl = options.referenceUrl;
return client.agentEval.updateBenchmark.mutate(input as any);
},
`Updated benchmark ${pc.bold(options.id)}`,
),
);
benchmarkCmd
.command('delete')
.description('Delete a benchmark')
.requiredOption('--id <id>', 'Benchmark ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEval.deleteBenchmark.mutate({ id: options.id });
},
`Deleted benchmark ${pc.bold(options.id)}`,
),
);
// ============================================
// Dataset Operations
// ============================================
const datasetCmd = evalCmd.command('dataset').description('Manage evaluation datasets');
datasetCmd
.command('list')
.description('List datasets')
.option('--benchmark-id <id>', 'Filter by benchmark ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { benchmarkId?: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEval.listDatasets.query(
options.benchmarkId ? { benchmarkId: options.benchmarkId } : undefined,
);
}),
);
datasetCmd
.command('get')
.description('Get dataset details (use --external for external eval API)')
.requiredOption('--id <id>', 'Dataset ID')
.option('--external', 'Use external evaluation API')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { external?: boolean; id: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
if (options.external) {
return client.agentEvalExternal.datasetGet.query({ datasetId: options.id });
}
return client.agentEval.getDataset.query({ id: options.id });
}),
);
datasetCmd
.command('create')
.description('Create a dataset')
.requiredOption('--benchmark-id <id>', 'Benchmark ID')
.requiredOption('--identifier <identifier>', 'Unique identifier')
.requiredOption('-n, --name <name>', 'Dataset name')
.option('-d, --description <desc>', 'Description')
.option('--eval-mode <mode>', 'Evaluation mode')
.option('--json', 'Output JSON envelope')
.action(
async (
options: JsonOption & {
benchmarkId: string;
description?: string;
evalMode?: string;
identifier: string;
name: string;
},
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
const input: Record<string, any> = {
benchmarkId: options.benchmarkId,
identifier: options.identifier,
name: options.name,
};
if (options.description) input.description = options.description;
if (options.evalMode) input.evalMode = options.evalMode;
return client.agentEval.createDataset.mutate(input as any);
},
`Created dataset ${pc.bold(options.name)}`,
),
);
datasetCmd
.command('update')
.description('Update a dataset')
.requiredOption('--id <id>', 'Dataset ID')
.option('-n, --name <name>', 'New name')
.option('-d, --description <desc>', 'New description')
.option('--eval-mode <mode>', 'New evaluation mode')
.option('--json', 'Output JSON envelope')
.action(
async (
options: JsonOption & {
description?: string;
evalMode?: string;
id: string;
name?: string;
},
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
const input: Record<string, any> = { id: options.id };
if (options.name) input.name = options.name;
if (options.description) input.description = options.description;
if (options.evalMode) input.evalMode = options.evalMode;
return client.agentEval.updateDataset.mutate(input as any);
},
`Updated dataset ${pc.bold(options.id)}`,
),
);
datasetCmd
.command('delete')
.description('Delete a dataset')
.requiredOption('--id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEval.deleteDataset.mutate({ id: options.id });
},
`Deleted dataset ${pc.bold(options.id)}`,
),
);
// ============================================
// TestCase Operations
// ============================================
const testcaseCmd = evalCmd.command('testcase').description('Manage evaluation test cases');
testcaseCmd
.command('list')
.description('List test cases')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('-L, --limit <n>', 'Page size', '50')
.option('--offset <n>', 'Offset', '0')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { datasetId: string; limit?: string; offset?: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEval.listTestCases.query({
datasetId: options.datasetId,
limit: Number.parseInt(options.limit || '50', 10),
offset: Number.parseInt(options.offset || '0', 10),
});
}),
);
testcaseCmd
.command('get')
.description('Get test case details')
.requiredOption('--id <id>', 'Test case ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEval.getTestCase.query({ id: options.id });
}),
);
testcaseCmd
.command('create')
.description('Create a test case')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.requiredOption('--input <text>', 'Input text')
.option('--expected <text>', 'Expected output')
.option('--category <cat>', 'Category')
.option('--sort-order <n>', 'Sort order')
.option('--json', 'Output JSON envelope')
.action(
async (
options: JsonOption & {
category?: string;
datasetId: string;
expected?: string;
input: string;
sortOrder?: string;
},
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
const content: Record<string, any> = { input: options.input };
if (options.expected) content.expected = options.expected;
if (options.category) content.category = options.category;
const input: Record<string, any> = { content, datasetId: options.datasetId };
if (options.sortOrder) input.sortOrder = Number.parseInt(options.sortOrder, 10);
return client.agentEval.createTestCase.mutate(input as any);
},
'Created test case',
),
);
testcaseCmd
.command('update')
.description('Update a test case')
.requiredOption('--id <id>', 'Test case ID')
.option('--input <text>', 'New input text')
.option('--expected <text>', 'New expected output')
.option('--category <cat>', 'New category')
.option('--sort-order <n>', 'New sort order')
.option('--json', 'Output JSON envelope')
.action(
async (
options: JsonOption & {
category?: string;
expected?: string;
id: string;
input?: string;
sortOrder?: string;
},
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
const input: Record<string, any> = { id: options.id };
const content: Record<string, any> = {};
if (options.input) content.input = options.input;
if (options.expected) content.expected = options.expected;
if (options.category) content.category = options.category;
if (Object.keys(content).length > 0) input.content = content;
if (options.sortOrder) input.sortOrder = Number.parseInt(options.sortOrder, 10);
return client.agentEval.updateTestCase.mutate(input as any);
},
`Updated test case ${pc.bold(options.id)}`,
),
);
testcaseCmd
.command('delete')
.description('Delete a test case')
.requiredOption('--id <id>', 'Test case ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEval.deleteTestCase.mutate({ id: options.id });
},
`Deleted test case ${pc.bold(options.id)}`,
),
);
testcaseCmd
.command('count')
.description('Count test cases by dataset (external eval API)')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { datasetId: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
}),
);
// ============================================
// Run Operations
// ============================================
const runCmd = evalCmd.command('run').description('Manage evaluation runs');
runCmd
.command('get')
.description('Get run information')
.requiredOption('--run-id <id>', 'Run ID')
.command('list')
.description('List evaluation runs')
.option('--benchmark-id <id>', 'Filter by benchmark ID')
.option('--dataset-id <id>', 'Filter by dataset ID')
.option('--status <status>', 'Filter by status')
.option('-L, --limit <n>', 'Page size', '50')
.option('--offset <n>', 'Offset', '0')
.option('--json', 'Output JSON envelope')
.action(async (options: RunGetOptions) =>
.action(
async (
options: JsonOption & {
benchmarkId?: string;
datasetId?: string;
limit?: string;
offset?: string;
status?: string;
},
) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
const input: Record<string, any> = {};
if (options.benchmarkId) input.benchmarkId = options.benchmarkId;
if (options.datasetId) input.datasetId = options.datasetId;
if (options.status) input.status = options.status;
input.limit = Number.parseInt(options.limit || '50', 10);
input.offset = Number.parseInt(options.offset || '0', 10);
return client.agentEval.listRuns.query(input as any);
}),
);
runCmd
.command('get')
.description('Get run details (use --external for external eval API)')
.requiredOption('--id <id>', 'Run ID')
.option('--external', 'Use external evaluation API')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { external?: boolean; id: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runGet.query({ runId: options.runId });
if (options.external) {
return client.agentEvalExternal.runGet.query({ runId: options.id });
}
return client.agentEval.getRunDetails.query({ id: options.id });
}),
);
runCmd
.command('create')
.description('Create an evaluation run')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--agent-id <id>', 'Target agent ID')
.option('-n, --name <name>', 'Run name')
.option('--k <n>', 'Number of runs per test case (1-10)')
.option('--max-concurrency <n>', 'Max concurrency (1-10)')
.option('--max-steps <n>', 'Max steps (1-1000)')
.option('--timeout <ms>', 'Timeout in ms (60000-3600000)')
.option('--json', 'Output JSON envelope')
.action(
async (
options: JsonOption & {
agentId?: string;
datasetId: string;
k?: string;
maxConcurrency?: string;
maxSteps?: string;
name?: string;
timeout?: string;
},
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
const input: Record<string, any> = { datasetId: options.datasetId };
if (options.agentId) input.targetAgentId = options.agentId;
if (options.name) input.name = options.name;
const config: Record<string, any> = {};
if (options.k) config.k = Number.parseInt(options.k, 10);
if (options.maxConcurrency)
config.maxConcurrency = Number.parseInt(options.maxConcurrency, 10);
if (options.maxSteps) config.maxSteps = Number.parseInt(options.maxSteps, 10);
if (options.timeout) config.timeout = Number.parseInt(options.timeout, 10);
if (Object.keys(config).length > 0) input.config = config;
return client.agentEval.createRun.mutate(input as any);
},
'Created evaluation run',
),
);
runCmd
.command('delete')
.description('Delete an evaluation run')
.requiredOption('--id <id>', 'Run ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEval.deleteRun.mutate({ id: options.id });
},
`Deleted run ${pc.bold(options.id)}`,
),
);
runCmd
.command('start')
.description('Start an evaluation run')
.requiredOption('--id <id>', 'Run ID')
.option('--force', 'Force restart even if already running')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { force?: boolean; id: string }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEval.startRun.mutate({ id: options.id, force: options.force });
},
`Started run ${pc.bold(options.id)}`,
),
);
runCmd
.command('abort')
.description('Abort a running evaluation')
.requiredOption('--id <id>', 'Run ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEval.abortRun.mutate({ id: options.id });
},
`Aborted run ${pc.bold(options.id)}`,
),
);
runCmd
.command('retry-errors')
.description('Retry failed test cases in a run')
.requiredOption('--id <id>', 'Run ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEval.retryRunErrors.mutate({ id: options.id });
},
`Retrying errors for run ${pc.bold(options.id)}`,
),
);
runCmd
.command('progress')
.description('Get run progress')
.requiredOption('--id <id>', 'Run ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEval.getRunProgress.query({ id: options.id });
}),
);
runCmd
.command('results')
.description('Get run results')
.requiredOption('--id <id>', 'Run ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { id: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEval.getRunResults.query({ id: options.id });
}),
);
runCmd
.command('set-status')
.description('Set run status (external API supports completed or external)')
.requiredOption('--run-id <id>', 'Run ID')
.description('Set run status (external eval API, supports completed or external)')
.requiredOption('--id <id>', 'Run ID')
.requiredOption('--status <status>', 'Status (completed | external)', parseRunStatus)
.option('--json', 'Output JSON envelope')
.action(async (options: RunSetStatusOptions) =>
.action(async (options: JsonOption & { id: string; status: 'completed' | 'external' }) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runSetStatus.mutate({
runId: options.runId,
runId: options.id,
status: options.status,
});
},
`Run ${pc.bold(options.runId)} status updated to ${pc.bold(options.status)}`,
`Run ${pc.bold(options.id)} status updated to ${pc.bold(options.status)}`,
),
);
evalCmd
.command('dataset')
.description('Manage evaluation datasets')
.command('get')
.description('Get dataset information')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: DatasetGetOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.datasetGet.query({ datasetId: options.datasetId });
}),
);
// ============================================
// Run-Topic Operations (external eval API)
// ============================================
const runTopicCmd = evalCmd.command('run-topic').description('Manage evaluation run topics');
evalCmd
.command('run-topics')
.description('Manage run topics')
runTopicCmd
.command('list')
.description('List topics in a run')
.requiredOption('--run-id <id>', 'Run ID')
.option('--only-external', 'Only return topics pending external evaluation')
.option('--json', 'Output JSON envelope')
.action(async (options: RunTopicsListOptions) =>
.action(async (options: JsonOption & { onlyExternal?: boolean; runId: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runTopicsList.query({
@@ -248,55 +730,7 @@ export function registerEvalCommand(program: Command) {
}),
);
evalCmd
.command('threads')
.description('Manage evaluation threads')
.command('list')
.description('List threads by topic')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--json', 'Output JSON envelope')
.action(async (options: ThreadsListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
}),
);
evalCmd
.command('messages')
.description('Manage evaluation messages')
.command('list')
.description('List messages by topic and optional thread')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--thread-id <id>', 'Thread ID')
.option('--json', 'Output JSON envelope')
.action(async (options: MessagesListOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.messagesList.query({
threadId: options.threadId,
topicId: options.topicId,
});
}),
);
evalCmd
.command('test-cases')
.description('Manage evaluation test cases')
.command('count')
.description('Count test cases by dataset')
.requiredOption('--dataset-id <id>', 'Dataset ID')
.option('--json', 'Output JSON envelope')
.action(async (options: TestCasesCountOptions) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.testCasesCount.query({ datasetId: options.datasetId });
}),
);
evalCmd
.command('run-topic')
.description('Manage evaluation run-topic reporting')
runTopicCmd
.command('report-result')
.description('Report one evaluation result for a run topic')
.requiredOption('--run-id <id>', 'Run ID')
@@ -306,21 +740,69 @@ export function registerEvalCommand(program: Command) {
.requiredOption('--correct <boolean>', 'Whether the result is correct', parseBoolean)
.requiredOption('--result-json <json>', 'Raw evaluation result JSON object', parseResultJson)
.option('--json', 'Output JSON envelope')
.action(async (options: RunTopicReportResultOptions) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runTopicReportResult.mutate({
correct: options.correct,
result: options.resultJson,
runId: options.runId,
score: options.score,
threadId: options.threadId,
topicId: options.topicId,
});
.action(
async (
options: JsonOption & {
correct: boolean;
resultJson: Record<string, unknown>;
runId: string;
score: number;
threadId?: string;
topicId: string;
},
`Reported result for topic ${pc.bold(options.topicId)}`,
),
) =>
executeCommand(
options,
async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.runTopicReportResult.mutate({
correct: options.correct,
result: options.resultJson,
runId: options.runId,
score: options.score,
threadId: options.threadId,
topicId: options.topicId,
});
},
`Reported result for topic ${pc.bold(options.topicId)}`,
),
);
// ============================================
// Eval Thread Operations (external eval API)
// ============================================
evalCmd
.command('thread')
.description('Manage evaluation threads')
.command('list')
.description('List threads by topic')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { topicId: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.threadsList.query({ topicId: options.topicId });
}),
);
// ============================================
// Eval Message Operations (external eval API)
// ============================================
evalCmd
.command('message')
.description('Manage evaluation messages')
.command('list')
.description('List messages by topic and optional thread')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--thread-id <id>', 'Thread ID')
.option('--json', 'Output JSON envelope')
.action(async (options: JsonOption & { threadId?: string; topicId: string }) =>
executeCommand(options, async () => {
const client = await getTrpcClient();
return client.agentEvalExternal.messagesList.query({
threadId: options.threadId,
topicId: options.topicId,
});
}),
);
}
+121
View File
@@ -0,0 +1,121 @@
import { Command } from 'commander';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { registerThreadCommand } from './thread';
const { mockTrpcClient } = vi.hoisted(() => ({
mockTrpcClient: {
thread: {
getThread: { query: vi.fn() },
getThreads: { query: vi.fn() },
removeThread: { mutate: vi.fn() },
},
},
}));
const { getTrpcClient: mockGetTrpcClient } = vi.hoisted(() => ({
getTrpcClient: vi.fn(),
}));
vi.mock('../api/client', () => ({ getTrpcClient: mockGetTrpcClient }));
vi.mock('../utils/logger', () => ({
log: { debug: vi.fn(), error: vi.fn(), info: vi.fn(), warn: vi.fn() },
setVerbose: vi.fn(),
}));
describe('thread command', () => {
let exitSpy: ReturnType<typeof vi.spyOn>;
let consoleSpy: ReturnType<typeof vi.spyOn>;
beforeEach(() => {
exitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
mockGetTrpcClient.mockResolvedValue(mockTrpcClient);
for (const method of Object.values(mockTrpcClient.thread)) {
for (const fn of Object.values(method)) {
(fn as ReturnType<typeof vi.fn>).mockReset();
}
}
});
afterEach(() => {
exitSpy.mockRestore();
consoleSpy.mockRestore();
});
function createProgram() {
const program = new Command();
program.exitOverride();
registerThreadCommand(program);
return program;
}
describe('list', () => {
it('should list threads by topic', async () => {
mockTrpcClient.thread.getThreads.query.mockResolvedValue([
{ id: 't1', title: 'Thread 1', type: 'standalone' },
]);
const program = createProgram();
await program.parseAsync(['node', 'test', 'thread', 'list', '--topic-id', 'topic1']);
expect(mockTrpcClient.thread.getThreads.query).toHaveBeenCalledWith({ topicId: 'topic1' });
});
it('should show empty message when no threads', async () => {
mockTrpcClient.thread.getThreads.query.mockResolvedValue([]);
const program = createProgram();
await program.parseAsync(['node', 'test', 'thread', 'list', '--topic-id', 'topic1']);
expect(consoleSpy).toHaveBeenCalledWith('No threads found.');
});
});
describe('list-all', () => {
it('should list all threads', async () => {
mockTrpcClient.thread.getThread.query.mockResolvedValue([
{ id: 't1', title: 'Thread 1', type: 'standalone' },
]);
const program = createProgram();
await program.parseAsync(['node', 'test', 'thread', 'list-all']);
expect(mockTrpcClient.thread.getThread.query).toHaveBeenCalled();
});
});
describe('delete', () => {
it('should delete a thread', async () => {
mockTrpcClient.thread.removeThread.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync(['node', 'test', 'thread', 'delete', 't1', '--yes']);
expect(mockTrpcClient.thread.removeThread.mutate).toHaveBeenCalledWith({
id: 't1',
removeChildren: undefined,
});
});
it('should delete with remove-children flag', async () => {
mockTrpcClient.thread.removeThread.mutate.mockResolvedValue({});
const program = createProgram();
await program.parseAsync([
'node',
'test',
'thread',
'delete',
't1',
'--remove-children',
'--yes',
]);
expect(mockTrpcClient.thread.removeThread.mutate).toHaveBeenCalledWith({
id: 't1',
removeChildren: true,
});
});
});
});
+99
View File
@@ -0,0 +1,99 @@
import type { Command } from 'commander';
import pc from 'picocolors';
import { getTrpcClient } from '../api/client';
import { confirm, outputJson, printTable, timeAgo, truncate } from '../utils/format';
export function registerThreadCommand(program: Command) {
const thread = program.command('thread').description('Manage message threads');
// ── list ──────────────────────────────────────────────
thread
.command('list')
.description('List threads by topic')
.requiredOption('--topic-id <id>', 'Topic ID')
.option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
.action(async (options: { json?: string | boolean; topicId: string }) => {
const client = await getTrpcClient();
const result = await client.thread.getThreads.query({ topicId: options.topicId });
const items = Array.isArray(result) ? result : [];
if (options.json !== undefined) {
const fields = typeof options.json === 'string' ? options.json : undefined;
outputJson(items, fields);
return;
}
if (items.length === 0) {
console.log('No threads found.');
return;
}
const rows = items.map((t: any) => [
t.id || '',
truncate(t.title || 'Untitled', 50),
t.type || '',
t.updatedAt ? timeAgo(t.updatedAt) : '',
]);
printTable(rows, ['ID', 'TITLE', 'TYPE', 'UPDATED']);
});
// ── list-all ──────────────────────────────────────────
thread
.command('list-all')
.description('List all threads for the current user')
.option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
.action(async (options: { json?: string | boolean }) => {
const client = await getTrpcClient();
const result = await client.thread.getThread.query();
const items = Array.isArray(result) ? result : [];
if (options.json !== undefined) {
const fields = typeof options.json === 'string' ? options.json : undefined;
outputJson(items, fields);
return;
}
if (items.length === 0) {
console.log('No threads found.');
return;
}
const rows = items.map((t: any) => [
t.id || '',
truncate(t.title || 'Untitled', 50),
t.type || '',
t.topicId || '',
t.updatedAt ? timeAgo(t.updatedAt) : '',
]);
printTable(rows, ['ID', 'TITLE', 'TYPE', 'TOPIC', 'UPDATED']);
});
// ── delete ────────────────────────────────────────────
thread
.command('delete <id>')
.description('Delete a thread')
.option('--remove-children', 'Also remove child messages')
.option('--yes', 'Skip confirmation prompt')
.action(async (id: string, options: { removeChildren?: boolean; yes?: boolean }) => {
if (!options.yes) {
const confirmed = await confirm('Are you sure you want to delete this thread?');
if (!confirmed) {
console.log('Cancelled.');
return;
}
}
const client = await getTrpcClient();
await client.thread.removeThread.mutate({
id,
removeChildren: options.removeChildren,
});
console.log(`${pc.green('✓')} Deleted thread ${pc.bold(id)}`);
});
}
+2
View File
@@ -25,6 +25,7 @@ import { registerSearchCommand } from './commands/search';
import { registerSessionGroupCommand } from './commands/session-group';
import { registerSkillCommand } from './commands/skill';
import { registerStatusCommand } from './commands/status';
import { registerThreadCommand } from './commands/thread';
import { registerTopicCommand } from './commands/topic';
const require = createRequire(import.meta.url);
@@ -54,6 +55,7 @@ registerGenerateCommand(program);
registerFileCommand(program);
registerSkillCommand(program);
registerSessionGroupCommand(program);
registerThreadCommand(program);
registerTopicCommand(program);
registerMessageCommand(program);
registerModelCommand(program);