feat(agent-runtime): persist ERROR_CODE_SPECS classification on operation errors (#15273)

*  feat(agent-runtime): persist ERROR_CODE_SPECS classification on operation errors

Look up the runtime error's spec in `ERROR_CODE_SPECS` at the single catch
chokepoint and merge `attribution` / `category` / `severity` / `httpStatus`
/ `retryable` / `countAsFailure` / `numericId` onto the normalized
`ChatMessageError`. The enriched object flows through to all three
downstream sinks — `agent_operations.error` JSONB, S3 trace snapshot,
and the agent-gateway WS push — without each consumer having to re-run
pattern matching.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

*  feat(agent-runtime): enrich inner-step error path too

Model-runtime failures caught inside `runtime.step()` resolve normally with
`newState.status = 'error'` instead of throwing, so the prior commit's outer
`executeStep` catch never sees common provider errors like
`InvalidProviderAPIKey` / `InsufficientQuota`. Those were reaching
`agent_operations.error` JSONB and the success-path trace snapshot raw —
without `attribution` / `category` / `severity` / …

Run `formatErrorForState` on `stepResult.newState.error` immediately after
`runtime.step()` returns, before the state is saved to Redis, hooks are
dispatched, or the trace is finalized. Made the helper idempotent (recognizes
already-normalized `ChatMessageError` shape) so a second pass through the
outer catch can't collapse it back to `AgentRuntimeError`. Success-path
`traceRecorder.finalize` now forwards the classification fields too.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Arvin Xu
2026-05-28 02:25:16 +08:00
committed by GitHub
parent 1ae8498fc7
commit 8c0e66b633
5 changed files with 291 additions and 53 deletions
+33
View File
@@ -4,18 +4,51 @@ import type { ILobeAgentRuntimeErrorType } from '../../agentRuntime';
import type { ErrorType } from '../../fetch';
import type { IToolErrorType } from '../../tool/error';
/**
* Orthogonal to `type`: `type` says *what* the error is, the four fields below
* say *how to react to it*. Sourced from `ERROR_CODE_SPECS` in `model-runtime`
* at the point where a thrown error is normalized into `ChatMessageError`, so
* downstream consumers (DB JSONB, S3 snapshot, gateway WS push, dashboards)
* don't have to redo the classification themselves.
*
* All fields are optional — codes not registered in `ERROR_CODE_SPECS` (or
* fallback shapes like `InternalServerError`) will not carry them.
*/
export type ChatMessageErrorAttribution = 'user' | 'provider' | 'harness' | 'system';
export type ChatMessageErrorSeverity = 'info' | 'warning' | 'error' | 'critical';
/**
* Chat message error object
*/
export interface ChatMessageError {
/** Who owns the fix — surfaces user-vs-harness split on dashboards. */
attribution?: ChatMessageErrorAttribution;
body?: any;
/** Semantic bucket for slicing (auth / quota / capacity / …). */
category?: string;
/** Whether this counts toward operational failure metrics. */
countAsFailure?: boolean;
/** HTTP status the runtime returned (or would return) for this error. */
httpStatus?: number;
message?: string;
/** Stable `E<numericId>` reference for docs / support tickets. */
numericId?: number;
/** Transport-level retryability hint. */
retryable?: boolean;
severity?: ChatMessageErrorSeverity;
type: ErrorType | IToolErrorType | ILobeAgentRuntimeErrorType;
}
export const ChatMessageErrorSchema = z.object({
attribution: z.enum(['user', 'provider', 'harness', 'system']).optional(),
body: z.any().optional(),
category: z.string().optional(),
countAsFailure: z.boolean().optional(),
httpStatus: z.number().optional(),
message: z.string().optional(),
numericId: z.number().optional(),
retryable: z.boolean().optional(),
severity: z.enum(['info', 'warning', 'error', 'critical']).optional(),
type: z.union([z.string(), z.number()]),
});
@@ -0,0 +1,119 @@
import { AgentRuntimeErrorType, ChatErrorType } from '@lobechat/types';
import { describe, expect, it } from 'vitest';
import { formatErrorForState } from './formatErrorForState';
describe('formatErrorForState', () => {
describe('input normalization', () => {
it('handles ChatCompletionErrorPayload — extracts errorType and message', () => {
const result = formatErrorForState({
error: { detail: 'Unauthorized' },
errorType: AgentRuntimeErrorType.InvalidProviderAPIKey,
message: 'Invalid API key',
provider: 'openai',
});
expect(result.type).toBe(AgentRuntimeErrorType.InvalidProviderAPIKey);
expect(result.message).toBe('Invalid API key');
expect(result.body).toEqual({ detail: 'Unauthorized' });
});
it('wraps standard Error as InternalServerError', () => {
const result = formatErrorForState(new TypeError('boom'));
expect(result.type).toBe(ChatErrorType.InternalServerError);
expect(result.message).toBe('boom');
expect(result.body).toEqual({ name: 'TypeError' });
});
it('falls back to AgentRuntimeError for unknown thrown values', () => {
const result = formatErrorForState('plain string failure');
expect(result.type).toBe(AgentRuntimeErrorType.AgentRuntimeError);
expect(result.message).toBe('plain string failure');
});
});
describe('ERROR_CODE_SPECS enrichment', () => {
it('attaches classification fields when the errorType is registered in the spec table', () => {
const result = formatErrorForState({
errorType: AgentRuntimeErrorType.InsufficientQuota,
message: 'balance exhausted',
});
expect(result).toMatchObject({
attribution: 'user',
category: 'quota',
countAsFailure: false,
httpStatus: 429,
numericId: 2001,
retryable: false,
severity: 'warning',
});
});
it('marks provider-side rate limits as retryable with provider attribution', () => {
const result = formatErrorForState({
errorType: AgentRuntimeErrorType.RateLimitExceeded,
message: 'RPM exceeded',
});
expect(result.attribution).toBe('provider');
expect(result.category).toBe('capacity');
expect(result.retryable).toBe(true);
expect(result.countAsFailure).toBe(false);
});
it('resolves the QuotaLimitReached → RateLimitExceeded alias', () => {
const result = formatErrorForState({
errorType: AgentRuntimeErrorType.QuotaLimitReached,
message: 'rate limited',
});
expect(result.type).toBe(AgentRuntimeErrorType.QuotaLimitReached);
expect(result.attribution).toBe('provider');
expect(result.category).toBe('capacity');
});
it('is idempotent on an already-normalized ChatMessageError', () => {
const once = formatErrorForState({
errorType: AgentRuntimeErrorType.InvalidProviderAPIKey,
message: 'bad key',
});
const twice = formatErrorForState(once);
// Re-running the helper must not collapse to AgentRuntimeError or strip
// classification — both are real risks if the early-return branch is
// missing, because the success-path inner-step write can run through
// here a second time when the outer service touches state.error again.
expect(twice.type).toBe(AgentRuntimeErrorType.InvalidProviderAPIKey);
expect(twice.attribution).toBe('user');
expect(twice.category).toBe('auth');
expect(twice.message).toBe('bad key');
});
it('enriches a partial ChatMessageError that only carries type + message', () => {
const result = formatErrorForState({
message: 'balance exhausted',
type: AgentRuntimeErrorType.InsufficientQuota,
});
expect(result.attribution).toBe('user');
expect(result.category).toBe('quota');
expect(result.httpStatus).toBe(429);
});
it('leaves classification fields unset for codes outside the spec table', () => {
const result = formatErrorForState(new Error('infra blew up'));
expect(result.type).toBe(ChatErrorType.InternalServerError);
expect(result.attribution).toBeUndefined();
expect(result.category).toBeUndefined();
expect(result.severity).toBeUndefined();
expect(result.httpStatus).toBeUndefined();
expect(result.retryable).toBeUndefined();
expect(result.countAsFailure).toBeUndefined();
expect(result.numericId).toBeUndefined();
});
});
});
@@ -0,0 +1,91 @@
import { getErrorCodeSpec } from '@lobechat/model-runtime';
import { AgentRuntimeErrorType, ChatErrorType, type ChatMessageError } from '@lobechat/types';
/**
* Merge classification metadata from `ERROR_CODE_SPECS` onto a normalized
* `ChatMessageError`. Codes that aren't in the spec table (fallbacks like
* `InternalServerError`, or numeric ChatErrorType values) pass through
* unchanged — every classification field stays optional.
*
* Keeping enrichment in one place means downstream consumers (`agent_operations.error`
* JSONB, S3 trace snapshots, agent-gateway WS push, dashboards) all get the
* same shape without re-running pattern matching themselves.
*/
const enrichWithSpec = (formatted: ChatMessageError): ChatMessageError => {
// `getErrorCodeSpec` is keyed by `ILobeAgentRuntimeErrorType` strings; coerce
// because `ChatMessageError['type']` widens to include numeric `ChatErrorType`
// values, which simply miss the lookup and pass through unenriched.
const spec = getErrorCodeSpec(String(formatted.type));
if (!spec) return formatted;
return {
...formatted,
attribution: spec.attribution,
category: spec.category,
countAsFailure: spec.countAsFailure,
httpStatus: spec.httpStatus,
numericId: spec.numericId,
retryable: spec.retryable,
severity: spec.severity,
};
};
/**
* Normalize an arbitrary thrown value into `ChatMessageError`, then attach
* classification metadata from `ERROR_CODE_SPECS` so the resulting object
* is self-describing for everything downstream of the runtime catch block.
*
* Handles four input shapes:
*
* 1. `ChatCompletionErrorPayload` — what `model-runtime` throws on LLM
* failures: `{ errorType, error, provider?, message? }`.
* 2. Already-normalized `ChatMessageError` (`{ type, message?, body? }`)
* — re-enriched in place so the helper is safe to call twice (the inner
* `runtime.step()` non-throwing error path and the outer `executeStep`
* catch can both run through here without double-wrapping).
* 3. Standard `Error` instance — wrapped as `InternalServerError`.
* 4. Anything else — stringified as `AgentRuntimeError`.
*/
export const formatErrorForState = (error: unknown): ChatMessageError => {
if (error && typeof error === 'object' && 'errorType' in error) {
const payload = error as {
error?: unknown;
errorType: ChatMessageError['type'];
message?: string;
};
return enrichWithSpec({
body: payload.error || error,
message: payload.message || String(payload.errorType),
type: payload.errorType,
});
}
// Path 2: already-normalized ChatMessageError shape — has `type` but not
// `errorType`, and isn't a thrown Error instance. Common when the inner
// runtime.step() catch has already stuffed a partial ChatMessageError into
// `newState.error` and the outer service is just topping it up.
if (
error &&
typeof error === 'object' &&
!(error instanceof Error) &&
'type' in error &&
(typeof (error as { type: unknown }).type === 'string' ||
typeof (error as { type: unknown }).type === 'number')
) {
return enrichWithSpec(error as ChatMessageError);
}
if (error instanceof Error) {
return enrichWithSpec({
body: { name: error.name },
message: error.message,
type: ChatErrorType.InternalServerError,
});
}
return enrichWithSpec({
body: error,
message: String(error),
type: AgentRuntimeErrorType.AgentRuntimeError,
});
};
@@ -19,13 +19,7 @@ import {
invokeAgentSpanName,
tracer as agentRuntimeTracer,
} from '@lobechat/observability-otel/modules/agent-runtime';
import {
AgentRuntimeErrorType,
ChatErrorType,
type ChatMessageError,
type ExecSubAgentTaskParams,
type UIChatMessage,
} from '@lobechat/types';
import { type ExecSubAgentTaskParams, type UIChatMessage } from '@lobechat/types';
import debug from 'debug';
import urlJoin from 'url-join';
@@ -34,6 +28,7 @@ import { type LobeChatDatabase } from '@/database/type';
import { appEnv } from '@/envs/app';
import { type AgentRuntimeCoordinatorOptions } from '@/server/modules/AgentRuntime';
import { AgentRuntimeCoordinator, createStreamEventManager } from '@/server/modules/AgentRuntime';
import { formatErrorForState } from '@/server/modules/AgentRuntime/formatErrorForState';
import {
createRuntimeExecutors,
type RuntimeExecutorContext,
@@ -73,43 +68,6 @@ if (process.env.VERCEL) {
const log = debug('lobe-server:agent-runtime-service');
/**
* Formats an error into ChatMessageError structure
* Handles various error formats from LLM execution and other sources
*/
function formatErrorForState(error: unknown): ChatMessageError {
// Handle ChatCompletionErrorPayload format from LLM errors
// e.g., { errorType: 'InvalidProviderAPIKey', error: { ... }, provider: 'openai' }
if (error && typeof error === 'object' && 'errorType' in error) {
const payload = error as {
error?: unknown;
errorType: ChatMessageError['type'];
message?: string;
};
return {
body: payload.error || error,
message: payload.message || String(payload.errorType),
type: payload.errorType,
};
}
// Handle standard Error objects
if (error instanceof Error) {
return {
body: { name: error.name },
message: error.message,
type: ChatErrorType.InternalServerError,
};
}
// Fallback for unknown error types
return {
body: error,
message: String(error),
type: AgentRuntimeErrorType.AgentRuntimeError,
};
}
const toAgentSignalSnapshotEvents = (
emission: Awaited<ReturnType<typeof emitAgentSignalSourceEvent>> | undefined,
) => {
@@ -775,6 +733,15 @@ export class AgentRuntimeService {
const startAt = Date.now();
const stepResult = await runtime.step(currentState, currentContext);
// Inner runtime.step() catches model-runtime exceptions and stuffs the
// raw error into newState.error without re-throwing — so the outer
// catch at the bottom of this method never sees them. Normalize +
// classify here so the raw error doesn't reach Redis state, the
// success-path trace finalize, or `persistCompletion`'s JSONB write.
if (stepResult.newState.error) {
stepResult.newState.error = formatErrorForState(stepResult.newState.error);
}
// Check if the operation was interrupted while the step was executing
// (e.g., user clicked abort during a long LLM call)
const latestState = await this.coordinator.loadAgentState(operationId);
@@ -999,19 +966,23 @@ export class AgentRuntimeService {
// Finalize tracing snapshot. The error catch below uses the same
// recorder so propagated failures still write the canonical S3
// snapshot instead of orphaning the partial ().
const newStateError = stepResult.newState.error;
await this.traceRecorder.finalize(operationId, {
appendEventsToLastStep: completionSignalEvents,
completionReason: reason,
error: stepResult.newState.error
error: newStateError
? {
attribution: newStateError.attribution,
category: newStateError.category,
countAsFailure: newStateError.countAsFailure,
httpStatus: newStateError.httpStatus,
message:
this.completionLifecycle.extractErrorMessage(stepResult.newState.error) ??
JSON.stringify(stepResult.newState.error),
type: String(
stepResult.newState.error.type ??
stepResult.newState.error.errorType ??
'unknown',
),
this.completionLifecycle.extractErrorMessage(newStateError) ??
JSON.stringify(newStateError),
numericId: newStateError.numericId,
retryable: newStateError.retryable,
severity: newStateError.severity,
type: String(newStateError.type ?? newStateError.errorType ?? 'unknown'),
}
: undefined,
state: stepResult.newState,
@@ -1111,7 +1082,14 @@ export class AgentRuntimeService {
await this.traceRecorder.finalize(operationId, {
completionReason: 'error',
error: {
attribution: formattedError.attribution,
category: formattedError.category,
countAsFailure: formattedError.countAsFailure,
httpStatus: formattedError.httpStatus,
message: formattedError.message ?? String(formattedError.type),
numericId: formattedError.numericId,
retryable: formattedError.retryable,
severity: formattedError.severity,
type: String(formattedError.type),
},
failedStep: { startedAt: stepStartAt, stepIndex },
@@ -1,4 +1,5 @@
import type { ISnapshotStore, StepSnapshot } from '@lobechat/agent-tracing';
import type { ChatMessageErrorAttribution, ChatMessageErrorSeverity } from '@lobechat/types';
import debug from 'debug';
import type { StepCompletionReason, StepPresentationData } from './types';
@@ -46,7 +47,23 @@ export interface FinalizeParams {
*/
appendEventsToLastStep?: SignalEvent[];
completionReason: StepCompletionReason;
error?: { message: string; type: string };
/**
* Top-level error on the persisted snapshot. The classification fields
* (`attribution`, `category`, `severity`, …) mirror `ChatMessageError` and
* are sourced from `ERROR_CODE_SPECS` at the runtime catch site; unknown
* codes simply omit them.
*/
error?: {
attribution?: ChatMessageErrorAttribution;
category?: string;
countAsFailure?: boolean;
httpStatus?: number;
message: string;
numericId?: number;
retryable?: boolean;
severity?: ChatMessageErrorSeverity;
type: string;
};
/**
* Synthetic step record for the error path. The real failing step never
* reached `appendStep` because the executor threw before the partial push,