mirror of
https://github.com/lobehub/lobe-chat.git
synced 2026-06-14 03:30:19 +00:00
✨ feat(agent-runtime): persist ERROR_CODE_SPECS classification on operation errors (#15273)
* ✨ feat(agent-runtime): persist ERROR_CODE_SPECS classification on operation errors Look up the runtime error's spec in `ERROR_CODE_SPECS` at the single catch chokepoint and merge `attribution` / `category` / `severity` / `httpStatus` / `retryable` / `countAsFailure` / `numericId` onto the normalized `ChatMessageError`. The enriched object flows through to all three downstream sinks — `agent_operations.error` JSONB, S3 trace snapshot, and the agent-gateway WS push — without each consumer having to re-run pattern matching. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * ✨ feat(agent-runtime): enrich inner-step error path too Model-runtime failures caught inside `runtime.step()` resolve normally with `newState.status = 'error'` instead of throwing, so the prior commit's outer `executeStep` catch never sees common provider errors like `InvalidProviderAPIKey` / `InsufficientQuota`. Those were reaching `agent_operations.error` JSONB and the success-path trace snapshot raw — without `attribution` / `category` / `severity` / … Run `formatErrorForState` on `stepResult.newState.error` immediately after `runtime.step()` returns, before the state is saved to Redis, hooks are dispatched, or the trace is finalized. Made the helper idempotent (recognizes already-normalized `ChatMessageError` shape) so a second pass through the outer catch can't collapse it back to `AgentRuntimeError`. Success-path `traceRecorder.finalize` now forwards the classification fields too. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -4,18 +4,51 @@ import type { ILobeAgentRuntimeErrorType } from '../../agentRuntime';
|
||||
import type { ErrorType } from '../../fetch';
|
||||
import type { IToolErrorType } from '../../tool/error';
|
||||
|
||||
/**
|
||||
* Orthogonal to `type`: `type` says *what* the error is, the four fields below
|
||||
* say *how to react to it*. Sourced from `ERROR_CODE_SPECS` in `model-runtime`
|
||||
* at the point where a thrown error is normalized into `ChatMessageError`, so
|
||||
* downstream consumers (DB JSONB, S3 snapshot, gateway WS push, dashboards)
|
||||
* don't have to redo the classification themselves.
|
||||
*
|
||||
* All fields are optional — codes not registered in `ERROR_CODE_SPECS` (or
|
||||
* fallback shapes like `InternalServerError`) will not carry them.
|
||||
*/
|
||||
export type ChatMessageErrorAttribution = 'user' | 'provider' | 'harness' | 'system';
|
||||
export type ChatMessageErrorSeverity = 'info' | 'warning' | 'error' | 'critical';
|
||||
|
||||
/**
|
||||
* Chat message error object
|
||||
*/
|
||||
export interface ChatMessageError {
|
||||
/** Who owns the fix — surfaces user-vs-harness split on dashboards. */
|
||||
attribution?: ChatMessageErrorAttribution;
|
||||
body?: any;
|
||||
/** Semantic bucket for slicing (auth / quota / capacity / …). */
|
||||
category?: string;
|
||||
/** Whether this counts toward operational failure metrics. */
|
||||
countAsFailure?: boolean;
|
||||
/** HTTP status the runtime returned (or would return) for this error. */
|
||||
httpStatus?: number;
|
||||
message?: string;
|
||||
/** Stable `E<numericId>` reference for docs / support tickets. */
|
||||
numericId?: number;
|
||||
/** Transport-level retryability hint. */
|
||||
retryable?: boolean;
|
||||
severity?: ChatMessageErrorSeverity;
|
||||
type: ErrorType | IToolErrorType | ILobeAgentRuntimeErrorType;
|
||||
}
|
||||
|
||||
export const ChatMessageErrorSchema = z.object({
|
||||
attribution: z.enum(['user', 'provider', 'harness', 'system']).optional(),
|
||||
body: z.any().optional(),
|
||||
category: z.string().optional(),
|
||||
countAsFailure: z.boolean().optional(),
|
||||
httpStatus: z.number().optional(),
|
||||
message: z.string().optional(),
|
||||
numericId: z.number().optional(),
|
||||
retryable: z.boolean().optional(),
|
||||
severity: z.enum(['info', 'warning', 'error', 'critical']).optional(),
|
||||
type: z.union([z.string(), z.number()]),
|
||||
});
|
||||
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
import { AgentRuntimeErrorType, ChatErrorType } from '@lobechat/types';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
import { formatErrorForState } from './formatErrorForState';
|
||||
|
||||
describe('formatErrorForState', () => {
|
||||
describe('input normalization', () => {
|
||||
it('handles ChatCompletionErrorPayload — extracts errorType and message', () => {
|
||||
const result = formatErrorForState({
|
||||
error: { detail: 'Unauthorized' },
|
||||
errorType: AgentRuntimeErrorType.InvalidProviderAPIKey,
|
||||
message: 'Invalid API key',
|
||||
provider: 'openai',
|
||||
});
|
||||
|
||||
expect(result.type).toBe(AgentRuntimeErrorType.InvalidProviderAPIKey);
|
||||
expect(result.message).toBe('Invalid API key');
|
||||
expect(result.body).toEqual({ detail: 'Unauthorized' });
|
||||
});
|
||||
|
||||
it('wraps standard Error as InternalServerError', () => {
|
||||
const result = formatErrorForState(new TypeError('boom'));
|
||||
|
||||
expect(result.type).toBe(ChatErrorType.InternalServerError);
|
||||
expect(result.message).toBe('boom');
|
||||
expect(result.body).toEqual({ name: 'TypeError' });
|
||||
});
|
||||
|
||||
it('falls back to AgentRuntimeError for unknown thrown values', () => {
|
||||
const result = formatErrorForState('plain string failure');
|
||||
|
||||
expect(result.type).toBe(AgentRuntimeErrorType.AgentRuntimeError);
|
||||
expect(result.message).toBe('plain string failure');
|
||||
});
|
||||
});
|
||||
|
||||
describe('ERROR_CODE_SPECS enrichment', () => {
|
||||
it('attaches classification fields when the errorType is registered in the spec table', () => {
|
||||
const result = formatErrorForState({
|
||||
errorType: AgentRuntimeErrorType.InsufficientQuota,
|
||||
message: 'balance exhausted',
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
attribution: 'user',
|
||||
category: 'quota',
|
||||
countAsFailure: false,
|
||||
httpStatus: 429,
|
||||
numericId: 2001,
|
||||
retryable: false,
|
||||
severity: 'warning',
|
||||
});
|
||||
});
|
||||
|
||||
it('marks provider-side rate limits as retryable with provider attribution', () => {
|
||||
const result = formatErrorForState({
|
||||
errorType: AgentRuntimeErrorType.RateLimitExceeded,
|
||||
message: 'RPM exceeded',
|
||||
});
|
||||
|
||||
expect(result.attribution).toBe('provider');
|
||||
expect(result.category).toBe('capacity');
|
||||
expect(result.retryable).toBe(true);
|
||||
expect(result.countAsFailure).toBe(false);
|
||||
});
|
||||
|
||||
it('resolves the QuotaLimitReached → RateLimitExceeded alias', () => {
|
||||
const result = formatErrorForState({
|
||||
errorType: AgentRuntimeErrorType.QuotaLimitReached,
|
||||
message: 'rate limited',
|
||||
});
|
||||
|
||||
expect(result.type).toBe(AgentRuntimeErrorType.QuotaLimitReached);
|
||||
expect(result.attribution).toBe('provider');
|
||||
expect(result.category).toBe('capacity');
|
||||
});
|
||||
|
||||
it('is idempotent on an already-normalized ChatMessageError', () => {
|
||||
const once = formatErrorForState({
|
||||
errorType: AgentRuntimeErrorType.InvalidProviderAPIKey,
|
||||
message: 'bad key',
|
||||
});
|
||||
const twice = formatErrorForState(once);
|
||||
|
||||
// Re-running the helper must not collapse to AgentRuntimeError or strip
|
||||
// classification — both are real risks if the early-return branch is
|
||||
// missing, because the success-path inner-step write can run through
|
||||
// here a second time when the outer service touches state.error again.
|
||||
expect(twice.type).toBe(AgentRuntimeErrorType.InvalidProviderAPIKey);
|
||||
expect(twice.attribution).toBe('user');
|
||||
expect(twice.category).toBe('auth');
|
||||
expect(twice.message).toBe('bad key');
|
||||
});
|
||||
|
||||
it('enriches a partial ChatMessageError that only carries type + message', () => {
|
||||
const result = formatErrorForState({
|
||||
message: 'balance exhausted',
|
||||
type: AgentRuntimeErrorType.InsufficientQuota,
|
||||
});
|
||||
|
||||
expect(result.attribution).toBe('user');
|
||||
expect(result.category).toBe('quota');
|
||||
expect(result.httpStatus).toBe(429);
|
||||
});
|
||||
|
||||
it('leaves classification fields unset for codes outside the spec table', () => {
|
||||
const result = formatErrorForState(new Error('infra blew up'));
|
||||
|
||||
expect(result.type).toBe(ChatErrorType.InternalServerError);
|
||||
expect(result.attribution).toBeUndefined();
|
||||
expect(result.category).toBeUndefined();
|
||||
expect(result.severity).toBeUndefined();
|
||||
expect(result.httpStatus).toBeUndefined();
|
||||
expect(result.retryable).toBeUndefined();
|
||||
expect(result.countAsFailure).toBeUndefined();
|
||||
expect(result.numericId).toBeUndefined();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,91 @@
|
||||
import { getErrorCodeSpec } from '@lobechat/model-runtime';
|
||||
import { AgentRuntimeErrorType, ChatErrorType, type ChatMessageError } from '@lobechat/types';
|
||||
|
||||
/**
|
||||
* Merge classification metadata from `ERROR_CODE_SPECS` onto a normalized
|
||||
* `ChatMessageError`. Codes that aren't in the spec table (fallbacks like
|
||||
* `InternalServerError`, or numeric ChatErrorType values) pass through
|
||||
* unchanged — every classification field stays optional.
|
||||
*
|
||||
* Keeping enrichment in one place means downstream consumers (`agent_operations.error`
|
||||
* JSONB, S3 trace snapshots, agent-gateway WS push, dashboards) all get the
|
||||
* same shape without re-running pattern matching themselves.
|
||||
*/
|
||||
const enrichWithSpec = (formatted: ChatMessageError): ChatMessageError => {
|
||||
// `getErrorCodeSpec` is keyed by `ILobeAgentRuntimeErrorType` strings; coerce
|
||||
// because `ChatMessageError['type']` widens to include numeric `ChatErrorType`
|
||||
// values, which simply miss the lookup and pass through unenriched.
|
||||
const spec = getErrorCodeSpec(String(formatted.type));
|
||||
if (!spec) return formatted;
|
||||
|
||||
return {
|
||||
...formatted,
|
||||
attribution: spec.attribution,
|
||||
category: spec.category,
|
||||
countAsFailure: spec.countAsFailure,
|
||||
httpStatus: spec.httpStatus,
|
||||
numericId: spec.numericId,
|
||||
retryable: spec.retryable,
|
||||
severity: spec.severity,
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Normalize an arbitrary thrown value into `ChatMessageError`, then attach
|
||||
* classification metadata from `ERROR_CODE_SPECS` so the resulting object
|
||||
* is self-describing for everything downstream of the runtime catch block.
|
||||
*
|
||||
* Handles four input shapes:
|
||||
*
|
||||
* 1. `ChatCompletionErrorPayload` — what `model-runtime` throws on LLM
|
||||
* failures: `{ errorType, error, provider?, message? }`.
|
||||
* 2. Already-normalized `ChatMessageError` (`{ type, message?, body? }`)
|
||||
* — re-enriched in place so the helper is safe to call twice (the inner
|
||||
* `runtime.step()` non-throwing error path and the outer `executeStep`
|
||||
* catch can both run through here without double-wrapping).
|
||||
* 3. Standard `Error` instance — wrapped as `InternalServerError`.
|
||||
* 4. Anything else — stringified as `AgentRuntimeError`.
|
||||
*/
|
||||
export const formatErrorForState = (error: unknown): ChatMessageError => {
|
||||
if (error && typeof error === 'object' && 'errorType' in error) {
|
||||
const payload = error as {
|
||||
error?: unknown;
|
||||
errorType: ChatMessageError['type'];
|
||||
message?: string;
|
||||
};
|
||||
return enrichWithSpec({
|
||||
body: payload.error || error,
|
||||
message: payload.message || String(payload.errorType),
|
||||
type: payload.errorType,
|
||||
});
|
||||
}
|
||||
|
||||
// Path 2: already-normalized ChatMessageError shape — has `type` but not
|
||||
// `errorType`, and isn't a thrown Error instance. Common when the inner
|
||||
// runtime.step() catch has already stuffed a partial ChatMessageError into
|
||||
// `newState.error` and the outer service is just topping it up.
|
||||
if (
|
||||
error &&
|
||||
typeof error === 'object' &&
|
||||
!(error instanceof Error) &&
|
||||
'type' in error &&
|
||||
(typeof (error as { type: unknown }).type === 'string' ||
|
||||
typeof (error as { type: unknown }).type === 'number')
|
||||
) {
|
||||
return enrichWithSpec(error as ChatMessageError);
|
||||
}
|
||||
|
||||
if (error instanceof Error) {
|
||||
return enrichWithSpec({
|
||||
body: { name: error.name },
|
||||
message: error.message,
|
||||
type: ChatErrorType.InternalServerError,
|
||||
});
|
||||
}
|
||||
|
||||
return enrichWithSpec({
|
||||
body: error,
|
||||
message: String(error),
|
||||
type: AgentRuntimeErrorType.AgentRuntimeError,
|
||||
});
|
||||
};
|
||||
@@ -19,13 +19,7 @@ import {
|
||||
invokeAgentSpanName,
|
||||
tracer as agentRuntimeTracer,
|
||||
} from '@lobechat/observability-otel/modules/agent-runtime';
|
||||
import {
|
||||
AgentRuntimeErrorType,
|
||||
ChatErrorType,
|
||||
type ChatMessageError,
|
||||
type ExecSubAgentTaskParams,
|
||||
type UIChatMessage,
|
||||
} from '@lobechat/types';
|
||||
import { type ExecSubAgentTaskParams, type UIChatMessage } from '@lobechat/types';
|
||||
import debug from 'debug';
|
||||
import urlJoin from 'url-join';
|
||||
|
||||
@@ -34,6 +28,7 @@ import { type LobeChatDatabase } from '@/database/type';
|
||||
import { appEnv } from '@/envs/app';
|
||||
import { type AgentRuntimeCoordinatorOptions } from '@/server/modules/AgentRuntime';
|
||||
import { AgentRuntimeCoordinator, createStreamEventManager } from '@/server/modules/AgentRuntime';
|
||||
import { formatErrorForState } from '@/server/modules/AgentRuntime/formatErrorForState';
|
||||
import {
|
||||
createRuntimeExecutors,
|
||||
type RuntimeExecutorContext,
|
||||
@@ -73,43 +68,6 @@ if (process.env.VERCEL) {
|
||||
|
||||
const log = debug('lobe-server:agent-runtime-service');
|
||||
|
||||
/**
|
||||
* Formats an error into ChatMessageError structure
|
||||
* Handles various error formats from LLM execution and other sources
|
||||
*/
|
||||
function formatErrorForState(error: unknown): ChatMessageError {
|
||||
// Handle ChatCompletionErrorPayload format from LLM errors
|
||||
// e.g., { errorType: 'InvalidProviderAPIKey', error: { ... }, provider: 'openai' }
|
||||
if (error && typeof error === 'object' && 'errorType' in error) {
|
||||
const payload = error as {
|
||||
error?: unknown;
|
||||
errorType: ChatMessageError['type'];
|
||||
message?: string;
|
||||
};
|
||||
return {
|
||||
body: payload.error || error,
|
||||
message: payload.message || String(payload.errorType),
|
||||
type: payload.errorType,
|
||||
};
|
||||
}
|
||||
|
||||
// Handle standard Error objects
|
||||
if (error instanceof Error) {
|
||||
return {
|
||||
body: { name: error.name },
|
||||
message: error.message,
|
||||
type: ChatErrorType.InternalServerError,
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback for unknown error types
|
||||
return {
|
||||
body: error,
|
||||
message: String(error),
|
||||
type: AgentRuntimeErrorType.AgentRuntimeError,
|
||||
};
|
||||
}
|
||||
|
||||
const toAgentSignalSnapshotEvents = (
|
||||
emission: Awaited<ReturnType<typeof emitAgentSignalSourceEvent>> | undefined,
|
||||
) => {
|
||||
@@ -775,6 +733,15 @@ export class AgentRuntimeService {
|
||||
const startAt = Date.now();
|
||||
const stepResult = await runtime.step(currentState, currentContext);
|
||||
|
||||
// Inner runtime.step() catches model-runtime exceptions and stuffs the
|
||||
// raw error into newState.error without re-throwing — so the outer
|
||||
// catch at the bottom of this method never sees them. Normalize +
|
||||
// classify here so the raw error doesn't reach Redis state, the
|
||||
// success-path trace finalize, or `persistCompletion`'s JSONB write.
|
||||
if (stepResult.newState.error) {
|
||||
stepResult.newState.error = formatErrorForState(stepResult.newState.error);
|
||||
}
|
||||
|
||||
// Check if the operation was interrupted while the step was executing
|
||||
// (e.g., user clicked abort during a long LLM call)
|
||||
const latestState = await this.coordinator.loadAgentState(operationId);
|
||||
@@ -999,19 +966,23 @@ export class AgentRuntimeService {
|
||||
// Finalize tracing snapshot. The error catch below uses the same
|
||||
// recorder so propagated failures still write the canonical S3
|
||||
// snapshot instead of orphaning the partial ().
|
||||
const newStateError = stepResult.newState.error;
|
||||
await this.traceRecorder.finalize(operationId, {
|
||||
appendEventsToLastStep: completionSignalEvents,
|
||||
completionReason: reason,
|
||||
error: stepResult.newState.error
|
||||
error: newStateError
|
||||
? {
|
||||
attribution: newStateError.attribution,
|
||||
category: newStateError.category,
|
||||
countAsFailure: newStateError.countAsFailure,
|
||||
httpStatus: newStateError.httpStatus,
|
||||
message:
|
||||
this.completionLifecycle.extractErrorMessage(stepResult.newState.error) ??
|
||||
JSON.stringify(stepResult.newState.error),
|
||||
type: String(
|
||||
stepResult.newState.error.type ??
|
||||
stepResult.newState.error.errorType ??
|
||||
'unknown',
|
||||
),
|
||||
this.completionLifecycle.extractErrorMessage(newStateError) ??
|
||||
JSON.stringify(newStateError),
|
||||
numericId: newStateError.numericId,
|
||||
retryable: newStateError.retryable,
|
||||
severity: newStateError.severity,
|
||||
type: String(newStateError.type ?? newStateError.errorType ?? 'unknown'),
|
||||
}
|
||||
: undefined,
|
||||
state: stepResult.newState,
|
||||
@@ -1111,7 +1082,14 @@ export class AgentRuntimeService {
|
||||
await this.traceRecorder.finalize(operationId, {
|
||||
completionReason: 'error',
|
||||
error: {
|
||||
attribution: formattedError.attribution,
|
||||
category: formattedError.category,
|
||||
countAsFailure: formattedError.countAsFailure,
|
||||
httpStatus: formattedError.httpStatus,
|
||||
message: formattedError.message ?? String(formattedError.type),
|
||||
numericId: formattedError.numericId,
|
||||
retryable: formattedError.retryable,
|
||||
severity: formattedError.severity,
|
||||
type: String(formattedError.type),
|
||||
},
|
||||
failedStep: { startedAt: stepStartAt, stepIndex },
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import type { ISnapshotStore, StepSnapshot } from '@lobechat/agent-tracing';
|
||||
import type { ChatMessageErrorAttribution, ChatMessageErrorSeverity } from '@lobechat/types';
|
||||
import debug from 'debug';
|
||||
|
||||
import type { StepCompletionReason, StepPresentationData } from './types';
|
||||
@@ -46,7 +47,23 @@ export interface FinalizeParams {
|
||||
*/
|
||||
appendEventsToLastStep?: SignalEvent[];
|
||||
completionReason: StepCompletionReason;
|
||||
error?: { message: string; type: string };
|
||||
/**
|
||||
* Top-level error on the persisted snapshot. The classification fields
|
||||
* (`attribution`, `category`, `severity`, …) mirror `ChatMessageError` and
|
||||
* are sourced from `ERROR_CODE_SPECS` at the runtime catch site; unknown
|
||||
* codes simply omit them.
|
||||
*/
|
||||
error?: {
|
||||
attribution?: ChatMessageErrorAttribution;
|
||||
category?: string;
|
||||
countAsFailure?: boolean;
|
||||
httpStatus?: number;
|
||||
message: string;
|
||||
numericId?: number;
|
||||
retryable?: boolean;
|
||||
severity?: ChatMessageErrorSeverity;
|
||||
type: string;
|
||||
};
|
||||
/**
|
||||
* Synthetic step record for the error path. The real failing step never
|
||||
* reached `appendStep` because the executor threw before the partial push,
|
||||
|
||||
Reference in New Issue
Block a user