✨ feat(agent-runtime): persist ERROR_CODE_SPECS classification on operation errors (#15273)

* ✨ feat(agent-runtime): persist ERROR_CODE_SPECS classification on operation errors Look up the runtime error's spec in `ERROR_CODE_SPECS` at the single catch chokepoint and merge `attribution` / `category` / `severity` / `httpStatus` / `retryable` / `countAsFailure` / `numericId` onto the normalized `ChatMessageError`. The enriched object flows through to all three downstream sinks — `agent_operations.error` JSONB, S3 trace snapshot, and the agent-gateway WS push — without each consumer having to re-run pattern matching. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * ✨ feat(agent-runtime): enrich inner-step error path too Model-runtime failures caught inside `runtime.step()` resolve normally with `newState.status = 'error'` instead of throwing, so the prior commit's outer `executeStep` catch never sees common provider errors like `InvalidProviderAPIKey` / `InsufficientQuota`. Those were reaching `agent_operations.error` JSONB and the success-path trace snapshot raw — without `attribution` / `category` / `severity` / … Run `formatErrorForState` on `stepResult.newState.error` immediately after `runtime.step()` returns, before the state is saved to Redis, hooks are dispatched, or the trace is finalized. Made the helper idempotent (recognizes already-normalized `ChatMessageError` shape) so a second pass through the outer catch can't collapse it back to `AgentRuntimeError`. Success-path `traceRecorder.finalize` now forwards the classification fields too. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-14 03:30:19 +00:00 · 2026-05-28 02:25:16 +08:00
parent 1ae8498fc7
commit 8c0e66b633
5 changed files with 291 additions and 53 deletions
@@ -4,18 +4,51 @@ import type { ILobeAgentRuntimeErrorType } from '../../agentRuntime';
 import type { ErrorType } from '../../fetch';
 import type { IToolErrorType } from '../../tool/error';

+/**
+ * Orthogonal to `type`: `type` says *what* the error is, the four fields below
+ * say *how to react to it*. Sourced from `ERROR_CODE_SPECS` in `model-runtime`
+ * at the point where a thrown error is normalized into `ChatMessageError`, so
+ * downstream consumers (DB JSONB, S3 snapshot, gateway WS push, dashboards)
+ * don't have to redo the classification themselves.
+ *
+ * All fields are optional — codes not registered in `ERROR_CODE_SPECS` (or
+ * fallback shapes like `InternalServerError`) will not carry them.
+ */
+export type ChatMessageErrorAttribution = 'user' | 'provider' | 'harness' | 'system';
+export type ChatMessageErrorSeverity = 'info' | 'warning' | 'error' | 'critical';
+
 /**
 * Chat message error object
 */
 export interface ChatMessageError {
+  /** Who owns the fix — surfaces user-vs-harness split on dashboards. */
+  attribution?: ChatMessageErrorAttribution;
  body?: any;
+  /** Semantic bucket for slicing (auth / quota / capacity / …). */
+  category?: string;
+  /** Whether this counts toward operational failure metrics. */
+  countAsFailure?: boolean;
+  /** HTTP status the runtime returned (or would return) for this error. */
+  httpStatus?: number;
  message?: string;
+  /** Stable `E<numericId>` reference for docs / support tickets. */
+  numericId?: number;
+  /** Transport-level retryability hint. */
+  retryable?: boolean;
+  severity?: ChatMessageErrorSeverity;
  type: ErrorType | IToolErrorType | ILobeAgentRuntimeErrorType;
 }

 export const ChatMessageErrorSchema = z.object({
+  attribution: z.enum(['user', 'provider', 'harness', 'system']).optional(),
  body: z.any().optional(),
+  category: z.string().optional(),
+  countAsFailure: z.boolean().optional(),
+  httpStatus: z.number().optional(),
  message: z.string().optional(),
+  numericId: z.number().optional(),
+  retryable: z.boolean().optional(),
+  severity: z.enum(['info', 'warning', 'error', 'critical']).optional(),
  type: z.union([z.string(), z.number()]),
 });

@@ -0,0 +1,119 @@
+import { AgentRuntimeErrorType, ChatErrorType } from '@lobechat/types';
+import { describe, expect, it } from 'vitest';
+
+import { formatErrorForState } from './formatErrorForState';
+
+describe('formatErrorForState', () => {
+  describe('input normalization', () => {
+    it('handles ChatCompletionErrorPayload — extracts errorType and message', () => {
+      const result = formatErrorForState({
+        error: { detail: 'Unauthorized' },
+        errorType: AgentRuntimeErrorType.InvalidProviderAPIKey,
+        message: 'Invalid API key',
+        provider: 'openai',
+      });
+
+      expect(result.type).toBe(AgentRuntimeErrorType.InvalidProviderAPIKey);
+      expect(result.message).toBe('Invalid API key');
+      expect(result.body).toEqual({ detail: 'Unauthorized' });
+    });
+
+    it('wraps standard Error as InternalServerError', () => {
+      const result = formatErrorForState(new TypeError('boom'));
+
+      expect(result.type).toBe(ChatErrorType.InternalServerError);
+      expect(result.message).toBe('boom');
+      expect(result.body).toEqual({ name: 'TypeError' });
+    });
+
+    it('falls back to AgentRuntimeError for unknown thrown values', () => {
+      const result = formatErrorForState('plain string failure');
+
+      expect(result.type).toBe(AgentRuntimeErrorType.AgentRuntimeError);
+      expect(result.message).toBe('plain string failure');
+    });
+  });
+
+  describe('ERROR_CODE_SPECS enrichment', () => {
+    it('attaches classification fields when the errorType is registered in the spec table', () => {
+      const result = formatErrorForState({
+        errorType: AgentRuntimeErrorType.InsufficientQuota,
+        message: 'balance exhausted',
+      });
+
+      expect(result).toMatchObject({
+        attribution: 'user',
+        category: 'quota',
+        countAsFailure: false,
+        httpStatus: 429,
+        numericId: 2001,
+        retryable: false,
+        severity: 'warning',
+      });
+    });
+
+    it('marks provider-side rate limits as retryable with provider attribution', () => {
+      const result = formatErrorForState({
+        errorType: AgentRuntimeErrorType.RateLimitExceeded,
+        message: 'RPM exceeded',
+      });
+
+      expect(result.attribution).toBe('provider');
+      expect(result.category).toBe('capacity');
+      expect(result.retryable).toBe(true);
+      expect(result.countAsFailure).toBe(false);
+    });
+
+    it('resolves the QuotaLimitReached → RateLimitExceeded alias', () => {
+      const result = formatErrorForState({
+        errorType: AgentRuntimeErrorType.QuotaLimitReached,
+        message: 'rate limited',
+      });
+
+      expect(result.type).toBe(AgentRuntimeErrorType.QuotaLimitReached);
+      expect(result.attribution).toBe('provider');
+      expect(result.category).toBe('capacity');
+    });
+
+    it('is idempotent on an already-normalized ChatMessageError', () => {
+      const once = formatErrorForState({
+        errorType: AgentRuntimeErrorType.InvalidProviderAPIKey,
+        message: 'bad key',
+      });
+      const twice = formatErrorForState(once);
+
+      // Re-running the helper must not collapse to AgentRuntimeError or strip
+      // classification — both are real risks if the early-return branch is
+      // missing, because the success-path inner-step write can run through
+      // here a second time when the outer service touches state.error again.
+      expect(twice.type).toBe(AgentRuntimeErrorType.InvalidProviderAPIKey);
+      expect(twice.attribution).toBe('user');
+      expect(twice.category).toBe('auth');
+      expect(twice.message).toBe('bad key');
+    });
+
+    it('enriches a partial ChatMessageError that only carries type + message', () => {
+      const result = formatErrorForState({
+        message: 'balance exhausted',
+        type: AgentRuntimeErrorType.InsufficientQuota,
+      });
+
+      expect(result.attribution).toBe('user');
+      expect(result.category).toBe('quota');
+      expect(result.httpStatus).toBe(429);
+    });
+
+    it('leaves classification fields unset for codes outside the spec table', () => {
+      const result = formatErrorForState(new Error('infra blew up'));
+
+      expect(result.type).toBe(ChatErrorType.InternalServerError);
+      expect(result.attribution).toBeUndefined();
+      expect(result.category).toBeUndefined();
+      expect(result.severity).toBeUndefined();
+      expect(result.httpStatus).toBeUndefined();
+      expect(result.retryable).toBeUndefined();
+      expect(result.countAsFailure).toBeUndefined();
+      expect(result.numericId).toBeUndefined();
+    });
+  });
+});
@@ -0,0 +1,91 @@
+import { getErrorCodeSpec } from '@lobechat/model-runtime';
+import { AgentRuntimeErrorType, ChatErrorType, type ChatMessageError } from '@lobechat/types';
+
+/**
+ * Merge classification metadata from `ERROR_CODE_SPECS` onto a normalized
+ * `ChatMessageError`. Codes that aren't in the spec table (fallbacks like
+ * `InternalServerError`, or numeric ChatErrorType values) pass through
+ * unchanged — every classification field stays optional.
+ *
+ * Keeping enrichment in one place means downstream consumers (`agent_operations.error`
+ * JSONB, S3 trace snapshots, agent-gateway WS push, dashboards) all get the
+ * same shape without re-running pattern matching themselves.
+ */
+const enrichWithSpec = (formatted: ChatMessageError): ChatMessageError => {
+  // `getErrorCodeSpec` is keyed by `ILobeAgentRuntimeErrorType` strings; coerce
+  // because `ChatMessageError['type']` widens to include numeric `ChatErrorType`
+  // values, which simply miss the lookup and pass through unenriched.
+  const spec = getErrorCodeSpec(String(formatted.type));
+  if (!spec) return formatted;
+
+  return {
+    ...formatted,
+    attribution: spec.attribution,
+    category: spec.category,
+    countAsFailure: spec.countAsFailure,
+    httpStatus: spec.httpStatus,
+    numericId: spec.numericId,
+    retryable: spec.retryable,
+    severity: spec.severity,
+  };
+};
+
+/**
+ * Normalize an arbitrary thrown value into `ChatMessageError`, then attach
+ * classification metadata from `ERROR_CODE_SPECS` so the resulting object
+ * is self-describing for everything downstream of the runtime catch block.
+ *
+ * Handles four input shapes:
+ *
+ * 1. `ChatCompletionErrorPayload` — what `model-runtime` throws on LLM
+ *    failures: `{ errorType, error, provider?, message? }`.
+ * 2. Already-normalized `ChatMessageError` (`{ type, message?, body? }`)
+ *    — re-enriched in place so the helper is safe to call twice (the inner
+ *    `runtime.step()` non-throwing error path and the outer `executeStep`
+ *    catch can both run through here without double-wrapping).
+ * 3. Standard `Error` instance — wrapped as `InternalServerError`.
+ * 4. Anything else — stringified as `AgentRuntimeError`.
+ */
+export const formatErrorForState = (error: unknown): ChatMessageError => {
+  if (error && typeof error === 'object' && 'errorType' in error) {
+    const payload = error as {
+      error?: unknown;
+      errorType: ChatMessageError['type'];
+      message?: string;
+    };
+    return enrichWithSpec({
+      body: payload.error || error,
+      message: payload.message || String(payload.errorType),
+      type: payload.errorType,
+    });
+  }
+
+  // Path 2: already-normalized ChatMessageError shape — has `type` but not
+  // `errorType`, and isn't a thrown Error instance. Common when the inner
+  // runtime.step() catch has already stuffed a partial ChatMessageError into
+  // `newState.error` and the outer service is just topping it up.
+  if (
+    error &&
+    typeof error === 'object' &&
+    !(error instanceof Error) &&
+    'type' in error &&
+    (typeof (error as { type: unknown }).type === 'string' ||
+      typeof (error as { type: unknown }).type === 'number')
+  ) {
+    return enrichWithSpec(error as ChatMessageError);
+  }
+
+  if (error instanceof Error) {
+    return enrichWithSpec({
+      body: { name: error.name },
+      message: error.message,
+      type: ChatErrorType.InternalServerError,
+    });
+  }
+
+  return enrichWithSpec({
+    body: error,
+    message: String(error),
+    type: AgentRuntimeErrorType.AgentRuntimeError,
+  });
+};
@@ -19,13 +19,7 @@ import {
  invokeAgentSpanName,
  tracer as agentRuntimeTracer,
 } from '@lobechat/observability-otel/modules/agent-runtime';
-import {
-  AgentRuntimeErrorType,
-  ChatErrorType,
-  type ChatMessageError,
-  type ExecSubAgentTaskParams,
-  type UIChatMessage,
-} from '@lobechat/types';
+import { type ExecSubAgentTaskParams, type UIChatMessage } from '@lobechat/types';
 import debug from 'debug';
 import urlJoin from 'url-join';

@@ -34,6 +28,7 @@ import { type LobeChatDatabase } from '@/database/type';
 import { appEnv } from '@/envs/app';
 import { type AgentRuntimeCoordinatorOptions } from '@/server/modules/AgentRuntime';
 import { AgentRuntimeCoordinator, createStreamEventManager } from '@/server/modules/AgentRuntime';
+import { formatErrorForState } from '@/server/modules/AgentRuntime/formatErrorForState';
 import {
  createRuntimeExecutors,
  type RuntimeExecutorContext,
@@ -73,43 +68,6 @@ if (process.env.VERCEL) {

 const log = debug('lobe-server:agent-runtime-service');

-/**
- * Formats an error into ChatMessageError structure
- * Handles various error formats from LLM execution and other sources
- */
-function formatErrorForState(error: unknown): ChatMessageError {
-  // Handle ChatCompletionErrorPayload format from LLM errors
-  // e.g., { errorType: 'InvalidProviderAPIKey', error: { ... }, provider: 'openai' }
-  if (error && typeof error === 'object' && 'errorType' in error) {
-    const payload = error as {
-      error?: unknown;
-      errorType: ChatMessageError['type'];
-      message?: string;
-    };
-    return {
-      body: payload.error || error,
-      message: payload.message || String(payload.errorType),
-      type: payload.errorType,
-    };
-  }
-
-  // Handle standard Error objects
-  if (error instanceof Error) {
-    return {
-      body: { name: error.name },
-      message: error.message,
-      type: ChatErrorType.InternalServerError,
-    };
-  }
-
-  // Fallback for unknown error types
-  return {
-    body: error,
-    message: String(error),
-    type: AgentRuntimeErrorType.AgentRuntimeError,
-  };
-}
-
 const toAgentSignalSnapshotEvents = (
  emission: Awaited<ReturnType<typeof emitAgentSignalSourceEvent>> | undefined,
 ) => {
@@ -775,6 +733,15 @@ export class AgentRuntimeService {
        const startAt = Date.now();
        const stepResult = await runtime.step(currentState, currentContext);

+        // Inner runtime.step() catches model-runtime exceptions and stuffs the
+        // raw error into newState.error without re-throwing — so the outer
+        // catch at the bottom of this method never sees them. Normalize +
+        // classify here so the raw error doesn't reach Redis state, the
+        // success-path trace finalize, or `persistCompletion`'s JSONB write.
+        if (stepResult.newState.error) {
+          stepResult.newState.error = formatErrorForState(stepResult.newState.error);
+        }
+
        // Check if the operation was interrupted while the step was executing
        // (e.g., user clicked abort during a long LLM call)
        const latestState = await this.coordinator.loadAgentState(operationId);
@@ -999,19 +966,23 @@ export class AgentRuntimeService {
          // Finalize tracing snapshot. The error catch below uses the same
          // recorder so propagated failures still write the canonical S3
          // snapshot instead of orphaning the partial ().
+          const newStateError = stepResult.newState.error;
          await this.traceRecorder.finalize(operationId, {
            appendEventsToLastStep: completionSignalEvents,
            completionReason: reason,
-            error: stepResult.newState.error
+            error: newStateError
              ? {
+                  attribution: newStateError.attribution,
+                  category: newStateError.category,
+                  countAsFailure: newStateError.countAsFailure,
+                  httpStatus: newStateError.httpStatus,
                  message:
-                    this.completionLifecycle.extractErrorMessage(stepResult.newState.error) ??
-                    JSON.stringify(stepResult.newState.error),
-                  type: String(
-                    stepResult.newState.error.type ??
-                      stepResult.newState.error.errorType ??
-                      'unknown',
-                  ),
+                    this.completionLifecycle.extractErrorMessage(newStateError) ??
+                    JSON.stringify(newStateError),
+                  numericId: newStateError.numericId,
+                  retryable: newStateError.retryable,
+                  severity: newStateError.severity,
+                  type: String(newStateError.type ?? newStateError.errorType ?? 'unknown'),
                }
              : undefined,
            state: stepResult.newState,
@@ -1111,7 +1082,14 @@ export class AgentRuntimeService {
      await this.traceRecorder.finalize(operationId, {
        completionReason: 'error',
        error: {
+          attribution: formattedError.attribution,
+          category: formattedError.category,
+          countAsFailure: formattedError.countAsFailure,
+          httpStatus: formattedError.httpStatus,
          message: formattedError.message ?? String(formattedError.type),
+          numericId: formattedError.numericId,
+          retryable: formattedError.retryable,
+          severity: formattedError.severity,
          type: String(formattedError.type),
        },
        failedStep: { startedAt: stepStartAt, stepIndex },
@@ -1,4 +1,5 @@
 import type { ISnapshotStore, StepSnapshot } from '@lobechat/agent-tracing';
+import type { ChatMessageErrorAttribution, ChatMessageErrorSeverity } from '@lobechat/types';
 import debug from 'debug';

 import type { StepCompletionReason, StepPresentationData } from './types';
@@ -46,7 +47,23 @@ export interface FinalizeParams {
   */
  appendEventsToLastStep?: SignalEvent[];
  completionReason: StepCompletionReason;
-  error?: { message: string; type: string };
+  /**
+   * Top-level error on the persisted snapshot. The classification fields
+   * (`attribution`, `category`, `severity`, …) mirror `ChatMessageError` and
+   * are sourced from `ERROR_CODE_SPECS` at the runtime catch site; unknown
+   * codes simply omit them.
+   */
+  error?: {
+    attribution?: ChatMessageErrorAttribution;
+    category?: string;
+    countAsFailure?: boolean;
+    httpStatus?: number;
+    message: string;
+    numericId?: number;
+    retryable?: boolean;
+    severity?: ChatMessageErrorSeverity;
+    type: string;
+  };
  /**
   * Synthetic step record for the error path. The real failing step never
   * reached `appendStep` because the executor threw before the partial push,