diff --git a/apps/server/src/modules/AgentRuntime/formatErrorForState.test.ts b/apps/server/src/modules/AgentRuntime/formatErrorForState.test.ts index e6fa8373d2..0c8b435942 100644 --- a/apps/server/src/modules/AgentRuntime/formatErrorForState.test.ts +++ b/apps/server/src/modules/AgentRuntime/formatErrorForState.test.ts @@ -132,6 +132,14 @@ describe('formatErrorForState', () => { expect(result.countAsFailure).toBeUndefined(); expect(result.numericId).toBeUndefined(); }); + + it('classifies a raw Drizzle "Failed query" Error via its message instead of a bare 500', () => { + const result = formatErrorForState(new Error('Failed query: rollback\nparams: ')); + + expect(result.type).toBe(AgentRuntimeErrorType.DatabasePersistError); + expect(result.numericId).toBe(7004); + expect(result.attribution).toBe('harness'); + }); }); describe('ProviderBizError refinement', () => { diff --git a/packages/locales/src/default/modelRuntime.ts b/packages/locales/src/default/modelRuntime.ts index f4df70cc7a..40a8fffd08 100644 --- a/packages/locales/src/default/modelRuntime.ts +++ b/packages/locales/src/default/modelRuntime.ts @@ -69,6 +69,8 @@ export default { "Sorry, the token usage or request count has reached the rate limit for this key. Please try again later or increase the key's quota.", StateStorePersistError: 'A temporary issue with the conversation state store interrupted this operation. Please try again; if it persists, contact support.', + StateStoreReadError: + 'This operation was ended because the connection closed before it finished. This is usually harmless — reopen the conversation to continue.', StreamChunkError: 'Error parsing the message chunk of the streaming request. Please check if the current API interface complies with the standard specifications, or contact your API provider for assistance.', UpstreamGatewayError: diff --git a/packages/model-runtime/src/errors/match.test.ts b/packages/model-runtime/src/errors/match.test.ts index 9ac9a8b50e..91f37e1cba 100644 --- a/packages/model-runtime/src/errors/match.test.ts +++ b/packages/model-runtime/src/errors/match.test.ts @@ -94,6 +94,20 @@ describe('matchErrorPattern', () => { ).toBe(AgentRuntimeErrorType.StateStorePersistError); }); + it('classifies the Upstash readonly-upgrade write rejection as StateStorePersistError', () => { + expect( + matchErrorPattern({ + message: 'READONLY Writes are temporarily rejected due to server upgrade', + })?.code, + ).toBe(AgentRuntimeErrorType.StateStorePersistError); + }); + + it('classifies a caller-gone blocking-read abort as StateStoreReadError (benign, not a persist failure)', () => { + expect(matchErrorPattern({ message: 'ERR caller gone' })?.code).toBe( + AgentRuntimeErrorType.StateStoreReadError, + ); + }); + it('classifies harness JS runtime crashes as AgentRuntimeError', () => { for (const message of [ 'e.trim is not a function', diff --git a/packages/model-runtime/src/errors/patterns.ts b/packages/model-runtime/src/errors/patterns.ts index 0b92199bd1..b45c9d9655 100644 --- a/packages/model-runtime/src/errors/patterns.ts +++ b/packages/model-runtime/src/errors/patterns.ts @@ -514,9 +514,21 @@ export const ERROR_PATTERNS: ErrorPattern[] = [ match: sub('Connection error.'), }, + // ───────────────────────────────────────────────────────────────────────── + // StateStoreReadError — a blocking state-store READ (XREAD / BLPOP) aborted + // because the caller disconnected. Benign client abandonment; must precede + // StateStorePersistError so the write-side bucket doesn't claim it. + // ───────────────────────────────────────────────────────────────────────── + { + code: AgentRuntimeErrorType.StateStoreReadError, + match: sub('ERR caller gone'), + note: 'Upstash aborts the in-flight blocking read (XREAD/BLPOP) when the originating request is already gone.', + }, + // ───────────────────────────────────────────────────────────────────────── // StateStorePersistError — Redis / Upstash agent-state store (NOT the LLM - // provider). ioredis aborts, request-size cap, suspended DB. + // provider). ioredis aborts, request-size cap, suspended DB, readonly upgrade + // window. Write / connection-level drops. // ───────────────────────────────────────────────────────────────────────── { code: AgentRuntimeErrorType.StateStorePersistError, @@ -531,6 +543,11 @@ export const ERROR_PATTERNS: ErrorPattern[] = [ code: AgentRuntimeErrorType.StateStorePersistError, match: sub('database has been suspended'), }, + { + code: AgentRuntimeErrorType.StateStorePersistError, + match: sub('READONLY Writes are temporarily rejected'), + note: 'Upstash rejects writes against the read-only replica during a server upgrade.', + }, // ───────────────────────────────────────────────────────────────────────── // NoAvailableChannel — router / proxy has no upstream diff --git a/packages/model-runtime/src/errors/refine.test.ts b/packages/model-runtime/src/errors/refine.test.ts index ec02b6e03c..ec63e2b6ea 100644 --- a/packages/model-runtime/src/errors/refine.test.ts +++ b/packages/model-runtime/src/errors/refine.test.ts @@ -1,4 +1,4 @@ -import { AgentRuntimeErrorType } from '@lobechat/types'; +import { AgentRuntimeErrorType, ChatErrorType } from '@lobechat/types'; import { describe, expect, it } from 'vitest'; import { refineErrorCode } from './refine'; @@ -13,6 +13,57 @@ describe('refineErrorCode', () => { ).toBeUndefined(); }); + describe('un-typed throw wrappers', () => { + // A raw `Error` (e.g. a Drizzle "Failed query: …" throw) is wrapped by + // formatErrorForState as InternalServerError (HTTP 500). It must still reach + // the message patterns, otherwise it persists as a bare, un-classified 500. + it('reclassifies a 500-wrapped Drizzle throw into DatabasePersistError', () => { + expect( + refineErrorCode({ + errorType: String(ChatErrorType.InternalServerError), + message: 'Failed query: rollback\nparams: ', + }), + ).toBe(AgentRuntimeErrorType.DatabasePersistError); + }); + + it('reclassifies an AgentRuntimeError-wrapped throw via its message', () => { + expect( + refineErrorCode({ + errorType: AgentRuntimeErrorType.AgentRuntimeError, + message: 'Failed query: select "id" from "messages"', + }), + ).toBe(AgentRuntimeErrorType.DatabasePersistError); + }); + + it('leaves a 500 wrapper unrefined when nothing matches', () => { + expect( + refineErrorCode({ + errorType: String(ChatErrorType.InternalServerError), + message: 'Agent state not found for operation op_xxx', + }), + ).toBeUndefined(); + }); + + // The HTTP-status fallback is provider-only: a leading "429"/"500" in a + // harness/DB/Redis throw is not a real upstream status and must NOT recast + // the error with provider retry/failure semantics. + it('does not apply the HTTP-status fallback to un-typed wrappers', () => { + expect( + refineErrorCode({ + errorType: String(ChatErrorType.InternalServerError), + message: '429 some harness throw with no registered pattern', + }), + ).toBeUndefined(); + expect( + refineErrorCode({ + errorType: AgentRuntimeErrorType.AgentRuntimeError, + httpStatus: 500, + message: 'opaque internal failure', + }), + ).toBeUndefined(); + }); + }); + describe('message-pattern pass', () => { it('reclassifies a rate-limit message into RateLimitExceeded', () => { expect( diff --git a/packages/model-runtime/src/errors/refine.ts b/packages/model-runtime/src/errors/refine.ts index cd4a60f71f..847c7873ac 100644 --- a/packages/model-runtime/src/errors/refine.ts +++ b/packages/model-runtime/src/errors/refine.ts @@ -1,14 +1,35 @@ -import { AgentRuntimeErrorType, type ILobeAgentRuntimeErrorType } from '@lobechat/types'; +import { + AgentRuntimeErrorType, + ChatErrorType, + type ILobeAgentRuntimeErrorType, +} from '@lobechat/types'; import { matchErrorPattern } from './match'; /** - * Error codes that are generic enough to be worth re-deriving from the upstream - * message / HTTP status. Specific codes assigned by a provider adapter are left - * untouched — we only refine the `ProviderBizError` catch-all, which absorbs - * any non-OK upstream response that the adapter couldn't name. + * Codes whose message is worth running through `matchErrorPattern`. + * + * Besides the `ProviderBizError` upstream catch-all, this covers the two + * fallback wrappers `formatErrorForState` produces for un-typed throws: a raw + * `Error` is wrapped as `InternalServerError` (HTTP 500) and any other value as + * `AgentRuntimeError`. They must be pattern-refinable so persistence-layer + * throws (`Failed query: …`) and state-store drops reach the registry — without + * them those land as a bare, un-classified 500. */ -const REFINABLE_CODES = new Set([AgentRuntimeErrorType.ProviderBizError]); +const PATTERN_REFINABLE_CODES = new Set([ + AgentRuntimeErrorType.AgentRuntimeError, + AgentRuntimeErrorType.ProviderBizError, + String(ChatErrorType.InternalServerError), +]); + +/** + * Codes eligible for the coarse HTTP-status fallback — provider catch-alls + * only. A leading "429"/"500" in an upstream body is a real status signal, but + * the same digits in a harness/DB/Redis throw (e.g. `Error('500 …')`) are not: + * those must keep their original `InternalServerError` / `AgentRuntimeError` + * code rather than being recast with provider retry/failure semantics. + */ +const STATUS_REFINABLE_CODES = new Set([AgentRuntimeErrorType.ProviderBizError]); /** * Last-resort mapping from a bare HTTP status to a code, used only when the @@ -50,27 +71,31 @@ export interface RefineErrorInput { } /** - * Reclassify a generic provider catch-all (`ProviderBizError`) into a more - * specific code using the upstream message and HTTP status. Returns the refined - * code, or `undefined` when no better classification is found (caller keeps the - * original errorType). + * Reclassify a generic catch-all (`ProviderBizError`, or the + * `InternalServerError` / `AgentRuntimeError` fallback wrappers) into a more + * specific code using the message and HTTP status. Returns the refined code, or + * `undefined` when no better classification is found (caller keeps the original + * errorType). * * Priority: * 1. `matchErrorPattern` over the message — most specific, covers the rich - * cases plus the migrated `Upstream*` patterns. - * 2. HTTP-status fallback for messages that matched nothing. + * cases plus the migrated `Upstream*` patterns. Open to all wrappers. + * 2. HTTP-status fallback for messages that matched nothing — provider + * catch-alls only (see `STATUS_REFINABLE_CODES`). */ export const refineErrorCode = ( input: RefineErrorInput, ): ILobeAgentRuntimeErrorType | undefined => { const { errorType, httpStatus, message, provider } = input; - if (!errorType || !REFINABLE_CODES.has(errorType)) return undefined; + if (!errorType || !PATTERN_REFINABLE_CODES.has(errorType)) return undefined; const matched = matchErrorPattern({ errorType, message, provider }); if (matched && matched.code !== errorType) return matched.code; - const byStatus = codeFromHttpStatus(httpStatus ?? leadingStatusFromMessage(message)); - if (byStatus && byStatus !== errorType) return byStatus; + if (STATUS_REFINABLE_CODES.has(errorType)) { + const byStatus = codeFromHttpStatus(httpStatus ?? leadingStatusFromMessage(message)); + if (byStatus && byStatus !== errorType) return byStatus; + } return undefined; }; diff --git a/packages/model-runtime/src/errors/specs.ts b/packages/model-runtime/src/errors/specs.ts index faeb83190c..4d5ca8c619 100644 --- a/packages/model-runtime/src/errors/specs.ts +++ b/packages/model-runtime/src/errors/specs.ts @@ -418,6 +418,18 @@ export const ERROR_CODE_SPECS: SpecMap = { description: 'Context-engine pipeline processor crashed ("Processor [] execution failed").', }, + [AgentRuntimeErrorType.StateStoreReadError]: { + code: AgentRuntimeErrorType.StateStoreReadError, + numericId: 7007, + category: 'stream', + severity: 'warning', + attribution: 'system', + httpStatus: 500, + retryable: false, + countAsFailure: false, + description: + 'State-store (Redis / Upstash) blocking read (XREAD / BLPOP) aborted because the caller disconnected ("ERR caller gone") — benign client abandonment.', + }, // ─── 8xxx Provider (catch-all) ──────────────────────────────────────── [AgentRuntimeErrorType.AgentRuntimeError]: { diff --git a/packages/types/src/agentRuntime.ts b/packages/types/src/agentRuntime.ts index 450d284ec0..10b1198563 100644 --- a/packages/types/src/agentRuntime.ts +++ b/packages/types/src/agentRuntime.ts @@ -137,6 +137,15 @@ export const AgentRuntimeErrorType = { * DB, …). Harness-side infra — the agent state layer, not the LLM provider. */ StateStorePersistError: 'StateStorePersistError', + /** + * A blocking state-store read (XREAD / BLPOP, e.g. consuming the agent event + * stream or waiting on a tool result) was aborted because the originating + * caller disconnected — Upstash replies "ERR caller gone". Benign client + * abandonment tied to the request lifecycle, not a harness fault; kept + * distinct from the write-side StateStorePersistError so it is not counted + * as a failure. + */ + StateStoreReadError: 'StateStoreReadError', /** * A context-engine pipeline processor threw while building the prompt * context ("Processor [] execution failed"). Harness-side bug in the