🐛 fix(model-runtime): classify "Agent state not found" as StateStoreReadError (#15778)

`coordinator.loadAgentState(operationId)` returning null throws a raw
`Error("Agent state not found for operation …")`, which (after the refine fix)
otherwise lands as a bare 500. It is a state-store READ failure, so route it to
StateStoreReadError alongside the caller-gone abort.

Because losing an operation's state is a genuine system fault (not benign
client abandonment), promote StateStoreReadError to countAsFailure: true /
severity: error. `ERR caller gone` now counts too — accepted trade-off, both
are system-side read failures worth tracking.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Arvin Xu
2026-06-13 21:11:33 +08:00
committed by GitHub
parent 24e34c7545
commit f51dd06a36
6 changed files with 29 additions and 15 deletions
+1 -1
View File
@@ -70,7 +70,7 @@ export default {
StateStorePersistError:
'A temporary issue with the conversation state store interrupted this operation. Please try again; if it persists, contact support.',
StateStoreReadError:
'This operation was ended because the connection closed before it finished. This is usually harmless — reopen the conversation to continue.',
'This operation could not be resumed because its session state was unavailable. Please reopen the conversation to continue; if it persists, contact support.',
StreamChunkError:
'Error parsing the message chunk of the streaming request. Please check if the current API interface complies with the standard specifications, or contact your API provider for assistance.',
UpstreamGatewayError:
@@ -102,12 +102,20 @@ describe('matchErrorPattern', () => {
).toBe(AgentRuntimeErrorType.StateStorePersistError);
});
it('classifies a caller-gone blocking-read abort as StateStoreReadError (benign, not a persist failure)', () => {
it('classifies a caller-gone blocking-read abort as StateStoreReadError', () => {
expect(matchErrorPattern({ message: 'ERR caller gone' })?.code).toBe(
AgentRuntimeErrorType.StateStoreReadError,
);
});
it('classifies a missing-agent-state read as StateStoreReadError', () => {
expect(
matchErrorPattern({
message: 'Agent state not found for operation op_1781276404066_agt_x_tpc_y_z',
})?.code,
).toBe(AgentRuntimeErrorType.StateStoreReadError);
});
it('classifies harness JS runtime crashes as AgentRuntimeError', () => {
for (const message of [
'e.trim is not a function',
@@ -515,15 +515,21 @@ export const ERROR_PATTERNS: ErrorPattern[] = [
},
// ─────────────────────────────────────────────────────────────────────────
// StateStoreReadError — a blocking state-store READ (XREAD / BLPOP) aborted
// because the caller disconnected. Benign client abandonment; must precede
// StateStorePersistError so the write-side bucket doesn't claim it.
// StateStoreReadError — a state-store READ failed: either a blocking read
// (XREAD / BLPOP) aborted because the caller disconnected, or the operation's
// agent state could not be loaded. Must precede StateStorePersistError so the
// write-side bucket doesn't claim it.
// ─────────────────────────────────────────────────────────────────────────
{
code: AgentRuntimeErrorType.StateStoreReadError,
match: sub('ERR caller gone'),
note: 'Upstash aborts the in-flight blocking read (XREAD/BLPOP) when the originating request is already gone.',
},
{
code: AgentRuntimeErrorType.StateStoreReadError,
match: sub('Agent state not found for operation'),
note: 'coordinator.loadAgentState() returned null — the operation state was evicted/cleaned up before this read.',
},
// ─────────────────────────────────────────────────────────────────────────
// StateStorePersistError — Redis / Upstash agent-state store (NOT the LLM
@@ -39,7 +39,7 @@ describe('refineErrorCode', () => {
expect(
refineErrorCode({
errorType: String(ChatErrorType.InternalServerError),
message: 'Agent state not found for operation op_xxx',
message: 'some opaque internal failure with no registered pattern',
}),
).toBeUndefined();
});
+3 -3
View File
@@ -422,13 +422,13 @@ export const ERROR_CODE_SPECS: SpecMap = {
code: AgentRuntimeErrorType.StateStoreReadError,
numericId: 7007,
category: 'stream',
severity: 'warning',
severity: 'error',
attribution: 'system',
httpStatus: 500,
retryable: false,
countAsFailure: false,
countAsFailure: true,
description:
'State-store (Redis / Upstash) blocking read (XREAD / BLPOP) aborted because the caller disconnected ("ERR caller gone") — benign client abandonment.',
'State-store (Redis / Upstash) read failed: a blocking read (XREAD / BLPOP) aborted because the caller disconnected ("ERR caller gone"), or the operation\'s agent state could not be loaded ("Agent state not found for operation …"). System-side — counts as a failure.',
},
// ─── 8xxx Provider (catch-all) ────────────────────────────────────────
+6 -6
View File
@@ -138,12 +138,12 @@ export const AgentRuntimeErrorType = {
*/
StateStorePersistError: 'StateStorePersistError',
/**
* A blocking state-store read (XREAD / BLPOP, e.g. consuming the agent event
* stream or waiting on a tool result) was aborted because the originating
* caller disconnected — Upstash replies "ERR caller gone". Benign client
* abandonment tied to the request lifecycle, not a harness fault; kept
* distinct from the write-side StateStorePersistError so it is not counted
* as a failure.
* A state-store (Redis / Upstash) READ failed: either a blocking read
* (XREAD / BLPOP, consuming the agent event stream or waiting on a tool
* result) was aborted because the caller disconnected ("ERR caller gone"), or
* the operation's agent state could not be loaded ("Agent state not found for
* operation …"). System-side read failure, kept distinct from the write-side
* StateStorePersistError; counts as a failure.
*/
StateStoreReadError: 'StateStoreReadError',
/**