💄 style: extend artifact code background

🐛 fix: keep artifact code panel scrolled
🐛 fix(agent): persist & deliver image attachments for device/sandbox hetero runs (#15685 )
2026-06-14 03:30:19 +00:00 · 2026-06-12 11:29:19 +08:00 · 2026-06-12 11:12:34 +08:00 · 2026-06-12 00:02:51 +08:00 · 2026-06-11 23:52:25 +08:00 · 2026-06-11 23:34:38 +08:00
3559 changed files with 150271 additions and 26283 deletions
@@ -51,7 +51,7 @@ export interface GlobalServerConfig {

 ### 3. Assemble Server Config (if new domain)

-In `src/server/globalConfig/index.ts`:
+In `apps/server/src/globalConfig/index.ts`:

 ```typescript
 import { <domain>Env } from '@/envs/<domain>';
@@ -97,7 +97,7 @@ AI_IMAGE_DEFAULT_IMAGE_NUM: z.coerce.number().min(1).max(20).optional(),
 // packages/types/src/serverConfig.ts
 image?: PartialDeep<UserImageConfig>;

-// src/server/globalConfig/index.ts
+// apps/server/src/globalConfig/index.ts
 image: cleanObject({ defaultImageNum: imageEnv.AI_IMAGE_DEFAULT_IMAGE_NUM }),

 // src/store/user/slices/common/action.ts
@@ -50,14 +50,14 @@ execAgent({ hooks })

 ## Key Files

-| File                                                       | Role                                                   |
-| ---------------------------------------------------------- | ------------------------------------------------------ |
-| `packages/agent-runtime/src/types/hooks.ts`                | Type definitions (AgentHookType, all event interfaces) |
-| `src/server/services/agentRuntime/hooks/types.ts`          | Server-side types (AgentHook, re-exports)              |
-| `src/server/services/agentRuntime/hooks/HookDispatcher.ts` | Registration, dispatch, dispatchBeforeToolCall         |
-| `src/server/modules/AgentRuntime/RuntimeExecutors.ts`      | Tool/Compact/HumanIntervention hook dispatch           |
-| `src/server/services/agentRuntime/AgentRuntimeService.ts`  | Step hooks + HumanIntervention resume/reject           |
-| `src/server/services/aiAgent/index.ts`                     | CallAgent hook dispatch                                |
+| File                                                            | Role                                                   |
+| --------------------------------------------------------------- | ------------------------------------------------------ |
+| `packages/agent-runtime/src/types/hooks.ts`                     | Type definitions (AgentHookType, all event interfaces) |
+| `apps/server/src/services/agentRuntime/hooks/types.ts`          | Server-side types (AgentHook, re-exports)              |
+| `apps/server/src/services/agentRuntime/hooks/HookDispatcher.ts` | Registration, dispatch, dispatchBeforeToolCall         |
+| `apps/server/src/modules/AgentRuntime/RuntimeExecutors.ts`      | Tool/Compact/HumanIntervention hook dispatch           |
+| `apps/server/src/services/agentRuntime/AgentRuntimeService.ts`  | Step hooks + HumanIntervention resume/reject           |
+| `apps/server/src/services/aiAgent/index.ts`                     | CallAgent hook dispatch                                |

 ## Registration Flow

@@ -26,9 +26,9 @@ Agent Signal has one consistent shape:

 Read:

- `src/server/services/agentSignal/index.ts`
- `src/server/workflows/agentSignal/index.ts`
- `src/server/workflows/agentSignal/run.ts`
+- `apps/server/src/services/agentSignal/index.ts`
+- `apps/server/src/workflows/agentSignal/index.ts`
+- `apps/server/src/workflows/agentSignal/run.ts`

 ## Core Model

@@ -48,11 +48,11 @@ Keep the boundaries strict:
 ## Implementation Workflow

 1. Decide whether the use case is synchronous or quiet background work.
-2. Define or reuse a source type in `src/server/services/agentSignal/sourceTypes.ts`.
-3. Define or reuse signal and action types in `src/server/services/agentSignal/policies/types.ts`.
+2. Define or reuse a source type in `apps/server/src/services/agentSignal/sourceTypes.ts`.
+3. Define or reuse signal and action types in `apps/server/src/services/agentSignal/policies/types.ts`.
 4. Implement handlers with `defineSourceHandler`, `defineSignalHandler`, or `defineActionHandler`.
 5. Bundle handlers with `defineAgentSignalHandlers(...)`.
-6. Register the policy in `src/server/services/agentSignal/policies/index.ts` and pass it into the runtime factory if needed.
+6. Register the policy in `apps/server/src/services/agentSignal/policies/index.ts` and pass it into the runtime factory if needed.
 7. Add or update ingress code that emits or enqueues the source event.
 8. Add observability and tests before considering the flow complete.

@@ -63,19 +63,19 @@ Keep the boundaries strict:
  `packages/agent-signal/src/base/builders.ts`
  `packages/agent-signal/src/base/types.ts`
 - Server-owned runtime and middleware:
-  `src/server/services/agentSignal/runtime/AgentSignalRuntime.ts`
-  `src/server/services/agentSignal/runtime/AgentSignalScheduler.ts`
-  `src/server/services/agentSignal/runtime/middleware.ts`
-  `src/server/services/agentSignal/runtime/context.ts`
+  `apps/server/src/services/agentSignal/runtime/AgentSignalRuntime.ts`
+  `apps/server/src/services/agentSignal/runtime/AgentSignalScheduler.ts`
+  `apps/server/src/services/agentSignal/runtime/middleware.ts`
+  `apps/server/src/services/agentSignal/runtime/context.ts`
 - Existing policy example:
-  `src/server/services/agentSignal/policies/analyzeIntent/index.ts`
-  `src/server/services/agentSignal/policies/analyzeIntent/feedbackSatisfaction.ts`
-  `src/server/services/agentSignal/policies/analyzeIntent/feedbackDomain.ts`
-  `src/server/services/agentSignal/policies/analyzeIntent/feedbackAction.ts`
-  `src/server/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`
+  `apps/server/src/services/agentSignal/policies/analyzeIntent/index.ts`
+  `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackSatisfaction.ts`
+  `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackDomain.ts`
+  `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackAction.ts`
+  `apps/server/src/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`
 - Observability:
-  `src/server/services/agentSignal/observability/projector.ts`
-  `src/server/services/agentSignal/observability/traceEvents.ts`
+  `apps/server/src/services/agentSignal/observability/projector.ts`
+  `apps/server/src/services/agentSignal/observability/traceEvents.ts`
  `packages/observability-otel/src/modules/agent-signal/index.ts`

 ## Implementation Rules
@@ -86,7 +86,7 @@ Keep the boundaries strict:
 - Use stable ids and idempotency keys when the same source can arrive more than once.
 - Preserve scope discipline. The runtime uses `scopeKey` to serialize related background work.
 - Prefer the dedicated shared package types and builders from `@lobechat/agent-signal` for normalized nodes and result contracts.
- Add focused tests near the touched runtime, policy, or store module. Existing tests under `src/server/services/agentSignal/**/__tests__` are the reference pattern.
+- Add focused tests near the touched runtime, policy, or store module. Existing tests under `apps/server/src/services/agentSignal/**/__tests__` are the reference pattern.

 ## References

@@ -32,9 +32,9 @@ source node

 Read:

- `src/server/services/agentSignal/index.ts`
- `src/server/services/agentSignal/sources/index.ts`
- `src/server/services/agentSignal/runtime/AgentSignalScheduler.ts`
+- `apps/server/src/services/agentSignal/index.ts`
+- `apps/server/src/services/agentSignal/sources/index.ts`
+- `apps/server/src/services/agentSignal/runtime/AgentSignalScheduler.ts`

 ## Package Boundaries

@@ -56,7 +56,7 @@ Read:
 - `packages/agent-signal/src/types/events.ts`
 - `packages/agent-signal/src/types/builtin.ts`

-### `src/server/services/agentSignal`
+### `apps/server/src/services/agentSignal`

 Treat this as the server-owned implementation layer.

@@ -89,11 +89,11 @@ Examples:

 Define source payloads in:

- `src/server/services/agentSignal/sourceTypes.ts`
+- `apps/server/src/services/agentSignal/sourceTypes.ts`

 Build normalized sources in:

- `src/server/services/agentSignal/sources/buildSource.ts`
+- `apps/server/src/services/agentSignal/sources/buildSource.ts`
 - `packages/agent-signal/src/base/builders.ts`

 ### Signal
@@ -109,7 +109,7 @@ Examples from `analyzeIntent`:

 Define server-owned signal types in:

- `src/server/services/agentSignal/policies/types.ts`
+- `apps/server/src/services/agentSignal/policies/types.ts`

 ### Action

@@ -157,9 +157,9 @@ When a user asks for "the procedure", document the flow above and point to the e

 Read:

- `src/server/services/agentSignal/sources/index.ts`
- `src/server/services/agentSignal/runtime/context.ts`
- `src/server/services/agentSignal/constants.ts`
+- `apps/server/src/services/agentSignal/sources/index.ts`
+- `apps/server/src/services/agentSignal/runtime/context.ts`
+- `apps/server/src/services/agentSignal/constants.ts`

 Use `enqueueAgentSignalSourceEvent(...)` when the work should stay quiet and out-of-band. That path:

@@ -172,8 +172,8 @@ This is the preferred path when the UI request should finish immediately and the

 Read:

- `src/server/workflows/agentSignal/index.ts`
- `src/server/workflows/agentSignal/run.ts`
+- `apps/server/src/workflows/agentSignal/index.ts`
+- `apps/server/src/workflows/agentSignal/run.ts`

 ## Existing Example: `analyzeIntent`

@@ -192,8 +192,8 @@ agent.user.message

 Read:

- `src/server/services/agentSignal/policies/analyzeIntent/index.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/feedbackSatisfaction.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/feedbackDomain.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/feedbackAction.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/index.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackSatisfaction.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackDomain.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackAction.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`
@@ -2,7 +2,7 @@

 ## Fluent Registration API

-Use the middleware helpers in `src/server/services/agentSignal/runtime/middleware.ts`.
+Use the middleware helpers in `apps/server/src/services/agentSignal/runtime/middleware.ts`.

 They provide:

@@ -32,7 +32,7 @@ The context gives you:

 Read:

- `src/server/services/agentSignal/runtime/context.ts`
+- `apps/server/src/services/agentSignal/runtime/context.ts`

 ## Return Contracts

@@ -48,7 +48,7 @@ Return one of these shapes:
 Read:

 - `packages/agent-signal/src/base/types.ts`
- `src/server/services/agentSignal/runtime/AgentSignalScheduler.ts`
+- `apps/server/src/services/agentSignal/runtime/AgentSignalScheduler.ts`

 ## Policy Composition Pattern

@@ -72,8 +72,8 @@ That bundle is later passed into the runtime via:

 Read:

- `src/server/services/agentSignal/policies/index.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/index.ts`
+- `apps/server/src/services/agentSignal/policies/index.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/index.ts`

 ## Source Handler Pattern

@@ -81,7 +81,7 @@ Use a source handler when you are interpreting a producer event into semantic si

 Reference:

- `src/server/services/agentSignal/policies/analyzeIntent/feedbackSatisfaction.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackSatisfaction.ts`

 Pattern:

@@ -114,8 +114,8 @@ Use a signal handler when one semantic state should branch into more semantic st

 References:

- `src/server/services/agentSignal/policies/analyzeIntent/feedbackDomain.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/feedbackAction.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackDomain.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/feedbackAction.ts`

 Pattern:

@@ -148,7 +148,7 @@ Use an action handler when the runtime should do actual work.

 Reference:

- `src/server/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`

 Pattern:

@@ -186,9 +186,9 @@ Keep these rules:
 Use this split:

 - external event payloads:
-  `src/server/services/agentSignal/sourceTypes.ts`
+  `apps/server/src/services/agentSignal/sourceTypes.ts`
 - policy-owned signal and action payloads:
-  `src/server/services/agentSignal/policies/types.ts`
+  `apps/server/src/services/agentSignal/policies/types.ts`
 - normalized shared node contracts:
  `packages/agent-signal/src/base/types.ts`

@@ -216,10 +216,10 @@ Prefer focused tests near the touched code.

 Useful references:

- `src/server/services/agentSignal/runtime/__tests__/AgentSignalRuntime.test.ts`
- `src/server/services/agentSignal/__tests__/index.integration.test.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/__tests__/*`
- `src/server/services/agentSignal/policies/analyzeIntent/actions/__tests__/*`
+- `apps/server/src/services/agentSignal/runtime/__tests__/AgentSignalRuntime.test.ts`
+- `apps/server/src/services/agentSignal/__tests__/index.integration.test.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/__tests__/*`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/actions/__tests__/*`

 Test at the smallest level that proves the behavior:

@@ -24,9 +24,9 @@ After runtime execution, the service projects one compact observability model fr

 Read:

- `src/server/services/agentSignal/observability/projector.ts`
- `src/server/services/agentSignal/observability/traceEvents.ts`
- `src/server/services/agentSignal/observability/store.ts`
+- `apps/server/src/services/agentSignal/observability/projector.ts`
+- `apps/server/src/services/agentSignal/observability/traceEvents.ts`
+- `apps/server/src/services/agentSignal/observability/store.ts`

 Projection outputs:

@@ -58,7 +58,7 @@ Workflow-triggered runs do not naturally pass through the normal foreground runt

 Read:

- `src/server/workflows/agentSignal/run.ts`
+- `apps/server/src/workflows/agentSignal/run.ts`

 Use that path when:

@@ -77,8 +77,8 @@ Check:

 Read:

- `src/server/services/agentSignal/index.ts`
- `src/server/services/agentSignal/sources/index.ts`
+- `apps/server/src/services/agentSignal/index.ts`
+- `apps/server/src/services/agentSignal/sources/index.ts`

 ### The signal exists but no action runs

@@ -98,8 +98,8 @@ Check:

 Reference:

- `src/server/services/agentSignal/policies/actionIdempotency.ts`
- `src/server/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`
+- `apps/server/src/services/agentSignal/policies/actionIdempotency.ts`
+- `apps/server/src/services/agentSignal/policies/analyzeIntent/actions/userMemory.ts`

 ### Background runs are hard to discover

@@ -0,0 +1,212 @@
+---
+name: agent-testing
+description: >
+  Agentic end-to-end testing for LobeHub: backend verification via the CLI,
+  frontend verification via agent-browser (Electron), full-stack verification in
+  the browser, and bot-channel verification via osascript. Local-first today,
+  designed to extend to cloud automation. Triggers on 'cli test', 'test with cli',
+  'verify with cli', 'backend test with cli', 'local test', 'test in electron',
+  'test desktop', 'test bot', 'bot test', 'test in discord', 'test in telegram',
+  'test in slack', 'test in wechat', 'test in weixin', 'test in lark', 'test in feishu',
+  'test in qq', 'manual test', 'osascript', 'test report', or any local
+  end-to-end verification task.
+---
+
+# Agent Testing (Agentic End-to-End Verification)
+
+One skill for all agentic end-to-end testing — local-first today, designed to
+also run as full cloud automation. Every test session follows the same
+four-step contract:
+
+```
+Step 0: Env + Auth  →  Step 1: Pick surface  →  Step 2: Run  →  Step 3: Structured report
+```
+
+## Step 0 — Environment setup + auth check (mandatory)
+
+Step 0 is about getting the environment ready: **dependencies are healthy**
+and **auth is green**. A test run that dies halfway on a missing dependency or
+a login wall wastes the whole session — clear both gates BEFORE writing a
+single test step.
+
+### 0.1 Dependencies are installed — root AND standalone apps
+
+The root pnpm workspace does **NOT** cover every app: `pnpm-workspace.yaml`
+lists `packages/**`, `e2e`, `apps/server`, and only `apps/desktop/src/main` —
+**`apps/desktop` and `apps/cli` are standalone**, each keeping its own
+`node_modules` with its own links into `packages/`. A root install does not
+refresh them, so install in every app the test will touch:
+
+```bash
+pnpm install                      # root workspace
+cd apps/desktop && pnpm install   # Electron surface
+cd apps/cli && pnpm install       # CLI surface
+```
+
+Symptom of a stale standalone install: the build/launch fails to resolve a
+recently added workspace package — `Rolldown failed to resolve import
+"@lobechat/<pkg>"` (Electron) or `Cannot find module '@lobechat/<pkg>'` (CLI).
+
+### 0.2 Run scripts from the repo root
+
+All paths in this skill (`./.agents/skills/agent-testing/...`) are
+repo-root-relative, and background commands inherit the current working
+directory — a script launched while `cwd` is `apps/desktop` fails with
+`No such file or directory`. Verify `pwd` is the repo root before launching
+long-running scripts.
+
+### 0.3 Auth is green
+
+**Auth is the gate for all automated testing.**
+
+```bash
+./.agents/skills/agent-testing/scripts/setup-auth.sh status
+```
+
+| Surface  | Mechanism                                         | One-key path                   | Standard check                 |
+| -------- | ------------------------------------------------- | ------------------------------ | ------------------------------ |
+| CLI      | OIDC Device Code Flow (`apps/cli/.lobehub-dev`)   | `setup-auth.sh cli`            | `setup-auth.sh status`         |
+| Web      | better-auth cookie injection into `agent-browser` | `pbpaste \| setup-auth.sh web` | `setup-auth.sh web-verify`     |
+| Electron | App's own persistent login state                  | Log in once in the app         | `app-probe.sh auth`            |
+| Bot      | Native apps already logged in                     | —                              | per-platform screenshot        |
+
+Login-state checks are standardized — do NOT hand-roll `window.__LOBE_STORES`
+eval snippets; use `scripts/app-probe.sh auth` (returns `{ isSignedIn, userId }`,
+works for Electron CDP and web sessions via `AB_TARGET`).
+
+If `status` is not all green, fix auth first (the steps that need a human must be
+requested from the user explicitly). Full background and failure modes:
+[references/auth.md](./references/auth.md).
+
+## Step 1 — Pick the surface by change scope
+
+| Change scope                                            | Default surface                      | Why                                                               | Guide                              |
+| ------------------------------------------------------- | ------------------------------------ | ----------------------------------------------------------------- | ---------------------------------- |
+| **Backend** (TRPC router / service / model / migration) | **CLI**                              | Fastest loop, text-assertable output, zero UI flakiness           | [cli/index.md](./cli/index.md)     |
+| **Pure frontend** (components, store, styles, UX)       | **Electron** (agent-browser + CDP)   | Primary product shape; `__LOBE_STORES` state introspection        | [ui/electron.md](./ui/electron.md) |
+| **Full-stack** (new API + UI consuming it)              | **Web** (browser + local dev server) | One surface where network requests and UI are observable together | [ui/web.md](./ui/web.md)           |
+| **Bot channels** (Discord / WeChat / Lark / …)          | Native app via osascript / bridge    | Only way to exercise the real channel end-to-end                  | `bot/<platform>/index.md`          |
+
+Escalate, don't duplicate: verify a backend change with the CLI first; only add
+a UI pass when the change actually affects the UI.
+
+### Environment support (local macOS vs cloud Linux)
+
+The decisive constraint per surface is **how evidence (screenshots) is
+captured**: CDP-based capture (`agent-browser screenshot`) renders from the
+browser engine and needs no real display; OS-level capture (`screencapture`,
+osascript) is macOS-only.
+
+| Surface  | macOS (local) | Linux / cloud (headless)                                  | Screenshot mechanism                                   |
+| -------- | ------------- | --------------------------------------------------------- | ------------------------------------------------------ |
+| CLI      | ✅            | ✅                                                        | n/a — text output                                      |
+| Web      | ✅            | ✅ headless Chromium works natively                       | CDP — no display needed                                |
+| Electron | ✅            | ⚠️ runs, but needs a display server: wrap with `xvfb-run` | CDP works under Xvfb; `capture-app-window.sh` does NOT |
+| Bot      | ✅            | ❌ osascript + native apps are macOS-only                 | macOS `screencapture` only                             |
+
+When a test must stay cloud-portable, prefer CDP-based evidence over
+OS-level capture wherever both exist.
+
+### Bot platforms
+
+| Platform      | Guide                                            | Quick switcher        |
+| ------------- | ------------------------------------------------ | --------------------- |
+| Discord       | [bot/discord/index.md](./bot/discord/index.md)   | `Cmd+K`               |
+| Slack         | [bot/slack/index.md](./bot/slack/index.md)       | `Cmd+K`               |
+| Telegram      | [bot/telegram/index.md](./bot/telegram/index.md) | `Cmd+F`               |
+| WeChat / 微信 | [bot/wechat/index.md](./bot/wechat/index.md)     | `Cmd+F`               |
+| Lark / 飞书   | [bot/lark/index.md](./bot/lark/index.md)         | `Cmd+K`               |
+| QQ            | [bot/qq/index.md](./bot/qq/index.md)             | `Cmd+F`               |
+| iMessage      | [bot/imessage/index.md](./bot/imessage/index.md) | bridge (no osascript) |
+
+Each platform folder contains an `index.md` (activation, navigation,
+send-message, verification snippets) and a `test-<platform>-bot.sh` script
+sharing the interface:
+
+```bash
+./.agents/skills/agent-testing/bot/<platform>/test-<platform>-bot.sh <channel_or_contact> <message> [wait_seconds] [screenshot_path]
+```
+
+New to osascript automation? Read
+[references/osascript.md](./references/osascript.md) first — it is a general
+macOS-automation asset (activate, type, paste, screenshot, accessibility reads,
+gotchas), not bot-specific.
+
+## Step 2 — Run
+
+Surface guides above carry the detailed workflows. Shared infrastructure:
+
+| Need                                 | Where                                                                |
+| ------------------------------------ | -------------------------------------------------------------------- |
+| Start / restart the local dev server | [references/dev-server.md](./references/dev-server.md)               |
+| `agent-browser` command reference    | [references/agent-browser.md](./references/agent-browser.md)         |
+| osascript patterns (general macOS)   | [references/osascript.md](./references/osascript.md)                 |
+| Agent gateway probing                | [references/agent-gateway.md](./references/agent-gateway.md)         |
+| Screen recording                     | [references/record-app-screen.md](./references/record-app-screen.md) |
+
+### Scripts
+
+All under `.agents/skills/agent-testing/scripts/`:
+
+| Script                    | Usage                                                                          |
+| ------------------------- | ------------------------------------------------------------------------------ |
+| `setup-auth.sh`           | One-stop auth setup & status check (`status` / `cli` / `web`)                  |
+| `app-probe.sh`            | LobeHub app probes: `auth` / `route` / `ops` / `goto <path>` / `errors`        |
+| `record-gif.sh`           | Frame-sequence → GIF for time-based behavior (streaming, timers, animations)   |
+| `report-init.sh`          | Scaffold a structured test report (Step 3)                                     |
+| `electron-dev.sh`         | Manage Electron dev env (start/stop/status/restart, CDP 9222)                  |
+| `capture-app-window.sh`   | Screenshot a specific app window (general; used by bot tests)                  |
+| `record-app-screen.sh`    | Record app screen (video + periodic screenshots)                               |
+| `record-electron-demo.sh` | Record Electron app demo with ffmpeg                                           |
+| `agent-gateway/`          | Gateway probe / dump / analyze tools                                           |
+
+`app-probe.sh` is the LobeHub-specific fast path into app state — auth check,
+current route, running operations, and `goto <path>` quick navigation
+(`/agent/<agentId>/<topicId>`, `/task/<taskId>`, `/settings`, …) so a test can
+jump straight to the state under test instead of clicking through the UI. See
+[ui/electron.md](./ui/electron.md#lobehub-probes--quick-navigation) for usage.
+
+## Step 3 — Structured report (mandatory deliverable)
+
+Every automated test session ends with a structured, evidence-backed report —
+not a chat-only summary. Scaffold it up front and fill it as you test:
+
+```bash
+DIR=$(./.agents/skills/agent-testing/scripts/report-init.sh my-feature "Verify my feature")
+# ... test, saving screenshots / CLI transcripts into $DIR/assets/ ...
+# fill $DIR/report.md (case table, embedded evidence, verdict) and $DIR/result.json
+```
+
+Reports live in `.records/reports/<timestamp>-<slug>/` (gitignored): `report.md`
+(human-readable, with embedded screenshots), `result.json` (machine-readable
+pass/fail + score), `assets/` (evidence). Format spec and evidence rules:
+[references/report.md](./references/report.md).
+
+Two hard rules worth front-loading:
+
+- **Report language = the user's conversation language.** Write the ENTIRE
+  `report.md` (headings included) in the language the user is conversing in —
+  no mixed English. `result.json` keys/status values stay English.
+- **Time-based behavior needs a GIF, not a screenshot.** If a case asserts
+  change over time (streaming output, a ticking timer, loading states,
+  animations), record it with `scripts/record-gif.sh` and embed the GIF —
+  a static screenshot cannot prove the behavior.
+
+## Directory map
+
+```
+agent-testing/
+├── SKILL.md            # this router
+├── cli/index.md        # backend verification via the LobeHub CLI
+├── ui/electron.md      # pure-frontend verification in the desktop app
+├── ui/web.md           # full-stack verification in the browser
+├── bot/<platform>/     # bot-channel verification (osascript / bridge)
+├── references/         # shared knowledge: auth, dev-server, agent-browser, osascript, report
+└── scripts/            # setup-auth, report-init, electron-dev, capture, recording, gateway
+```
+
+## Gotchas
+
+- agent-browser: see [references/agent-browser.md](./references/agent-browser.md#gotchas)
+- Electron: see [ui/electron.md](./ui/electron.md#electron-gotchas)
+- osascript: see [references/osascript.md](./references/osascript.md#gotchas)
@@ -2,7 +2,7 @@

 **App name:** `Discord` | **Process name:** `Discord`

-See [osascript-common.md](../osascript-common.md) for shared patterns.
+See [references/osascript.md](../../references/osascript.md) for shared patterns.

 ## Activate & Navigate

@@ -92,6 +92,6 @@ echo "Screenshot saved to /tmp/discord-test-result.png"
 ## Script

 ```bash
-./.agents/skills/local-testing/bot/discord/test-discord-bot.sh "bot-testing" "!ping"
-./.agents/skills/local-testing/bot/discord/test-discord-bot.sh "bot-testing" "/ask Tell me a joke" 30
+./.agents/skills/agent-testing/bot/discord/test-discord-bot.sh "bot-testing" "!ping"
+./.agents/skills/agent-testing/bot/discord/test-discord-bot.sh "bot-testing" "/ask Tell me a joke" 30
 ```
@@ -60,5 +60,5 @@ echo "[$APP] Waiting ${WAIT}s for bot response..."
 sleep "$WAIT"

 echo "[$APP] Capturing screenshot..."
-"$SCRIPT_DIR/../capture-app-window.sh" "$APP" "$SCREENSHOT"
+"$SCRIPT_DIR/../../scripts/capture-app-window.sh" "$APP" "$SCREENSHOT"
 echo "[$APP] Done! Screenshot saved to $SCREENSHOT"
@@ -21,7 +21,7 @@ So the test surface is three layers:
  curl -sS -m4 -o /dev/null -w '%{http_code}\n' \
    "http://127.0.0.1:1234/api/v1/server/info?password=<PW>" # expect 200
  ```
- **Electron dev running with CDP**: `./.agents/skills/local-testing/scripts/electron-dev.sh start`
+- **Electron dev running with CDP**: `./.agents/skills/agent-testing/scripts/electron-dev.sh start`
 - The **iMessage Desktop branch** checked out (the `imessageBridge` IPC group
  and `@lobechat/chat-adapter-imessage` must be compiled into the main bundle).
  Run `pnpm install --ignore-scripts` at the repo root **and** in `apps/desktop/`
@@ -31,7 +31,7 @@ So the test surface is three layers:
 ## Fast path: automated script

 ```bash
-./.agents/skills/local-testing/bot/imessage/test-imessage-bridge.sh '<bluebubbles_password>' [bb_url] [cdp_port]
+./.agents/skills/agent-testing/bot/imessage/test-imessage-bridge.sh '<bluebubbles_password>' [bb_url] [cdp_port]
 ```

 Asserts the whole flow and self-cleans (unique `applicationId` per run, removes
@@ -136,7 +136,7 @@ Verifies the leg the bridge uses to _reply_: `BlueBubblesApiClient.sendText`
 → `POST /api/v1/message/text`. Run the helper against your own number:

 ```bash
-./.agents/skills/local-testing/bot/imessage/send-imessage-test.sh '<bb_password>' '+<E164>' # e.g. +15551234567
+./.agents/skills/agent-testing/bot/imessage/send-imessage-test.sh '<bb_password>' '+<E164>' # e.g. +15551234567
 ```

 **Gotcha that bites everyone:** with `method=apple-script` and a _new_
@@ -2,7 +2,7 @@

 **App name:** `Lark` or `飞书` | **Process name:** `Lark` or `飞书`

-See [osascript-common.md](../osascript-common.md) for shared patterns.
+See [references/osascript.md](../../references/osascript.md) for shared patterns.

 ## Activate & Navigate

@@ -56,6 +56,6 @@ screencapture /tmp/lark-bot-response.png
 ## Script

 ```bash
-./.agents/skills/local-testing/bot/lark/test-lark-bot.sh "bot-testing" "@MyBot hello"
-./.agents/skills/local-testing/bot/lark/test-lark-bot.sh "bot-testing" "Help me with this" 30
+./.agents/skills/agent-testing/bot/lark/test-lark-bot.sh "bot-testing" "@MyBot hello"
+./.agents/skills/agent-testing/bot/lark/test-lark-bot.sh "bot-testing" "Help me with this" 30
 ```
@@ -80,5 +80,5 @@ echo "[$APP] Waiting ${WAIT}s for bot response..."
 sleep "$WAIT"

 echo "[$APP] Capturing screenshot..."
-"$SCRIPT_DIR/../capture-app-window.sh" "$APP" "$SCREENSHOT"
+"$SCRIPT_DIR/../../scripts/capture-app-window.sh" "$APP" "$SCREENSHOT"
 echo "[$APP] Done! Screenshot saved to $SCREENSHOT"
@@ -2,7 +2,7 @@

 **App name:** `QQ` | **Process name:** `QQ`

-See [osascript-common.md](../osascript-common.md) for shared patterns.
+See [references/osascript.md](../../references/osascript.md) for shared patterns.

 ## Activate & Navigate

@@ -57,6 +57,6 @@ screencapture /tmp/qq-bot-response.png
 ## Script

 ```bash
-./.agents/skills/local-testing/bot/qq/test-qq-bot.sh "bot-testing" "Hello bot" 15
-./.agents/skills/local-testing/bot/qq/test-qq-bot.sh "MyBot" "/help" 10
+./.agents/skills/agent-testing/bot/qq/test-qq-bot.sh "bot-testing" "Hello bot" 15
+./.agents/skills/agent-testing/bot/qq/test-qq-bot.sh "MyBot" "/help" 10
 ```
@@ -72,5 +72,5 @@ echo "[$APP] Waiting ${WAIT}s for bot response..."
 sleep "$WAIT"

 echo "[$APP] Capturing screenshot..."
-"$SCRIPT_DIR/../capture-app-window.sh" "$APP" "$SCREENSHOT"
+"$SCRIPT_DIR/../../scripts/capture-app-window.sh" "$APP" "$SCREENSHOT"
 echo "[$APP] Done! Screenshot saved to $SCREENSHOT"
@@ -2,7 +2,7 @@

 **App name:** `Slack` | **Process name:** `Slack`

-See [osascript-common.md](../osascript-common.md) for shared patterns.
+See [references/osascript.md](../../references/osascript.md) for shared patterns.

 ## Activate & Navigate

@@ -68,6 +68,6 @@ screencapture /tmp/slack-bot-response.png
 ## Script

 ```bash
-./.agents/skills/local-testing/bot/slack/test-slack-bot.sh "bot-testing" "@mybot hello"
-./.agents/skills/local-testing/bot/slack/test-slack-bot.sh "bot-testing" "/ask What is 2+2?" 20
+./.agents/skills/agent-testing/bot/slack/test-slack-bot.sh "bot-testing" "@mybot hello"
+./.agents/skills/agent-testing/bot/slack/test-slack-bot.sh "bot-testing" "/ask What is 2+2?" 20
 ```
@@ -60,5 +60,5 @@ echo "[$APP] Waiting ${WAIT}s for bot response..."
 sleep "$WAIT"

 echo "[$APP] Capturing screenshot..."
-"$SCRIPT_DIR/../capture-app-window.sh" "$APP" "$SCREENSHOT"
+"$SCRIPT_DIR/../../scripts/capture-app-window.sh" "$APP" "$SCREENSHOT"
 echo "[$APP] Done! Screenshot saved to $SCREENSHOT"
@@ -2,7 +2,7 @@

 **App name:** `Telegram` | **Process name:** `Telegram`

-See [osascript-common.md](../osascript-common.md) for shared patterns.
+See [references/osascript.md](../../references/osascript.md) for shared patterns.

 ## Activate & Navigate

@@ -75,6 +75,6 @@ curl -s "https://api.telegram.org/bot$TELEGRAM_BOT_TOKEN/getUpdates?limit=5" | j
 ## Script

 ```bash
-./.agents/skills/local-testing/bot/telegram/test-telegram-bot.sh "MyTestBot" "/start"
-./.agents/skills/local-testing/bot/telegram/test-telegram-bot.sh "GPTBot" "Hello" 60
+./.agents/skills/agent-testing/bot/telegram/test-telegram-bot.sh "MyTestBot" "/start"
+./.agents/skills/agent-testing/bot/telegram/test-telegram-bot.sh "GPTBot" "Hello" 60
 ```
@@ -75,5 +75,5 @@ echo "[$APP] Waiting ${WAIT}s for bot response..."
 sleep "$WAIT"

 echo "[$APP] Capturing screenshot..."
-"$SCRIPT_DIR/../capture-app-window.sh" "$APP" "$SCREENSHOT"
+"$SCRIPT_DIR/../../scripts/capture-app-window.sh" "$APP" "$SCREENSHOT"
 echo "[$APP] Done! Screenshot saved to $SCREENSHOT"
@@ -2,7 +2,7 @@

 **App name:** `微信` or `WeChat` | **Process name:** `WeChat`

-See [osascript-common.md](../osascript-common.md) for shared patterns.
+See [references/osascript.md](../../references/osascript.md) for shared patterns.

 ## Activate & Navigate

@@ -76,6 +76,6 @@ screencapture /tmp/wechat-bot-response.png
 ## Script

 ```bash
-./.agents/skills/local-testing/bot/wechat/test-wechat-bot.sh "文件传输助手" "test message" 5
-./.agents/skills/local-testing/bot/wechat/test-wechat-bot.sh "MyBot" "Tell me a joke" 30
+./.agents/skills/agent-testing/bot/wechat/test-wechat-bot.sh "文件传输助手" "test message" 5
+./.agents/skills/agent-testing/bot/wechat/test-wechat-bot.sh "MyBot" "Tell me a joke" 30
 ```
@@ -81,5 +81,5 @@ echo "[$APP] Waiting ${WAIT}s for bot response..."
 sleep "$WAIT"

 echo "[$APP] Capturing screenshot..."
-"$SCRIPT_DIR/../capture-app-window.sh" "$APP" "$SCREENSHOT"
+"$SCRIPT_DIR/../../scripts/capture-app-window.sh" "$APP" "$SCREENSHOT"
 echo "[$APP] Done! Screenshot saved to $SCREENSHOT"
@@ -0,0 +1,142 @@
+# CLI Backend Verification
+
+Default surface for verifying **backend changes** (TRPC routers, services,
+models, migrations) end-to-end: fastest loop, text-assertable output, zero UI
+flakiness.
+
+## When to use
+
+- Verifying TRPC router / service / model changes end-to-end
+- Testing new API fields or response structure changes
+- Validating CLI command output after backend modifications
+- Debugging data flow issues between server and CLI
+
+## Prerequisites
+
+| Requirement  | Details                                                                           |
+| ------------ | --------------------------------------------------------------------------------- |
+| Dev server   | `localhost:3010` — see [../references/dev-server.md](../references/dev-server.md) |
+| CLI source   | `apps/cli/` — runs from source, no rebuild; standalone `node_modules` — run `pnpm install` inside `apps/cli/` (root install does not cover it) |
+| CLI dev mode | `LOBEHUB_CLI_HOME=.lobehub-dev` for isolated credentials                          |
+| Auth         | Device Code Flow login — see [../references/auth.md](../references/auth.md)       |
+
+All CLI dev commands run from `apps/cli/`. Subsequent examples use `$CLI`:
+
+```bash
+CLI="LOBEHUB_CLI_HOME=.lobehub-dev bun src/index.ts"
+```
+
+## Workflow
+
+### Step 1 — Server up?
+
+See [../references/dev-server.md](../references/dev-server.md) for the health
+check, start, and restart commands. Server-side code changes require a restart.
+
+### Step 2 — Auth ready?
+
+```bash
+./.agents/skills/agent-testing/scripts/setup-auth.sh status
+```
+
+If the CLI is not logged in, **the user must run the login themselves**
+(interactive browser authorization):
+
+```bash
+cd apps/cli && LOBEHUB_CLI_HOME=.lobehub-dev bun src/index.ts login --server http://localhost:3010
+```
+
+Credentials persist in `apps/cli/.lobehub-dev/`. Details:
+[../references/auth.md](../references/auth.md).
+
+### Step 3 — Test with CLI commands
+
+CLI runs from source, so CLI-side code changes take effect immediately without
+rebuilding:
+
+```bash
+cd apps/cli
+$CLI <command>
+```
+
+Capture output for the report as you go (e.g. `$CLI task list | tee "$DIR/assets/task-list.txt"`).
+
+### Step 4 — Clean up test data
+
+```bash
+$CLI task delete < id > -y
+$CLI agent delete < id > -y
+```
+
+### Step 5 — Report
+
+Finish with a structured report —
+[../references/report.md](../references/report.md). CLI evidence = exact
+command + trimmed output.
+
+## Common testing patterns
+
+### Task system
+
+```bash
+$CLI task list
+$CLI task create -n "Root Task" -i "Test instruction"
+$CLI task create -n "Child Task" -i "Sub instruction" --parent T-1
+$CLI task view T-1
+$CLI task tree T-1
+$CLI task edit T-1 --status running
+$CLI task comment T-1 -m "Test comment"
+$CLI task delete T-1 -y
+```
+
+### Agent system
+
+```bash
+$CLI agent list
+$CLI agent view <agent-id>
+$CLI agent run <agent-id> -m "Test prompt"
+```
+
+### Document & knowledge base
+
+```bash
+$CLI doc list
+$CLI doc create -t "Test Doc" -c "Content here"
+$CLI doc view <doc-id>
+$CLI kb list
+$CLI kb tree <kb-id>
+```
+
+### Model & provider
+
+```bash
+$CLI model list
+$CLI provider list
+$CLI provider test <provider-id>
+```
+
+## Dev-test cycle
+
+```
+1. Make code changes (service/model/router/type)
+         |
+2. Run unit tests (fast feedback)
+   bunx vitest run --silent='passed-only' '<test-file>'
+         |
+3. Restart dev server (if server-side changes — see dev-server.md)
+         |
+4. CLI verification (end-to-end)
+   $CLI <command>
+         |
+5. Clean up test data + write the report
+```
+
+## Troubleshooting
+
+| Issue                       | Solution                                        |
+| --------------------------- | ----------------------------------------------- |
+| `No authentication found`   | Run `login --server http://localhost:3010`      |
+| `UNAUTHORIZED` on API calls | Token expired; re-run login                     |
+| `ECONNREFUSED`              | Dev server not running — see dev-server.md      |
+| CLI shows old data/behavior | Server needs restart to pick up code changes    |
+| Login opens wrong server    | Must use `--server` flag (env var doesn't work) |
@@ -0,0 +1,257 @@
+# agent-browser CLI Reference
+
+Generic reference for the `agent-browser` CLI — automate Chromium-based apps (Electron, Chrome, web) via Chrome DevTools Protocol. LobeHub-specific patterns live in [../ui/electron.md](../ui/electron.md) and [../ui/web.md](../ui/web.md); authentication recipes live in [auth.md](./auth.md).
+
+Use `agent-browser` to automate Chromium-based apps via Chrome DevTools Protocol.
+
+Install via `npm i -g agent-browser`, `brew install agent-browser`, or `cargo install agent-browser`. Run `agent-browser install` to download Chrome. Run `agent-browser upgrade` to update.
+
+## Core Workflow
+
+Every browser automation follows this pattern:
+
+1. **Navigate**: `agent-browser open <url>`
+2. **Snapshot**: `agent-browser snapshot -i` (get element refs like `@e1`, `@e2`)
+3. **Interact**: Use refs to click, fill, select
+4. **Re-snapshot**: After navigation or DOM changes, get fresh refs
+
+```bash
+agent-browser open https://example.com/form
+agent-browser snapshot -i
+# Output: @e1 [input type="email"], @e2 [input type="password"], @e3 [button] "Submit"
+
+agent-browser fill @e1 "user@example.com"
+agent-browser fill @e2 "password123"
+agent-browser click @e3
+agent-browser wait --load networkidle
+agent-browser snapshot -i # Check result
+```
+
+## Command Chaining
+
+```bash
+# Chain open + wait + snapshot in one call
+agent-browser open https://example.com && agent-browser wait --load networkidle && agent-browser snapshot -i
+```
+
+Use `&&` when you don't need to read intermediate output. Run commands separately when you need to parse output first (e.g., snapshot to discover refs, then interact).
+
+## Essential Commands
+
+```bash
+# Navigation
+agent-browser open <url>              # Navigate (aliases: goto, navigate)
+agent-browser close                   # Close browser
+agent-browser close --all             # Close all active sessions
+
+# Snapshot
+agent-browser snapshot -i             # Interactive elements with refs (recommended)
+agent-browser snapshot -s "#selector" # Scope to CSS selector
+
+# Interaction (use @refs from snapshot)
+agent-browser click @e1               # Click element
+agent-browser click @e1 --new-tab     # Click and open in new tab
+agent-browser fill @e2 "text"         # Clear and type text
+agent-browser type @e2 "text"         # Type without clearing
+agent-browser select @e1 "option"     # Select dropdown option
+agent-browser check @e1               # Check checkbox
+agent-browser press Enter             # Press key
+agent-browser keyboard type "text"    # Type at current focus (no selector)
+agent-browser keyboard inserttext "text"  # Insert without key events
+agent-browser scroll down 500         # Scroll page
+agent-browser scroll down 500 --selector "div.content"  # Scroll within container
+
+# Get information
+agent-browser get text @e1            # Get element text
+agent-browser get url                 # Get current URL
+agent-browser get title               # Get page title
+agent-browser get cdp-url             # Get CDP WebSocket URL
+
+# Wait
+agent-browser wait @e1                # Wait for element
+agent-browser wait --load networkidle # Wait for network idle
+agent-browser wait --url "**/page"    # Wait for URL pattern
+agent-browser wait 2000               # Wait milliseconds
+agent-browser wait --text "Welcome"   # Wait for text to appear
+agent-browser wait --fn "!document.body.innerText.includes('Loading...')"  # Wait for text to disappear
+agent-browser wait "#spinner" --state hidden  # Wait for element to disappear
+
+# Downloads
+agent-browser download @e1 ./file.pdf          # Click element to trigger download
+agent-browser wait --download ./output.zip     # Wait for any download to complete
+
+# Network
+agent-browser network requests                 # Inspect tracked requests
+agent-browser network requests --type xhr,fetch  # Filter by resource type
+agent-browser network requests --method POST   # Filter by HTTP method
+agent-browser network route "**/api/*" --abort # Block matching requests
+agent-browser network har start                # Start HAR recording
+agent-browser network har stop ./capture.har   # Stop and save HAR file
+
+# Viewport & Device Emulation
+agent-browser set viewport 1920 1080          # Set viewport size (default: 1280x720)
+agent-browser set viewport 1920 1080 2        # 2x retina
+agent-browser set device "iPhone 14"          # Emulate device (viewport + user agent)
+
+# Capture
+agent-browser screenshot              # Screenshot to temp dir
+agent-browser screenshot --full       # Full page screenshot
+agent-browser screenshot --annotate   # Annotated screenshot with numbered element labels
+agent-browser pdf output.pdf          # Save as PDF
+
+# Clipboard
+agent-browser clipboard read          # Read text from clipboard
+agent-browser clipboard write "text"  # Write text to clipboard
+agent-browser clipboard copy          # Copy current selection
+agent-browser clipboard paste         # Paste from clipboard
+
+# Dialogs (alert, confirm, prompt, beforeunload)
+agent-browser dialog accept           # Accept dialog
+agent-browser dialog accept "input"   # Accept prompt dialog with text
+agent-browser dialog dismiss          # Dismiss/cancel dialog
+agent-browser dialog status           # Check if dialog is open
+
+# Diff (compare page states)
+agent-browser diff snapshot                        # Compare current vs last snapshot
+agent-browser diff screenshot --baseline before.png  # Visual pixel diff
+agent-browser diff url <url1> <url2>               # Compare two pages
+
+# Streaming
+agent-browser stream enable           # Start WebSocket streaming
+agent-browser stream status           # Inspect streaming state
+agent-browser stream disable          # Stop streaming
+```
+
+## Batch Execution
+
+```bash
+echo '[
+  ["open", "https://example.com"],
+  ["snapshot", "-i"],
+  ["click", "@e1"],
+  ["screenshot", "result.png"]
+]' | agent-browser batch --json
+```
+
+## Authentication
+
+```bash
+# Option 1: Auth vault (credentials stored encrypted)
+echo "$PASSWORD" | agent-browser auth save myapp --url https://app.example.com/login --username user --password-stdin
+agent-browser auth login myapp
+
+# Option 2: Session name (auto-save/restore cookies + localStorage)
+agent-browser --session-name myapp open https://app.example.com/login
+agent-browser close                                                       # State auto-saved
+agent-browser --session-name myapp open https://app.example.com/dashboard # Auto-restored
+
+# Option 3: Persistent profile
+agent-browser --profile ~/.myapp open https://app.example.com/login
+
+# Option 4: State file
+agent-browser state save auth.json
+agent-browser state load auth.json
+```
+
+### LobeHub dev server — inject better-auth cookie
+
+`agent-browser --headed` on macOS can create an off-screen Chromium window, blocking manual login. For a local LobeHub dev server (e.g. `localhost:3010`), copy the `better-auth.session_token` cookie out of a **Network request** in the user's own Chrome DevTools and load it via `state load`. See [auth.md](./auth.md) for the full recipe.
+
+## Semantic Locators (Alternative to Refs)
+
+```bash
+agent-browser find text "Sign In" click
+agent-browser find label "Email" fill "user@test.com"
+agent-browser find role button click --name "Submit"
+agent-browser find placeholder "Search" type "query"
+agent-browser find testid "submit-btn" click
+```
+
+## JavaScript Evaluation (eval)
+
+```bash
+# Simple expressions
+agent-browser eval 'document.title'
+
+# Complex JS: use --stdin with heredoc (RECOMMENDED)
+agent-browser eval --stdin << 'EVALEOF'
+JSON.stringify(
+  Array.from(document.querySelectorAll("img"))
+    .filter(i => !i.alt)
+    .map(i => ({ src: i.src.split("/").pop(), width: i.width }))
+)
+EVALEOF
+
+# Base64 encoding (avoids all shell escaping issues)
+agent-browser eval -b "$(echo -n 'document.title' | base64)"
+```
+
+## Ref Lifecycle
+
+Refs (`@e1`, `@e2`, etc.) are invalidated when the page changes. Always re-snapshot after clicking links/buttons that navigate, form submissions, or dynamic content loading.
+
+## Annotated Screenshots (Vision Mode)
+
+```bash
+agent-browser screenshot --annotate
+# Output includes the image path and a legend:
+#   [1] @e1 button "Submit"
+#   [2] @e2 link "Home"
+agent-browser click @e2 # Click using ref from annotated screenshot
+```
+
+## Parallel Sessions
+
+```bash
+agent-browser --session site1 open https://site-a.com
+agent-browser --session site2 open https://site-b.com
+agent-browser session list
+```
+
+## Connect to Existing Chrome
+
+```bash
+agent-browser --auto-connect snapshot # Auto-discover running Chrome
+agent-browser --cdp 9222 snapshot     # Explicit CDP port
+```
+
+## iOS Simulator (Mobile Safari)
+
+```bash
+agent-browser device list
+agent-browser -p ios --device "iPhone 16 Pro" open https://example.com
+agent-browser -p ios snapshot -i
+agent-browser -p ios tap @e1
+agent-browser -p ios swipe up
+agent-browser -p ios screenshot mobile.png
+agent-browser -p ios close
+```
+
+## Observability Dashboard
+
+```bash
+agent-browser dashboard install
+agent-browser dashboard start # Background server on port 4848
+agent-browser dashboard stop
+```
+
+## Cloud Providers
+
+Use `-p <provider>` to run against cloud browsers: `agentcore`, `browserbase`, `browserless`, `browseruse`, `kernel`.
+
+## Browser Engine Selection
+
+```bash
+agent-browser --engine lightpanda open example.com # 10x faster, 10x less memory
+```
+
+## Gotchas
+
+- **Daemon can get stuck** — if commands hang, `agent-browser close --all` or `pkill -f agent-browser` to reset
+- **HMR invalidates everything** — after code changes, refs break. Re-snapshot or restart
+- **`snapshot -i` doesn't find contenteditable** — use `snapshot -i -C` for rich text editors
+- **`fill` doesn't work on contenteditable** — use `type` for chat inputs
+- **Screenshots go to `~/.agent-browser/tmp/screenshots/`** — read them with the `Read` tool
+- **Dialogs block all commands** — if commands time out, check `agent-browser dialog status`
+- **Default timeout is 25s** — override with `AGENT_BROWSER_DEFAULT_TIMEOUT` (ms) or use explicit waits
+- **Shell quoting corrupts eval** — use `eval --stdin <<'EVALEOF'` for complex JS
@@ -19,13 +19,13 @@ works for any LobeHub streaming session.

 ```bash
 # 1. Start Electron with CDP
-./.agents/skills/local-testing/scripts/electron-dev.sh start
+./.agents/skills/agent-testing/scripts/electron-dev.sh start

 # 2. Navigate to a chat, switch runtime to Cloud Sandbox (gateway mode)

 # 3. Install the probe + helpers
 agent-browser --cdp 9222 eval --stdin \
-  < .agents/skills/local-testing/scripts/agent-gateway/probe.js
+  < .agents/skills/agent-testing/scripts/agent-gateway/probe.js

 # 4. Send a tool-call message — manually or via type+press
 agent-browser --cdp 9222 eval "window.__PROBE_EVENT('SENT')"
@@ -34,15 +34,15 @@ agent-browser --cdp 9222 eval "window.__PROBE_EVENT('SENT')"
 #    rightmost inactive tab as AWAY — edit ROUND_TRIPS / DWELL_MS in the
 #    file if you want different timing)
 agent-browser --cdp 9222 eval --stdin \
-  < .agents/skills/local-testing/scripts/agent-gateway/tab-switch.js
+  < .agents/skills/agent-testing/scripts/agent-gateway/tab-switch.js

 # 6. Wait for streaming to finish, then dump
 agent-browser --cdp 9222 eval --stdin \
-  < .agents/skills/local-testing/scripts/agent-gateway/probe-dump.js \
+  < .agents/skills/agent-testing/scripts/agent-gateway/probe-dump.js \
  > /tmp/probe.json

 # 7. Analyze
-node .agents/skills/local-testing/scripts/agent-gateway/analyze.mjs /tmp/probe.json
+node .agents/skills/agent-testing/scripts/agent-gateway/analyze.mjs /tmp/probe.json
 ```

 The analyzer prints three sections: EVENTS, TIMELINE, REGRESSIONS. If
@@ -0,0 +1,123 @@
+# Auth Setup for Local Agent Testing
+
+**Auth is the gate for all automated testing.** Prepare and verify it before
+writing any test step. The one-stop entry point is:
+
+```bash
+SCRIPT=".agents/skills/agent-testing/scripts/setup-auth.sh"
+
+$SCRIPT status        # check server + CLI + web auth readiness
+$SCRIPT cli           # interactive CLI device-code login (must be run by the user)
+pbpaste | $SCRIPT web # inject a copied Cookie header into the agent-browser session
+$SCRIPT web-verify    # live-check that the agent-browser session is authenticated
+```
+
+`SERVER_URL` defaults to `http://localhost:3010` (this repo's `dev:next` port).
+Override it when testing against another server (e.g. `SERVER_URL=http://localhost:3011`
+in the cloud repo).
+
+## Per-surface overview
+
+| Surface  | Mechanism                                | Persistence                                                       | Human interaction                               |
+| -------- | ---------------------------------------- | ----------------------------------------------------------------- | ----------------------------------------------- |
+| CLI      | OIDC Device Code Flow                    | `apps/cli/.lobehub-dev/settings.json`                             | Yes — browser authorization, every token expiry |
+| Web      | better-auth cookie injection             | `~/.lobehub-agent-testing/web-state.json` + agent-browser session | Copy the Cookie header once per token rotation  |
+| Electron | App's own login state                    | Electron user-data dir                                            | Log in once manually in the app                 |
+| Bot      | Native apps (Discord/WeChat/…) logged in | Each app's own session                                            | Once per app                                    |
+
+## CLI — Device Code Flow
+
+Credentials are isolated from the user's real CLI config via
+`LOBEHUB_CLI_HOME=.lobehub-dev` (kept inside `apps/cli/`, gitignored).
+
+Login requires interactive browser authorization, so **the user must run it
+themselves** (e.g. via the `!` prefix in Claude Code):
+
+```bash
+cd apps/cli && LOBEHUB_CLI_HOME=.lobehub-dev bun src/index.ts login --server http://localhost:3010
+```
+
+- The `--server` flag is required — an env var does NOT work and login will hit
+  the wrong server without it.
+- Check state without logging in: `setup-auth.sh status` (verifies
+  `settings.json` exists and `serverUrl` matches).
+- `UNAUTHORIZED` on API calls means the token expired — re-run login.
+
+## Web — better-auth cookie injection (agent-browser)
+
+`agent-browser --headed` on macOS often creates the Chromium window off-screen —
+the user can't see or interact with it, so manual login inside the agent-browser
+session fails. Instead, copy the **better-auth session cookie** out of the
+user's own logged-in Chrome and inject it as a Playwright-style state file.
+
+Do **not** use this on production URLs — only local dev. Treat the cookie as a
+secret: don't paste it into shared logs, PRs, or commit it anywhere.
+
+### One-key path
+
+1. Ask the user to copy the Cookie header **from a Network request, NOT
+   `document.cookie`** (`document.cookie` cannot see HttpOnly cookies, which is
+   exactly where better-auth puts its session):
+   - Open the logged-in tab (`http://localhost:<port>/…`) in Chrome.
+   - `Cmd+Option+I` → **Network** tab → refresh → click any same-origin request.
+   - Under **Request Headers**, right-click the `Cookie:` line → **Copy value**.
+2. Inject and verify in one shot:
+
+```bash
+pbpaste | ./.agents/skills/agent-testing/scripts/setup-auth.sh web
+```
+
+The script filters the header down to the better-auth cookies
+(`better-auth.session_token`, `better-auth.state`), builds the Playwright
+`storageState` JSON, loads it into the `agent-browser` session (default name
+`lobehub-dev`), opens `SERVER_URL`, and asserts the URL is not `/signin`.
+
+### Using the authenticated session
+
+```bash
+agent-browser --session lobehub-dev open "http://localhost:3010/"
+agent-browser --session lobehub-dev snapshot -i | head -20
+# Look for the user's avatar/name in the sidebar, or absence of the signin form.
+```
+
+### Notes
+
+- `storageState` doesn't enforce the HttpOnly flag on load — the script stores
+  cookies with `httpOnly: false`, which is fine for local dev and sidesteps a
+  CDP-context quirk where HttpOnly cookies sometimes fail to attach.
+- The state file is kept at `~/.lobehub-agent-testing/web-state.json` so
+  `setup-auth.sh status` can report web-auth readiness across sessions.
+
+### Common failure modes
+
+| Symptom                                       | Cause                                                                     | Fix                                               |
+| --------------------------------------------- | ------------------------------------------------------------------------- | ------------------------------------------------- |
+| Still redirects to `/signin` after injection  | User pasted from `document.cookie` → missed HttpOnly session              | Re-pull from Network request Headers, not console |
+| Script reports `no better-auth cookies found` | Separator wrong, or user pasted URL-decoded value                         | Keep the raw `Cookie:` header as-is               |
+| Login works briefly then expires              | `better-auth.session_token` rotated (user logged out / signed in again)   | Re-copy and re-inject                             |
+| Domain mismatch                               | Cookie domain must be `localhost` literally, no leading dot for local dev | —                                                 |
+
+## Electron
+
+The desktop app keeps its own persistent login state in its user-data
+directory — log in once manually inside the app and it survives restarts of
+`electron-dev.sh`. No injection needed. The standard check (do NOT hand-roll a
+store eval) once Electron is up with CDP:
+
+```bash
+./.agents/skills/agent-testing/scripts/app-probe.sh auth
+# → {"ok":true,"isSignedIn":true,"userId":"user_xxx"}
+```
+
+`setup-auth.sh status` runs this probe automatically when CDP 9222 is
+reachable.
+
+## Scope
+
+These recipes only cover **local dev** authentication. They do not:
+
+- Work for production — production cookies are `Secure; HttpOnly; Domain=.lobehub.com`
+  and must be delivered over HTTPS.
+- Replace real OAuth flows — tests that must exercise the login UI itself need a
+  real Chromium with `--remote-debugging-port` or a bot account.
+- Flow cookies back to the user's Chrome — injection is one-way.
@@ -0,0 +1,55 @@
+# Local Dev Server
+
+Single source of truth for starting / restarting the backend that all test
+surfaces (CLI, Electron, Web) hit.
+
+## Ports & modes
+
+| Command             | What it runs                                              | Port                              |
+| ------------------- | --------------------------------------------------------- | --------------------------------- |
+| `pnpm run dev:next` | Next.js backend (API + auth)                              | `3010`                            |
+| `bun run dev`       | Full-stack (Next.js + Vite SPA, via `devStartupSequence`) | `3010` (API) + SPA                |
+| `bun run dev:spa`   | Vite SPA only, proxies API to `3010`                      | `9876` (prints a Debug Proxy URL) |
+
+In the **cloud repo** (where this repo is the `lobehub/` submodule) the dev
+server conventionally runs on `3011` — set `SERVER_URL=http://localhost:3011`
+for the scripts in this skill when testing there.
+
+## Health check
+
+```bash
+curl -s -o /dev/null -w '%{http_code}' http://localhost:3010/
+```
+
+## Start / restart
+
+```bash
+# Start (from repo root)
+pnpm run dev:next
+
+# Restart — required to pick up server-side code changes
+lsof -ti:3010 | xargs kill
+pnpm run dev:next
+```
+
+## When a server restart is needed
+
+Next.js hot-reload may not pick up changes in workspace packages — restart when
+in doubt.
+
+| Change location                                 | Restart? |
+| ----------------------------------------------- | -------- |
+| `apps/server/src/` (routers, services, modules) | Yes      |
+| `src/server/` (agent-hono, workflows-hono)      | Yes      |
+| `packages/database/` (models)                   | Yes      |
+| `packages/types/`                               | Yes      |
+| `packages/prompts/`                             | Yes      |
+| `apps/cli/` (CLI runs from source)              | No       |
+
+## Troubleshooting
+
+| Issue                     | Solution                                                |
+| ------------------------- | ------------------------------------------------------- |
+| `ECONNREFUSED`            | Server not running — start it                           |
+| `EADDRINUSE` on the port  | Already running — `lsof -ti:<port> \| xargs kill` first |
+| Stale data / old behavior | Server needs a restart to pick up code changes          |
@@ -12,13 +12,13 @@ General-purpose screen recording tool for the Electron app. Captures CDP screens

 ```bash
 # Start recording (Electron must be running with CDP)
-.agents/skills/local-testing/scripts/record-app-screen.sh start [output_name]
+.agents/skills/agent-testing/scripts/record-app-screen.sh start [output_name]

 # Stop recording and assemble video
-.agents/skills/local-testing/scripts/record-app-screen.sh stop
+.agents/skills/agent-testing/scripts/record-app-screen.sh stop

 # Check if recording is active
-.agents/skills/local-testing/scripts/record-app-screen.sh status
+.agents/skills/agent-testing/scripts/record-app-screen.sh status
 ```

 ### Arguments
@@ -74,10 +74,10 @@ The `.records/` directory is at the project root and is gitignored.

 ```bash
 # Start Electron
-.agents/skills/local-testing/scripts/electron-dev.sh start
+.agents/skills/agent-testing/scripts/electron-dev.sh start

 # Start recording
-.agents/skills/local-testing/scripts/record-app-screen.sh start my-test
+.agents/skills/agent-testing/scripts/record-app-screen.sh start my-test

 # Run automation
 agent-browser --cdp 9222 click @e61
@@ -86,14 +86,14 @@ agent-browser --cdp 9222 press Enter
 sleep 10

 # Stop and get results
-.agents/skills/local-testing/scripts/record-app-screen.sh stop
+.agents/skills/agent-testing/scripts/record-app-screen.sh stop
 # → .records/my-test.mp4 + .records/my-test/*.png
 ```

 ### Gateway Streaming Demo

 ```bash
-.agents/skills/local-testing/scripts/electron-dev.sh start
+.agents/skills/agent-testing/scripts/electron-dev.sh start

 # Inject gateway URL
 agent-browser --cdp 9222 eval --stdin << 'EOF'
@@ -106,19 +106,19 @@ agent-browser --cdp 9222 eval --stdin << 'EOF'
 EOF

 # Record
-.agents/skills/local-testing/scripts/record-app-screen.sh start gateway-demo
+.agents/skills/agent-testing/scripts/record-app-screen.sh start gateway-demo

 # Navigate to agent, send message, wait for completion...
 # (automation commands here)

-.agents/skills/local-testing/scripts/record-app-screen.sh stop
+.agents/skills/agent-testing/scripts/record-app-screen.sh stop
 open .records/gateway-demo.mp4
 ```

 ### Check Active Recording

 ```bash
-.agents/skills/local-testing/scripts/record-app-screen.sh status
+.agents/skills/agent-testing/scripts/record-app-screen.sh status
 # [record] Active recording
 #   Frames:      42 captured (running: yes)
 #   Screenshots: 14 captured (running: yes)
@@ -0,0 +1,124 @@
+# Structured Test Reports
+
+Every automated test session ends with a structured, evidence-backed report.
+A chat-only summary is not an acceptable deliverable: the report is what the
+user (or a reviewer, or a later agent) audits without replaying the session.
+
+## Location & layout
+
+Reports live under `.records/reports/` (gitignored, like all `.records/`
+output):
+
+```
+.records/reports/<YYYYMMDD-HHMMSS>-<slug>/
+├── report.md      # human-readable report (embedded screenshots, case table, verdict)
+├── result.json    # machine-readable results (pass/fail counts, score)
+└── assets/        # evidence: screenshots, HAR files, CLI transcripts
+```
+
+## Workflow
+
+1. **Scaffold up front** — before running the first test step:
+
+   ```bash
+   DIR=$(./.agents/skills/agent-testing/scripts/report-init.sh < slug > "<title>")
+   ```
+
+   The script creates the directory, pre-fills branch / commit / date in both
+   files, and prints the directory path.
+
+2. **Collect evidence as you test** — every asserted behavior gets one evidence
+   item in `$DIR/assets/`:
+   - UI (static state): `agent-browser screenshot` or `capture-app-window.sh`,
+     then **verify the screenshot with the Read tool before citing it** —
+     never cite an image you haven't looked at.
+   - UI (time-based behavior): **screenshot vs GIF is a judgment you must
+     make per case.** If the assertion is about change over time — streaming
+     output, a ticking timer, loading/progress states, animations,
+     appear/disappear transitions — a static screenshot cannot prove it.
+     Record a frame sequence and synthesize a GIF:
+
+     ```bash
+     # start recording (background), trigger the behavior, wait for it to finish
+     ../scripts/record-gif.sh "$DIR/assets/case2-streaming.gif" 12 2 &
+     GIF_PID=$!
+     # ... drive the scenario ...
+     wait $GIF_PID
+     ```
+
+     Embed it like an image: `![case 2](assets/case2-streaming.gif)`. Verify
+     at least the first/last frames visually (Read the GIF) before citing.
+   - CLI: exact command + trimmed output (`$CLI task list | tee "$DIR/assets/task-list.txt"`).
+   - Network: `agent-browser network requests` dumps or HAR files.
+
+3. **Fill `report.md` as you go** — don't reconstruct from memory at the end.
+
+4. **Set the verdict** in both `report.md` and `result.json`, then link the
+   report directory in your final answer to the user.
+
+## Report language (hard rule)
+
+**`report.md` MUST be written in the language the user is conversing in** —
+the whole file, headings included. If the conversation is in Chinese, the
+report is in Chinese; do not mix English prose into it. The scaffold's English
+headings are placeholders — translate them when filling. Exceptions that stay
+as-is: code/commands, identifiers, log excerpts, and `result.json` (its keys
+and status values are machine-read and stay English; the `title` and case
+`name` fields follow the user's language).
+
+## report.md sections
+
+| Section         | Content                                                                            |
+| --------------- | ---------------------------------------------------------------------------------- |
+| **Scope**       | What changed / what is being verified; branch + commit                             |
+| **Environment** | Server URL, surfaces used (cli / electron / web / bot), relevant versions          |
+| **Cases**       | Table: `# \| case \| surface \| steps \| expected \| actual \| status \| evidence` |
+| **Evidence**    | Embedded screenshots/GIFs (`![case 1](assets/case1.png)`), fenced CLI transcripts  |
+| **Verdict**     | Pass/fail/blocked counts, optional 0–100 score, open issues / follow-ups           |
+
+Status values: `pass` / `fail` / `blocked` (couldn't run — e.g. auth or env
+missing; a blocked case is not a pass).
+
+## result.json schema
+
+```json
+{
+  "branch": "feat/task-tree",
+  "cases": [
+    {
+      "id": "1",
+      "name": "task tree returns nested children",
+      "surface": "cli",
+      "status": "pass",
+      "evidence": ["assets/task-tree.txt"]
+    }
+  ],
+  "commit": "abc1234",
+  "createdAt": "2026-06-11T15:30:00+08:00",
+  "summary": {
+    "total": 1,
+    "passed": 1,
+    "failed": 0,
+    "blocked": 0,
+    "score": 100,
+    "verdict": "pass"
+  },
+  "surfaces": ["cli"],
+  "title": "Verify task tree API"
+}
+```
+
+`score` is optional — use it when the verdict has a subjective component (UI
+polish, copy quality); omit it for purely binary runs. `verdict` is the single
+word the user reads first: `pass`, `fail`, or `partial`.
+
+## Rules
+
+- **No evidence, no claim** — every `pass`/`fail` in the case table must link
+  at least one asset.
+- **Screenshots must be visually verified** with the Read tool before being
+  cited.
+- **Report failures faithfully** — a failing case with clear evidence is a good
+  report; a vague green one is not.
+- If coverage was cut (cases skipped, surfaces not exercised), say so in the
+  Verdict section — silent truncation reads as "covered everything".
@@ -11,7 +11,7 @@
 //   6. ROLLBACKS — msgN / childN / role drops in the active-topic timeline
 //
 // Usage:
-//   bun run .agents/skills/local-testing/scripts/agent-gateway/analyze-events.ts <dump.json>
+//   bun run .agents/skills/agent-testing/scripts/agent-gateway/analyze-events.ts <dump.json>

 import { readFileSync } from 'node:fs';

@@ -5,16 +5,16 @@
 // streaming-replay test fixtures.
 //
 // Commands:
-//   bun run .agents/skills/local-testing/scripts/agent-gateway/run.ts install
+//   bun run .agents/skills/agent-testing/scripts/agent-gateway/run.ts install
 //       Bundle probe-events.ts and inject into the CDP-attached browser.
 //       Re-installing clears all buffers and re-patches WebSocket / fetch.
 //
-//   bun run .agents/skills/local-testing/scripts/agent-gateway/run.ts dump [name]
+//   bun run .agents/skills/agent-testing/scripts/agent-gateway/run.ts dump [name]
 //       Stop the timeline timer, fetch the capture as JSON, write it to
 //       `.agent-gateway/<name>-<YYYYMMDD-HHmmss>.json`. `name` defaults to
 //       `dump`. Prints the absolute path written.
 //
-//   bun run .agents/skills/local-testing/scripts/agent-gateway/run.ts analyze [path]
+//   bun run .agents/skills/agent-testing/scripts/agent-gateway/run.ts analyze [path]
 //       Run analyze-events.ts on the dump. `path` defaults to the most
 //       recently modified file in `.agent-gateway/`.
 //
@@ -28,7 +28,7 @@ import path from 'node:path';
 import { fileURLToPath } from 'node:url';

 const SCRIPT_DIR = path.dirname(fileURLToPath(import.meta.url));
-// .agents/skills/local-testing/scripts/agent-gateway/ → 5 levels up
+// .agents/skills/agent-testing/scripts/agent-gateway/ → 5 levels up
 const PROJECT_ROOT = path.resolve(SCRIPT_DIR, '../../../../..');
 const DUMP_DIR = path.join(PROJECT_ROOT, '.agent-gateway');

@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# app-probe.sh — standardized probes for a running LobeHub app (Electron via
+# CDP, or a web agent-browser session). Use these instead of hand-rolling
+# `window.__LOBE_STORES` eval snippets — especially the auth check.
+#
+# Usage:
+#   app-probe.sh auth              # { isSignedIn, userId } from the user store
+#   app-probe.sh route             # current SPA route
+#   app-probe.sh ops               # running chat operations (type / status / startTime)
+#   app-probe.sh goto <path>       # navigate the SPA to a route (full reload), e.g. goto /agent/agt_xxx
+#   app-probe.sh errors-install    # install a console.error interceptor
+#   app-probe.sh errors            # dump errors captured since errors-install
+#
+# Target selection (default: Electron over CDP 9222):
+#   AB_TARGET="--cdp 9222"             # Electron (default; CDP_PORT also honored)
+#   AB_TARGET="--session lobehub-dev"  # web agent-browser session
+#
+# Common routes (desktop SPA): /  /agent/<agentId>  /agent/<agentId>/<topicId>
+#   /task  /task/<taskId>  /page  /settings  /community
+
+set -euo pipefail
+
+AB_TARGET="${AB_TARGET:---cdp ${CDP_PORT:-9222}}"
+
+run_eval() {
+  # shellcheck disable=SC2086
+  agent-browser $AB_TARGET eval --stdin
+}
+
+case "${1:-}" in
+  auth)
+    run_eval << 'EVALEOF'
+(function () {
+  var stores = window.__LOBE_STORES;
+  if (!stores || !stores.user) return JSON.stringify({ ok: false, reason: 'no user store — app not loaded yet?' });
+  var u = stores.user();
+  return JSON.stringify({ ok: !!u.isSignedIn, isSignedIn: !!u.isSignedIn, userId: (u.user && u.user.id) || null });
+})()
+EVALEOF
+    ;;
+  route)
+    run_eval << 'EVALEOF'
+location.pathname + location.search + location.hash
+EVALEOF
+    ;;
+  ops)
+    run_eval << 'EVALEOF'
+(function () {
+  var stores = window.__LOBE_STORES;
+  if (!stores || !stores.chat) return JSON.stringify({ ok: false, reason: 'no chat store — open a conversation first' });
+  var ops = Object.values(stores.chat().operations || {});
+  var running = ops.filter(function (o) { return o.status === 'running'; });
+  return JSON.stringify({
+    ok: true,
+    running: running.map(function (o) { return { startTime: o.metadata && o.metadata.startTime, type: o.type }; }),
+    runningCount: running.length,
+    total: ops.length,
+  });
+})()
+EVALEOF
+    ;;
+  goto)
+    TARGET_PATH="${2:?Usage: app-probe.sh goto <path>}"
+    # shellcheck disable=SC2086
+    agent-browser $AB_TARGET eval "location.href = '$TARGET_PATH'" > /dev/null
+    sleep 2
+    bash "${BASH_SOURCE[0]}" route
+    ;;
+  errors-install)
+    run_eval << 'EVALEOF'
+(function () {
+  window.__CAPTURED_ERRORS = [];
+  var orig = console.error;
+  console.error = function () {
+    var msg = Array.from(arguments).map(function (a) {
+      if (a instanceof Error) return a.message;
+      return typeof a === 'object' ? JSON.stringify(a) : String(a);
+    }).join(' ');
+    window.__CAPTURED_ERRORS.push(msg);
+    orig.apply(console, arguments);
+  };
+  return 'installed';
+})()
+EVALEOF
+    ;;
+  errors)
+    run_eval << 'EVALEOF'
+JSON.stringify(window.__CAPTURED_ERRORS || 'interceptor not installed — run errors-install first')
+EVALEOF
+    ;;
+  *)
+    echo "Usage: $0 {auth|route|ops|goto <path>|errors-install|errors}" >&2
+    exit 2
+    ;;
+esac
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# record-gif.sh — capture a frame sequence via agent-browser (CDP) and
+# synthesize a GIF for embedding in a test report.
+#
+# Use this whenever the asserted behavior is about CHANGE OVER TIME —
+# streaming output, a ticking timer, loading states, animations. A static
+# screenshot cannot prove those; a GIF can. Cloud-portable: frames come from
+# CDP rendering, no OS-level screen capture.
+#
+# Usage:
+#   record-gif.sh <output.gif> <duration_seconds> [fps]
+#
+#   AB_TARGET="--cdp 9222"             # Electron (default; CDP_PORT honored)
+#   AB_TARGET="--session lobehub-dev"  # web agent-browser session
+#   GIF_WIDTH=960                      # output width (px), default 960
+#
+# Requires ffmpeg (`brew install ffmpeg`). Effective fps is capped by
+# screenshot latency (~0.3-0.5s per frame); 1-2 fps is the realistic range.
+#
+# Example — record a 12s run and embed it in the report:
+#   ./record-gif.sh "$DIR/assets/case2-tray-running.gif" 12 2 &
+#   GIF_PID=$!
+#   # ... trigger the streaming behavior ...
+#   wait $GIF_PID
+
+set -euo pipefail
+
+OUT="${1:?Usage: record-gif.sh <output.gif> <duration_seconds> [fps]}"
+DUR="${2:?Usage: record-gif.sh <output.gif> <duration_seconds> [fps]}"
+FPS="${3:-2}"
+AB_TARGET="${AB_TARGET:---cdp ${CDP_PORT:-9222}}"
+GIF_WIDTH="${GIF_WIDTH:-960}"
+
+command -v ffmpeg > /dev/null || {
+  echo "ffmpeg not found — install with: brew install ffmpeg" >&2
+  exit 1
+}
+
+TMP=$(mktemp -d)
+trap 'rm -rf "$TMP"' EXIT
+
+FRAMES=$((DUR * FPS))
+INTERVAL=$(python3 -c "print(1 / $FPS)")
+
+for i in $(seq -f '%04g' 1 "$FRAMES"); do
+  # shellcheck disable=SC2086
+  agent-browser $AB_TARGET screenshot "$TMP/frame-$i.png" > /dev/null 2>&1 || true
+  sleep "$INTERVAL"
+done
+
+CAPTURED=$(find "$TMP" -name 'frame-*.png' | wc -l | tr -d ' ')
+[ "$CAPTURED" -gt 0 ] || {
+  echo "no frames captured — is the app reachable via $AB_TARGET?" >&2
+  exit 1
+}
+
+ffmpeg -y -loglevel error -framerate "$FPS" -pattern_type glob -i "$TMP/frame-*.png" \
+  -vf "fps=$FPS,scale=$GIF_WIDTH:-1:flags=lanczos,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse" \
+  "$OUT"
+
+echo "$OUT ($CAPTURED frames @ ${FPS}fps)"
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# report-init.sh — scaffold a structured test report under .records/reports/.
+#
+# Format spec and evidence rules: ../references/report.md
+#
+# Usage:
+#   report-init.sh <slug> [title]
+#
+# Prints the report directory path (capture it: DIR=$(report-init.sh my-test)).
+
+set -euo pipefail
+
+SLUG="${1:?Usage: report-init.sh <slug> [title]}"
+TITLE="${2:-$SLUG}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)"
+TS="$(date +%Y%m%d-%H%M%S)"
+DIR="$REPO_ROOT/.records/reports/$TS-$SLUG"
+mkdir -p "$DIR/assets"
+
+BRANCH=$(git -C "$REPO_ROOT" branch --show-current 2> /dev/null || echo "unknown")
+COMMIT=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2> /dev/null || echo "unknown")
+DATE_HUMAN=$(date '+%Y-%m-%d %H:%M')
+DATE_ISO=$(date '+%Y-%m-%dT%H:%M:%S%z')
+
+cat > "$DIR/report.md" << EOF
+# Test Report: $TITLE
+
+## Scope
+
+<!-- What changed / what is being verified -->
+
+- Branch: \`$BRANCH\`
+- Commit: \`$COMMIT\`
+- Date: $DATE_HUMAN
+
+## Environment
+
+- Server: <!-- e.g. http://localhost:3010 -->
+- Surfaces: <!-- cli / electron / web / bot:<platform> -->
+
+## Cases
+
+| # | Case | Surface | Steps | Expected | Actual | Status | Evidence |
+| - | ---- | ------- | ----- | -------- | ------ | ------ | -------- |
+| 1 |      |         |       |          |        |        |          |
+
+## Evidence
+
+<!-- Embed screenshots: ![case 1](assets/case1.png) -->
+<!-- CLI transcripts in fenced blocks, with the exact command -->
+
+## Verdict
+
+- Passed: 0 / 0
+- Failed: 0
+- Blocked: 0
+- Score (optional): —
+- Open issues / follow-ups:
+EOF
+
+cat > "$DIR/result.json" << EOF
+{
+  "title": "$TITLE",
+  "createdAt": "$DATE_ISO",
+  "branch": "$BRANCH",
+  "commit": "$COMMIT",
+  "surfaces": [],
+  "cases": [],
+  "summary": { "total": 0, "passed": 0, "failed": 0, "blocked": 0, "verdict": "pending" }
+}
+EOF
+
+echo "$DIR"
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+# setup-auth.sh — one-stop auth setup & check for local agent testing.
+#
+# Auth is the gate for all automated testing: prepare it BEFORE writing any
+# test step. Background and failure modes: ../references/auth.md
+#
+# Usage:
+#   setup-auth.sh status        # check server + CLI + web auth readiness
+#   setup-auth.sh cli           # interactive CLI device-code login (run by a human)
+#   setup-auth.sh web           # stdin = Cookie header -> inject into agent-browser session
+#   setup-auth.sh web-verify    # live-check the agent-browser session is authenticated
+#
+# Env:
+#   SERVER_URL  (default http://localhost:3010)   dev server under test
+#   SESSION     (default lobehub-dev)             agent-browser session name
+#   AUTH_DIR    (default ~/.lobehub-agent-testing) where web state is persisted
+
+set -euo pipefail
+
+SERVER_URL="${SERVER_URL:-http://localhost:3010}"
+SESSION="${SESSION:-lobehub-dev}"
+AUTH_DIR="${AUTH_DIR:-$HOME/.lobehub-agent-testing}"
+STATE_FILE="$AUTH_DIR/web-state.json"
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)"
+CLI_HOME="$REPO_ROOT/apps/cli/.lobehub-dev"
+
+ok()   { printf '  \033[32m✔\033[0m %s\n' "$1"; }
+bad()  { printf '  \033[31m✘\033[0m %s\n' "$1"; }
+note() { printf '      %s\n' "$1"; }
+
+check_server() {
+  local code
+  code=$(curl -s -o /dev/null -w '%{http_code}' "$SERVER_URL/" 2> /dev/null || true)
+  if [[ "$code" =~ ^[23] ]]; then
+    ok "dev server reachable at $SERVER_URL"
+  else
+    bad "dev server NOT reachable at $SERVER_URL (http_code='$code')"
+    note "start it: pnpm run dev:next  (see references/dev-server.md)"
+    return 1
+  fi
+}
+
+check_cli() {
+  if [[ -f "$CLI_HOME/settings.json" ]] && grep -q "$SERVER_URL" "$CLI_HOME/settings.json"; then
+    ok "CLI logged in to $SERVER_URL (creds: apps/cli/.lobehub-dev)"
+  else
+    bad "CLI not logged in to $SERVER_URL"
+    note "ask the user to run:"
+    note "cd apps/cli && LOBEHUB_CLI_HOME=.lobehub-dev bun src/index.ts login --server $SERVER_URL"
+    return 1
+  fi
+}
+
+check_web() {
+  if [[ -f "$STATE_FILE" ]]; then
+    ok "web auth state saved ($STATE_FILE)"
+    note "live-verify: $0 web-verify"
+  else
+    bad "no web auth state for agent-browser"
+    note "copy the Cookie header from Chrome DevTools (Network tab), then:"
+    note "pbpaste | $0 web   (see references/auth.md)"
+    return 1
+  fi
+}
+
+check_electron() {
+  local cdp_port="${CDP_PORT:-9222}"
+  if ! curl -s -o /dev/null --max-time 2 "http://localhost:$cdp_port/json/version" 2> /dev/null; then
+    note "electron: not running (CDP $cdp_port unreachable) — start with electron-dev.sh; check skipped"
+    return 0
+  fi
+  local probe result
+  probe="$(dirname "${BASH_SOURCE[0]}")/app-probe.sh"
+  result=$(bash "$probe" auth 2> /dev/null || true)
+  # agent-browser eval returns the JSON string with escaped quotes — normalize.
+  result="${result//\\/}"
+  if [[ "$result" == *'"isSignedIn":true'* ]]; then
+    ok "electron app signed in ($result)"
+  else
+    bad "electron app NOT signed in ($result)"
+    note "log in once manually inside the app (state persists across restarts)"
+    return 1
+  fi
+}
+
+cmd_status() {
+  echo "agent-testing auth status (SERVER_URL=$SERVER_URL):"
+  local rc=0
+  check_server || rc=1
+  check_cli || rc=1
+  check_web || rc=1
+  check_electron || rc=1
+  if [[ $rc -eq 0 ]]; then
+    echo "all green — safe to start automated testing."
+  else
+    echo "auth NOT ready — fix the ✘ items before writing any test step."
+  fi
+  return $rc
+}
+
+cmd_cli() {
+  echo "Starting CLI device-code login against $SERVER_URL ..."
+  echo "(opens a browser authorization — must be run by a human in a terminal)"
+  cd "$REPO_ROOT/apps/cli"
+  LOBEHUB_CLI_HOME=.lobehub-dev bun src/index.ts login --server "$SERVER_URL"
+}
+
+# Build a Playwright storageState file from a raw Cookie header on stdin,
+# keeping only the better-auth cookies. See references/auth.md for why the
+# header must come from a Network request (HttpOnly) and why httpOnly=false.
+cmd_web() {
+  mkdir -p "$AUTH_DIR"
+  python3 - "$STATE_FILE" << 'PY'
+import json, sys, time
+
+raw = sys.stdin.read().strip()
+if raw.lower().startswith("cookie:"):
+    raw = raw.split(":", 1)[1].strip()
+
+WANTED = {"better-auth.session_token", "better-auth.state"}
+exp = int(time.time()) + 30 * 24 * 3600  # 30 days
+
+cookies = []
+for pair in raw.split("; "):
+    if "=" not in pair:
+        continue
+    name, _, value = pair.partition("=")
+    if name not in WANTED:
+        continue
+    cookies.append({
+        "name": name,
+        "value": value,
+        "domain": "localhost",
+        "path": "/",
+        "expires": exp,
+        "httpOnly": False,
+        "secure": False,
+        "sameSite": "Lax",
+    })
+
+if not cookies:
+    sys.stderr.write("no better-auth cookies found in input — paste the raw Cookie header from a Network request\n")
+    sys.exit(1)
+
+with open(sys.argv[1], "w") as f:
+    json.dump({"cookies": cookies, "origins": []}, f, indent=2)
+print(f"wrote {len(cookies)} cookie(s) to {sys.argv[1]}")
+PY
+  agent-browser --session "$SESSION" state load "$STATE_FILE"
+  cmd_web_verify
+}
+
+cmd_web_verify() {
+  agent-browser --session "$SESSION" open "$SERVER_URL/" > /dev/null
+  local url
+  url=$(agent-browser --session "$SESSION" get url)
+  if [[ "$url" == *"/signin"* || "$url" == *"/login"* ]]; then
+    bad "agent-browser session '$SESSION' NOT authenticated (landed on $url)"
+    note "re-copy the Cookie header and re-run: pbpaste | $0 web"
+    return 1
+  fi
+  ok "agent-browser session '$SESSION' authenticated (at $url)"
+}
+
+case "${1:-status}" in
+  status) cmd_status ;;
+  cli) cmd_cli ;;
+  web) cmd_web ;;
+  web-verify) cmd_web_verify ;;
+  *)
+    echo "Usage: $0 {status|cli|web|web-verify}" >&2
+    exit 2
+    ;;
+esac
@@ -0,0 +1,154 @@
+# Electron (LobeHub Desktop) UI Testing
+
+Default surface for verifying **pure frontend changes** (components, store logic, styles, interactions) in the primary product shape. Drives the Electron renderer over CDP with `agent-browser` — see [../references/agent-browser.md](../references/agent-browser.md) for the full command reference.
+
+**Auth**: the Electron app keeps its own persistent login state — log in once manually in the app; sessions survive restarts. Run `../scripts/setup-auth.sh status` before testing (see [../references/auth.md](../references/auth.md)).
+
+**Linux / headless (cloud)**: Electron itself runs on Linux, but it has no true headless mode — it needs a display server. In a headless environment wrap the launch with `xvfb-run` (virtual framebuffer). Everything CDP-based keeps working under Xvfb: the `agent-browser --cdp 9222` connection, snapshots, eval, and `agent-browser screenshot` (captured from the renderer via CDP, not the OS screen). What does NOT work on Linux: `capture-app-window.sh` (macOS `screencapture`), osascript, and the ffmpeg recording scripts in their current form.
+
+### Setup / Teardown
+
+Use the `electron-dev.sh` script to manage the Electron dev environment. It handles process lifecycle, waits for SPA readiness, and reliably kills all child processes (main + helpers + vite).
+
+```bash
+SCRIPT=".agents/skills/agent-testing/scripts/electron-dev.sh"
+
+# Start Electron dev with CDP (idempotent — skips if already running)
+$SCRIPT start
+
+# Check if Electron is running and CDP is reachable
+$SCRIPT status
+
+# Kill all Electron-related processes (main + helper + vite)
+$SCRIPT stop
+
+# Force fresh restart
+$SCRIPT restart
+```
+
+After `start` succeeds, connect with: `agent-browser --cdp 9222 snapshot -i`
+
+**Always run `$SCRIPT stop` when done testing** — `pkill -f "Electron"` alone won't catch all helper processes.
+
+#### Environment Variables
+
+| Variable          | Default                 | Description                              |
+| ----------------- | ----------------------- | ---------------------------------------- |
+| `CDP_PORT`        | `9222`                  | Chrome DevTools Protocol port            |
+| `ELECTRON_LOG`    | `/tmp/electron-dev.log` | Electron process log                     |
+| `ELECTRON_WAIT_S` | `60`                    | Max seconds to wait for Electron process |
+| `RENDERER_WAIT_S` | `60`                    | Max seconds to wait for SPA to load      |
+
+### LobeHub Probes & Quick Navigation
+
+`scripts/app-probe.sh` is the standard fast path into app state — **use it
+instead of hand-rolling `__LOBE_STORES` eval snippets** for these common needs:
+
+```bash
+PROBE=".agents/skills/agent-testing/scripts/app-probe.sh"
+
+$PROBE auth              # login check (Step 0.3) → { isSignedIn, userId }
+$PROBE route             # current SPA route
+$PROBE ops               # running chat operations (type / startTime)
+$PROBE goto /settings    # jump the SPA straight to a route (full reload)
+$PROBE errors-install    # install console.error interceptor
+$PROBE errors            # dump captured errors
+```
+
+`goto` lets a test enter the state under test directly instead of clicking
+through the UI. Common desktop routes:
+
+| Route                         | Where it lands                       |
+| ----------------------------- | ------------------------------------ |
+| `/`                           | Home (has a chat input)              |
+| `/agent/<agentId>`            | Agent conversation (latest topic)    |
+| `/agent/<agentId>/<topicId>`  | Specific topic in a conversation     |
+| `/task` · `/task/<taskId>`    | Task list / task detail              |
+| `/page`                       | Documents (文稿)                     |
+| `/settings`                   | Settings                             |
+| `/community`                  | Discover / community                 |
+
+Targets default to Electron (`--cdp 9222`); set `AB_TARGET="--session <name>"`
+for web sessions. For deeper or one-off state inspection, fall back to raw
+eval below.
+
+### LobeHub-Specific Patterns
+
+#### Access Zustand Store State
+
+```bash
+agent-browser --cdp 9222 eval --stdin << 'EVALEOF'
+(function() {
+  var chat = window.__LOBE_STORES.chat();
+  var ops = Object.values(chat.operations);
+  return JSON.stringify({
+    ops: ops.map(function(o) { return { type: o.type, status: o.status }; }),
+    activeAgent: chat.activeAgentId,
+    activeTopic: chat.activeTopicId,
+  });
+})()
+EVALEOF
+```
+
+#### Find and Use the Chat Input
+
+```bash
+# The chat input is contenteditable — must use -C flag
+agent-browser --cdp 9222 snapshot -i -C 2>&1 | grep "editable"
+
+agent-browser --cdp 9222 click @e48
+agent-browser --cdp 9222 type @e48 "Hello world"
+agent-browser --cdp 9222 press Enter
+```
+
+#### Wait for Agent to Complete
+
+```bash
+agent-browser --cdp 9222 eval --stdin << 'EVALEOF'
+(function() {
+  var chat = window.__LOBE_STORES.chat();
+  var ops = Object.values(chat.operations);
+  var running = ops.filter(function(o) { return o.status === 'running'; });
+  return running.length === 0 ? 'done' : 'running: ' + running.length;
+})()
+EVALEOF
+```
+
+#### Install Error Interceptor
+
+```bash
+agent-browser --cdp 9222 eval --stdin << 'EVALEOF'
+(function() {
+  window.__CAPTURED_ERRORS = [];
+  var orig = console.error;
+  console.error = function() {
+    var msg = Array.from(arguments).map(function(a) {
+      if (a instanceof Error) return a.message;
+      return typeof a === 'object' ? JSON.stringify(a) : String(a);
+    }).join(' ');
+    window.__CAPTURED_ERRORS.push(msg);
+    orig.apply(console, arguments);
+  };
+  return 'installed';
+})()
+EVALEOF
+
+# Later, check captured errors:
+agent-browser --cdp 9222 eval "JSON.stringify(window.__CAPTURED_ERRORS)"
+```
+
+## Electron Gotchas
+
+- **Always use `electron-dev.sh stop` to clean up** — `pkill -f "Electron"` only kills the main process; helper processes (GPU, renderer, network) survive. The script finds and kills all of them via PID matching against the project's electron binary path.
+- **`npx electron-vite dev` must run from `apps/desktop/`** — running from project root fails silently. The `electron-dev.sh` script handles this automatically.
+- **Dev build auto-opens DevTools, which hijacks the CDP target** — `agent-browser --cdp 9222` may attach to the DevTools page (`devtools://…`) instead of the app (`app://renderer/`). Symptom: `get url` returns a `devtools://` URL. Fix: close the DevTools target and reconnect:
+
+  ```bash
+  DT_ID=$(curl -s http://localhost:9222/json/list | python3 -c "import json,sys; ts=json.load(sys.stdin); print(next(t['id'] for t in ts if t['type']=='page' and t['url'].startswith('devtools://')))")
+  curl -s "http://localhost:9222/json/close/$DT_ID" > /dev/null
+  agent-browser close --all && agent-browser --cdp 9222 get url   # expect app://renderer/
+  ```
+
+- **Don't resize the Electron window after load** — resizing triggers full SPA reload
+- **Store is at `window.__LOBE_STORES`** not `window.__ZUSTAND_STORES__`
+- **Streaming / ticking UI needs GIF evidence** — see `scripts/record-gif.sh`; a static screenshot cannot prove time-based behavior.
@@ -0,0 +1,69 @@
+# Web (Full-Stack) Testing
+
+Default surface for **full-stack changes** — a new/changed API plus the UI that
+consumes it. The browser is the one surface where network requests and UI state
+are observable together, so you can assert both sides of the contract in a
+single run.
+
+For pure-frontend changes prefer [electron.md](./electron.md); for
+backend-only changes prefer [../cli/index.md](../cli/index.md).
+
+## Prerequisites
+
+- Local dev server running — [../references/dev-server.md](../references/dev-server.md)
+- Web auth injected into agent-browser — [../references/auth.md](../references/auth.md):
+
+```bash
+pbpaste | ./.agents/skills/agent-testing/scripts/setup-auth.sh web # after copying the Cookie header
+```
+
+## Option A — agent-browser with injected auth (recommended)
+
+```bash
+SESSION=lobehub-dev
+
+agent-browser --session $SESSION open "http://localhost:3010/"
+agent-browser --session $SESSION snapshot -i
+# interact via refs — full command reference: ../references/agent-browser.md
+```
+
+### Watch the API while driving the UI
+
+```bash
+# After triggering the UI action under test:
+agent-browser --session $SESSION network requests --type xhr,fetch
+agent-browser --session $SESSION network requests --method POST
+
+# Record a full HAR for the report
+agent-browser --session $SESSION network har start
+# ... drive the scenario ...
+agent-browser --session $SESSION network har stop ./capture.har
+```
+
+Assert both layers: the request/response shape (network) and the rendered
+result (snapshot/screenshot). Both belong in the report as evidence.
+
+## Option B — real Chrome with remote debugging
+
+For flows that need a real, visible browser (e.g. exercising the login UI
+itself):
+
+```bash
+/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome \
+  --remote-debugging-port=9222 \
+  --user-data-dir=/tmp/chrome-test-profile \
+  "<URL>" &
+sleep 5
+agent-browser --cdp 9222 snapshot -i
+
+# Or auto-discover running Chrome with remote debugging
+agent-browser --auto-connect snapshot -i
+```
+
+## Option C — Debug Proxy (local frontend, production backend)
+
+`bun run dev:spa` prints a **Debug Proxy** URL
+(`https://app.lobehub.com/_dangerous_local_dev_proxy?debug-host=…`) that loads
+your local Vite SPA inside the online environment — HMR against real server
+config. Useful for verifying frontend behavior against production data, **not**
+for testing backend changes (the backend is production, not your branch).
@@ -216,6 +216,6 @@ When using `--messages`, the output shows three sections (if context engine data

 ## Integration Points

- **Recording**: `src/server/services/agentRuntime/AgentRuntimeService.ts` — in the `executeStep()` method, after building `stepPresentationData`, writes partial snapshot in dev mode
- **Context engine capture**: `src/server/modules/AgentRuntime/RuntimeExecutors.ts` — in `call_llm` executor, after `serverMessagesEngine()` returns, calls `ctx.tracingContextEngine(input, output)`. `AgentRuntimeService.executeStep` buffers it per step and passes it to `traceRecorder.appendStep` as the typed `contextEngine` field (kept off the `events` array to stay out of Redis state).
+- **Recording**: `apps/server/src/services/agentRuntime/AgentRuntimeService.ts` — in the `executeStep()` method, after building `stepPresentationData`, writes partial snapshot in dev mode
+- **Context engine capture**: `apps/server/src/modules/AgentRuntime/RuntimeExecutors.ts` — in `call_llm` executor, after `serverMessagesEngine()` returns, calls `ctx.tracingContextEngine(input, output)`. `AgentRuntimeService.executeStep` buffers it per step and passes it to `traceRecorder.appendStep` as the typed `contextEngine` field (kept off the `events` array to stay out of Redis state).
 - **Store**: `FileSnapshotStore` reads/writes to `.agent-tracing/` relative to `process.cwd()`
@@ -271,7 +271,7 @@ Lists in the same file you may need to touch:

 - `defaultToolIds` — added to the agent's tool list by default
 - `alwaysOnToolIds` — forced on regardless of user selection (use sparingly)
- `runtimeManagedToolIds` — enable state controlled by runtime, not user UI; **must mirror the rules map** in `src/server/modules/Mecha/AgentToolsEngine/index.ts` and `src/helpers/toolEngineering/index.ts`
+- `runtimeManagedToolIds` — enable state controlled by runtime, not user UI; **must mirror the rules map** in `apps/server/src/modules/Mecha/AgentToolsEngine/index.ts` and `src/helpers/toolEngineering/index.ts`

 ---

@@ -1,171 +0,0 @@
---
-name: cli-backend-testing
-description: >
-  CLI + Backend integration testing workflow. Use when verifying backend API changes
-  (TRPC routers, services, models) via the LobeHub CLI against a local dev server.
-  Triggers on 'cli test', 'test with cli', 'verify with cli', 'local cli test',
-  'backend test with cli', or when needing to validate server-side changes end-to-end.
---
-
-# CLI + Backend Integration Testing
-
-Standard workflow for verifying backend changes using the LobeHub CLI (`lh`) against a local dev server.
-
-## When to Use
-
- Verifying TRPC router / service / model changes end-to-end
- Testing new API fields or response structure changes
- Validating CLI command output after backend modifications
- Debugging data flow issues between server and CLI
-
-## Prerequisites
-
-| Requirement  | Details                                                       |
-| ------------ | ------------------------------------------------------------- |
-| Dev server   | `localhost:3011` (Next.js)                                    |
-| CLI source   | `lobehub/apps/cli/`                                           |
-| CLI dev mode | Uses `LOBEHUB_CLI_HOME=.lobehub-dev` for isolated credentials |
-| Auth         | Device Code Flow login to local server                        |
-
-## Quick Reference
-
-All CLI dev commands run from `lobehub/apps/cli/`. Subsequent examples use `$CLI`:
-
-```bash
-CLI="LOBEHUB_CLI_HOME=.lobehub-dev bun src/index.ts"
-```
-
-## Workflow
-
-### Step 1: Ensure Dev Server is Running
-
-```bash
-curl -s -o /dev/null -w '%{http_code}' http://localhost:3011/ 2> /dev/null
-```
-
- **If reachable**: skip to Step 2.
- **If unreachable**: start from cloud repo root:
-
-```bash
-pnpm run dev:next
-```
-
-To **restart** (pick up server-side code changes):
-
-```bash
-lsof -ti:3011 | xargs kill
-pnpm run dev:next
-```
-
-**Important:** Server-side code changes in the submodule (`lobehub/src/server/`, `lobehub/packages/`) require a server restart. Next.js hot-reload may not pick up changes in submodule packages.
-
-### Step 2: Check CLI Authentication
-
-```bash
-cat lobehub/apps/cli/.lobehub-dev/settings.json 2> /dev/null
-```
-
- **If file exists and contains `"serverUrl": "http://localhost:3011"`**: skip to Step 3.
- **If missing or wrong server**: ask the user to run:
-
-```bash
-! cd lobehub/apps/cli && LOBEHUB_CLI_HOME=.lobehub-dev bun src/index.ts login --server http://localhost:3011
-```
-
-> Login requires interactive browser authorization (OIDC Device Code Flow), so the user must run it themselves via `!` prefix. Credentials persist in `lobehub/apps/cli/.lobehub-dev/`.
-
-### Step 3: Test with CLI Commands
-
-CLI runs from source, so CLI-side code changes take effect immediately without rebuilding.
-
-```bash
-cd lobehub/apps/cli
-$CLI <command>
-```
-
-### Step 4: Clean Up Test Data
-
-```bash
-$CLI task delete < id > -y
-$CLI agent delete < id > -y
-```
-
-## Common Testing Patterns
-
-### Task System
-
-```bash
-$CLI task list
-$CLI task create -n "Root Task" -i "Test instruction"
-$CLI task create -n "Child Task" -i "Sub instruction" --parent T-1
-$CLI task view T-1
-$CLI task tree T-1
-$CLI task edit T-1 --status running
-$CLI task comment T-1 -m "Test comment"
-$CLI task delete T-1 -y
-```
-
-### Agent System
-
-```bash
-$CLI agent list
-$CLI agent view <agent-id>
-$CLI agent run <agent-id> -m "Test prompt"
-```
-
-### Document & Knowledge Base
-
-```bash
-$CLI doc list
-$CLI doc create -t "Test Doc" -c "Content here"
-$CLI doc view <doc-id>
-$CLI kb list
-$CLI kb tree <kb-id>
-```
-
-### Model & Provider
-
-```bash
-$CLI model list
-$CLI provider list
-$CLI provider test <provider-id>
-```
-
-## Dev-Test Cycle
-
-```
-1. Make code changes (service/model/router/type)
-         |
-2. Run unit tests (fast feedback)
-   bunx vitest run --silent='passed-only' '<test-file>'
-         |
-3. Restart dev server (if server-side changes)
-   lsof -ti:3011 | xargs kill && pnpm run dev:next
-         |
-4. CLI verification (end-to-end)
-   $CLI <command>
-         |
-5. Clean up test data
-```
-
-### When Server Restart is Needed
-
-| Change Location                           | Restart? |
-| ----------------------------------------- | -------- |
-| `lobehub/src/server/` (routers, services) | Yes      |
-| `lobehub/packages/database/` (models)     | Yes      |
-| `lobehub/packages/types/`                 | Yes      |
-| `lobehub/packages/prompts/`               | Yes      |
-| `lobehub/apps/cli/` (CLI code)            | No       |
-| `src/` (cloud overrides)                  | Yes      |
-
-## Troubleshooting
-
-| Issue                       | Solution                                                              |
-| --------------------------- | --------------------------------------------------------------------- |
-| `No authentication found`   | Run `login --server http://localhost:3011`                            |
-| `UNAUTHORIZED` on API calls | Token expired; re-run login                                           |
-| `ECONNREFUSED`              | Dev server not running; start with `pnpm run dev:next`                |
-| CLI shows old data/behavior | Server needs restart to pick up code changes                          |
-| `EADDRINUSE` on port 3011   | Server already running; kill with `lsof -ti:3011 \| xargs kill`       |
-| Login opens wrong server    | Must use `--server http://localhost:3011` flag (env var doesn't work) |
@@ -111,7 +111,7 @@ Generate video from text prompt. This is an async operation.
 **Source**: `apps/cli/src/commands/generate/video.ts`

 ```bash
-lh gen video "A cat playing piano" -m <model> -p <provider> [options]
+lh gen video "A cat playing piano" -m < model > -p < provider > [options]
 ```

 | Option                      | Description              | Required |
@@ -259,13 +259,13 @@ Image and video generation use an async task pattern:
     UUID from the `async_tasks` table, not `gen_xxx`
   - Returns `{ status, error, generation }` (generation includes asset URLs on success)
   - Before querying, calls `checkTimeoutTasks` which marks tasks as `error` if they have been
-     `pending` or `processing` for more than ~5 minutes (`ASYNC_TASK_TIMEOUT = 298s`)
+     `pending` or `processing` for more than \~5 minutes (`ASYNC_TASK_TIMEOUT = 298s`)

 **Server routes**:

- `src/server/routers/lambda/image/index.ts` — image creation (uses `authedProcedure` + `serverDatabase`)
- `src/server/routers/lambda/video/index.ts` — video creation (uses `authedProcedure` + `serverDatabase`)
- `src/server/routers/lambda/generation.ts` — status checking
+- `apps/server/src/routers/lambda/image/index.ts` — image creation (uses `authedProcedure` + `serverDatabase`)
+- `apps/server/src/routers/lambda/video/index.ts` — video creation (uses `authedProcedure` + `serverDatabase`)
+- `apps/server/src/routers/lambda/generation.ts` — status checking
 - `packages/database/src/models/asyncTask.ts` — `AsyncTaskModel` including `checkTimeoutTasks`

 **Note**: Image/video routes do NOT use the `keyVaults` middleware — they read API keys from the database via `initModelRuntimeFromDB` or `createAsyncCaller`.
@@ -6,6 +6,66 @@ user-invocable: false

 # Database Migrations Guide

+## Development-stage schema changes
+
+Schema changes churn during feature development. When the schema changes before the migration has shipped, do not hand-edit the existing migration SQL to chase the new schema shape. Delete the draft migration artifacts added by this branch (SQL file, matching snapshot, and matching journal entry), then run the generator again and re-apply the normal migration review steps below.
+
+For example, if this branch's draft migration is `0110_add_verify_tables_and_ai_infra_id`:
+
+```bash
+# 1. Delete the draft SQL and its snapshot
+rm packages/database/migrations/0110_add_verify_tables_and_ai_infra_id.sql
+rm packages/database/migrations/meta/0110_snapshot.json
+
+# 2. Remove the matching 0110 entry from the journal's "entries" array
+#    packages/database/migrations/meta/_journal.json
+
+# 3. Regenerate from the current schema
+bun run db:generate
+```
+
+This keeps the generated SQL, snapshot, and journal aligned with the actual schema. Manual SQL edits are reserved for review-time hardening such as idempotent clauses, custom extension SQL, and meaningful filename/tag updates.
+
+Before release, if a feature branch accumulated multiple development-only migrations, consolidate them into one migration when possible. Production does not need to replay every intermediate draft shape, and fewer migrations reduce deploy-time risk.
+
+For example, if this branch added `0110`, `0111`, and `0112`, delete all three drafts and regenerate a single migration:
+
+```bash
+# 1. Delete every draft SQL and snapshot this branch added
+rm packages/database/migrations/011{0,1,2}_*.sql
+rm packages/database/migrations/meta/011{0,1,2}_snapshot.json
+
+# 2. Remove the 0110/0111/0112 entries from the journal's "entries" array
+#    packages/database/migrations/meta/_journal.json
+
+# 3. Regenerate one migration covering the full schema delta
+bun run db:generate
+```
+
+Do not make a migration compatible with earlier development-only versions of the same branch. While the migration has not shipped, there is no production history to preserve. Fix local/dev databases directly with whatever SQL is simplest (drop the draft table, rename a column, delete draft rows), then regenerate the branch migration from the current schema.
+
+For example, if an earlier draft on this branch created `signup_attempt_id` and you have since renamed it to `user_signup_log_id`, do not add a compatibility `ALTER ... RENAME` to the migration. Just fix the dev DB directly (see the `access-pg` skill for the `bun -e` + `pg` pattern), then regenerate:
+
+```bash
+# Fix the dev DB to match the new schema (simplest SQL wins)
+set -a && source .env && set +a && bun -e '
+import pg from "pg";
+const client = new pg.Client({ connectionString: process.env.DATABASE_URL });
+await client.connect();
+await client.query("ALTER TABLE user_signup_logs DROP COLUMN signup_attempt_id");
+await client.end();
+'
+
+# Regenerate so the migration reflects only the final shape
+bun run db:generate
+```
+
+After a migration has reached production or the target default branch, treat it as immutable: add a follow-up migration instead of rewriting it.
+
+## Rebase conflicts
+
+When a rebase conflicts in migration files, keep the upstream/default-branch migrations and remove all migrations introduced by the current feature branch. Complete the rebase, then regenerate this branch's migration from the rebased schema. This avoids merging two independent snapshots or hand-splicing journal entries.
+
 ## Step 1: Generate Migrations

 ```bash
@@ -57,7 +57,7 @@ process.env.DEBUG = 'lobe-*';
 ## Example

 ```typescript
-// src/server/routers/edge/market/index.ts
+// apps/server/src/routers/edge/market/index.ts
 import debug from 'debug';

 const log = debug('lobe-edge-router:market');
@@ -6,6 +6,14 @@ user-invocable: false

 # Drizzle ORM Schema Style Guide

+> **Adding a Model or Repository?** Ship a sibling test in the same PR — every new
+> file under `packages/database/src/models/**` or `src/repositories/**` needs a
+> matching `__tests__/<name>.test.ts`. See the **testing** skill
+> (`.agents/skills/testing/references/db-model-test.md`) for the `getTestDB()`
+> integration pattern, user-isolation tests, the BM25 `describe.skipIf(!isServerDB)`
+> guard, and schema gotchas. CI's coverage patch gate won't reliably catch a brand-new
+> untested file, so this is on you.
+
 ## Configuration

 - Config: `drizzle.config.ts`
@@ -25,16 +33,42 @@ Location: `packages/database/src/schemas/_helpers.ts`

 - **Tables**: Plural snake_case (`users`, `session_groups`)
 - **Columns**: snake_case (`user_id`, `created_at`)
+- **New tables**: Check nearby existing tables before naming a new one. Preserve
+  the established noun family and suffix. For example, if the user-scoped table
+  is `user_xxx_logs`, the workspace-scoped counterpart should be
+  `workspace_xxx_logs`, not `workspace_xxx_records` or another new synonym.
+
+```typescript
+// ✅ Good: follows the existing user/workspace table family.
+export const userSignupLogs = pgTable('user_signup_logs', { ... });
+export const workspaceSignupLogs = pgTable('workspace_signup_logs', { ... });
+
+// ❌ Bad: introduces a new suffix for the same concept.
+export const workspaceSignupRecords = pgTable('workspace_signup_records', { ... });
+```

 ## Column Definitions

 ### Primary Keys

+Do not use auto-incrementing primary keys (`serial`, `bigserial`, generated
+identity columns). They create sequence-state problems during cross-database
+migrations, restores, and data copy jobs. Prefer text IDs from application
+generators (`idGenerator`, `createNanoId`) or `uuid` for internal tables.
+
+Keep `$defaultFn(...)` when a table normally owns ID generation. Callers can
+still pass an explicit `id`; the default only runs when the insert omits it. Do
+not remove the default just because one flow needs to supply a request-scoped ID.
+
 ```typescript
+// ✅ Good: app-generated text ID; explicit inserts can still override it.
 id: text('id')
  .primaryKey()
  .$defaultFn(() => idGenerator('agents'))
  .notNull(),
+
+// ❌ Bad: sequence state is fragile across DB migrations and restores.
+id: serial('id').primaryKey(),
 ```

 ID prefixes make entity types distinguishable. For internal tables, use `uuid`.
@@ -53,6 +87,80 @@ userId: text('user_id')
 ...timestamps,  // Spread from _helpers.ts
 ```

+### Optional and Undefined Values
+
+Do not introduce artificial sentinel strings for missing values, such as
+`unknown`, unless the domain already has that explicit state and existing code
+uses it consistently. Prefer nullable columns, optional TypeScript fields, or a
+separate concrete status enum when the value is genuinely absent.
+
+```typescript
+// ✅ Good: absent until the final stage writes a real decision.
+export type UserSignupLogFinalDecision = 'allow' | 'block' | 'error';
+
+finalDecision: varchar('final_decision', { length: 32 }).$type<UserSignupLogFinalDecision>(),
+
+// ❌ Bad: invents a new state that callers now need to handle everywhere.
+export type UserSignupLogFinalDecision = 'allow' | 'block' | 'error' | 'unknown';
+
+finalDecision: varchar('final_decision', { length: 32 })
+  .$type<UserSignupLogFinalDecision>()
+  .notNull()
+  .default('unknown');
+```
+
+### Field Descriptions
+
+For columns whose meaning is not obvious from the name alone, add JSDoc on the
+schema field. Include a concrete example when it clarifies the stored value or
+the lifecycle moment that writes it. This is especially important for external
+IDs, lifecycle statuses, denormalized snapshots, JSONB signals, and fields whose
+name could mean either a request ID or a persisted row ID.
+
+```typescript
+// ✅ Good: explain the table's business object first, then only document
+// non-obvious lifecycle or risk-control fields.
+/**
+ * User signup logs - one row per signup flow, collecting stage-level
+ * risk-control decisions before and after the auth provider creates a user.
+ */
+export const userSignupLogs = pgTable('user_signup_logs', {
+  /** Final signup outcome reason, for example user_created, llm_block, or guard_error */
+  finalReason: text('final_reason'),
+
+  /** Aggregated risk level derived from stage decisions, for example block -> high */
+  riskLevel: varchar('risk_level', { length: 16 }).$type<UserSignupLogRiskLevel>(),
+
+  /** Ordered stage-level decisions and metadata grouped by signup review stage */
+  stageResults: jsonb('stage_results').$type<UserSignupLogStageResults>(),
+});
+
+// ❌ Bad: comments restate obvious column names without adding domain meaning.
+/** User email */
+email: text('email'),
+```
+
+### JSONB Types
+
+Avoid `Record<string, unknown>` or similarly loose JSONB types for schema
+columns. Define a concrete interface that describes the expected JSON shape, even
+when most properties are optional. This keeps callers, migrations, and review
+queries aligned on the same data contract.
+
+```typescript
+interface UserSignupLogMetadata {
+  payloadPath?: string;
+  requestPath?: string;
+}
+
+metadata: jsonb('metadata').$type<UserSignupLogMetadata>(),
+```
+
+```typescript
+// ❌ Bad: hides the contract and makes downstream access untyped.
+metadata: jsonb('metadata').$type<Record<string, unknown>>(),
+```
+
 ### Indexes

 ```typescript
@@ -176,66 +284,52 @@ const rows = await this.db

 ### Raw SQL and Advanced Queries

-Prefer Drizzle builders whenever the query can be expressed clearly with `select`,
-`insert().select()`, `update().from()`, joins, CTEs, `groupBy`, and typed selected
-columns. This keeps table and column references tied to schema definitions, so
-schema changes are more likely to surface as TypeScript errors.
+Prefer Drizzle builders whenever the query reads clearly with `select`,
+`insert().select()`, `update().from()`, joins, CTEs, and `groupBy` — this keeps
+table/column references tied to schema, so changes surface as TypeScript errors.
+Within a builder, expression-level `sql<T>` is fine for features lacking a helper
+(JSON path, casts, aggregates, `CASE`, `NOW()`). Row locks are clauses, not
+expressions — use `.for('update')`, never raw `FOR UPDATE`.

-Expression-level `sql<T>` is fine inside a Drizzle builder for PostgreSQL features
-that do not have a dedicated helper, such as JSON path extraction, casts, aggregate
-expressions, `CASE`, `NOW()`, or advisory locks. Row locks are query clauses, not
-expressions; use the select builder's `.for('update')` instead of raw
-`FOR UPDATE` SQL fragments.
+Use `COALESCE` only when null-handling is part of required DB semantics (nullable
+JSONB append/merge, "keep first non-null"). Don't scatter
+`COALESCE(excluded.col, current.col)` across ordinary upsert scalars just to avoid
+an update object — build `set` from defined values only, and hide any remaining
+SQL behind named helpers (`appendJsonbArray`, `mergeJsonbObject`, `keepFirstValue`)
+so the method reads as business intent, not SQL plumbing.
+
+```typescript
+// ✅ Scalars included only when present; SQL hidden behind a named helper.
+const updateValues = compactUndefined({
+  email: record.email ?? undefined,
+  ip: record.ip ?? undefined,
+});
+await db.insert(userSignupLogs).values(values).onConflictDoUpdate({
+  set: { ...updateValues, stageResults: appendStageResult(stage, result), updatedAt: now },
+  target: userSignupLogs.id,
+});
+
+// ❌ Every scalar becomes SQL plumbing.
+set: {
+  email: sql`COALESCE(excluded.email, ${userSignupLogs.email})`,
+  ip: sql`COALESCE(excluded.ip, ${userSignupLogs.ip})`,
+}
+```

 When refactoring raw SQL:

- Preserve the original query shape for latency-sensitive paths. If raw SQL is one
-  database roundtrip, do not replace it with multiple depth-based queries just to
-  remove `execute`.
- Use `$with(...)` plus `insert().select()` / `update().from()` for multi-step
-  single-roundtrip writes when Drizzle can express the data flow.
- Avoid generic `execute<MyRow>(sql...)` as the main safety mechanism. It types the
-  returned rows, but it does not keep selected columns in sync with schema changes.
- If the only clean implementation is a PostgreSQL feature that Drizzle cannot
-  express well, keep the raw SQL and tighten it instead: use schema references in
-  interpolations, explicit user scope, a narrow row interface, and regression tests.
+- Preserve query shape on latency-sensitive paths. If raw SQL is one roundtrip,
+  don't split it into multiple depth-based queries just to drop `execute`.
+- Use `$with(...)` + `insert().select()` / `update().from()` for multi-step
+  single-roundtrip writes Drizzle can express.
+- Don't rely on `execute<MyRow>(sql...)` for safety — it types rows but doesn't keep
+  selected columns in sync with schema changes.
+- If only a PostgreSQL feature Drizzle can't express works, keep the raw SQL and
+  tighten it: schema refs in interpolations, explicit user scope, a narrow row
+  interface, and regression tests.

-Recursive CTEs are a special case: current Drizzle usage in this repo does not have
-a clean `WITH RECURSIVE` builder pattern. Keep recursive CTE raw SQL when replacing
-it would add extra database roundtrips or materially worsen performance.
-
-Example: convert an aggregate query when Drizzle can preserve one roundtrip:
-
-```typescript
-// ✅ Good: builder owns table and column references; sql<T> stays expression-level.
-const rows = await trx
-  .select({
-    model: messages.model,
-    provider: messages.provider,
-    totalCost: sql<string | null>`sum((${messages.metadata}->'usage'->>'cost')::numeric)`.as(
-      'totalCost',
-    ),
-  })
-  .from(messages)
-  .where(
-    and(
-      eq(messages.topicId, topicId),
-      eq(messages.userId, userId),
-      eq(messages.role, 'assistant'),
-      sql`${messages.metadata} ? 'usage'`,
-    ),
-  )
-  .groupBy(messages.provider, messages.model);
-```
-
-Example: use the select lock builder for row locks:
-
-```typescript
-const [user] = await trx.select().from(users).where(eq(users.id, userId)).for('update');
-```
-
-Example: keep a recursive CTE raw when replacing it would add depth-based DB
-roundtrips:
+Recursive CTEs are the canonical "keep raw" case — there's no clean `WITH RECURSIVE`
+builder, and a rewrite would add depth-based roundtrips:

 ```typescript
 interface TaskTreeRow {
@@ -243,15 +337,13 @@ interface TaskTreeRow {
  parent_task_id: string | null;
 }

-// execute<T> is acceptable here only because Drizzle has no clean WITH RECURSIVE
-// builder; a builder rewrite would add depth-based roundtrips. Keep schema refs in
-// the interpolations and scope every leg to the user.
+// execute<T> acceptable: no clean WITH RECURSIVE builder. Keep schema refs in the
+// interpolations and scope every leg to the user.
 const { rows } = await db.execute<TaskTreeRow>(sql`
  WITH RECURSIVE task_tree AS (
    SELECT ${tasks.id}, ${tasks.parentTaskId}
    FROM ${tasks}
-    WHERE ${tasks.id} = ${rootTaskId}
-      AND ${tasks.createdByUserId} = ${userId}
+    WHERE ${tasks.id} = ${rootTaskId} AND ${tasks.createdByUserId} = ${userId}
    UNION ALL
    SELECT ${tasks.id}, ${tasks.parentTaskId}
    FROM ${tasks}
@@ -241,6 +241,6 @@ When the bug comes from a real trace, distill it into the closest existing test
 3. Add or update the narrowest failing test near the broken layer.
 4. Fix the smallest layer that can explain the symptom.
 5. Re-run focused tests.
-6. Only then do an Electron smoke test with the `local-testing` skill if UI confirmation is still needed.
+6. Only then do an Electron smoke test with the `agent-testing` skill if UI confirmation is still needed.

 Do not start with a broad Electron repro if a raw trace or adapter test can prove the fault zone faster.
@@ -1,561 +0,0 @@
---
-name: local-testing
-description: >
-  Local app and bot testing. Uses agent-browser CLI for Electron/web app UI testing,
-  and osascript (AppleScript) for controlling native macOS apps (WeChat, Discord, Telegram, Slack, Lark/飞书, QQ)
-  to test bots. Triggers on 'local test', 'test in electron', 'test desktop', 'test bot',
-  'bot test', 'test in discord', 'test in telegram', 'test in slack', 'test in weixin',
-  'test in wechat', 'test in lark', 'test in feishu', 'test in qq',
-  'manual test', 'osascript', or UI/bot verification tasks.
---
-
-# Local App & Bot Testing
-
-Two approaches for local testing on macOS:
-
-| Approach                    | Tool                | Best For                                             |
-| --------------------------- | ------------------- | ---------------------------------------------------- |
-| **agent-browser + CDP**     | `agent-browser` CLI | Electron apps, web apps (DOM access, JS eval)        |
-| **osascript (AppleScript)** | `osascript -e`      | Native macOS apps (WeChat, Discord, Telegram, Slack) |
-
---
-
-# Part 1: agent-browser (Electron / Web Apps)
-
-Use `agent-browser` to automate Chromium-based apps via Chrome DevTools Protocol.
-
-Install via `npm i -g agent-browser`, `brew install agent-browser`, or `cargo install agent-browser`. Run `agent-browser install` to download Chrome. Run `agent-browser upgrade` to update.
-
-## Core Workflow
-
-Every browser automation follows this pattern:
-
-1. **Navigate**: `agent-browser open <url>`
-2. **Snapshot**: `agent-browser snapshot -i` (get element refs like `@e1`, `@e2`)
-3. **Interact**: Use refs to click, fill, select
-4. **Re-snapshot**: After navigation or DOM changes, get fresh refs
-
-```bash
-agent-browser open https://example.com/form
-agent-browser snapshot -i
-# Output: @e1 [input type="email"], @e2 [input type="password"], @e3 [button] "Submit"
-
-agent-browser fill @e1 "user@example.com"
-agent-browser fill @e2 "password123"
-agent-browser click @e3
-agent-browser wait --load networkidle
-agent-browser snapshot -i # Check result
-```
-
-## Command Chaining
-
-```bash
-# Chain open + wait + snapshot in one call
-agent-browser open https://example.com && agent-browser wait --load networkidle && agent-browser snapshot -i
-```
-
-Use `&&` when you don't need to read intermediate output. Run commands separately when you need to parse output first (e.g., snapshot to discover refs, then interact).
-
-## Essential Commands
-
-```bash
-# Navigation
-agent-browser open <url>              # Navigate (aliases: goto, navigate)
-agent-browser close                   # Close browser
-agent-browser close --all             # Close all active sessions
-
-# Snapshot
-agent-browser snapshot -i             # Interactive elements with refs (recommended)
-agent-browser snapshot -s "#selector" # Scope to CSS selector
-
-# Interaction (use @refs from snapshot)
-agent-browser click @e1               # Click element
-agent-browser click @e1 --new-tab     # Click and open in new tab
-agent-browser fill @e2 "text"         # Clear and type text
-agent-browser type @e2 "text"         # Type without clearing
-agent-browser select @e1 "option"     # Select dropdown option
-agent-browser check @e1               # Check checkbox
-agent-browser press Enter             # Press key
-agent-browser keyboard type "text"    # Type at current focus (no selector)
-agent-browser keyboard inserttext "text"  # Insert without key events
-agent-browser scroll down 500         # Scroll page
-agent-browser scroll down 500 --selector "div.content"  # Scroll within container
-
-# Get information
-agent-browser get text @e1            # Get element text
-agent-browser get url                 # Get current URL
-agent-browser get title               # Get page title
-agent-browser get cdp-url             # Get CDP WebSocket URL
-
-# Wait
-agent-browser wait @e1                # Wait for element
-agent-browser wait --load networkidle # Wait for network idle
-agent-browser wait --url "**/page"    # Wait for URL pattern
-agent-browser wait 2000               # Wait milliseconds
-agent-browser wait --text "Welcome"   # Wait for text to appear
-agent-browser wait --fn "!document.body.innerText.includes('Loading...')"  # Wait for text to disappear
-agent-browser wait "#spinner" --state hidden  # Wait for element to disappear
-
-# Downloads
-agent-browser download @e1 ./file.pdf          # Click element to trigger download
-agent-browser wait --download ./output.zip     # Wait for any download to complete
-
-# Network
-agent-browser network requests                 # Inspect tracked requests
-agent-browser network requests --type xhr,fetch  # Filter by resource type
-agent-browser network requests --method POST   # Filter by HTTP method
-agent-browser network route "**/api/*" --abort # Block matching requests
-agent-browser network har start                # Start HAR recording
-agent-browser network har stop ./capture.har   # Stop and save HAR file
-
-# Viewport & Device Emulation
-agent-browser set viewport 1920 1080          # Set viewport size (default: 1280x720)
-agent-browser set viewport 1920 1080 2        # 2x retina
-agent-browser set device "iPhone 14"          # Emulate device (viewport + user agent)
-
-# Capture
-agent-browser screenshot              # Screenshot to temp dir
-agent-browser screenshot --full       # Full page screenshot
-agent-browser screenshot --annotate   # Annotated screenshot with numbered element labels
-agent-browser pdf output.pdf          # Save as PDF
-
-# Clipboard
-agent-browser clipboard read          # Read text from clipboard
-agent-browser clipboard write "text"  # Write text to clipboard
-agent-browser clipboard copy          # Copy current selection
-agent-browser clipboard paste         # Paste from clipboard
-
-# Dialogs (alert, confirm, prompt, beforeunload)
-agent-browser dialog accept           # Accept dialog
-agent-browser dialog accept "input"   # Accept prompt dialog with text
-agent-browser dialog dismiss          # Dismiss/cancel dialog
-agent-browser dialog status           # Check if dialog is open
-
-# Diff (compare page states)
-agent-browser diff snapshot                        # Compare current vs last snapshot
-agent-browser diff screenshot --baseline before.png  # Visual pixel diff
-agent-browser diff url <url1> <url2>               # Compare two pages
-
-# Streaming
-agent-browser stream enable           # Start WebSocket streaming
-agent-browser stream status           # Inspect streaming state
-agent-browser stream disable          # Stop streaming
-```
-
-## Batch Execution
-
-```bash
-echo '[
-  ["open", "https://example.com"],
-  ["snapshot", "-i"],
-  ["click", "@e1"],
-  ["screenshot", "result.png"]
-]' | agent-browser batch --json
-```
-
-## Authentication
-
-```bash
-# Option 1: Auth vault (credentials stored encrypted)
-echo "$PASSWORD" | agent-browser auth save myapp --url https://app.example.com/login --username user --password-stdin
-agent-browser auth login myapp
-
-# Option 2: Session name (auto-save/restore cookies + localStorage)
-agent-browser --session-name myapp open https://app.example.com/login
-agent-browser close                                                       # State auto-saved
-agent-browser --session-name myapp open https://app.example.com/dashboard # Auto-restored
-
-# Option 3: Persistent profile
-agent-browser --profile ~/.myapp open https://app.example.com/login
-
-# Option 4: State file
-agent-browser state save auth.json
-agent-browser state load auth.json
-```
-
-### LobeHub dev server — inject better-auth cookie
-
-`agent-browser --headed` on macOS can create an off-screen Chromium window, blocking manual login. For a local LobeHub dev server (e.g. `localhost:3011`), copy the `better-auth.session_token` cookie out of a **Network request** in the user's own Chrome DevTools and load it via `state load`. See [references/agent-browser-login.md](./references/agent-browser-login.md) for the full recipe.
-
-## Semantic Locators (Alternative to Refs)
-
-```bash
-agent-browser find text "Sign In" click
-agent-browser find label "Email" fill "user@test.com"
-agent-browser find role button click --name "Submit"
-agent-browser find placeholder "Search" type "query"
-agent-browser find testid "submit-btn" click
-```
-
-## JavaScript Evaluation (eval)
-
-```bash
-# Simple expressions
-agent-browser eval 'document.title'
-
-# Complex JS: use --stdin with heredoc (RECOMMENDED)
-agent-browser eval --stdin << 'EVALEOF'
-JSON.stringify(
-  Array.from(document.querySelectorAll("img"))
-    .filter(i => !i.alt)
-    .map(i => ({ src: i.src.split("/").pop(), width: i.width }))
-)
-EVALEOF
-
-# Base64 encoding (avoids all shell escaping issues)
-agent-browser eval -b "$(echo -n 'document.title' | base64)"
-```
-
-## Ref Lifecycle
-
-Refs (`@e1`, `@e2`, etc.) are invalidated when the page changes. Always re-snapshot after clicking links/buttons that navigate, form submissions, or dynamic content loading.
-
-## Annotated Screenshots (Vision Mode)
-
-```bash
-agent-browser screenshot --annotate
-# Output includes the image path and a legend:
-#   [1] @e1 button "Submit"
-#   [2] @e2 link "Home"
-agent-browser click @e2 # Click using ref from annotated screenshot
-```
-
-## Parallel Sessions
-
-```bash
-agent-browser --session site1 open https://site-a.com
-agent-browser --session site2 open https://site-b.com
-agent-browser session list
-```
-
-## Connect to Existing Chrome
-
-```bash
-agent-browser --auto-connect snapshot # Auto-discover running Chrome
-agent-browser --cdp 9222 snapshot     # Explicit CDP port
-```
-
-## iOS Simulator (Mobile Safari)
-
-```bash
-agent-browser device list
-agent-browser -p ios --device "iPhone 16 Pro" open https://example.com
-agent-browser -p ios snapshot -i
-agent-browser -p ios tap @e1
-agent-browser -p ios swipe up
-agent-browser -p ios screenshot mobile.png
-agent-browser -p ios close
-```
-
-## Observability Dashboard
-
-```bash
-agent-browser dashboard install
-agent-browser dashboard start # Background server on port 4848
-agent-browser dashboard stop
-```
-
-## Cloud Providers
-
-Use `-p <provider>` to run against cloud browsers: `agentcore`, `browserbase`, `browserless`, `browseruse`, `kernel`.
-
-## Browser Engine Selection
-
-```bash
-agent-browser --engine lightpanda open example.com # 10x faster, 10x less memory
-```
-
-## Electron (LobeHub Desktop)
-
-### Setup / Teardown
-
-Use the `electron-dev.sh` script to manage the Electron dev environment. It handles process lifecycle, waits for SPA readiness, and reliably kills all child processes (main + helpers + vite).
-
-```bash
-SCRIPT=".agents/skills/local-testing/scripts/electron-dev.sh"
-
-# Start Electron dev with CDP (idempotent — skips if already running)
-$SCRIPT start
-
-# Check if Electron is running and CDP is reachable
-$SCRIPT status
-
-# Kill all Electron-related processes (main + helper + vite)
-$SCRIPT stop
-
-# Force fresh restart
-$SCRIPT restart
-```
-
-After `start` succeeds, connect with: `agent-browser --cdp 9222 snapshot -i`
-
-**Always run `$SCRIPT stop` when done testing** — `pkill -f "Electron"` alone won't catch all helper processes.
-
-#### Environment Variables
-
-| Variable          | Default                 | Description                              |
-| ----------------- | ----------------------- | ---------------------------------------- |
-| `CDP_PORT`        | `9222`                  | Chrome DevTools Protocol port            |
-| `ELECTRON_LOG`    | `/tmp/electron-dev.log` | Electron process log                     |
-| `ELECTRON_WAIT_S` | `60`                    | Max seconds to wait for Electron process |
-| `RENDERER_WAIT_S` | `60`                    | Max seconds to wait for SPA to load      |
-
-### LobeHub-Specific Patterns
-
-#### Access Zustand Store State
-
-```bash
-agent-browser --cdp 9222 eval --stdin << 'EVALEOF'
-(function() {
-  var chat = window.__LOBE_STORES.chat();
-  var ops = Object.values(chat.operations);
-  return JSON.stringify({
-    ops: ops.map(function(o) { return { type: o.type, status: o.status }; }),
-    activeAgent: chat.activeAgentId,
-    activeTopic: chat.activeTopicId,
-  });
-})()
-EVALEOF
-```
-
-#### Find and Use the Chat Input
-
-```bash
-# The chat input is contenteditable — must use -C flag
-agent-browser --cdp 9222 snapshot -i -C 2>&1 | grep "editable"
-
-agent-browser --cdp 9222 click @e48
-agent-browser --cdp 9222 type @e48 "Hello world"
-agent-browser --cdp 9222 press Enter
-```
-
-#### Wait for Agent to Complete
-
-```bash
-agent-browser --cdp 9222 eval --stdin << 'EVALEOF'
-(function() {
-  var chat = window.__LOBE_STORES.chat();
-  var ops = Object.values(chat.operations);
-  var running = ops.filter(function(o) { return o.status === 'running'; });
-  return running.length === 0 ? 'done' : 'running: ' + running.length;
-})()
-EVALEOF
-```
-
-#### Install Error Interceptor
-
-```bash
-agent-browser --cdp 9222 eval --stdin << 'EVALEOF'
-(function() {
-  window.__CAPTURED_ERRORS = [];
-  var orig = console.error;
-  console.error = function() {
-    var msg = Array.from(arguments).map(function(a) {
-      if (a instanceof Error) return a.message;
-      return typeof a === 'object' ? JSON.stringify(a) : String(a);
-    }).join(' ');
-    window.__CAPTURED_ERRORS.push(msg);
-    orig.apply(console, arguments);
-  };
-  return 'installed';
-})()
-EVALEOF
-
-# Later, check captured errors:
-agent-browser --cdp 9222 eval "JSON.stringify(window.__CAPTURED_ERRORS)"
-```
-
-## Chrome / Web Apps
-
-```bash
-/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome \
-  --remote-debugging-port=9222 \
-  --user-data-dir=/tmp/chrome-test-profile \
-  "<URL>" &
-sleep 5
-agent-browser --cdp 9222 snapshot -i
-
-# Or auto-discover running Chrome with remote debugging
-agent-browser --auto-connect snapshot -i
-```
-
---
-
-# Part 2: osascript (Native macOS App Bot Testing)
-
-Use AppleScript via `osascript` to control native macOS desktop apps for bot testing. Works with any app that supports macOS Accessibility, no CDP or Chromium needed.
-
-The pattern is the same for every platform:
-
-1. **Activate** the app (`tell application "X" to activate`)
-2. **Navigate** to a channel/chat (Quick Switcher `Cmd+K` or Search `Cmd+F`)
-3. **Send** a message (clipboard paste `Cmd+V` + Enter)
-4. **Wait** for the bot response
-5. **Screenshot** for verification (`screencapture` + `Read` tool)
-
-## Per-Platform References
-
-Pick the file for your target platform — each contains activation, navigation, send-message, and verification snippets specific to that app:
-
-Each channel has its own folder under `bot/<channel>/` containing an `index.md`
-(activation, navigation, send-message, and verification snippets specific to
-that app) and its test script:
-
-| Platform      | Reference                                        | Quick switcher |
-| ------------- | ------------------------------------------------ | -------------- |
-| Discord       | [bot/discord/index.md](./bot/discord/index.md)   | `Cmd+K`        |
-| Slack         | [bot/slack/index.md](./bot/slack/index.md)       | `Cmd+K`        |
-| Telegram      | [bot/telegram/index.md](./bot/telegram/index.md) | `Cmd+F`        |
-| WeChat / 微信 | [bot/wechat/index.md](./bot/wechat/index.md)     | `Cmd+F`        |
-| Lark / 飞书   | [bot/lark/index.md](./bot/lark/index.md)         | `Cmd+K`        |
-| QQ            | [bot/qq/index.md](./bot/qq/index.md)             | `Cmd+F`        |
-
-For **shared osascript patterns** (activate, type, paste, screenshot, read accessibility, common workflow template, gotchas), see [bot/osascript-common.md](./bot/osascript-common.md). Read this first if you're new to osascript automation.
-
-## Bridge-based channels (no native app)
-
-Some channels have no native app to drive with osascript — they connect through
-a local bridge inside the Desktop app. These are tested with agent-browser
-(IPC + UI) plus the bridge's own HTTP/REST endpoints, not osascript:
-
-| Channel  | Reference                                        | What it drives                                           |
-| -------- | ------------------------------------------------ | -------------------------------------------------------- |
-| iMessage | [bot/imessage/index.md](./bot/imessage/index.md) | `imessageBridge.*` IPC + local bridge + BlueBubbles REST |
-
-For iMessage there is a one-shot regression script — see `test-imessage-bridge.sh` below.
-
---
-
-# Scripts
-
-**App / recording scripts** in `.agents/skills/local-testing/scripts/`:
-
-| Script                    | Usage                                               |
-| ------------------------- | --------------------------------------------------- |
-| `electron-dev.sh`         | Manage Electron dev env (start/stop/status/restart) |
-| `record-electron-demo.sh` | Record Electron app demo with ffmpeg                |
-| `record-app-screen.sh`    | Record app screen (video + screenshots, start/stop) |
-
-**Bot scripts** live under `.agents/skills/local-testing/bot/`, one folder per
-channel (alongside that channel's `index.md`). The shared
-`capture-app-window.sh` sits at the `bot/` root:
-
-| Script                             | Usage                                                               |
-| ---------------------------------- | ------------------------------------------------------------------- |
-| `capture-app-window.sh`            | Capture screenshot of a specific app window (used by bot tests)     |
-| `discord/test-discord-bot.sh`      | Send message to Discord bot via osascript                           |
-| `slack/test-slack-bot.sh`          | Send message to Slack bot via osascript                             |
-| `telegram/test-telegram-bot.sh`    | Send message to Telegram bot via osascript                          |
-| `wechat/test-wechat-bot.sh`        | Send message to WeChat bot via osascript                            |
-| `lark/test-lark-bot.sh`            | Send message to Lark / 飞书 bot via osascript                       |
-| `qq/test-qq-bot.sh`                | Send message to QQ bot via osascript                                |
-| `imessage/test-imessage-bridge.sh` | Regression-test the iMessage BlueBubbles bridge (IPC + HTTP)        |
-| `imessage/send-imessage-test.sh`   | Send one real iMessage (desktop → BB → iMessage) and verify it sent |
-
-### Window Screenshot Utility
-
-`capture-app-window.sh` captures a screenshot of a specific app window using `screencapture -l <windowID>`. It uses Swift + CGWindowList to find the window by process name, so screenshots work correctly even when the window is on an external monitor or behind other windows.
-
-```bash
-# Standalone usage
-./.agents/skills/local-testing/bot/capture-app-window.sh "Discord" /tmp/discord.png
-./.agents/skills/local-testing/bot/capture-app-window.sh "Slack" /tmp/slack.png
-./.agents/skills/local-testing/bot/capture-app-window.sh "WeChat" /tmp/wechat.png
-```
-
-All bot test scripts use this utility automatically for their screenshots.
-
-### Bot Test Scripts
-
-All bot test scripts share the same interface:
-
-```bash
-./scripts/test-<platform>-bot.sh <channel_or_contact> <message> [wait_seconds] [screenshot_path]
-```
-
-Examples:
-
-```bash
-# Discord — test a bot in #bot-testing channel
-./.agents/skills/local-testing/bot/discord/test-discord-bot.sh "bot-testing" "!ping"
-./.agents/skills/local-testing/bot/discord/test-discord-bot.sh "bot-testing" "/ask Tell me a joke" 30
-
-# Slack — test a bot in #bot-testing channel
-./.agents/skills/local-testing/bot/slack/test-slack-bot.sh "bot-testing" "@mybot hello"
-./.agents/skills/local-testing/bot/slack/test-slack-bot.sh "bot-testing" "/ask What is 2+2?" 20
-
-# Telegram — test a bot by username
-./.agents/skills/local-testing/bot/telegram/test-telegram-bot.sh "MyTestBot" "/start"
-./.agents/skills/local-testing/bot/telegram/test-telegram-bot.sh "GPTBot" "Hello" 60
-
-# WeChat — test a bot or send to a contact
-./.agents/skills/local-testing/bot/wechat/test-wechat-bot.sh "文件传输助手" "test message" 5
-./.agents/skills/local-testing/bot/wechat/test-wechat-bot.sh "MyBot" "Tell me a joke" 30
-
-# Lark/飞书 — test a bot in a group chat
-./.agents/skills/local-testing/bot/lark/test-lark-bot.sh "bot-testing" "@MyBot hello"
-./.agents/skills/local-testing/bot/lark/test-lark-bot.sh "bot-testing" "Help me with this" 30
-
-# QQ — test a bot in a group or direct chat
-./.agents/skills/local-testing/bot/qq/test-qq-bot.sh "bot-testing" "Hello bot" 15
-./.agents/skills/local-testing/bot/qq/test-qq-bot.sh "MyBot" "/help" 10
-```
-
-Each script: activates the app, navigates to the channel/contact, pastes the message via clipboard, sends, waits, and takes a screenshot. Use the `Read` tool on the screenshot for visual verification.
-
-### iMessage bridge regression script
-
-`test-imessage-bridge.sh` does **not** follow the osascript bot interface — it
-drives the Desktop bridge's IPC + HTTP layers and asserts the result, then
-self-cleans. Needs BlueBubbles running and Electron up with CDP.
-
-```bash
-./.agents/skills/local-testing/bot/imessage/test-imessage-bridge.sh '<bluebubbles_password>' [bb_url] [cdp_port]
-# defaults: bb_url=http://127.0.0.1:1234  cdp_port=9222 — exit 0 = all green
-```
-
-It guards the connect/configure flow (testConfig happy + reject paths, first-time
-`upsertConfig` save, bridge running + webhook registered, local-server secret
-enforcement). See [bot/imessage/index.md](./bot/imessage/index.md)
-for the full manual UI flow and known bugs.
-
---
-
-# Screen Recording
-
-Record automated demos using `record-app-screen.sh` (start/stop lifecycle, CDP screenshots + ffmpeg assembly). See [references/record-app-screen.md](references/record-app-screen.md) for full documentation.
-
-```bash
-./.agents/skills/local-testing/scripts/electron-dev.sh start
-./.agents/skills/local-testing/scripts/record-app-screen.sh start my-demo
-# ... run automation ...
-./.agents/skills/local-testing/scripts/record-app-screen.sh stop
-```
-
-Outputs to `.records/` directory (gitignored): `<name>.mp4` (video) + `<name>/` (screenshots every 3s).
-
---
-
-# Gotchas
-
-### agent-browser
-
- **Daemon can get stuck** — if commands hang, `agent-browser close --all` or `pkill -f agent-browser` to reset
- **HMR invalidates everything** — after code changes, refs break. Re-snapshot or restart
- **`snapshot -i` doesn't find contenteditable** — use `snapshot -i -C` for rich text editors
- **`fill` doesn't work on contenteditable** — use `type` for chat inputs
- **Screenshots go to `~/.agent-browser/tmp/screenshots/`** — read them with the `Read` tool
- **Dialogs block all commands** — if commands time out, check `agent-browser dialog status`
- **Default timeout is 25s** — override with `AGENT_BROWSER_DEFAULT_TIMEOUT` (ms) or use explicit waits
- **Shell quoting corrupts eval** — use `eval --stdin <<'EVALEOF'` for complex JS
-
-### Electron-specific
-
- **Always use `electron-dev.sh stop` to clean up** — `pkill -f "Electron"` only kills the main process; helper processes (GPU, renderer, network) survive. The script finds and kills all of them via PID matching against the project's electron binary path.
- **`npx electron-vite dev` must run from `apps/desktop/`** — running from project root fails silently. The `electron-dev.sh` script handles this automatically.
- **Don't resize the Electron window after load** — resizing triggers full SPA reload
- **Store is at `window.__LOBE_STORES`** not `window.__ZUSTAND_STORES__`
-
-### osascript
-
-See [bot/osascript-common.md](./bot/osascript-common.md#gotchas) for the full osascript gotchas list (accessibility permissions, `keystroke` non-ASCII issues, locale-specific app names, rate limiting, etc.).
@@ -1,110 +0,0 @@
-# Log `agent-browser` into a local LobeHub dev server
-
-`agent-browser --headed` on macOS often creates the Chromium window off-screen — the user can't see or interact with it, so manual login inside the agent-browser session fails. Instead of sharing the user's real Chrome profile, copy the **better-auth session cookie** out of a request in DevTools and inject it into the agent-browser session as a Playwright-style state file.
-
-## When to use
-
- You need `agent-browser` to reach an authenticated page on `http://localhost:<port>` (e.g. `localhost:3011`).
- The user already has a logged-in tab of the same dev server in their own Chrome.
- Spawning a headed Chromium to let the user log in manually is unreliable (window off-screen, no interaction).
-
-Do **not** use this on production URLs — only local dev. Treat the cookie as a secret: don't paste it into shared logs, PRs, or commit it anywhere.
-
-## Step 1 — Ask the user to copy the cookie from a Network request, NOT `document.cookie`
-
-`document.cookie` will not return HttpOnly cookies, which is exactly where better-auth puts its session. Instruct the user:
-
-1. Open the logged-in tab (`http://localhost:<port>/…`) in their own Chrome.
-2. `Cmd+Option+I` → **Network** tab.
-3. Refresh, click any same-origin request (e.g. the top-level document request).
-4. In the right pane under **Request Headers**, right-click the `Cookie:` line → **Copy value** (or copy the entire header).
-5. Paste the string into chat.
-
-You only need the better-auth pieces. Everything else (Clerk, `LOBE_LOCALE`, HMR hash, theme vars) is noise and can stay. The minimum viable set is:
-
-```
-better-auth.session_token=<value>; better-auth.state=<value>
-```
-
-## Step 2 — Build a Playwright-style state file
-
-`agent-browser state load` expects Playwright's `storageState` format: a JSON with a `cookies` array and an `origins` array.
-
-```bash
-cat > /tmp/mkstate.py << 'PY'
-import json, sys, time
-
-# Read the Cookie header from stdin (allows optional "Cookie: " prefix).
-raw = sys.stdin.read().strip()
-if raw.lower().startswith("cookie:"):
-    raw = raw.split(":", 1)[1].strip()
-
-# Keep only better-auth cookies. Extend this set if the app genuinely needs more.
-WANTED = {"better-auth.session_token", "better-auth.state"}
-
-cookies = []
-exp = int(time.time()) + 30 * 24 * 3600  # 30 days
-for pair in raw.split("; "):
-    if "=" not in pair:
-        continue
-    name, _, value = pair.partition("=")
-    if name not in WANTED:
-        continue
-    cookies.append({
-        "name": name,
-        "value": value,
-        "domain": "localhost",
-        "path": "/",
-        "expires": exp,
-        "httpOnly": False,
-        "secure": False,
-        "sameSite": "Lax",
-    })
-
-if not cookies:
-    sys.stderr.write("no better-auth cookies found in input\n")
-    sys.exit(1)
-
-print(json.dumps({"cookies": cookies, "origins": []}, indent=2))
-PY
-
-# Feed the copied Cookie header in via env var or heredoc.
-printf '%s' "$COOKIE_HEADER" | python3 /tmp/mkstate.py > /tmp/state.json
-```
-
-**Note on `httpOnly`**: the real cookie in the user's browser is HttpOnly, but `storageState` doesn't enforce the flag on load — it just attaches the value. Storing with `httpOnly: false` is fine for local dev and sidesteps a CDP-context quirk where HttpOnly cookies sometimes fail to attach.
-
-## Step 3 — Load state and navigate
-
-```bash
-SESSION="my-test" # any stable session name
-
-agent-browser --session "$SESSION" state load /tmp/state.json
-agent-browser --session "$SESSION" open "http://localhost:3011/"
-agent-browser --session "$SESSION" get url
-# Expect NOT /signin?callbackUrl=… — if you still see signin, cookie didn't apply.
-```
-
-## Step 4 — Verify
-
-```bash
-agent-browser --session "$SESSION" snapshot -i | head -20
-# Look for the user's avatar/name in the sidebar, or absence of the signin form.
-```
-
-## Common failure modes
-
-| Symptom                                         | Cause                                                                   | Fix                                                  |
-| ----------------------------------------------- | ----------------------------------------------------------------------- | ---------------------------------------------------- |
-| Still redirects to `/signin` after `state load` | User pasted from `document.cookie` → missed HttpOnly session            | Re-pull from Network request Headers, not console    |
-| `state load` reports 0 cookies                  | Separator wrong, or user pasted URL-decoded value                       | Keep the raw `Cookie:` header as-is; split on `"; "` |
-| Login works briefly then expires                | `better-auth.session_token` rotated (user logged out / signed in again) | Re-copy and re-load                                  |
-| Domain mismatch                                 | Use `domain: "localhost"` literally, no leading dot for local dev       | —                                                    |
-
-## Scope
-
-Only covers authenticating an **agent-browser** session into a **local** LobeHub dev server. It does not:
-
- Work for production — production cookies are `Secure; HttpOnly; Domain=.lobehub.com` and must be delivered over HTTPS.
- Replace real OAuth flows — tests that must exercise the login UI need a real Chromium with `--remote-debugging-port` or a bot account.
- Flow cookies back to the user's Chrome — injection is one-way (into agent-browser only).
@@ -0,0 +1,69 @@
+---
+name: model-bank-metadata
+description: 'Backfill and maintain model-bank metadata (knowledgeCutoff, family, generation). Use when adding models, fixing cutoff/family data, running a metadata sweep across aiModels providers, or researching official knowledge cutoffs.'
+user-invocable: false
+---
+
+# Model-Bank Metadata (knowledgeCutoff / family / generation)
+
+How to populate and maintain the three structured metadata fields on `packages/model-bank/src/aiModels/*.ts` model cards, at single-model scale (new model PR) or repo-wide scale (sweep across \~80 provider files / \~1900 entries).
+
+## Field semantics
+
+| Field             | Format                                                                              | Meaning                                                                                                                                                                                 |
+| ----------------- | ----------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `knowledgeCutoff` | `'YYYY-MM'` (or `'YYYY'` if only the year is published)                             | World-knowledge cutoff. When a vendor distinguishes a **"reliable knowledge cutoff"** from the broader training-data cutoff (Anthropic does), always use the **reliable** one.          |
+| `family`          | lowercase slug (`claude`, `gpt`, `o-series`, `qwen`, `deepseek`, `llama`, `glm`, …) | Model lineage, finer than `organization`. Lets the UI group models and match the same model across aggregator providers.                                                                |
+| `generation`      | family slug + version (`claude-4.6`, `gpt-5.2`, `qwen3.5`, `llama-3.1`)             | Generation within the family. Only set when confidently derivable from the model line's naming. Rolling aliases (`qwen-max`, `deepseek-chat`, `gemini-flash-latest`) get `family` only. |
+
+All three are optional. **The cardinal rule: only fill what an authoritative source states or naming rules derive — never guess.** An empty field is correct for vendors that publish nothing.
+
+No DB migration is ever needed for these: builtin models are merged from model-bank at read time (`repositories/aiInfra/index.ts` spreads the whole card), so new card fields flow to the client automatically.
+
+## Sourcing rules for knowledgeCutoff
+
+Accept only:
+
+- Vendor official docs (platform.openai.com / developers.openai.com, docs.x.ai, ai.google.dev, docs.anthropic.com / platform.claude.com)
+- Official Hugging Face org model cards (huggingface.co/meta-llama/..., etc.)
+- Official tech reports / system cards / launch blog posts
+
+Reject:
+
+- **Third-party aggregator sites** (aiknowledgecutoff.com and similar) — proven to copy one model's value across a whole family. A Cohere sweep once claimed `2024-06` for four distinct base models; none of the cited Cohere pages said that, and the only cutoff Cohere actually publishes is Feb 2023 for the 08-2024 Command R/R+ refresh.
+- **AWS Bedrock model cards as sole source** — proven to conflate launch date with knowledge cutoff (DeepSeek R1's card lists both as "Jan 2025"). If Bedrock is the only place a value appears, leave the field empty.
+- Inference from `releasedAt` — a release date is not a cutoff.
+
+Variant inheritance: dated snapshots (`-2024-08-06`), speed/price tiers of the same checkpoint, quantizations (`-fp8`, `-awq`), context-length variants (`-32k`), ollama `:NNb` tags, and cloud-prefixed ids (`anthropic.`/`us.`/`global.` Bedrock ids) share their base model's cutoff. **Distills do not inherit** from teacher or base — use the distill's own published value or leave empty. **Sizes within one generation can genuinely differ**: Llama 3 8B is Mar 2023 while 70B is Dec 2023 (per Meta's own card) — don't "fix" that to one family-wide value.
+
+Vendors that publish no cutoffs (leave empty, don't chase): Qwen, DeepSeek, GLM/Zhipu, ERNIE, Doubao, Hunyuan, SenseNova, Spark, MiniMax, StepFun, Yi (mostly), Moonshot.
+
+Known per-vendor footguns:
+
+- **Anthropic**: Opus 4.6 reliable cutoff is `2025-05`, Sonnet 4.6 is `2025-08` — easy to swap. Claude 3.7 is `2024-10` (system card: trained through Nov 2024, knowledge cutoff end of Oct 2024). Cite system cards / the models overview, not the Help Center article (a living page that drops retired models — citation rot).
+- **xAI**: docs.x.ai has one blanket sentence covering grok-3/grok-4; mini variants are not named there. Grok 4.20/4.3 have no official cutoff anywhere.
+- **OpenAI**: per-model docs pages (developers.openai.com/api/docs/models/<id>) state cutoffs explicitly, including snapshot differences (gpt-4-1106-preview `2023-04` vs gpt-4-0125-preview `2023-12`).
+
+## family/generation derivation
+
+Rule-based, no research needed: `scripts/derive-family.ts` holds the per-family regex rules. Traps already encoded there — keep them when extending:
+
+- Date suffixes are not versions: `claude-sonnet-4-20250514` is generation `claude-4`, not `claude-4.2`.
+- Size suffixes are not versions: `llama-3-8b` → `llama-3` (not `llama-3.8`); `gemma-7b-it` is **gemma-1** (not gemma-7).
+- Vendor spelling variants: `qwen2p5` = qwen2.5, `llama-v3p1` = llama-3.1, ollama `:NNb` tags, Bedrock `us.`/`global.`/`anthropic.` prefixes.
+- `claude-X.0` normalizes to `claude-X`.
+- Fable/Mythos-class ids (`claude-fable-5`) don't match the opus/sonnet/haiku regex — they are the Mythos class — `family: 'claude-mythos'`, `generation: 'mythos-5'` (set manually; the launch page calls Fable 5 "the generally available Mythos-class model").
+
+## Repo-wide sweep workflow
+
+1. **Extract ids**: `bun .agents/skills/model-bank-metadata/scripts/extract-model-ids.ts` → unique normalized chat-model ids (normalization = last path segment, lowercased). Non-chat types (image/video/embedding/tts) have no knowledge cutoff — skip them.
+2. **Research (multi-agent)**: chunk ids by family (≤50 per chunk) and fan out one research agent per chunk (Workflow tool), each returning `{id, cutoff, source}` with the sourcing rules above baked into the prompt, **plus** one adversarial verify agent per chunk that re-fetches cited sources and refutes unsupported claims. The verify pass is load-bearing: it caught the Cohere aggregator copy-paste and the AWS launch-date conflation.
+3. **Policy filter**: before applying, drop entries whose only source is a rejected category (check the returned `sources` map — e.g. drop everything sourced to aws.amazon.com).
+4. **Apply**: `bun scripts/apply-cutoffs.ts <map.json>` and `bun scripts/apply-family.ts <map.json>` (run from repo root). Both are idempotent codemods keyed on normalized id — aggregator providers get the same values automatically; entries that already have the field are skipped. They rely on the uniform prettier formatting of the data files (entries start `  {` / end `  },`, fields at 4-space indent).
+5. **Verify**: `cd packages/model-bank && bunx vitest run src/aiModels/__tests__/index.test.ts && bunx tsc --noEmit`.
+
+## Maintenance rules
+
+- **New model PRs** should fill all three fields inline, citing the official source in the PR body (see the Anthropic entries in `anthropic.ts` for reference values).
+- **After resolving merge conflicts** in model-bank data files, sanity-check that metadata didn't vanish: `git grep -c knowledgeCutoff -- 'packages/model-bank/src/aiModels/*.ts'` before vs after. A three-way stack of model PRs once silently dropped all 10 Anthropic cutoffs during conflict resolution.
+- Dirty ids exist in aggregator data (a sambanova id once carried a trailing tab). The codemods match ids verbatim — if a map key won't apply, check for invisible characters before assuming the model is missing.
@@ -0,0 +1,73 @@
+/**
+ * One-off codemod: apply a canonical { normalizedModelId: 'YYYY-MM' } map onto
+ * packages/model-bank/src/aiModels/*.ts, inserting `knowledgeCutoff` after the
+ * `id:` line of every chat-model entry that matches and doesn't already have one.
+ *
+ * Relies on the uniform prettier formatting of these files:
+ *   - each model entry starts with `  {` and ends with `  },` (2-space indent)
+ *   - fields are at 4-space indent: `    id: '...'`, `    type: 'chat'`
+ *
+ * Usage: bun /tmp/apply-cutoffs.ts /tmp/cutoff-map.json
+ */
+import { readdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+const mapPath = process.argv[2];
+if (!mapPath) throw new Error('usage: bun apply-cutoffs.ts <map.json>');
+const map: Record<string, string> = JSON.parse(readFileSync(mapPath, 'utf8'));
+
+const dir = 'packages/model-bank/src/aiModels';
+const normalize = (id: string) => id.split('/').pop()!.toLowerCase();
+
+let touchedFiles = 0;
+let inserted = 0;
+const matchedIds = new Set<string>();
+
+for (const file of readdirSync(dir).filter((f) => f.endsWith('.ts'))) {
+  const path = join(dir, file);
+  const lines = readFileSync(path, 'utf8').split('\n');
+  const out: string[] = [];
+  let changed = false;
+
+  let i = 0;
+  while (i < lines.length) {
+    if (lines[i] !== '  {') {
+      out.push(lines[i]);
+      i++;
+      continue;
+    }
+    // collect one model entry block
+    const start = i;
+    let end = i;
+    while (end < lines.length && lines[end] !== '  },') end++;
+    const block = lines.slice(start, end + 1);
+
+    const idLineIdx = block.findIndex((l) => /^ {4}id: '/.test(l));
+    const isChat = block.some((l) => /^ {4}type: 'chat',?$/.test(l));
+    const hasCutoff = block.some((l) => /^ {4}knowledgeCutoff:/.test(l));
+
+    if (idLineIdx >= 0 && isChat && !hasCutoff) {
+      const rawId = block[idLineIdx].match(/^ {4}id: '(.+)',$/)?.[1];
+      const norm = rawId ? normalize(rawId) : undefined;
+      const cutoff = norm ? map[norm] : undefined;
+      if (cutoff && /^\d{4}(?:-\d{2})?$/.test(cutoff)) {
+        block.splice(idLineIdx + 1, 0, `    knowledgeCutoff: '${cutoff}',`);
+        inserted++;
+        changed = true;
+        matchedIds.add(norm!);
+      }
+    }
+    out.push(...block);
+    i = end + 1;
+  }
+
+  if (changed) {
+    writeFileSync(path, out.join('\n'));
+    touchedFiles++;
+  }
+}
+
+console.log(`inserted ${inserted} knowledgeCutoff fields across ${touchedFiles} files`);
+console.log(`map ids used: ${matchedIds.size}/${Object.keys(map).length}`);
+const unused = Object.keys(map).filter((k) => !matchedIds.has(k));
+if (unused.length) console.log('unused map keys (first 20):', unused.slice(0, 20));
@@ -0,0 +1,49 @@
+import { readdirSync, readFileSync, writeFileSync } from 'node:fs';
+import { join } from 'node:path';
+
+const map: Record<string, { family: string; generation?: string }> = JSON.parse(
+  readFileSync('/tmp/family-map.json', 'utf8'),
+);
+const dir = 'packages/model-bank/src/aiModels';
+const normalize = (id: string) => id.split('/').pop()!.toLowerCase();
+
+let inserted = 0;
+let touchedFiles = 0;
+for (const file of readdirSync(dir).filter((f) => f.endsWith('.ts'))) {
+  const path = join(dir, file);
+  const lines = readFileSync(path, 'utf8').split('\n');
+  const out: string[] = [];
+  let changed = false;
+  let i = 0;
+  while (i < lines.length) {
+    if (lines[i] !== '  {') {
+      out.push(lines[i]);
+      i++;
+      continue;
+    }
+    let end = i;
+    while (end < lines.length && lines[end] !== '  },') end++;
+    const block = lines.slice(i, end + 1);
+    const idLineIdx = block.findIndex((l) => /^ {4}id: '/.test(l));
+    const isChat = block.some((l) => /^ {4}type: 'chat',?$/.test(l));
+    const hasFamily = block.some((l) => /^ {4}family:/.test(l));
+    if (idLineIdx >= 0 && isChat && !hasFamily) {
+      const rawId = block[idLineIdx].match(/^ {4}id: '(.+)',$/)?.[1];
+      const r = rawId ? map[normalize(rawId)] : undefined;
+      if (r) {
+        const add = [`    family: '${r.family}',`];
+        if (r.generation) add.push(`    generation: '${r.generation}',`);
+        block.splice(idLineIdx, 0, ...add);
+        inserted++;
+        changed = true;
+      }
+    }
+    out.push(...block);
+    i = end + 1;
+  }
+  if (changed) {
+    writeFileSync(path, out.join('\n'));
+    touchedFiles++;
+  }
+}
+console.log(`annotated ${inserted} model entries across ${touchedFiles} files`);
@@ -0,0 +1,237 @@
+/* eslint-disable regexp/no-unused-capturing-group */
+/**
+ * Rule-based derivation of { family, generation } from normalized model ids.
+ * Principle: only fill what is confidently derivable; otherwise omit.
+ *
+ * Usage: bun /tmp/derive-family.ts            # print distinct pairs for review
+ *        bun /tmp/derive-family.ts --emit     # write /tmp/family-map.json
+ */
+import { readFileSync, writeFileSync } from 'node:fs';
+
+const ids: string[] = JSON.parse(readFileSync('/tmp/model-ids.json', 'utf8'));
+
+type R = { family: string; generation?: string };
+
+const derive = (id: string): R | undefined => {
+  // strip cloud/bedrock prefixes for matching
+  const m = id.replace(/^(us\.|global\.|eu\.|apac\.)?(anthropic\.|meta\.|cohere\.|azure-)/, '');
+
+  // ---- anthropic ----
+  if (m.startsWith('claude')) {
+    // family = product-line tier (claude-opus/sonnet/haiku/instant); bare claude-2.x has no tier
+    const tier = m.match(/(opus|sonnet|haiku|instant)/)?.[1];
+    const family = tier ? `claude-${tier}` : 'claude';
+    let g = m.match(/^claude-(?:opus|sonnet|haiku)-(\d)[.-](\d)(?!\d)/); // claude-opus-4-8 / claude-haiku-4.5
+    if (g) return { family, generation: `claude-${g[1]}.${g[2]}` };
+    g = m.match(/^claude-(?:opus|sonnet|haiku)-(\d)(?!\d)/); // claude-opus-4
+    if (g) return { family, generation: `claude-${g[1]}` };
+    g = m.match(/^claude-(\d)[.-](\d)(?!\d)/); // claude-3-5-haiku / claude-3.7-sonnet / claude-2.1
+    if (g) return { family, generation: g[2] === '0' ? `claude-${g[1]}` : `claude-${g[1]}.${g[2]}` };
+    g = m.match(/^claude-(\d)(?!\d)/); // claude-3-haiku
+    if (g) return { family, generation: `claude-${g[1]}` };
+    if (m.startsWith('claude-instant')) return { family: 'claude-instant' };
+    if (/^claude-v?2/.test(m)) return { family: 'claude', generation: 'claude-2' };
+    return { family };
+  }
+
+  // ---- openai ----
+  if (/^(gpt-oss|gpt_oss)/.test(m) || m.startsWith('gpt-oss:'))
+    return { family: 'gpt-oss', generation: 'gpt-oss' };
+  if (/^(chatgpt-4o|gpt-4o)/.test(m)) return { family: 'gpt', generation: 'gpt-4o' };
+  if (/^gpt-(3\.5|35)/.test(m)) return { family: 'gpt', generation: 'gpt-3.5' };
+  if (m.startsWith('gpt-audio')) return { family: 'gpt', generation: 'gpt-audio' };
+  {
+    const g = m.match(/^gpt-(\d)\.(\d)/); // gpt-4.1 / gpt-5.2
+    if (g) return { family: 'gpt', generation: `gpt-${g[1]}.${g[2]}` };
+    const g2 = m.match(/^gpt-(\d)(?!\d)/); // gpt-4 / gpt-5
+    if (g2) return { family: 'gpt', generation: `gpt-${g2[1]}` };
+  }
+  {
+    const g = m.match(/^o([134])(-|$)/); // o1 / o3 / o4
+    if (g) return { family: 'o-series', generation: `o${g[1]}` };
+  }
+  if (/^(codex|computer-use-preview)/.test(m)) return { family: 'gpt' };
+
+  // ---- google ----
+  {
+    const g = m.match(/^gemini-(\d+(?:\.\d+)?)/);
+    if (g) return { family: 'gemini', generation: `gemini-${g[1]}` };
+    if (/^gemini-(pro|flash)/.test(m)) return { family: 'gemini' }; // rolling aliases
+    if (m.startsWith('gemma')) {
+      if (/^gemma-?\db/.test(m)) return { family: 'gemma', generation: 'gemma-1' };
+      const v = m.match(/^gemma-?(\d)(?!b)/);
+      return { family: 'gemma', generation: v ? `gemma-${v[1]}` : undefined };
+    }
+    if (/^(codegemma|learnlm|palm)/.test(m)) return { family: m.match(/^[a-z]+/)![0] };
+  }
+
+  // ---- qwen ----
+  if (m.startsWith('qwq')) return { family: 'qwen', generation: 'qwq' };
+  if (m.startsWith('qvq')) return { family: 'qwen', generation: 'qvq' };
+  if (m.startsWith('codeqwen')) return { family: 'qwen' };
+  if (m.startsWith('qwen')) {
+    const g =
+      m.match(/^qwen-?([123](?:\.\d+)?)(?![0-9b])/) || // qwen3.5-plus / qwen-3-14b / qwen2-7b / qwen1.5
+      m.match(/^qwen([23](?:\.\d+)?):/) || // qwen2.5:72b
+      m.match(/^qwen([23])p(\d)/); // qwen2p5 -> handled below
+    if (/^qwen(\d)p(\d)/.test(m)) {
+      const p = m.match(/^qwen(\d)p(\d)/)!;
+      return { family: 'qwen', generation: `qwen${p[1]}.${p[2]}` };
+    }
+    if (g) return { family: 'qwen', generation: `qwen${g[1]}` };
+    return { family: 'qwen' }; // qwen-max/plus/turbo/vl rolling aliases
+  }
+
+  // ---- deepseek ----
+  if (/^(deepseek|azure-deepseek|pro-deepseek)/.test(m) || m.startsWith('deepseek_')) {
+    const s = m.replace(/^pro-/, '').replaceAll('_', '-');
+    if (s.startsWith('deepseek-r1-distill'))
+      return { family: 'deepseek', generation: 'deepseek-r1-distill' };
+    if (s.startsWith('deepseek-r1')) return { family: 'deepseek', generation: 'deepseek-r1' };
+    const g = s.match(/^deepseek-(?:chat-)?v(\d(?:\.\d)?)/);
+    if (g) return { family: 'deepseek', generation: `deepseek-v${g[1]}` };
+    if (/^deepseek-(coder-v2|coder)/.test(s))
+      return { family: 'deepseek', generation: 'deepseek-coder' };
+    return { family: 'deepseek' }; // deepseek-chat / reasoner rolling aliases
+  }
+
+  // ---- meta llama ----
+  if (m.startsWith('codellama')) return { family: 'llama', generation: 'codellama' };
+  if (/^(meta-)?llama|^l3(\d)?-|^llava/.test(m)) {
+    if (m.startsWith('llava')) return { family: 'llava' };
+    const s = m.replace(/^meta-/, '');
+    const g =
+      s.match(/^llama-?([234])(?:[.-](\d))?(?![0-9b])/) || // llama-3.1 / llama3.3 / llama-4
+      s.match(/^llama-?v([234])p?(\d)?/) || // llama-v3p1
+      s.match(/^llama([234])[.:-](\d)?/);
+    if (g) {
+      const gen = g[2] ? `llama-${g[1]}.${g[2]}` : `llama-${g[1]}`;
+      return { family: 'llama', generation: gen };
+    }
+    if (m.startsWith('l3-')) return { family: 'llama', generation: 'llama-3' };
+    if (m.startsWith('l31-')) return { family: 'llama', generation: 'llama-3.1' };
+    return { family: 'llama' };
+  }
+
+  // ---- zhipu ----
+  if (/^(zai-)?glm/.test(m)) {
+    const s = m.replace(/^zai-/, '');
+    if (s.startsWith('glm-z1')) return { family: 'glm', generation: 'glm-z1' };
+    if (s.startsWith('glm-zero')) return { family: 'glm', generation: 'glm-zero' };
+    const g = s.match(/^glm-(\d(?:\.\d)?)/);
+    if (g) return { family: 'glm', generation: `glm-${g[1]}` };
+    return { family: 'glm' };
+  }
+  if (/^(charglm|codegeex|emohaa)/.test(m)) return { family: m.match(/^[a-z]+/)![0] };
+
+  // ---- mistral ----
+  if (
+    /^(open-)?(mistral|mixtral|ministral|codestral|devstral|magistral|pixtral|mathstral|labs-devstral|labs-leanstral|open-codestral)/.test(
+      m,
+    )
+  ) {
+    const fam = m.replace(/^(open-|labs-)/, '').match(/^[a-z]+/)![0];
+    return { family: fam };
+  }
+
+  // ---- xai ----
+  if (m.startsWith('grok')) {
+    const g = m.match(/^grok-(\d(?:\.\d+)?)/);
+    return { family: 'grok', generation: g ? `grok-${g[1]}` : undefined };
+  }
+
+  // ---- moonshot ----
+  if (m.startsWith('kimi')) {
+    const g = m.match(/^kimi-k(\d(?:\.\d)?)/);
+    return { family: 'kimi', generation: g ? `kimi-k${g[1]}` : undefined };
+  }
+  if (m.startsWith('moonshot-kimi-k2')) return { family: 'kimi', generation: 'kimi-k2' };
+  if (m.startsWith('moonshot-v1')) return { family: 'kimi', generation: 'moonshot-v1' };
+
+  // ---- minimax ----
+  if (m.startsWith('minimax')) {
+    if (m.startsWith('minimax-text')) return { family: 'minimax', generation: 'minimax-text-01' };
+    const g = m.match(/^minimax-m(\d(?:\.\d)?)/);
+    return { family: 'minimax', generation: g ? `minimax-m${g[1]}` : undefined };
+  }
+  if (m.startsWith('abab')) return { family: 'minimax', generation: 'abab' };
+
+  // ---- baidu ----
+  if (m.startsWith('ernie')) {
+    if (m.startsWith('ernie-x1')) return { family: 'ernie', generation: 'ernie-x1' };
+    const g = m.match(/^ernie-(\d\.\d)/);
+    return { family: 'ernie', generation: g ? `ernie-${g[1]}` : undefined };
+  }
+  if (m.startsWith('qianfan')) return { family: 'qianfan' };
+
+  // ---- bytedance ----
+  if (m.startsWith('doubao')) {
+    const g = m.match(/^doubao-seed-(\d[.-]\d|\d)/) || m.match(/^doubao-(\d\.\d)/);
+    return { family: 'doubao', generation: g ? `doubao-${g[1].replace('-', '.')}` : undefined };
+  }
+  if (/^(seed-oss|skylark)/.test(m)) return { family: m.startsWith('seed') ? 'doubao' : 'skylark' };
+
+  // ---- tencent ----
+  if (m.startsWith('hunyuan')) {
+    const g = m.match(/^hunyuan-(\d\.\d)/);
+    return { family: 'hunyuan', generation: g ? `hunyuan-${g[1]}` : undefined };
+  }
+  if (m.startsWith('hy3')) return { family: 'hunyuan', generation: 'hunyuan-3' };
+
+  // ---- others (family only / simple version) ----
+  if (m.startsWith('yi-')) return { family: 'yi' };
+  if (/^(command|c4ai-command)/.test(m)) return { family: 'command' };
+  if (/^(aya|c4ai-aya)/.test(m)) return { family: 'aya' };
+  if (/^phi-?(\d)?/.test(m) && m.startsWith('phi')) {
+    const g = m.match(/^phi-?(\d(?:\.\d)?)/);
+    return { family: 'phi', generation: g ? `phi-${g[1]}` : undefined };
+  }
+  if (m.startsWith('wizardlm')) return { family: 'wizardlm' };
+  if (m.startsWith('step-')) {
+    const g = m.match(/^step-(?:r1|(\d(?:\.\d)?))/);
+    return { family: 'step', generation: g?.[1] ? `step-${g[1]}` : undefined };
+  }
+  if (/^(internlm|intern-)/.test(m)) return { family: 'intern' };
+  if (m.startsWith('internvl')) return { family: 'internvl' };
+  if (m.startsWith('baichuan')) {
+    const g = m.match(/^baichuan-?(m?\d)/);
+    return { family: 'baichuan', generation: g ? `baichuan-${g[1]}` : undefined };
+  }
+  if (/^(sensechat|sensenova)/.test(m)) return { family: 'sensenova' };
+  if (/^(spark|generalv|4\.0ultra)/.test(m)) return { family: 'spark' };
+  if (/^(360gpt|360zhinao)/.test(m)) return { family: '360zhinao' };
+  if (/^(jamba|ai21-jamba)/.test(m)) return { family: 'jamba' };
+  if (m.startsWith('sonar')) return { family: 'sonar' };
+  if (/^(nova-lite|nova-micro|nova-pro)/.test(m)) return { family: 'nova' };
+  if (/^(ling|ring)-/.test(m)) return { family: m.match(/^[a-z]+/)![0] };
+  if (m.startsWith('longcat')) return { family: 'longcat' };
+  if (m.startsWith('mimo')) return { family: 'mimo' };
+  if (m.startsWith('taichu')) return { family: 'taichu' };
+  if (/^(hermes|nous-hermes)/.test(m)) return { family: 'hermes' };
+  if (m.startsWith('solar')) return { family: 'solar' };
+  if (m.startsWith('kat-coder')) return { family: 'kat-coder' };
+  if (m.startsWith('dbrx')) return { family: 'dbrx' };
+  if (m.startsWith('morph')) return { family: 'morph' };
+
+  return undefined;
+};
+
+const map: Record<string, R> = {};
+const pairs = new Map<string, number>();
+let derived = 0;
+for (const id of ids) {
+  const r = derive(id);
+  if (!r) continue;
+  derived++;
+  map[id] = r;
+  const key = `${r.family} :: ${r.generation ?? '—'}`;
+  pairs.set(key, (pairs.get(key) || 0) + 1);
+}
+
+console.log(`derived ${derived}/${ids.length}`);
+for (const [k, n] of [...pairs.entries()].sort()) console.log(String(n).padStart(4), k);
+
+if (process.argv.includes('--emit')) {
+  writeFileSync('/tmp/family-map.json', JSON.stringify(map, null, 1));
+  console.log('\nwritten /tmp/family-map.json');
+}
@@ -0,0 +1,23 @@
+/**
+ * Extract unique normalized chat-model ids from packages/model-bank/src/aiModels/*.ts.
+ * Normalization: last path segment, lowercased (matches the apply codemods).
+ *
+ * Usage (repo root): bun .agents/skills/model-bank-metadata/scripts/extract-model-ids.ts [out.json]
+ * Default output: /tmp/model-ids.json
+ */
+import { readdirSync, writeFileSync } from 'node:fs';
+import { join, resolve } from 'node:path';
+
+const dir = resolve('packages/model-bank/src/aiModels');
+const out = process.argv[2] || '/tmp/model-ids.json';
+
+const ids = new Set<string>();
+for (const f of readdirSync(dir).filter((f) => f.endsWith('.ts'))) {
+  const mod = await import(join(dir, f));
+  for (const m of mod.default || []) {
+    if (!m?.id || m.type !== 'chat') continue;
+    ids.add(m.id.split('/').pop()!.toLowerCase());
+  }
+}
+writeFileSync(out, JSON.stringify([...ids].sort(), null, 1));
+console.log(`${ids.size} unique normalized chat ids -> ${out}`);
@@ -56,7 +56,8 @@ git submodules.
 ├── apps/
 │   ├── cli/                  # LobeHub CLI
 │   ├── desktop/              # Electron desktop app
-│   └── device-gateway/       # Device gateway service
+│   ├── device-gateway/       # Device gateway service
+│   └── server/               # Next.js-backed server: featureFlags, globalConfig, modules, routers, services, utils, workflows (`@/server/*` alias)
 ├── docs/                     # changelog, development, self-hosting, usage
 ├── locales/                  # en-US, zh-CN, ...
 ├── packages/                 # ~80 @lobechat/* workspace packages — `ls` for the full set. Key ones:
@@ -85,32 +86,32 @@ git submodules.
    ├── business/             # Open-source stubs (client/server) — cloud repo provides real impls
    ├── features/             # Domain business components
    ├── store/                # ~30 zustand stores — `ls` for the full set
-    ├── server/               # featureFlags, globalConfig, modules, routers, services, workflows, agent-hono
+    ├── server/               # standalone-Hono server pieces only: agent-hono, workflows-hono (main backend lives in `apps/server`)
    └── ...                   # components, hooks, layout, libs, locales, services, types, utils
 ```

 ## Architecture Map

-| Layer            | Location                                            |
-| ---------------- | --------------------------------------------------- |
-| UI Components    | `src/components`, `src/features`                    |
-| SPA Pages        | `src/routes/`                                       |
-| React Router     | `src/spa/router/`                                   |
-| Global Providers | `src/layout`                                        |
-| Zustand Stores   | `src/store`                                         |
-| Client Services  | `src/services/`                                     |
-| REST API         | `src/app/(backend)/webapi`                          |
-| tRPC Routers     | `src/server/routers/{async\|lambda\|mobile\|tools}` |
-| Server Services  | `src/server/services` (can access DB)               |
-| Server Modules   | `src/server/modules` (no DB access)                 |
-| Feature Flags    | `src/server/featureFlags`                           |
-| Global Config    | `src/server/globalConfig`                           |
-| DB Schema        | `packages/database/src/schemas`                     |
-| DB Model         | `packages/database/src/models`                      |
-| DB Repository    | `packages/database/src/repositories`                |
-| Third-party      | `src/libs` (analytics, oidc, etc.)                  |
-| Builtin Tools    | `packages/builtin-tool-*`, `packages/builtin-tools` |
-| Open-source stub | `src/business/*`, `packages/business/*` (this repo) |
+| Layer            | Location                                                 |
+| ---------------- | -------------------------------------------------------- |
+| UI Components    | `src/components`, `src/features`                         |
+| SPA Pages        | `src/routes/`                                            |
+| React Router     | `src/spa/router/`                                        |
+| Global Providers | `src/layout`                                             |
+| Zustand Stores   | `src/store`                                              |
+| Client Services  | `src/services/`                                          |
+| REST API         | `src/app/(backend)/webapi`                               |
+| tRPC Routers     | `apps/server/src/routers/{async\|lambda\|mobile\|tools}` |
+| Server Services  | `apps/server/src/services` (can access DB)               |
+| Server Modules   | `apps/server/src/modules` (no DB access)                 |
+| Feature Flags    | `apps/server/src/featureFlags`                           |
+| Global Config    | `apps/server/src/globalConfig`                           |
+| DB Schema        | `packages/database/src/schemas`                          |
+| DB Model         | `packages/database/src/models`                           |
+| DB Repository    | `packages/database/src/repositories`                     |
+| Third-party      | `src/libs` (analytics, oidc, etc.)                       |
+| Builtin Tools    | `packages/builtin-tool-*`, `packages/builtin-tools`      |
+| Open-source stub | `src/business/*`, `packages/business/*` (this repo)      |

 ## Data Flow

@@ -22,6 +22,7 @@ user-invocable: false

 - Bug fixes must include tests covering the fixed scenario
 - New logic (services, store actions, utilities) should have test coverage
+- **New database Model/Repository** (`packages/database/src/models/**`, `src/repositories/**`) must ship a sibling `__tests__/<name>.test.ts` — incl. user-isolation tests; BM25 search guarded by `describe.skipIf(!isServerDB)` (see `/testing` → `db-model-test.md`)
 - Existing tests still cover the changed behavior?
 - Prefer `vi.spyOn` over `vi.mock` (see `/testing` skill)

@@ -50,7 +50,7 @@ Common false positives (do NOT merge):
 - `db-migrations` vs `drizzle` — distinct workflows (migration files vs schema authoring).
 - `microcopy` vs `i18n` — content vs mechanics.
 - `agent-runtime-hooks` vs `agent-tracing` vs `agent-signal` — different surfaces of the agent system.
- `testing` vs `local-testing` vs `cli-backend-testing` — different test types.
+- `testing` vs `agent-testing` — different test types.

 ### 4 — Description format consistency

@@ -14,15 +14,21 @@ user-invocable: false
 # Run specific test file
 bunx vitest run --silent='passed-only' '[file-path]'

-# Database package (client)
+# Database package (client-db, PGlite — default, skips BM25/pg_search)
 cd packages/database && bunx vitest run --silent='passed-only' '[file]'

-# Database package (server)
+# Database package (server-db, Postgres — BM25/pgvector parity, what CI measures coverage in)
 cd packages/database && TEST_SERVER_DB=1 bunx vitest run --silent='passed-only' '[file]'
 ```

 **Never run** `bun run test` - it runs all 3000+ tests (\~10 minutes).

+> **Database models/repositories:** every new file under `packages/database/src/models/**`
+> or `src/repositories/**` ships with a sibling `__tests__/<name>.test.ts` in the same PR.
+> Use the real DB via `getTestDB()` (integration style), guard BM25/full-text-search blocks
+> with `describe.skipIf(!isServerDB)`, and always test user-isolation. See
+> `references/db-model-test.md` for setup, schema gotchas, and the client-vs-server-db split.
+
 ## Test Categories

 | Category | Location                    | Config                          |
@@ -1,95 +1,74 @@
 # Database Model Testing Guide

-Test `packages/database` Model layer.
+Test the `packages/database` Model and Repository layers.

-## Dual Environment Verification (Required)
+> **Rule: every new Model or Repository ships with a sibling test in the same PR.**
+> A new file under `src/models/**` or `src/repositories/**` must have a matching
+> `__tests__/<name>.test.ts`. Coverage runs in server-db mode in CI and the patch
+> gate will not always catch a brand-new untested file (a small new file barely
+> moves the project total) — so this is a convention, not something CI guarantees.
+> Start from the template: `packages/database/src/models/__tests__/_test_template.ts`.
+
+## Two test environments: client-db vs server-db
+
+`getTestDB()` (`src/core/getTestDB.ts`) returns different engines based on the
+`TEST_SERVER_DB` env var:
+
+| Mode                    | Engine                              | When               | Notes                                                                                                                                                               |
+| ----------------------- | ----------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **client-db** (default) | PGlite (in-memory)                  | `bunx vitest run`  | Migration runner **skips any SQL containing `pg_search` / `bm25`** — the ParadeDB BM25 `@@@` operator does not exist here.                                          |
+| **server-db**           | node-postgres → `DATABASE_TEST_URL` | `TEST_SERVER_DB=1` | CI uses the `paradedb/paradedb` image (has `pg_search`). **Coverage is measured in this mode** (`test:coverage` → `vitest.config.server.mts`, uploaded to Codecov). |

 ```bash
-# 1. Client environment (fast)
-cd packages/database && TEST_SERVER_DB=0 bunx vitest run --silent='passed-only' '[file]'
+# 1. Client environment (fast, default — what most local runs use)
+cd packages/database && bunx vitest run --silent='passed-only' '[file]'

-# 2. Server environment (compatibility)
+# 2. Server environment (BM25 / pg_search / pgvector parity, needs DATABASE_TEST_URL)
 cd packages/database && TEST_SERVER_DB=1 bunx vitest run --silent='passed-only' '[file]'
 ```

-## User Permission Check - Security First 🔒
+Implication: client-db coverage **under-counts** any code that needs BM25 (e.g.
+`repositories/search/index.ts` reads near-0% locally but is fully covered in CI).
+Don't chase those lines locally — confirm via CI/Codecov.

-**Critical security requirement**: All user data operations must include permission checks.
+## BM25 / full-text search → `describe.skipIf(!isServerDB)`
+
+Any method using the BM25 `@@@` operator or `sanitizeBm25` (keyword search:
+`queryByKeyword`, `searchAgents`, userMemory lexical search, …) **throws under
+PGlite** (often swallowed by a `catch` that returns `[]`, so the test silently
+fails with empty results). Guard those blocks so they only run in server-db:

 ```typescript
-// ❌ DANGEROUS: Missing permission check
-update = async (id: string, data: Partial<MyModel>) => {
-  return this.db
-    .update(myTable)
-    .set(data)
-    .where(eq(myTable.id, id)) // Only checks ID
-    .returning();
-};
-
-// ✅ SECURE: Permission check included
-update = async (id: string, data: Partial<MyModel>) => {
-  return this.db
-    .update(myTable)
-    .set(data)
-    .where(
-      and(
-        eq(myTable.id, id),
-        eq(myTable.userId, this.userId), // ✅ Permission check
-      ),
-    )
-    .returning();
-};
-```
-
-## Test File Structure
-
-```typescript
-// @vitest-environment node
-describe('MyModel', () => {
-  describe('create', () => {
-    /* ... */
-  });
-  describe('queryAll', () => {
-    /* ... */
-  });
-  describe('update', () => {
-    it('should update own records');
-    it('should NOT update other users records'); // 🔒 Security
-  });
-  describe('delete', () => {
-    it('should delete own records');
-    it('should NOT delete other users records'); // 🔒 Security
-  });
-  describe('user isolation', () => {
-    it('should enforce user data isolation'); // 🔒 Core security
-  });
+// BM25 search requires the pg_search extension (ParadeDB), not available in PGlite
+const isServerDB = process.env.TEST_SERVER_DB === '1';
+describe.skipIf(!isServerDB)('queryByKeyword', () => {
+  /* ... */
 });
 ```

-## Security Test Example
+Convention already used in `session.test.ts`, `topic.query.test.ts`,
+`message.query.test.ts`, `home/index.test.ts`, `repositories/search/index.test.ts`.
+
+## Setup boilerplate
+
+Top-of-file pattern (see `_test_template.ts` for the full version). Use real DB
+integration via `getTestDB()` — **not a mocked `vi.fn()` db**; the integration
+style exercises real SQL and gives far deeper coverage.

 ```typescript
-it('should not update records of other users', async () => {
-  const [otherUserRecord] = await serverDB
-    .insert(myTable)
-    .values({ userId: 'other-user', data: 'original' })
-    .returning();
+import { eq } from 'drizzle-orm';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';

-  const result = await myModel.update(otherUserRecord.id, { data: 'hacked' });
+import { getTestDB } from '../../core/getTestDB';
+import { users } from '../../schemas';
+import type { LobeChatDatabase } from '../../type';
+import { MyModel } from '../myModel';

-  expect(result).toBeUndefined();
-  const unchanged = await serverDB.query.myTable.findFirst({
-    where: eq(myTable.id, otherUserRecord.id),
-  });
-  expect(unchanged?.data).toBe('original');
-});
-```
+const serverDB: LobeChatDatabase = await getTestDB(); // top-level await is fine

-## Data Management
-
-```typescript
-const userId = 'test-user';
+const userId = 'my-model-test-user';
 const otherUserId = 'other-user';
+const myModel = new MyModel(serverDB, userId);

 beforeEach(async () => {
  await serverDB.delete(users);
@@ -97,40 +76,99 @@ beforeEach(async () => {
 });

 afterEach(async () => {
-  await serverDB.delete(users);
+  await serverDB.delete(users); // cascades to user-scoped rows
 });
 ```

-## Foreign Key Handling
+Some tests need the Node environment (pgvector, server-only deps) — add
+`// @vitest-environment node` as the first line when required.
+
+## User permission check — security first 🔒
+
+**Every user-data operation must be ownership-scoped.** Always add a test proving
+another user cannot read/update/delete the row.

 ```typescript
-// ❌ Wrong: Invalid foreign key
+// ✅ SECURE: ownership in the WHERE clause
+update = async (id: string, data: Partial<MyModel>) =>
+  this.db
+    .update(myTable)
+    .set(data)
+    .where(and(eq(myTable.id, id), eq(myTable.userId, this.userId)))
+    .returning();
+```
+
+```typescript
+it('should NOT update another user's record', async () => {
+  const otherModel = new MyModel(serverDB, otherUserId);
+  const [row] = await otherModel.create({ data: 'original' });
+
+  await myModel.update(row.id, { data: 'hacked' });
+
+  const unchanged = await serverDB.query.myTable.findFirst({
+    where: eq(myTable.id, row.id),
+  });
+  expect(unchanged?.data).toBe('original');
+});
+```
+
+## What to cover
+
+Aim each model/repository as close to 100% as practical (excluding BM25):
+
+- Every public method
+- Both branches of conditionals; empty-list / `if (!x) return []` early returns
+- Error fallbacks (e.g. decrypt/JSON-parse failure → `null`)
+- Filters, pagination, ordering branches
+- Ownership / user isolation, and workspace scoping if the model takes a `workspaceId`
+
+## Schema gotchas (real traps that fail inserts or types)
+
+- **`workspaces`** requires `{ id, name, slug, primaryOwnerId }` and has **no
+  `userId` column** — `insert(workspaces).values({ id, name, slug, primaryOwnerId })`.
+- **uuid columns**: a "not found" test must pass a _valid_ UUID
+  (`'00000000-0000-0000-0000-000000000000'`); a random string raises a `22P02`
+  DB error instead of returning `undefined`/`null`.
+- **Enum / `$type` columns** are type-checked: e.g. `files.source` is a
+  `FileSource` enum (`image_generation` | `page-editor` | `video_generation`),
+  not free text — passing `'upload'` is a type error.
+- Read the table's schema in `src/schemas/` for `notNull` columns **without
+  defaults**; you must supply those on insert.
+
+## Foreign key handling
+
+```typescript
+// ❌ Wrong: invalid foreign key
 const testData = { asyncTaskId: 'invalid-uuid', fileId: 'non-existent' };

-// ✅ Correct: Use null
+// ✅ Use null …
 const testData = { asyncTaskId: null, fileId: null };

-// ✅ Or: Create referenced record first
-beforeEach(async () => {
-  const [asyncTask] = await serverDB
-    .insert(asyncTasks)
-    .values({ id: 'valid-id', status: 'pending' })
-    .returning();
-  testData.asyncTaskId = asyncTask.id;
-});
+// ✅ … or create the referenced row first
+const [asyncTask] = await serverDB.insert(asyncTasks).values({ status: 'pending' }).returning();
+testData.asyncTaskId = asyncTask.id;
 ```

-## Predictable Sorting
+## Predictable sorting

 ```typescript
-// ✅ Use explicit timestamps
-const oldDate = new Date('2024-01-01T10:00:00Z');
-const newDate = new Date('2024-01-02T10:00:00Z');
+// ✅ Use explicit timestamps — never rely on insert order
 await serverDB.insert(table).values([
-  { ...data1, createdAt: oldDate },
-  { ...data2, createdAt: newDate },
+  { ...data1, createdAt: new Date('2024-01-01T10:00:00Z') },
+  { ...data2, createdAt: new Date('2024-01-02T10:00:00Z') },
 ]);
-
-// ❌ Don't rely on insert order
-await serverDB.insert(table).values([data1, data2]); // Unpredictable
 ```
+
+## Checking coverage of one file
+
+```bash
+# Per-file coverage; read the "Uncovered Line #s" column to find gaps
+cd packages/database
+bunx vitest run --coverage --silent='passed-only' '[test-file]' 2>&1 | grep '[sourceFile].ts'
+```
+
+## Before finishing
+
+1. Tests pass: `bunx vitest run --silent='passed-only' '[file]'`
+2. Types pass: `bun run type-check` (vitest uses esbuild and does **not**
+   type-check — a green test run can still have type errors).
@@ -1,6 +1,6 @@
 ---
 name: trpc-router
-description: 'TRPC router development guide. Use when creating or modifying src/server/routers, adding procedures, or implementing server-side API endpoints.'
+description: 'TRPC router development guide. Use when creating or modifying apps/server/src/routers, adding procedures, or implementing server-side API endpoints.'
 user-invocable: false
 ---

@@ -8,9 +8,9 @@ user-invocable: false

 ## File Location

- Routers: `src/server/routers/lambda/<domain>.ts`
- Helpers: `src/server/routers/lambda/_helpers/`
- Schemas: `src/server/routers/lambda/_schema/`
+- Routers: `apps/server/src/routers/lambda/<domain>.ts`
+- Helpers: `apps/server/src/routers/lambda/_helpers/`
+- Schemas: `apps/server/src/routers/lambda/_schema/`

 ## Router Structure

@@ -186,4 +186,4 @@ QSTASH_URL=https://custom-qstash.com
 - [Upstash Workflow Documentation](https://upstash.com/docs/workflow)
 - [QStash Documentation](https://upstash.com/docs/qstash)
 - [Example Workflows in Codebase](<../../src/app/(backend)/api/workflows/>)
- [Workflow Classes](../../src/server/workflows/)
+- [Workflow Classes](../../apps/server/src/workflows/)
@@ -177,7 +177,7 @@ This allows cloud to override specific modules while using lobehub defaults.
 Place workflow class in cloud:

 ```text
-lobehub-cloud/src/server/workflows/featureName/index.ts
+lobehub-cloud/apps/server/src/workflows/featureName/index.ts
 ```

 ### Shared Workflows
@@ -185,7 +185,7 @@ lobehub-cloud/src/server/workflows/featureName/index.ts
 Place workflow class in lobehub, re-export in cloud if needed:

 ```text
-lobehub/src/server/workflows/featureName/index.ts
+lobehub/apps/server/src/workflows/featureName/index.ts
 ```

 ---
@@ -294,8 +294,8 @@ export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/*/route';
 **Step 4**: Move workflow class to lobehub

 ```bash
-mv lobehub-cloud/src/server/workflows/feature \
-  lobehub/src/server/workflows/
+mv lobehub-cloud/apps/server/src/workflows/feature \
+  lobehub/apps/server/src/workflows/
 ```

 **Step 5**: Update cloud imports
@@ -305,7 +305,7 @@ mv lobehub-cloud/src/server/workflows/feature \
 import { Workflow } from '@/server/workflows/feature';

 // To
-import { Workflow } from 'lobehub/src/server/workflows/feature';
+import { Workflow } from 'lobehub/apps/server/src/workflows/feature';
 ```

 ---
@@ -326,7 +326,7 @@ lobehub-cloud/
 │   ├── process-users/route.ts
 │   ├── paginate-users/route.ts
 │   └── generate-user/route.ts
-└── src/server/workflows/welcomePlaceholder/
+└── apps/server/src/workflows/welcomePlaceholder/
    └── index.ts
 ```

@@ -4,7 +4,7 @@ Full code templates for the 3-layer architecture. Read this when actually writin

 ## Table of Contents

-1. [Workflow Class](#workflow-class) — `src/server/workflows/{workflowName}/index.ts`
+1. [Workflow Class](#workflow-class) — `apps/server/src/workflows/{workflowName}/index.ts`
 2. [Layer 1: Entry Point](#layer-1-entry-point-process-) — `process-*` route
 3. [Layer 2: Pagination](#layer-2-pagination-paginate-) — `paginate-*` route
 4. [Layer 3: Execution](#layer-3-execution-execute--generate-) — `execute-*` / `generate-*` route
@@ -13,7 +13,7 @@ Full code templates for the 3-layer architecture. Read this when actually writin

 ## Workflow Class

-**Location:** `src/server/workflows/{workflowName}/index.ts`
+**Location:** `apps/server/src/workflows/{workflowName}/index.ts`

 ```typescript
 import { Client } from '@upstash/workflow';
@@ -223,6 +223,29 @@ OPENAI_API_KEY=sk-xxxxxxxxx
 # The LobeChat agents market index url
 # AGENTS_INDEX_URL=https://chat-agents.lobehub.com

+# #######################################
+# ######### Cloud Sandbox Service #######
+# #######################################
+
+# Sandbox provider for built-in code execution, shell, file operations, and export.
+# Supported values: market, onlyboxes
+# SANDBOX_PROVIDER=market
+
+# Required when SANDBOX_PROVIDER=onlyboxes. Base URL of the Onlyboxes console API, without /api/v1.
+# ONLYBOXES_BASE_URL=https://onlyboxes.example.com
+
+# Required when SANDBOX_PROVIDER=onlyboxes. Must match Onlyboxes CONSOLE_JIT_SIGNING_KEY.
+# ONLYBOXES_JIT_SIGNING_KEY=onlyboxes-jit-signing-secret
+
+# Optional JIT token issuer. Defaults to APP_URL.
+# ONLYBOXES_JIT_ISSUER=https://lobehub.example.com
+
+# Optional JIT token TTL in seconds.
+# ONLYBOXES_JIT_TTL_SEC=1800
+
+# Optional terminal session lease in seconds for the Onlyboxes provider.
+# ONLYBOXES_LEASE_TTL_SEC=900
+
 # #######################################
 # ########### Plugin Service ############
 # #######################################
@@ -5,6 +5,18 @@ inputs:
  node-version:
    description: Node.js version
    required: true
+  cloud-repository:
+    description: Cloud repository to overlay for commercial desktop builds
+    required: false
+    default: lobehub/lobehub-cloud
+  cloud-ref:
+    description: Optional Cloud repository ref
+    required: false
+    default: ''
+  cloud-token:
+    description: GitHub token with permission to read the Cloud repository
+    required: false
+    default: ''

 runs:
  using: composite
@@ -14,9 +26,77 @@ runs:
      with:
        node-version: ${{ inputs.node-version }}

+    - name: Overlay Cloud repository for desktop build
+      if: inputs.cloud-token != ''
+      shell: bash
+      env:
+        CLOUD_CHECKOUT: ${{ runner.temp }}/lobehub-cloud
+        CLOUD_REF: ${{ inputs.cloud-ref }}
+        CLOUD_REPOSITORY: ${{ inputs.cloud-repository }}
+        CLOUD_ROOT: ${{ github.workspace }}/..
+        CLOUD_TOKEN: ${{ inputs.cloud-token }}
+      run: |
+        set -euo pipefail
+
+        cloud_root="$(cd "$GITHUB_WORKSPACE/.." && pwd)"
+        cloud_checkout="$RUNNER_TEMP/lobehub-cloud"
+
+        rm -rf "$cloud_checkout"
+
+        clone_args=(--depth 1)
+        if [ -n "$CLOUD_REF" ]; then
+          clone_args+=(--branch "$CLOUD_REF")
+        fi
+
+        git clone "${clone_args[@]}" "https://x-access-token:${CLOUD_TOKEN}@github.com/${CLOUD_REPOSITORY}.git" "$cloud_checkout"
+
+        node <<'NODE'
+        const fs = require('node:fs');
+        const path = require('node:path');
+
+        const source = process.env.CLOUD_CHECKOUT;
+        const target = process.env.CLOUD_ROOT;
+        const skip = new Set(['.git', 'lobehub', 'node_modules']);
+
+        const copy = (from, to) => {
+          const stat = fs.lstatSync(from);
+          if (stat.isSymbolicLink()) {
+            const link = fs.readlinkSync(from);
+            fs.rmSync(to, { force: true, recursive: true });
+            fs.symlinkSync(link, to);
+            return;
+          }
+
+          if (stat.isDirectory()) {
+            fs.mkdirSync(to, { recursive: true });
+            for (const entry of fs.readdirSync(from)) {
+              if (skip.has(entry)) continue;
+              copy(path.join(from, entry), path.join(to, entry));
+            }
+            return;
+          }
+
+          fs.mkdirSync(path.dirname(to), { recursive: true });
+          fs.copyFileSync(from, to);
+        };
+
+        for (const entry of fs.readdirSync(source)) {
+          if (skip.has(entry)) continue;
+          copy(path.join(source, entry), path.join(target, entry));
+        }
+        NODE
+
+        echo "CLOUD_DESKTOP=1" >> "$GITHUB_ENV"
+        echo "✅ Cloud repository overlaid at $cloud_root"
+
    - name: Install dependencies
      shell: bash
-      run: pnpm install --node-linker=hoisted
+      run: |
+        set -euo pipefail
+        if [ "${CLOUD_DESKTOP:-}" = "1" ]; then
+          cd ..
+        fi
+        pnpm install --node-linker=hoisted

    # 移除国内 electron 镜像配置，GitHub Actions 使用官方源更快
    - name: Remove China electron mirror from .npmrc
@@ -31,4 +111,11 @@ runs:

    - name: Install deps on Desktop
      shell: bash
-      run: npm run install-isolated --prefix=./apps/desktop
+      run: |
+        set -euo pipefail
+        if [ "${CLOUD_DESKTOP:-}" = "1" ]; then
+          cd ..
+          npm run install-isolated --prefix=./lobehub/apps/desktop
+        else
+          npm run install-isolated --prefix=./apps/desktop
+        fi
@@ -30,7 +30,7 @@ jobs:
            This issue is closed, If you have any questions, you can comment and reply.
      - name: Checkout repository
        if: github.event_name == 'pull_request_target' && github.event.pull_request.merged == true
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Check if PR author is maintainer
        if: github.event.pull_request.merged == true
@@ -104,6 +104,7 @@ jobs:
      - name: Setup build environment
        uses: ./.github/actions/desktop-build-setup
        with:
+          cloud-token: ${{ secrets.LOBEHUB_CLOUD_TOKEN }}
          node-version: ${{ env.NODE_VERSION }}

      - name: Set package version
@@ -172,6 +173,7 @@ jobs:
      - name: Setup build environment
        uses: ./.github/actions/desktop-build-setup
        with:
+          cloud-token: ${{ secrets.LOBEHUB_CLOUD_TOKEN }}
          node-version: ${{ env.NODE_VERSION }}

      - name: Set package version
@@ -216,6 +218,7 @@ jobs:
      - name: Setup build environment
        uses: ./.github/actions/desktop-build-setup
        with:
+          cloud-token: ${{ secrets.LOBEHUB_CLOUD_TOKEN }}
          node-version: ${{ env.NODE_VERSION }}

      - name: Set package version
@@ -54,7 +54,7 @@ jobs:
      - name: Setup Node.js
        uses: actions/setup-node@v6
        with:
-          node-version: 24.11.1
+          node-version: 24.16.0
          package-manager-cache: false

      # 主要逻辑：确定构建版本号
@@ -92,6 +92,7 @@ jobs:
      - name: Setup build environment
        uses: ./.github/actions/desktop-build-setup
        with:
+          cloud-token: ${{ secrets.LOBEHUB_CLOUD_TOKEN }}
          node-version: 24.11.1

      # 设置 package.json 的版本号
@@ -87,6 +87,7 @@ jobs:
      - name: Setup build environment
        uses: ./.github/actions/desktop-build-setup
        with:
+          cloud-token: ${{ secrets.LOBEHUB_CLOUD_TOKEN }}
          node-version: ${{ env.NODE_VERSION }}

      - name: Set package version
@@ -223,6 +223,7 @@ jobs:
      - name: Setup build environment
        uses: ./.github/actions/desktop-build-setup
        with:
+          cloud-token: ${{ secrets.LOBEHUB_CLOUD_TOKEN }}
          node-version: ${{ env.NODE_VERSION }}

      - name: Set package version
@@ -409,7 +410,7 @@ jobs:
      - uses: actions/checkout@v6

      - name: Delete old canary GitHub releases
-        uses: actions/github-script@v7
+        uses: actions/github-script@v8
        with:
          script: |
            const { data: releases } = await github.rest.repos.listReleases({
@@ -180,6 +180,7 @@ jobs:
      - name: Setup build environment
        uses: ./.github/actions/desktop-build-setup
        with:
+          cloud-token: ${{ secrets.LOBEHUB_CLOUD_TOKEN }}
          node-version: ${{ env.NODE_VERSION }}

      - name: Set package version
@@ -28,7 +28,7 @@ jobs:
      - name: Setup Node.js
        uses: actions/setup-node@v6
        with:
-          node-version: 24.11.1
+          node-version: 24.16.0

      - name: Setup pnpm
        uses: pnpm/action-setup@v4
@@ -51,7 +51,7 @@ jobs:
      - name: Setup Node.js
        uses: actions/setup-node@v6
        with:
-          node-version: 24.11.1
+          node-version: 24.16.0
          registry-url: https://registry.npmjs.org

      - name: Setup pnpm
@@ -32,7 +32,7 @@ jobs:
    runs-on: ubuntu-latest
    name: Test Packages
    env:
-      PACKAGES: '@lobechat/file-loaders @lobechat/prompts @lobechat/model-runtime @lobechat/web-crawler @lobechat/electron-server-ipc @lobechat/utils @lobechat/python-interpreter @lobechat/context-engine @lobechat/agent-runtime @lobechat/conversation-flow @lobechat/ssrf-safe-fetch @lobechat/memory-user-memory @lobechat/types @lobechat/builtin-tool-lobe-agent model-bank @lobechat/agent-gateway-client @lobechat/agent-manager-runtime @lobechat/device-gateway-client @lobechat/device-identity @lobechat/eval-dataset-parser @lobechat/eval-rubric @lobechat/fetch-sse @lobechat/heterogeneous-agents'
+      PACKAGES: '@lobechat/file-loaders @lobechat/prompts @lobechat/model-runtime @lobechat/web-crawler @lobechat/electron-server-ipc @lobechat/utils @lobechat/context-engine @lobechat/agent-runtime @lobechat/conversation-flow @lobechat/ssrf-safe-fetch @lobechat/memory-user-memory @lobechat/types @lobechat/trpc @lobechat/app-config @lobechat/locales @lobechat/env @lobechat/builtin-tool-lobe-agent model-bank @lobechat/agent-gateway-client @lobechat/agent-manager-runtime @lobechat/device-gateway-client @lobechat/device-identity @lobechat/eval-dataset-parser @lobechat/eval-rubric @lobechat/fetch-sse @lobechat/heterogeneous-agents'

    steps:
      - name: Checkout
@@ -90,11 +90,23 @@ jobs:
          for package in $PACKAGES; do
            dir="${package#@lobechat/}"
            if [ -f "./packages/$dir/coverage/lcov.info" ]; then
-              echo "Uploading coverage for $dir..."
+              flag="packages/$dir"
+
+              case "$dir" in
+                builtin-tool-*)
+                  flag="builtin-tools"
+                  ;;
+                locales|env|device-gateway-client)
+                  echo "Skipping Codecov upload for $dir."
+                  continue
+                  ;;
+              esac
+
+              echo "Uploading coverage for $dir as $flag..."
              ./codecov upload-coverage \
                $COMMON_ARGS \
                --file ./packages/$dir/coverage/lcov.info \
-                --flag packages/$dir \
+                --flag "$flag" \
                --disable-search
            fi
          done
@@ -105,8 +117,8 @@ jobs:
    if: needs.check-duplicate-run.outputs.should_skip != 'true'
    strategy:
      matrix:
-        shard: [1, 2, 3]
-    name: Test App (shard ${{ matrix.shard }}/3)
+        shard: [1, 2]
+    name: Test App (shard ${{ matrix.shard }}/2)
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
@@ -126,7 +138,7 @@ jobs:
        run: pnpm install

      - name: Run tests
-        run: bunx vitest --coverage --silent='passed-only' --reporter=default --reporter=blob --shard=${{ matrix.shard }}/3
+        run: bunx vitest --coverage --silent='passed-only' --reporter=default --reporter=blob --shard=${{ matrix.shard }}/2 --exclude '**/apps/server/**'

      - name: Upload blob report
        if: ${{ !cancelled() }}
@@ -219,6 +231,40 @@ jobs:
          files: ./apps/desktop/coverage/lcov.info
          flags: desktop

+  test-server:
+    needs: check-duplicate-run
+    if: needs.check-duplicate-run.outputs.should_skip != 'true'
+    name: Test Server
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        env:
+          REF_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          REPOSITORY: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name || github.repository }}
+        run: |
+          git init .
+          git remote add origin "https://github.com/${REPOSITORY}.git"
+          git fetch --no-tags --depth=1 origin "${REF_SHA}"
+          git checkout --force FETCH_HEAD
+
+      - name: Setup environment
+        uses: ./.github/actions/setup-env
+
+      - name: Install deps
+        run: pnpm install
+
+      - name: Test Server Coverage
+        run: bunx vitest --coverage --silent='passed-only' --reporter=default --coverage.reportsDirectory=./apps/server/coverage --dir apps/server
+
+      - name: Upload Server coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./apps/server/coverage/lcov.info
+          flags: server
+
  test-databsae:
    needs: check-duplicate-run
    if: needs.check-duplicate-run.outputs.should_skip != 'true'
@@ -19,7 +19,7 @@ lobehub/
 ├── apps/
 │   ├── desktop/            # Electron desktop app
 │   ├── cli/                # LobeHub CLI
-│   └── device-gateway/     # Device gateway service
+│   └── server/             # Server service
 ├── packages/               # Shared packages (@lobechat/*)
 │   ├── database/           # Database schemas, models, repositories
 │   ├── agent-runtime/      # Agent runtime
@@ -125,8 +125,9 @@ bun run type-check
 ### i18n

 - Add keys to a namespace file under `src/locales/default/` (e.g. `agent.ts`, `auth.ts`)
- For dev preview: translate `locales/zh-CN/` and `locales/en-US/`
- `pnpm i18n` is slow; run it manually when locale keys need updating (e.g. before opening a PR).
+- Ship en-US and zh-CN by hand in the same PR: write the English source in `src/locales/default/*.ts` and mirror it to `locales/en-US/`; hand-translate `locales/zh-CN/`. Leave all other locales to CI.
+- Don't run `pnpm i18n` manually by default — a daily CI workflow (`auto-i18n.yml`) runs it and opens an automated translation PR for any missing keys.
+- Run `pnpm i18n` manually only when your branch needs the translated locales immediately, instead of waiting for the daily job (slow; requires `OPENAI_API_KEY`). Note it only fills keys missing from other locales — value-only edits never need it.

 ### Code Style

@@ -210,6 +210,14 @@ ENV NEXT_PUBLIC_S3_DOMAIN="" \
    S3_ENABLE_PATH_STYLE="" \
    S3_SET_ACL=""

+# Cloud Sandbox
+ENV SANDBOX_PROVIDER="" \
+    ONLYBOXES_BASE_URL="" \
+    ONLYBOXES_JIT_ISSUER="" \
+    ONLYBOXES_JIT_SIGNING_KEY="" \
+    ONLYBOXES_JIT_TTL_SEC="" \
+    ONLYBOXES_LEASE_TTL_SEC=""
+
 # Model Variables
 ENV \
    # AI21
@@ -1,6 +1,6 @@
 .\" Code generated by `npm run man:generate`; DO NOT EDIT.
 .\" Manual command details come from the Commander command tree.
-.TH LH 1 "" "@lobehub/cli 0.0.24" "User Commands"
+.TH LH 1 "" "@lobehub/cli 0.0.29" "User Commands"
 .SH NAME
 lh \- LobeHub CLI \- manage and connect to LobeHub services
 .SH SYNOPSIS
@@ -113,6 +113,9 @@ Manage plugins
 .B user
 Manage user account and settings
 .TP
+.B verify
+Manage the Agent Run delivery checker (criteria, rubrics, plans, results)
+.TP
 .B whoami
 Display current user information
 .TP
@@ -1,6 +1,6 @@
 {
  "name": "@lobehub/cli",
-  "version": "0.0.24",
+  "version": "0.0.29",
  "type": "module",
  "bin": {
    "lh": "./dist/index.js",
@@ -35,7 +35,7 @@
    "@lobechat/local-file-shell": "workspace:*",
    "@lobechat/tool-runtime": "workspace:*",
    "@trpc/client": "^11.8.1",
-    "@types/node": "^22.13.5",
+    "@types/node": "^24.13.2",
    "@types/ws": "^8.18.1",
    "commander": "^13.1.0",
    "dayjs": "^1.11.19",
@@ -4,6 +4,9 @@ packages:
  - '../../packages/device-identity'
  - '../../packages/heterogeneous-agents'
  - '../../packages/local-file-shell'
+  - '../../packages/tool-runtime'
+  - '../../packages/prompts'
+  - '../../packages/const'
  - '../../packages/types'
  - '../../packages/model-bank'
  - '../../packages/business/const'
@@ -347,22 +347,33 @@ export function registerAgentCommand(program: Command) {
        const { serverUrl, headers, token, tokenType } = await getAgentStreamAuthInfo();
        const agentGatewayUrl = options.sse ? undefined : resolveAgentGatewayUrl();

-        if (agentGatewayUrl) {
-          await streamAgentEventsViaWebSocket({
-            gatewayUrl: agentGatewayUrl,
-            json: options.json,
-            operationId,
-            serverUrl,
-            token,
-            tokenType,
-            verbose: options.verbose,
-          });
-        } else {
-          const streamUrl = `${serverUrl}/api/agent/stream?operationId=${encodeURIComponent(operationId)}`;
-          await streamAgentEvents(streamUrl, headers, {
-            json: options.json,
-            verbose: options.verbose,
-          });
+        try {
+          if (agentGatewayUrl) {
+            await streamAgentEventsViaWebSocket({
+              gatewayUrl: agentGatewayUrl,
+              json: options.json,
+              operationId,
+              serverUrl,
+              token,
+              tokenType,
+              verbose: options.verbose,
+            });
+          } else {
+            const streamUrl = `${serverUrl}/api/agent/stream?operationId=${encodeURIComponent(operationId)}`;
+            await streamAgentEvents(streamUrl, headers, {
+              json: options.json,
+              verbose: options.verbose,
+            });
+          }
+        } catch (error) {
+          // The live stream (gateway WS / SSE) dropped before the run finished —
+          // the run is still executing server-side. Instead of failing, fall back
+          // to polling the run status until it reaches a terminal state.
+          if (options.json) throw error;
+          log.warn(
+            `Live stream unavailable (${(error as Error).message}). Polling run status every 10s…`,
+          );
+          await pollAgentRunStatus(client, operationId);
        }
      },
    );
@@ -626,3 +637,56 @@ function colorStatus(status: string): string {
    }
  }
 }
+
+const TERMINAL_RUN_STATUSES = new Set([
+  'completed',
+  'done',
+  'success',
+  'failed',
+  'error',
+  'cancelled',
+  'canceled',
+  'aborted',
+]);
+
+/**
+ * Fallback when the live stream (gateway WebSocket / SSE) drops before the run
+ * finishes: the run is still executing server-side, so poll its status every 10s
+ * until it reaches a terminal state (or is no longer tracked, which also means it
+ * has finished). Avoids hard-exiting on a transient gateway disconnect.
+ */
+async function pollAgentRunStatus(
+  client: Awaited<ReturnType<typeof getTrpcClient>>,
+  operationId: string,
+): Promise<void> {
+  const POLL_MS = 10_000;
+  let lastStatus = '';
+  for (let i = 0; ; i++) {
+    if (i > 0) await new Promise((resolve) => setTimeout(resolve, POLL_MS));
+
+    let r: any;
+    try {
+      r = await client.aiAgent.getOperationStatus.query({ operationId } as any);
+    } catch (error) {
+      log.error(`Status poll failed: ${(error as Error).message}`);
+      process.exit(1);
+    }
+
+    if (!r) {
+      log.info('Run is no longer tracked — finished (or expired).');
+      return;
+    }
+
+    const status = r.status || r.state || 'unknown';
+    if (status !== lastStatus) {
+      lastStatus = status;
+      const steps = r.stepCount !== undefined ? ` · ${r.stepCount} step(s)` : '';
+      log.info(`Run status: ${colorStatus(status)}${steps}`);
+    }
+
+    if (TERMINAL_RUN_STATUSES.has(status)) {
+      if (r.error) log.error(`Run error: ${r.error}`);
+      return;
+    }
+  }
+}
@@ -3,6 +3,7 @@ import os from 'node:os';
 import path from 'node:path';

 import type {
+  AgentRunRequestMessage,
  DeviceSystemInfo,
  SystemInfoRequestMessage,
  ToolCallRequestMessage,
@@ -25,6 +26,7 @@ import {
  stopDaemon,
  writeStatus,
 } from '../daemon/manager';
+import { spawnHeteroAgentRun } from '../device/agentRun';
 import { registerDevice, resolveDeviceIdentity } from '../device/register';
 import { loadOrCreateConnectionId, loadSettings, normalizeUrl, saveSettings } from '../settings';
 import { executeToolCall } from '../tools';
@@ -286,6 +288,39 @@ async function runConnect(options: ConnectOptions, isDaemonChild: boolean) {
    });
  });

+  // Handle gateway-dispatched agent runs (heterogeneous agents, e.g. Claude
+  // Code). Mirrors the desktop app: spawn `lh hetero exec`, which owns the full
+  // execution + server-ingest pipeline. Ack with the spawn outcome — `accepted`
+  // once the child starts, `rejected` if it fails to spawn (e.g. bad cwd) — so
+  // a failed dispatch surfaces as an error instead of a stuck assistant message.
+  client.on('agent_run_request', async (request: AgentRunRequestMessage) => {
+    info(
+      `Received agent_run_request: operationId=${request.operationId} type=${request.agentType}`,
+    );
+    try {
+      const ack = await spawnHeteroAgentRun(
+        {
+          agentType: request.agentType,
+          cwd: request.cwd,
+          imageList: request.imageList,
+          jwt: request.jwt,
+          operationId: request.operationId,
+          prompt: request.prompt,
+          resumeSessionId: request.resumeSessionId,
+          serverUrl: auth.serverUrl,
+          systemContext: request.systemContext,
+          topicId: request.topicId,
+        },
+        { error, info },
+      );
+      client.sendAgentRunAck({ operationId: request.operationId, ...ack });
+    } catch (err) {
+      const reason = err instanceof Error ? err.message : String(err);
+      error(`agent_run_request failed: ${reason}`);
+      client.sendAgentRunAck({ operationId: request.operationId, reason, status: 'rejected' });
+    }
+  });
+
  client.on('connected', () => {
    updateStatus('connected');
  });
@@ -1,3 +1,6 @@
+import { mkdtemp, readdir, readFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
 import { PassThrough } from 'node:stream';

 import { Command } from 'commander';
@@ -645,4 +648,224 @@ describe('hetero exec command', () => {
      'finish',
    ]);
  });
+
+  it('resets the per-message text accumulator at message boundaries (no cross-message duplication)', async () => {
+    // The `replace` snapshot accumulator must not span
+    // message boundaries. Two assistant messages separated by a
+    // stream_end/stream_start boundary must each snapshot only their OWN
+    // text — otherwise the second message re-emits the first's text verbatim.
+    const textSnapshots: string[] = [];
+    mockHeteroIngestMutate.mockImplementation(async ({ events }: any) => {
+      for (const e of events) {
+        if (e.type === 'stream_chunk' && e.data?.chunkType === 'text') {
+          textSnapshots.push(e.data.content);
+        }
+      }
+      return { ack: true };
+    });
+
+    mockSpawnAgent.mockReturnValue(
+      createFakeHandle({
+        events: [
+          {
+            data: { chunkType: 'text', content: 'first message' },
+            operationId: 'op-server',
+            stepIndex: 0,
+            timestamp: 1,
+            type: 'stream_chunk',
+          },
+          { data: {}, operationId: 'op-server', stepIndex: 0, timestamp: 2, type: 'stream_end' },
+          {
+            data: { newStep: true, provider: 'claude-code' },
+            operationId: 'op-server',
+            stepIndex: 1,
+            timestamp: 3,
+            type: 'stream_start',
+          },
+          {
+            data: { chunkType: 'text', content: 'second message' },
+            operationId: 'op-server',
+            stepIndex: 1,
+            timestamp: 4,
+            type: 'stream_chunk',
+          },
+          {
+            data: { reason: 'success' },
+            operationId: 'op-server',
+            stepIndex: 1,
+            timestamp: 5,
+            type: 'agent_runtime_end',
+          },
+        ],
+        exitCode: 0,
+      }),
+    );
+
+    await runCmd([
+      'hetero',
+      'exec',
+      '--type',
+      'claude-code',
+      '--prompt',
+      'hi',
+      '--topic',
+      'topic-1',
+      '--operation-id',
+      'op-server',
+      '--render',
+      'none',
+    ]);
+
+    // Second snapshot carries ONLY the second message — not "first messagesecond message".
+    expect(textSnapshots).toEqual(['first message', 'second message']);
+  });
+
+  it('forwards subagent text raw (no snapshot coalescing, no cross-scope pollution of main text)', async () => {
+    // Subagent text is emitted as ONE full block per turn and the server's
+    // subagent path *appends* it (no snapshot semantics). It must therefore
+    // bypass the main-agent `replace`-snapshot coalescing: folding it into the
+    // shared accumulator would (a) splice main text into the subagent message
+    // and (b) make the server append a replace-snapshot → duplicated content.
+    const ingested: any[] = [];
+    mockHeteroIngestMutate.mockImplementation(async ({ events }: any) => {
+      for (const e of events) ingested.push(e);
+      return { ack: true };
+    });
+
+    const subagent = { parentToolCallId: 'task-1', subagentMessageId: 'msg-sub-1' };
+
+    mockSpawnAgent.mockReturnValue(
+      createFakeHandle({
+        events: [
+          // Main-agent streamed text delta (coalesced).
+          {
+            data: { chunkType: 'text', content: 'hello ' },
+            operationId: 'op-server',
+            stepIndex: 0,
+            timestamp: 1,
+            type: 'stream_chunk',
+          },
+          // Subagent full-block text — must pass through untouched.
+          {
+            data: { chunkType: 'text', content: 'I checked the files.', subagent },
+            operationId: 'op-server',
+            stepIndex: 0,
+            timestamp: 2,
+            type: 'stream_chunk',
+          },
+          {
+            data: {
+              chunkType: 'tools_calling',
+              toolsCalling: [
+                {
+                  apiName: 'Bash',
+                  arguments: '{"cmd":"ls"}',
+                  id: 'tc-1',
+                  identifier: 'bash',
+                  type: 'default',
+                },
+              ],
+            },
+            operationId: 'op-server',
+            stepIndex: 1,
+            timestamp: 3,
+            type: 'stream_chunk',
+          },
+          {
+            data: { reason: 'success' },
+            operationId: 'op-server',
+            stepIndex: 1,
+            timestamp: 4,
+            type: 'agent_runtime_end',
+          },
+        ],
+        exitCode: 0,
+      }),
+    );
+
+    await runCmd([
+      'hetero',
+      'exec',
+      '--type',
+      'claude-code',
+      '--prompt',
+      'hi',
+      '--topic',
+      'topic-1',
+      '--operation-id',
+      'op-server',
+      '--render',
+      'none',
+    ]);
+
+    const textEvents = ingested.filter(
+      (e) => e.type === 'stream_chunk' && e.data?.chunkType === 'text',
+    );
+
+    // Subagent text forwarded verbatim: keeps its subagent tag, original
+    // content, and is NOT converted into a replace snapshot.
+    const subagentText = textEvents.find((e) => e.data?.subagent);
+    expect(subagentText).toBeDefined();
+    expect(subagentText.data.content).toBe('I checked the files.');
+    expect(subagentText.data.snapshotMode).toBeUndefined();
+
+    // Main snapshot is untainted by the subagent block.
+    const mainText = textEvents.find((e) => !e.data?.subagent);
+    expect(mainText).toBeDefined();
+    expect(mainText.data.content).toBe('hello ');
+    expect(mainText.data.snapshotMode).toBe('replace');
+    expect(mainText.data.content).not.toContain('I checked');
+  });
+
+  it('--raw-dump writes a session folder with meta.json, wires onRawStdout, and tees stderr', async () => {
+    const root = await mkdtemp(path.join(tmpdir(), 'hetero-rawdump-'));
+
+    mockSpawnAgent.mockReturnValue(
+      createFakeHandle({
+        events: [
+          {
+            data: { chunkType: 'text', content: 'hi' },
+            operationId: 'op-raw',
+            stepIndex: 0,
+            timestamp: 1,
+            type: 'stream_chunk',
+          },
+        ],
+        exitCode: 0,
+        stderrChunks: ['warning: something happened\n'],
+      }),
+    );
+
+    await runCmd([
+      'hetero',
+      'exec',
+      '--type',
+      'claude-code',
+      '--prompt',
+      'hi',
+      '--operation-id',
+      'op-raw',
+      '--render',
+      'none',
+      '--raw-dump',
+      root,
+    ]);
+
+    // The raw stdout tee is handed to spawnAgent (the package captures the
+    // pre-adapter bytes — exercised in spawnAgent.test.ts).
+    expect(typeof mockSpawnAgent.mock.calls[0][0].onRawStdout).toBe('function');
+
+    // One session folder per exec, keyed by the operation id.
+    const sessions = await readdir(root);
+    expect(sessions).toHaveLength(1);
+    expect(sessions[0]).toContain('op-raw');
+    const sessionDir = path.join(root, sessions[0]!);
+
+    const meta = JSON.parse(await readFile(path.join(sessionDir, 'meta.json'), 'utf8'));
+    expect(meta).toMatchObject({ agentType: 'claude-code', operationId: 'op-raw' });
+
+    // stderr is teed to the attempt's log file.
+    const stderrDump = await readFile(path.join(sessionDir, 'attempt-1.stderr.log'), 'utf8');
+    expect(stderrDump).toContain('warning: something happened');
+  });
 });
@@ -1,6 +1,7 @@
 import { randomUUID } from 'node:crypto';
 import { once } from 'node:events';
-import { readFile } from 'node:fs/promises';
+import { createWriteStream } from 'node:fs';
+import { mkdir, readFile, writeFile } from 'node:fs/promises';
 import path from 'node:path';

 import type {
@@ -59,6 +60,12 @@ interface ExecOptions {
  inputJson?: string;
  operationId?: string;
  prompt?: string;
+  /**
+   * When set, persist the agent process's RAW stdout/stderr (pre-adapter
+   * stream-json) under `<rawDump>/<timestamp>-<operationId>/` for debugging.
+   * Independent of `--render` and the server ingest path.
+   */
+  rawDump?: string;
  /**
   * Output rendering mode.
   *   jsonl — emit each `AgentStreamEvent` as a JSONL line on stdout (default
@@ -217,10 +224,25 @@ class SerialServerIngester {
  push(event: AgentStreamEvent): void {
    if (this.fatalError) return;

+    // Text-snapshot coalescing is a MAIN-AGENT-ONLY transport optimization:
+    // it debounces the main agent's token-level text *deltas* into one
+    // `replace` snapshot to cut ingest calls. Subagent text is explicitly
+    // excluded (`!event.data?.subagent`) for two reasons:
+    //   1. Subagent text is emitted as ONE full block per turn (see
+    //      claudeCode adapter `handleSubagentAssistant` — "the full block IS
+    //      the only emission"), so there is nothing to coalesce.
+    //   2. `accumulatedText` is a single shared accumulator with no subagent
+    //      scope. Folding subagent blocks in would (a) splice main-agent text
+    //      into the subagent message via the shared buffer, and (b) emit a
+    //      `replace` snapshot that the server's subagent path *appends*
+    //      (`persistSubagentText` has no snapshot semantics) → duplicated /
+    //      cross-scope content. Forwarding the raw block straight through lets
+    //      the server append it exactly once, correctly.
    if (
      event.type === 'stream_chunk' &&
      event.data?.chunkType === 'text' &&
-      typeof event.data?.content === 'string'
+      typeof event.data?.content === 'string' &&
+      !event.data?.subagent
    ) {
      this.accumulatedText += event.data.content;
      this.pendingTextEvent = event;
@@ -233,6 +255,17 @@ class SerialServerIngester {
    }

    this.queuePendingTextSnapshot();
+    // `accumulatedText` is a PER-MESSAGE accumulator: it coalesces the text
+    // deltas of the current assistant message into one `replace` snapshot.
+    // A new message boundary (`stream_start` / `stream_end`, emitted by the
+    // adapter's `openMainMessage`) must reset it — otherwise it spans the
+    // whole run and every later message's snapshot re-emits all prior
+    // messages' text verbatim, which the server then persists into the new
+    // DB message: cross-message text duplication. Reset
+    // AFTER flushing the just-ended message's pending snapshot above.
+    if (event.type === 'stream_start' || event.type === 'stream_end') {
+      this.accumulatedText = '';
+    }
    this.enqueue(async () => {
      await this.sink.ingest([event]);
    });
@@ -280,6 +313,77 @@ class SerialServerIngester {
  }
 }

+interface RawStreamDumpAttempt {
+  /** Flush + close both file streams. Resolves once the bytes are on disk. */
+  close: () => Promise<void>;
+  writeStderr: (chunk: Buffer) => void;
+  writeStdout: (chunk: Buffer) => void;
+}
+
+/**
+ * Persists the agent process's RAW stdout/stderr — the untouched stream-json,
+ * BEFORE the adapter — to disk for post-hoc debugging. The adapted/ingested
+ * view can't tell a CC-side empty `tool_result` apart from an adapter
+ * extraction bug; the raw dump can.
+ *
+ * Enabled via `lh hetero exec --raw-dump <dir>`. Each exec gets its own
+ * `<dir>/<timestamp>-<operationId>/` session folder; each spawn attempt (the
+ * resume retry is a second attempt) writes `<label>.stdout.jsonl` /
+ * `<label>.stderr.log`. Fully best-effort: any dump failure is logged and
+ * swallowed so it never affects the run or its exit code.
+ *
+ * Future: the server-side sandbox runner (`spawnHeteroSandbox`) and the
+ * desktop device path (`spawnLhHeteroExec`) can pass `--raw-dump` pointing at
+ * a collectable location to capture remote runs the same way.
+ */
+class RawStreamDump {
+  private constructor(private readonly dir: string) {}
+
+  static async create(
+    root: string,
+    operationId: string,
+    meta: Record<string, unknown>,
+  ): Promise<RawStreamDump | undefined> {
+    try {
+      const safeTs = new Date().toISOString().replaceAll(/[.:]/g, '-');
+      const dir = path.join(path.resolve(root), `${safeTs}-${operationId}`);
+      await mkdir(dir, { recursive: true });
+      await writeFile(
+        path.join(dir, 'meta.json'),
+        `${JSON.stringify({ ...meta, operationId, startedAt: new Date().toISOString() }, null, 2)}\n`,
+      );
+      log.info(`Raw stream dump enabled → ${dir}`);
+      return new RawStreamDump(dir);
+    } catch (err) {
+      log.warn(
+        `Failed to initialize raw stream dump: ${err instanceof Error ? err.message : String(err)}`,
+      );
+      return undefined;
+    }
+  }
+
+  openAttempt(label: string): RawStreamDumpAttempt {
+    const stdout = createWriteStream(path.join(this.dir, `${label}.stdout.jsonl`));
+    const stderr = createWriteStream(path.join(this.dir, `${label}.stderr.log`));
+    // A failed dump write must never crash the run — drop write errors.
+    stdout.on('error', () => {});
+    stderr.on('error', () => {});
+    return {
+      close: () =>
+        Promise.all([
+          new Promise<void>((resolve) => stdout.end(() => resolve())),
+          new Promise<void>((resolve) => stderr.end(() => resolve())),
+        ]).then(() => undefined),
+      writeStderr: (chunk: Buffer) => {
+        stderr.write(chunk);
+      },
+      writeStdout: (chunk: Buffer) => {
+        stdout.write(chunk);
+      },
+    };
+  }
+}
+
 const exec = async (options: ExecOptions): Promise<void> => {
  if (!SUPPORTED_AGENT_TYPES.has(options.type)) {
    log.error(
@@ -314,6 +418,17 @@ const exec = async (options: ExecOptions): Promise<void> => {

  const operationId = options.operationId || randomUUID();

+  // Optional raw stream dump (pre-adapter stdout/stderr) for debugging.
+  let rawDump: RawStreamDump | undefined;
+  if (options.rawDump) {
+    rawDump = await RawStreamDump.create(options.rawDump, operationId, {
+      agentType: options.type,
+      cwd: options.cwd || process.cwd(),
+      resume: options.resume ?? null,
+      topicId: options.topic ?? null,
+    });
+  }
+
  // Determine JSONL output mode.
  // Explicit --render flag always wins. Otherwise: emit JSONL in standalone
  // mode; suppress in server-ingest mode (sink handles the data path).
@@ -357,6 +472,7 @@ const exec = async (options: ExecOptions): Promise<void> => {
  const runOneAgent = async (
    spawnOpts: Parameters<typeof spawnAgent>[0],
    interceptResumeErrors: boolean,
+    runLabel: string,
  ): Promise<{
    code: number | null;
    ingestError: boolean;
@@ -365,12 +481,17 @@ const exec = async (options: ExecOptions): Promise<void> => {
    signal: NodeJS.Signals | null;
    stderrContent: string;
  }> => {
+    // One raw-dump file pair per spawn attempt (the resume retry is a second
+    // attempt). The stdout tee runs inside `spawnAgent` before the adapter.
+    const dumpAttempt = rawDump?.openAttempt(runLabel);
+
    // `spawnAgent` is async and can reject DURING image normalization — fetch
    // failures, missing local --image paths, decode errors.
    let handle: Awaited<ReturnType<typeof spawnAgent>>;
    try {
-      handle = await spawnAgent(spawnOpts);
+      handle = await spawnAgent({ ...spawnOpts, onRawStdout: dumpAttempt?.writeStdout });
    } catch (err) {
+      await dumpAttempt?.close();
      log.error('Failed to start agent:', err instanceof Error ? err.message : String(err));
      process.exit(1);
    }
@@ -387,6 +508,7 @@ const exec = async (options: ExecOptions): Promise<void> => {
      if (stderrContent.length < STDERR_CAP) {
        stderrContent += chunk.toString();
      }
+      dumpAttempt?.writeStderr(chunk);
    });
    handle.stderr.pipe(process.stderr);

@@ -460,6 +582,7 @@ const exec = async (options: ExecOptions): Promise<void> => {
          // best-effort
        }
      }
+      await dumpAttempt?.close();
      process.exit(1);
    } finally {
      process.off('SIGINT', onSigint);
@@ -468,6 +591,7 @@ const exec = async (options: ExecOptions): Promise<void> => {

    const { code, signal } = await handle.exit;
    await stderrEnded;
+    await dumpAttempt?.close();

    // Fallback stderr detection: CC may exit non-zero without emitting a
    // result event (e.g. it writes to stderr and quits immediately).
@@ -503,6 +627,7 @@ const exec = async (options: ExecOptions): Promise<void> => {
      resumeSessionId: options.resume,
    },
    interceptResume,
+    'attempt-1',
  );

  // ─── Auto-retry without --resume when the session cannot be used ─────────
@@ -531,6 +656,7 @@ const exec = async (options: ExecOptions): Promise<void> => {
        // No resumeSessionId — start fresh
      },
      false, // no need to intercept resume errors on a fresh run
+      'attempt-2-noresume',
    );
  }

@@ -618,5 +744,9 @@ export function registerHeteroCommand(program: Command) {
      '--render <mode>',
      'Output mode: jsonl (emit events as JSONL on stdout) | none (suppress stdout). Defaults to jsonl in standalone, none in server-ingest mode.',
    )
+    .option(
+      '--raw-dump <dir>',
+      'Persist the agent process RAW stdout/stderr (pre-adapter stream-json) under <dir>/<timestamp>-<operationId>/ for debugging. Each spawn attempt writes its own .stdout.jsonl / .stderr.log. Best-effort; never affects the run.',
+    )
    .action(exec);
 }
@@ -64,15 +64,18 @@ describe('skill command', () => {

  describe('list', () => {
    it('should display skills in table format', async () => {
-      mockTrpcClient.agentSkills.list.query.mockResolvedValue([
-        {
-          description: 'A skill',
-          id: 's1',
-          identifier: 'test-skill',
-          name: 'Test Skill',
-          source: 'user',
-        },
-      ]);
+      mockTrpcClient.agentSkills.list.query.mockResolvedValue({
+        data: [
+          {
+            description: 'A skill',
+            id: 's1',
+            identifier: 'test-skill',
+            name: 'Test Skill',
+            source: 'user',
+          },
+        ],
+        total: 1,
+      });

      const program = createProgram();
      await program.parseAsync(['node', 'test', 'skill', 'list']);
@@ -83,7 +86,7 @@ describe('skill command', () => {

    it('should output JSON when --json flag is used', async () => {
      const items = [{ id: 's1', name: 'Test' }];
-      mockTrpcClient.agentSkills.list.query.mockResolvedValue(items);
+      mockTrpcClient.agentSkills.list.query.mockResolvedValue({ data: items, total: items.length });

      const program = createProgram();
      await program.parseAsync(['node', 'test', 'skill', 'list', '--json']);
@@ -92,7 +95,7 @@ describe('skill command', () => {
    });

    it('should filter by source', async () => {
-      mockTrpcClient.agentSkills.list.query.mockResolvedValue([]);
+      mockTrpcClient.agentSkills.list.query.mockResolvedValue({ data: [], total: 0 });

      const program = createProgram();
      await program.parseAsync(['node', 'test', 'skill', 'list', '--source', 'builtin']);
@@ -111,7 +114,7 @@ describe('skill command', () => {
    });

    it('should show message when no skills found', async () => {
-      mockTrpcClient.agentSkills.list.query.mockResolvedValue([]);
+      mockTrpcClient.agentSkills.list.query.mockResolvedValue({ data: [], total: 0 });

      const program = createProgram();
      await program.parseAsync(['node', 'test', 'skill', 'list']);
@@ -211,9 +214,10 @@ describe('skill command', () => {

  describe('search', () => {
    it('should search skills', async () => {
-      mockTrpcClient.agentSkills.search.query.mockResolvedValue([
-        { description: 'A skill', id: 's1', name: 'Found Skill' },
-      ]);
+      mockTrpcClient.agentSkills.search.query.mockResolvedValue({
+        data: [{ description: 'A skill', id: 's1', name: 'Found Skill' }],
+        total: 1,
+      });

      const program = createProgram();
      await program.parseAsync(['node', 'test', 'skill', 'search', 'test']);
@@ -223,7 +227,7 @@ describe('skill command', () => {
    });

    it('should show message when no results', async () => {
-      mockTrpcClient.agentSkills.search.query.mockResolvedValue([]);
+      mockTrpcClient.agentSkills.search.query.mockResolvedValue({ data: [], total: 0 });

      const program = createProgram();
      await program.parseAsync(['node', 'test', 'skill', 'search', 'nothing']);
@@ -47,7 +47,7 @@ export function registerSkillCommand(program: Command) {
      if (options.source) input.source = options.source as 'builtin' | 'market' | 'user';

      const result = await client.agentSkills.list.query(input);
-      const items = Array.isArray(result) ? result : [];
+      const items = result?.data ?? [];

      if (options.json !== undefined) {
        const fields = typeof options.json === 'string' ? options.json : undefined;
@@ -206,7 +206,7 @@ export function registerSkillCommand(program: Command) {
    .action(async (query: string, options: { json?: string | boolean }) => {
      const client = await getTrpcClient();
      const result = await client.agentSkills.search.query({ query });
-      const items = Array.isArray(result) ? result : [];
+      const items = result?.data ?? [];

      if (options.json !== undefined) {
        const fields = typeof options.json === 'string' ? options.json : undefined;
@@ -0,0 +1,90 @@
+import { Command } from 'commander';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { registerVerifyCommand } from './verify';
+
+const { mockTrpcClient } = vi.hoisted(() => ({
+  mockTrpcClient: {
+    verify: {
+      createRubric: { mutate: vi.fn() },
+      getRubric: { query: vi.fn() },
+      updateRubric: { mutate: vi.fn() },
+    },
+  },
+}));
+
+const { getTrpcClient: mockGetTrpcClient } = vi.hoisted(() => ({
+  getTrpcClient: vi.fn(),
+}));
+
+vi.mock('../api/client', () => ({ getTrpcClient: mockGetTrpcClient }));
+vi.mock('../utils/logger', () => ({
+  log: { debug: vi.fn(), error: vi.fn(), info: vi.fn(), warn: vi.fn() },
+  setVerbose: vi.fn(),
+}));
+
+describe('verify rubric config commands', () => {
+  let consoleSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+    mockGetTrpcClient.mockResolvedValue(mockTrpcClient);
+    mockTrpcClient.verify.createRubric.mutate.mockReset().mockResolvedValue({ id: 'rub-1' });
+    mockTrpcClient.verify.updateRubric.mutate.mockReset().mockResolvedValue(undefined);
+    mockTrpcClient.verify.getRubric.query.mockReset();
+  });
+
+  afterEach(() => consoleSpy.mockRestore());
+
+  const run = async (args: string[]) => {
+    const program = new Command();
+    program.exitOverride();
+    registerVerifyCommand(program);
+    await program.parseAsync(['node', 'lh', 'verify', ...args]);
+  };
+
+  it('passes maxRepairRounds config when creating a rubric', async () => {
+    await run(['rubric', 'create', '-t', 'Standard', '--max-repair-rounds', '3']);
+
+    expect(mockTrpcClient.verify.createRubric.mutate).toHaveBeenCalledWith({
+      config: { maxRepairRounds: 3 },
+      description: undefined,
+      title: 'Standard',
+    });
+  });
+
+  it('omits config when no max-repair-rounds flag is given', async () => {
+    await run(['rubric', 'create', '-t', 'Standard']);
+
+    expect(mockTrpcClient.verify.createRubric.mutate).toHaveBeenCalledWith({
+      config: undefined,
+      description: undefined,
+      title: 'Standard',
+    });
+  });
+
+  it('updates only the config when max-repair-rounds is passed', async () => {
+    await run(['rubric', 'update', 'rub-1', '--max-repair-rounds', '0']);
+
+    expect(mockTrpcClient.verify.updateRubric.mutate).toHaveBeenCalledWith({
+      id: 'rub-1',
+      value: { config: { maxRepairRounds: 0 } },
+    });
+  });
+
+  it('views a rubric and prints its repair-round config', async () => {
+    mockTrpcClient.verify.getRubric.query.mockResolvedValue({
+      config: { maxRepairRounds: 4 },
+      description: 'desc',
+      id: 'rub-1',
+      title: 'Standard',
+    });
+
+    await run(['rubric', 'view', 'rub-1']);
+
+    expect(mockTrpcClient.verify.getRubric.query).toHaveBeenCalledWith({ id: 'rub-1' });
+    const printed = consoleSpy.mock.calls.map((c) => String(c[0])).join('\n');
+    expect(printed).toContain('Standard');
+    expect(printed).toContain('4');
+  });
+});
@@ -0,0 +1,455 @@
+import type { Command } from 'commander';
+import pc from 'picocolors';
+
+import { getTrpcClient } from '../api/client';
+import { confirm, outputJson, printTable, timeAgo, truncate } from '../utils/format';
+import { log } from '../utils/logger';
+
+// ── Helpers ────────────────────────────────────────────────
+
+type VerifierType = 'agent' | 'llm' | 'program';
+type OnFail = 'auto_repair' | 'manual';
+type Decision = 'accepted' | 'overridden' | 'rejected';
+
+const VERIFIER_TYPES: VerifierType[] = ['program', 'agent', 'llm'];
+const ON_FAIL: OnFail[] = ['manual', 'auto_repair'];
+const DECISIONS: Decision[] = ['accepted', 'rejected', 'overridden'];
+
+function parseConfig(raw?: string): Record<string, unknown> | undefined {
+  if (!raw) return undefined;
+  try {
+    return JSON.parse(raw);
+  } catch {
+    log.error('--config must be valid JSON');
+    process.exit(1);
+  }
+}
+
+function assertEnum<T extends string>(value: T | undefined, allowed: T[], flag: string): void {
+  if (value !== undefined && !allowed.includes(value)) {
+    log.error(`${flag} must be one of: ${allowed.join(', ')}`);
+    process.exit(1);
+  }
+}
+
+// ── Command Registration ───────────────────────────────────
+
+export function registerVerifyCommand(program: Command) {
+  const verify = program
+    .command('verify')
+    .description('Manage the Agent Run delivery checker (criteria, rubrics, plans, results)');
+
+  // ════════════ criteria ════════════
+  const criterion = verify.command('criterion').description('Reusable pass/fail standards');
+
+  criterion
+    .command('list')
+    .description('List criteria')
+    .option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
+    .action(async (options: { json?: boolean | string }) => {
+      const client = await getTrpcClient();
+      const items = await client.verify.listCriteria.query();
+
+      if (options.json !== undefined) {
+        outputJson(items, typeof options.json === 'string' ? options.json : undefined);
+        return;
+      }
+      if (items.length === 0) return void console.log('No criteria found.');
+      printTable(
+        items.map((c) => [
+          c.id,
+          truncate(c.title, 60),
+          c.verifierType,
+          c.required ? 'gate' : 'soft',
+          c.onFail,
+          c.updatedAt ? timeAgo(c.updatedAt) : '',
+        ]),
+        ['ID', 'TITLE', 'TYPE', 'BLOCK', 'ON-FAIL', 'UPDATED'],
+      );
+    });
+
+  criterion
+    .command('create')
+    .description('Create a criterion')
+    .requiredOption('-t, --title <title>', 'Criterion title')
+    .requiredOption('--type <type>', `Verifier type (${VERIFIER_TYPES.join('|')})`)
+    .option('--on-fail <strategy>', `Action on failure (${ON_FAIL.join('|')})`)
+    .option('--soft', 'Non-blocking (required=false); defaults to blocking')
+    .option('--config <json>', 'Verifier config as JSON')
+    .option('--doc <id>', 'Linked guidance document id')
+    .action(
+      async (options: {
+        config?: string;
+        doc?: string;
+        onFail?: OnFail;
+        soft?: boolean;
+        title: string;
+        type: VerifierType;
+      }) => {
+        assertEnum(options.type, VERIFIER_TYPES, '--type');
+        assertEnum(options.onFail, ON_FAIL, '--on-fail');
+        const client = await getTrpcClient();
+        const result = await client.verify.createCriterion.mutate({
+          documentId: options.doc,
+          onFail: options.onFail,
+          required: options.soft ? false : undefined,
+          title: options.title,
+          verifierConfig: parseConfig(options.config),
+          verifierType: options.type,
+        });
+        console.log(`${pc.green('✓')} Created criterion ${pc.bold((result as any).id)}`);
+      },
+    );
+
+  criterion
+    .command('delete <id>')
+    .description('Delete a criterion')
+    .option('--yes', 'Skip confirmation')
+    .action(async (id: string, options: { yes?: boolean }) => {
+      if (!options.yes && !(await confirm(`Delete criterion ${id}?`)))
+        return void console.log('Cancelled.');
+      const client = await getTrpcClient();
+      await client.verify.deleteCriterion.mutate({ id });
+      console.log(`${pc.green('✓')} Deleted criterion ${pc.bold(id)}`);
+    });
+
+  // ════════════ rubrics ════════════
+  const rubric = verify.command('rubric').description('Named groups of criteria');
+
+  rubric
+    .command('list')
+    .description('List rubrics')
+    .option('--json [fields]', 'Output JSON, optionally specify fields (comma-separated)')
+    .action(async (options: { json?: boolean | string }) => {
+      const client = await getTrpcClient();
+      const items = await client.verify.listRubrics.query();
+      if (options.json !== undefined) {
+        outputJson(items, typeof options.json === 'string' ? options.json : undefined);
+        return;
+      }
+      if (items.length === 0) return void console.log('No rubrics found.');
+      printTable(
+        items.map((r) => [
+          r.id,
+          truncate(r.title, 60),
+          truncate(r.description || '', 60),
+          r.updatedAt ? timeAgo(r.updatedAt) : '',
+        ]),
+        ['ID', 'TITLE', 'DESCRIPTION', 'UPDATED'],
+      );
+    });
+
+  rubric
+    .command('create')
+    .description('Create a rubric')
+    .requiredOption('-t, --title <title>', 'Rubric title')
+    .option('-d, --description <text>', 'Rubric description')
+    .option('--max-repair-rounds <n>', 'Cap on automatic repair rounds (0-5)')
+    .action(async (options: { description?: string; maxRepairRounds?: string; title: string }) => {
+      const client = await getTrpcClient();
+      const result = await client.verify.createRubric.mutate({
+        config:
+          options.maxRepairRounds !== undefined
+            ? { maxRepairRounds: Number(options.maxRepairRounds) }
+            : undefined,
+        description: options.description,
+        title: options.title,
+      });
+      console.log(`${pc.green('✓')} Created rubric ${pc.bold((result as any).id)}`);
+    });
+
+  rubric
+    .command('view <id>')
+    .description('Show a rubric and its run-policy config')
+    .option('--json [fields]', 'Output JSON')
+    .action(async (id: string, options: { json?: boolean | string }) => {
+      const client = await getTrpcClient();
+      const item = await client.verify.getRubric.query({ id });
+      if (!item) return void log.error('Rubric not found.');
+      if (options.json !== undefined) {
+        outputJson(item, typeof options.json === 'string' ? options.json : undefined);
+        return;
+      }
+      console.log(`${pc.bold('ID')}            ${item.id}`);
+      console.log(`${pc.bold('Title')}         ${item.title}`);
+      if (item.description) console.log(`${pc.bold('Description')}   ${item.description}`);
+      const maxRepairRounds = (item.config as { maxRepairRounds?: number } | null)?.maxRepairRounds;
+      console.log(`${pc.bold('Repair rounds')} ${maxRepairRounds ?? pc.dim('default')}`);
+    });
+
+  rubric
+    .command('update <id>')
+    .description('Update a rubric (title / description / run-policy config)')
+    .option('-t, --title <title>', 'New title')
+    .option('-d, --description <text>', 'New description')
+    .option('--max-repair-rounds <n>', 'Cap on automatic repair rounds (0-5)')
+    .action(
+      async (
+        id: string,
+        options: { description?: string; maxRepairRounds?: string; title?: string },
+      ) => {
+        const client = await getTrpcClient();
+        const value: {
+          config?: { maxRepairRounds?: number };
+          description?: string;
+          title?: string;
+        } = {};
+        if (options.title !== undefined) value.title = options.title;
+        if (options.description !== undefined) value.description = options.description;
+        if (options.maxRepairRounds !== undefined)
+          value.config = { maxRepairRounds: Number(options.maxRepairRounds) };
+        await client.verify.updateRubric.mutate({ id, value });
+        console.log(`${pc.green('✓')} Updated rubric ${pc.bold(id)}`);
+      },
+    );
+
+  rubric
+    .command('delete <id>')
+    .description('Delete a rubric')
+    .option('--yes', 'Skip confirmation')
+    .action(async (id: string, options: { yes?: boolean }) => {
+      if (!options.yes && !(await confirm(`Delete rubric ${id}?`)))
+        return void console.log('Cancelled.');
+      const client = await getTrpcClient();
+      await client.verify.deleteRubric.mutate({ id });
+      console.log(`${pc.green('✓')} Deleted rubric ${pc.bold(id)}`);
+    });
+
+  rubric
+    .command('criteria <rubricId>')
+    .description('List criteria in a rubric')
+    .option('--json [fields]', 'Output JSON')
+    .action(async (rubricId: string, options: { json?: boolean | string }) => {
+      const client = await getTrpcClient();
+      const items = await client.verify.getRubricCriteria.query({ rubricId });
+      if (options.json !== undefined) {
+        outputJson(items, typeof options.json === 'string' ? options.json : undefined);
+        return;
+      }
+      if (items.length === 0) return void console.log('No criteria in this rubric.');
+      printTable(
+        items.map((c: any) => [
+          c.id,
+          truncate(c.title, 60),
+          c.verifierType,
+          c.required ? 'gate' : 'soft',
+        ]),
+        ['ID', 'TITLE', 'TYPE', 'BLOCK'],
+      );
+    });
+
+  rubric
+    .command('set-criteria <rubricId> <criterionIds...>')
+    .description('Set the criteria a rubric aggregates (order preserved)')
+    .action(async (rubricId: string, criterionIds: string[]) => {
+      const client = await getTrpcClient();
+      await client.verify.setRubricCriteria.mutate({
+        criteria: criterionIds.map((criterionId, i) => ({ criterionId, sortOrder: i })),
+        rubricId,
+      });
+      console.log(
+        `${pc.green('✓')} Rubric ${pc.bold(rubricId)} now has ${criterionIds.length} criterion(s)`,
+      );
+    });
+
+  // ════════════ per-run plan ════════════
+  const plan = verify.command('plan').description('Per-run check plan lifecycle');
+
+  plan
+    .command('generate <operationId>')
+    .description('Generate a draft check plan for a run')
+    .requiredOption('--goal <goal>', "The run's task/instruction the plan must satisfy")
+    .option('--rubric <id>', 'Mounted rubric id')
+    .option('--criteria <ids>', 'Ad-hoc criterion ids (comma-separated)')
+    .option('--ai', 'Let the LLM propose additional criteria')
+    .option('--max-ai <n>', 'Max AI-proposed criteria')
+    .option('--model <model>', 'Model (required with --ai)')
+    .option('--provider <provider>', 'Provider (required with --ai)')
+    .option('--context <text>', 'Extra context for the AI prompt')
+    .option('--json [fields]', 'Output JSON')
+    .action(
+      async (
+        operationId: string,
+        options: {
+          ai?: boolean;
+          context?: string;
+          criteria?: string;
+          goal: string;
+          json?: boolean | string;
+          maxAi?: string;
+          model?: string;
+          provider?: string;
+          rubric?: string;
+        },
+      ) => {
+        if (options.ai && (!options.model || !options.provider)) {
+          log.error('--ai requires --model and --provider');
+          process.exit(1);
+        }
+        const client = await getTrpcClient();
+        const items = await client.verify.generateDraftPlan.mutate({
+          context: options.context,
+          enableAiGeneration: options.ai,
+          goal: options.goal,
+          maxAiCriteria: options.maxAi ? Number.parseInt(options.maxAi, 10) : undefined,
+          modelConfig:
+            options.model && options.provider
+              ? { model: options.model, provider: options.provider }
+              : undefined,
+          operationId,
+          verifyCriteriaIds: options.criteria
+            ?.split(',')
+            .map((s) => s.trim())
+            .filter(Boolean),
+          verifyRubricId: options.rubric ?? null,
+        });
+        if (options.json !== undefined) {
+          outputJson(items, typeof options.json === 'string' ? options.json : undefined);
+          return;
+        }
+        console.log(`${pc.green('✓')} Draft plan: ${pc.bold(String(items.length))} item(s)`);
+        printTable(
+          items.map((i: any) => [
+            String(i.index),
+            truncate(i.title, 60),
+            i.verifierType,
+            i.required ? 'gate' : 'soft',
+          ]),
+          ['#', 'TITLE', 'TYPE', 'BLOCK'],
+        );
+      },
+    );
+
+  plan
+    .command('state <operationId>')
+    .description('Show the verify state (status + frozen plan) of a run')
+    .option('--json [fields]', 'Output JSON')
+    .action(async (operationId: string, options: { json?: boolean | string }) => {
+      const client = await getTrpcClient();
+      const state = await client.verify.getVerifyState.query({ operationId });
+      if (options.json !== undefined) {
+        outputJson(state, typeof options.json === 'string' ? options.json : undefined);
+        return;
+      }
+      if (!state) return void console.log('No verify state for this run.');
+      console.log(`${pc.bold('status')}: ${state.verifyStatus ?? pc.dim('(none)')}`);
+      console.log(
+        `${pc.bold('confirmed')}: ${state.verifyPlanConfirmedAt ? timeAgo(state.verifyPlanConfirmedAt) : pc.dim('no')}`,
+      );
+      const items = (state.verifyPlan ?? []) as any[];
+      console.log(`${pc.bold('plan')}: ${items.length} item(s)`);
+      if (items.length > 0)
+        printTable(
+          items.map((i) => [
+            String(i.index),
+            truncate(i.title, 60),
+            i.verifierType,
+            i.required ? 'gate' : 'soft',
+          ]),
+          ['#', 'TITLE', 'TYPE', 'BLOCK'],
+        );
+    });
+
+  plan
+    .command('confirm <operationId>')
+    .description('Freeze (confirm) the draft plan')
+    .action(async (operationId: string) => {
+      const client = await getTrpcClient();
+      await client.verify.confirmPlan.mutate({ operationId });
+      console.log(`${pc.green('✓')} Confirmed plan for run ${pc.bold(operationId)}`);
+    });
+
+  plan
+    .command('skip <operationId>')
+    .description('Skip verification for a run')
+    .action(async (operationId: string) => {
+      const client = await getTrpcClient();
+      await client.verify.skipPlan.mutate({ operationId });
+      console.log(`${pc.green('✓')} Skipped verification for run ${pc.bold(operationId)}`);
+    });
+
+  // ════════════ run / results ════════════
+  verify
+    .command('run <operationId>')
+    .description('Execute the confirmed plan against a deliverable (LLM judge)')
+    .requiredOption('--goal <goal>', "The run's task")
+    .requiredOption('--deliverable <text>', 'The output to judge')
+    .requiredOption('--model <model>', 'Judge model')
+    .requiredOption('--provider <provider>', 'Judge provider')
+    .option('--no-batch', 'Judge each item separately instead of one batched call')
+    .option('--json [fields]', 'Output JSON')
+    .action(
+      async (
+        operationId: string,
+        options: {
+          batch?: boolean;
+          deliverable: string;
+          goal: string;
+          json?: boolean | string;
+          model: string;
+          provider: string;
+        },
+      ) => {
+        const client = await getTrpcClient();
+        const results = await client.verify.executeVerify.mutate({
+          batchLlm: options.batch,
+          deliverable: options.deliverable,
+          goal: options.goal,
+          modelConfig: { model: options.model, provider: options.provider },
+          operationId,
+        });
+        if (options.json !== undefined) {
+          outputJson(results, typeof options.json === 'string' ? options.json : undefined);
+          return;
+        }
+        printResults(results);
+      },
+    );
+
+  verify
+    .command('results <operationId>')
+    .description('List check results for a run')
+    .option('--json [fields]', 'Output JSON')
+    .action(async (operationId: string, options: { json?: boolean | string }) => {
+      const client = await getTrpcClient();
+      const results = await client.verify.listResults.query({ operationId });
+      if (options.json !== undefined) {
+        outputJson(results, typeof options.json === 'string' ? options.json : undefined);
+        return;
+      }
+      if (results.length === 0) return void console.log('No results yet.');
+      printResults(results);
+    });
+
+  // ════════════ feedback ════════════
+  verify
+    .command('decision <resultId> <decision>')
+    .description(`Record human feedback on a result (${DECISIONS.join('|')})`)
+    .action(async (resultId: string, decision: Decision) => {
+      assertEnum(decision, DECISIONS, 'decision');
+      const client = await getTrpcClient();
+      await client.verify.submitDecision.mutate({ decision, resultId });
+      console.log(`${pc.green('✓')} Recorded ${pc.bold(decision)} on result ${pc.bold(resultId)}`);
+    });
+}
+
+function printResults(results: any[]): void {
+  printTable(
+    results.map((r) => [
+      truncate(r.checkItemTitle || r.checkItemId, 50),
+      statusColor(r.status),
+      r.verdict ?? '',
+      r.confidence != null ? String(r.confidence) : '',
+      r.required ? 'gate' : 'soft',
+      truncate(r.suggestion || '', 40),
+    ]),
+    ['CHECK', 'STATUS', 'VERDICT', 'CONF', 'BLOCK', 'SUGGESTION'],
+  );
+}
+
+function statusColor(status: string): string {
+  if (status === 'passed') return pc.green(status);
+  if (status === 'failed') return pc.red(status);
+  if (status === 'running') return pc.yellow(status);
+  return pc.dim(status);
+}
@@ -0,0 +1,145 @@
+import { EventEmitter } from 'node:events';
+
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import { spawnHeteroAgentRun } from './agentRun';
+
+const { spawnMock } = vi.hoisted(() => ({ spawnMock: vi.fn() }));
+
+vi.mock('node:child_process', () => ({ spawn: spawnMock }));
+
+const makeFakeChild = () => {
+  const child = new EventEmitter() as EventEmitter & {
+    stdin: { end: ReturnType<typeof vi.fn>; write: ReturnType<typeof vi.fn> };
+  };
+  child.stdin = { end: vi.fn(), write: vi.fn() };
+  return child;
+};
+
+const baseParams = {
+  agentType: 'claudeCode',
+  jwt: 'jwt',
+  operationId: 'op',
+  prompt: 'hi',
+  serverUrl: 'https://app.lobehub.com',
+  topicId: 'tpc',
+};
+
+describe('spawnHeteroAgentRun', () => {
+  afterEach(() => {
+    spawnMock.mockReset();
+  });
+
+  it('spawns `lh hetero exec` in server-ingest mode via the current CLI entry', async () => {
+    const child = makeFakeChild();
+    spawnMock.mockReturnValue(child);
+
+    const ackPromise = spawnHeteroAgentRun({
+      ...baseParams,
+      cwd: '/work/dir',
+      jwt: 'jwt-token',
+      operationId: 'op-1',
+      topicId: 'tpc-1',
+    });
+
+    expect(spawnMock).toHaveBeenCalledTimes(1);
+    const [bin, args, opts] = spawnMock.mock.calls[0];
+
+    expect(bin).toBe(process.execPath);
+    expect(args).toEqual([
+      ...process.execArgv,
+      process.argv[1],
+      'hetero',
+      'exec',
+      '--type',
+      'claudeCode',
+      '--operation-id',
+      'op-1',
+      '--topic',
+      'tpc-1',
+      '--render',
+      'none',
+      '--input-json',
+      '-',
+      '--cwd',
+      '/work/dir',
+    ]);
+    expect(opts).toMatchObject({
+      cwd: '/work/dir',
+      env: expect.objectContaining({
+        LOBEHUB_JWT: 'jwt-token',
+        LOBEHUB_SERVER: 'https://app.lobehub.com',
+      }),
+    });
+
+    // stdin is only written after the child actually spawns.
+    expect(child.stdin.write).not.toHaveBeenCalled();
+    child.emit('spawn');
+
+    await expect(ackPromise).resolves.toEqual({ status: 'accepted' });
+    expect(child.stdin.write).toHaveBeenCalledWith(JSON.stringify('hi'));
+    expect(child.stdin.end).toHaveBeenCalledTimes(1);
+  });
+
+  it('rejects (no stuck run) when the child errors before spawning, e.g. bad cwd', async () => {
+    const child = makeFakeChild();
+    spawnMock.mockReturnValue(child);
+
+    const ackPromise = spawnHeteroAgentRun({ ...baseParams, cwd: '/missing' });
+    child.emit('error', new Error('spawn ENOENT'));
+
+    await expect(ackPromise).resolves.toEqual({ reason: 'spawn ENOENT', status: 'rejected' });
+    expect(child.stdin.write).not.toHaveBeenCalled();
+  });
+
+  it('appends --resume when resuming a session', () => {
+    const child = makeFakeChild();
+    spawnMock.mockReturnValue(child);
+
+    void spawnHeteroAgentRun({ ...baseParams, resumeSessionId: 'sess-9' });
+
+    const [, args] = spawnMock.mock.calls[0];
+    expect(args).toContain('--resume');
+    expect(args).toContain('sess-9');
+  });
+
+  it('sends a content-block array to stdin when systemContext is provided', async () => {
+    const child = makeFakeChild();
+    spawnMock.mockReturnValue(child);
+
+    const ackPromise = spawnHeteroAgentRun({
+      ...baseParams,
+      prompt: 'do it',
+      systemContext: 'workspace rules',
+    });
+    child.emit('spawn');
+    await ackPromise;
+
+    expect(child.stdin.write).toHaveBeenCalledWith(
+      JSON.stringify([
+        { text: 'workspace rules', type: 'text' },
+        { text: 'do it', type: 'text' },
+      ]),
+    );
+  });
+
+  it('appends image blocks to stdin when imageList is provided', async () => {
+    const child = makeFakeChild();
+    spawnMock.mockReturnValue(child);
+
+    const ackPromise = spawnHeteroAgentRun({
+      ...baseParams,
+      imageList: [{ id: 'file-1', url: 'https://signed/a.png' }],
+      prompt: 'look at this',
+    });
+    child.emit('spawn');
+    await ackPromise;
+
+    expect(child.stdin.write).toHaveBeenCalledWith(
+      JSON.stringify([
+        { text: 'look at this', type: 'text' },
+        { source: { id: 'file-1', type: 'url', url: 'https://signed/a.png' }, type: 'image' },
+      ]),
+    );
+  });
+});
@@ -0,0 +1,134 @@
+import { spawn } from 'node:child_process';
+
+import {
+  buildHeteroExecStdinPayload,
+  type HeteroExecImageRef,
+} from '@lobechat/heterogeneous-agents/protocol';
+
+export interface SpawnHeteroAgentRunParams {
+  agentType: string;
+  cwd?: string;
+  /** Image attachments (signed URLs) appended as image content blocks. */
+  imageList?: HeteroExecImageRef[];
+  jwt: string;
+  operationId: string;
+  prompt: string;
+  resumeSessionId?: string;
+  serverUrl: string;
+  systemContext?: string;
+  topicId: string;
+}
+
+export interface AgentRunAckResult {
+  reason?: string;
+  status: 'accepted' | 'rejected';
+}
+
+interface SpawnHeteroAgentRunLogger {
+  error?: (msg: string) => void;
+  info?: (msg: string) => void;
+}
+
+/**
+ * Spawn `lh hetero exec` for a gateway-dispatched agent run. Mirrors the
+ * desktop app's `spawnLhHeteroExec`: the spawned CLI owns the full pipeline
+ * (spawn -> adapt -> BatchIngester -> server ingest), so the connect daemon
+ * needs no local stream handling — it only kicks off the process.
+ *
+ * Re-invokes the current CLI entry (`process.execPath` + `process.argv[1]`)
+ * instead of relying on `lh` being on `PATH`, so it also works inside the
+ * detached `lh connect --daemon` child where `PATH` may be minimal.
+ *
+ * Resolves only once the child's outcome is known: `accepted` on the `spawn`
+ * event, `rejected` on an early `error`. `spawn()` reports failures (missing or
+ * inaccessible `cwd`, etc.) asynchronously via `error`, so acking eagerly would
+ * report a false success and leave the run with no process to emit
+ * `heteroFinish` — surfacing as a stuck assistant message. A rejected ack
+ * instead flows back as a dispatch failure the user can see.
+ */
+export function spawnHeteroAgentRun(
+  params: SpawnHeteroAgentRunParams,
+  logger?: SpawnHeteroAgentRunLogger,
+): Promise<AgentRunAckResult> {
+  const {
+    agentType,
+    cwd,
+    imageList,
+    jwt,
+    operationId,
+    prompt,
+    resumeSessionId,
+    serverUrl,
+    systemContext,
+    topicId,
+  } = params;
+  const workDir = cwd ?? process.cwd();
+
+  // Server-ingest mode (--topic + --operation-id): events are batch-POSTed to
+  // the server, not rendered. `--input-json -` reads the prompt from stdin.
+  const cliArgs = [
+    process.argv[1],
+    'hetero',
+    'exec',
+    '--type',
+    agentType,
+    '--operation-id',
+    operationId,
+    '--topic',
+    topicId,
+    '--render',
+    'none',
+    '--input-json',
+    '-',
+    '--cwd',
+    workDir,
+    ...(resumeSessionId ? ['--resume', resumeSessionId] : []),
+  ];
+
+  // systemContext / image attachments turn the payload into a content-block
+  // array: context block first, then the user's prompt, then images — mirrors
+  // the desktop path. `lh hetero exec` coerces both shapes via
+  // coerceJsonPrompt.
+  const stdinPayload = buildHeteroExecStdinPayload({ imageList, prompt, systemContext });
+
+  return new Promise<AgentRunAckResult>((resolve) => {
+    let settled = false;
+    const settle = (result: AgentRunAckResult) => {
+      if (settled) return;
+      settled = true;
+      resolve(result);
+    };
+
+    const child = spawn(process.execPath, [...process.execArgv, ...cliArgs], {
+      cwd: workDir,
+      env: {
+        ...process.env,
+        LOBEHUB_JWT: jwt,
+        LOBEHUB_SERVER: serverUrl,
+      },
+      stdio: ['pipe', 'inherit', 'inherit'],
+    });
+
+    child.once('spawn', () => {
+      // Only safe to write stdin once the process actually started.
+      try {
+        child.stdin?.write(stdinPayload);
+        child.stdin?.end();
+      } catch (err) {
+        logger?.error?.(
+          `hetero exec stdin write failed (op=${operationId}): ${(err as Error).message}`,
+        );
+      }
+      settle({ status: 'accepted' });
+    });
+
+    child.once('error', (err) => {
+      logger?.error?.(`hetero exec spawn failed (op=${operationId}): ${err.message}`);
+      settle({ reason: err.message, status: 'rejected' });
+    });
+
+    child.on('exit', (code, signal) => {
+      logger?.info?.(`hetero exec exited (op=${operationId}) code=${code} signal=${signal}`);
+    });
+  });
+}
@@ -34,6 +34,7 @@ import { registerTaskCommand } from './commands/task';
 import { registerThreadCommand } from './commands/thread';
 import { registerTopicCommand } from './commands/topic';
 import { registerUserCommand } from './commands/user';
+import { registerVerifyCommand } from './commands/verify';

 const require = createRequire(import.meta.url);
 const { version } = require('../package.json');
@@ -75,6 +76,7 @@ export function createProgram() {
  registerProviderCommand(program);
  registerPluginCommand(program);
  registerUserCommand(program);
+  registerVerifyCommand(program);
  registerConfigCommand(program);
  registerEvalCommand(program);
  registerMigrateCommand(program);
@@ -296,7 +296,11 @@ export async function streamAgentEventsViaWebSocket(
        console.log(JSON.stringify(jsonEvents, null, 2));
      }
      isSettled = true;
-      reject(new Error(`Agent gateway WebSocket closed before completion: ${String(event)}`));
+      // Surface the close code + reason — `String(event)` is just "[object CloseEvent]".
+      const reason = event.reason ? `: ${event.reason}` : '';
+      reject(
+        new Error(`Agent gateway WebSocket closed before completion (code ${event.code}${reason})`),
+      );
    };
  });
 }
--- a/Show More
+++ b/Show More