💄 style: add preserve thinking feature for Qwen3.7 Max model (#13494)

Co-authored-by: Copilot <copilot@github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: YuTengjing <ytj2713151713@gmail.com>
2026-06-13 19:20:04 +00:00 · 2026-06-09 17:21:39 +08:00
parent 434532ce36
commit 64d3bdb978
44 changed files with 1001 additions and 1492 deletions
@@ -149,6 +149,8 @@
  "extendParams.enableReasoning.title": "Enable Deep Thinking",
  "extendParams.imageAspectRatio.title": "Image Aspect Ratio",
  "extendParams.imageResolution.title": "Image Resolution",
+  "extendParams.preserveThinking.desc": "When enabled, assistant historical reasoning will be sent back as context for compatible models. This may increase token usage.",
+  "extendParams.preserveThinking.title": "Preserve Historical Thinking",
  "extendParams.reasoningBudgetToken.title": "Thinking Consumption Token",
  "extendParams.reasoningEffort.title": "Reasoning Intensity",
  "extendParams.textVerbosity.title": "Output Text Detail Level",
@@ -234,6 +234,7 @@
  "providerModels.item.modelConfig.extendParams.options.imageResolution.hint": "For Gemini 3 image generation models; controls resolution of generated images.",
  "providerModels.item.modelConfig.extendParams.options.imageResolution2.hint": "For Gemini 3.1 Flash Image models; controls resolution of generated images (supports 512px).",
  "providerModels.item.modelConfig.extendParams.options.opus47Effort.hint": "For Claude Opus 4.7 and later; controls effort level (low/medium/high/xhigh/max).",
+  "providerModels.item.modelConfig.extendParams.options.preserveThinking.hint": "For Qwen3.6 Plus, GLM-5 and GLM-4.7; sends historical assistant reasoning back to model context (preserve_thinking).",
  "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint": "For Claude, Qwen3 and similar; controls token budget for reasoning.",
  "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken32k.hint": "For GLM-5 and GLM-4.7; controls token budget for reasoning (max 32k).",
  "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken80k.hint": "For Qwen3 series; controls token budget for reasoning (max 80k).",
@@ -149,6 +149,8 @@
  "extendParams.enableReasoning.title": "开启深度思考",
  "extendParams.imageAspectRatio.title": "图片宽高比",
  "extendParams.imageResolution.title": "图片分辨率",
+  "extendParams.preserveThinking.desc": "开启后会将历史助手思考过程作为上下文回传给模型，可能增加 Token 消耗。",
+  "extendParams.preserveThinking.title": "传递历史思考过程",
  "extendParams.reasoningBudgetToken.title": "思考 Token 预算",
  "extendParams.reasoningEffort.title": "推理强度",
  "extendParams.textVerbosity.title": "输出详细程度",
@@ -234,6 +234,7 @@
  "providerModels.item.modelConfig.extendParams.options.imageResolution.hint": "适用于 Gemini 3 图像生成模型；控制生成图像的分辨率。",
  "providerModels.item.modelConfig.extendParams.options.imageResolution2.hint": "适用于 Gemini 3.1 Flash Image 模型；控制生成图像的分辨率（支持 512px）。",
  "providerModels.item.modelConfig.extendParams.options.opus47Effort.hint": "适用于 Claude Opus 4.7 及更高版本；控制努力级别（低/中/高/超高/最大）。",
+  "providerModels.item.modelConfig.extendParams.options.preserveThinking.hint": "适用于 Qwen3.6 Plus、GLM-5 与 GLM-4.7；将历史助手思考过程回传为模型上下文（preserve_thinking）。",
  "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint": "适用于 Claude、Qwen3 等模型；控制用于推理的 Token 预算。",
  "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken32k.hint": "适用于GLM-5和GLM-4.7；控制推理的令牌预算（最大32k）。",
  "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken80k.hint": "适用于Qwen3系列；控制推理的令牌预算（最大80k）。",
@@ -685,14 +685,14 @@ describe('UsageCounter', () => {

      const result1 = UsageCounter.accumulateLLM({
        cost: state.cost,
-        model: 'gpt-4o-audio-preview',
+        model: 'gpt-audio',
        modelUsage: usage1,
        provider: 'openai',
        usage: state.usage,
      });
      const result2 = UsageCounter.accumulateLLM({
        cost: result1.cost,
-        model: 'gpt-4o-audio-preview',
+        model: 'gpt-audio',
        modelUsage: usage2,
        provider: 'openai',
        usage: result1.usage,
@@ -887,73 +887,6 @@ const aihubmixChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      search: true,
-      vision: true,
-    },
-    contextWindowTokens: 2_000_000,
-    description:
-      'We’re excited to release Grok 4 Fast, our latest progress in cost-effective reasoning models.',
-    displayName: 'Grok 4 Fast (Non-Reasoning)',
-    id: 'grok-4-fast-non-reasoning',
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-09',
-    settings: {
-      searchImpl: 'params',
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      search: true,
-      vision: true,
-    },
-    contextWindowTokens: 2_000_000,
-    description:
-      'We’re excited to release Grok 4 Fast, our latest progress in cost-effective reasoning models.',
-    displayName: 'Grok 4 Fast',
-    id: 'grok-4-fast-reasoning',
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-09',
-    settings: {
-      searchImpl: 'params',
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      vision: true,
-    },
-    contextWindowTokens: 256_000,
-    description:
-      'Latest Grok flagship with unmatched performance in language, math, and reasoning — a true all-rounder. Currently points to grok-4-0709; due to limited resources it is temporarily 10% higher than official pricing and is expected to return to official price later.',
-    displayName: 'Grok 4 0709',
-    id: 'grok-4',
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 3.3, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16.5, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-07-09',
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1384,24 +1317,6 @@ const aihubmixChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'DeepSeek V3.1 Fast is the high-TPS fast variant of DeepSeek V3.1. Hybrid thinking mode: via chat templates, one model supports both thinking and non-thinking. Smarter tool use: post-training boosts tool and agent task performance.',
-    displayName: 'DeepSeek V3.1 (Fast)',
-    id: 'DeepSeek-V3.1-Fast',
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 1.096, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 3.288, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -4,33 +4,43 @@ const cerebrasModels: AIChatModelCard[] = [
  {
    abilities: {
      functionCall: true,
+      reasoning: true,
    },
    contextWindowTokens: 131_072,
-    displayName: 'Qwen 3 235B Instruct',
-    id: 'qwen-3-235b-a22b-instruct-2507',
+    displayName: 'GPT OSS 120B',
+    enabled: true,
+    id: 'gpt-oss-120b',
+    maxOutput: 40_960,
    pricing: {
      units: [
-        { name: 'textInput', rate: 0.6, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 0.35, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 0.75, strategy: 'fixed', unit: 'millionTokens' },
      ],
    },
+    settings: {
+      extendParams: ['reasoningEffort'],
+    },
    type: 'chat',
  },
  {
    abilities: {
      functionCall: true,
+      reasoning: true,
    },
-    contextWindowTokens: 32_768,
+    contextWindowTokens: 131_072,
    description:
-      'Llama 3.1 8B: a small, low-latency Llama variant for lightweight online inference and chat.',
-    displayName: 'Llama 3.1 8B',
-    id: 'llama3.1-8b',
+      "GLM-4.7 is Zhipu's new generation flagship model with 355B total parameters and 32B active parameters, fully upgraded in general dialogue, reasoning, and agent capabilities. GLM-4.7 enhances Interleaved Thinking and introduces Preserved Thinking and Turn-level Thinking.",
+    displayName: 'GLM-4.7',
+    id: 'zai-glm-4.7',
+    maxOutput: 40_960,
    pricing: {
+      currency: 'USD',
      units: [
-        { name: 'textInput', rate: 0.1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 0.1, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 2.25, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 2.75, strategy: 'fixed', unit: 'millionTokens' },
      ],
    },
+    releasedAt: '2025-12-22',
    type: 'chat',
  },
 ];
@@ -317,23 +317,6 @@ const githubCopilotChatModels: AIChatModelCard[] = [
    type: 'chat',
  },

-  // Grok Models
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      structuredOutput: true,
-    },
-    contextWindowTokens: 173_000,
-    description:
-      'We’re excited to launch grok-code-fast-1, a fast and cost-effective reasoning model that excels at agentic coding.',
-    displayName: 'Grok Code Fast 1',
-    enabled: true,
-    id: 'grok-code-fast-1',
-    releasedAt: '2025-08-27',
-    type: 'chat',
-  },
-
  // Raptor Models
  {
    abilities: {
@@ -59,114 +59,6 @@ const moonshotChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      structuredOutput: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'K2 long-thinking model with 256k context, supporting multi-step tool use and reasoning for complex problems.',
-    displayName: 'Kimi K2 Thinking',
-    id: 'kimi-k2-thinking',
-    maxOutput: 65_536,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-11-06',
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      structuredOutput: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'High-speed K2 long-thinking variant with 256k context, strong deep reasoning, and 60–100 tokens/sec output.',
-    displayName: 'Kimi K2 Thinking Turbo',
-    id: 'kimi-k2-thinking-turbo',
-    maxOutput: 65_536,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 58, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-11-06',
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      structuredOutput: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'kimi-k2-0905-preview offers a 256k context window, stronger agentic coding, better front-end code quality, and improved context understanding.',
-    displayName: 'Kimi K2 0905 Preview',
-    id: 'kimi-k2-0905-preview',
-    maxOutput: 65_536,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-05',
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'kimi-k2 is an MoE foundation model with strong coding and agent capabilities (1T total params, 32B active), outperforming other mainstream open models across reasoning, programming, math, and agent benchmarks.',
-    displayName: 'Kimi K2 0711 Preview',
-    id: 'kimi-k2-0711-preview',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-07-11',
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'kimi-k2 is an MoE foundation model with strong coding and agent capabilities (1T total params, 32B active), outperforming other mainstream open models across reasoning, programming, math, and agent benchmarks.',
-    displayName: 'Kimi K2 Turbo Preview',
-    id: 'kimi-k2-turbo-preview',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 58, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-05',
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -12,99 +12,6 @@ const nvidiaChatModels: AIChatModelCard[] = [
    maxOutput: 131_072,
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 204_800,
-    displayName: 'MiniMax-M2.5',
-    id: 'minimaxai/minimax-m2.5',
-    maxOutput: 131_072,
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'DeepSeek V3.2 is a next-gen reasoning model with stronger complex reasoning and chain-of-thought capabilities.',
-    displayName: 'DeepSeek V3.2',
-    enabled: true,
-    id: 'deepseek-ai/deepseek-v3.2',
-    maxOutput: 65_536,
-    settings: {
-      extendParams: ['enableReasoning'],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 200_000,
-    description:
-      'GLM-4.7 is Zhipu latest flagship model, enhanced for Agentic Coding scenarios with improved coding capabilities.',
-    displayName: 'GLM-4.7',
-    id: 'z-ai/glm4.7',
-    maxOutput: 131_072,
-    settings: {
-      extendParams: ['enableReasoning'],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 200_000,
-    description:
-      "GLM-5 is Zhipu AI's new flagship foundation model for agent engineering, achieving open-source SOTA performance in coding and agent capabilities. It matches Claude Opus 4.5 in performance.",
-    displayName: 'GLM-5',
-    id: 'z-ai/glm5',
-    maxOutput: 131_072,
-    settings: {
-      extendParams: ['enableReasoning'],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'Kimi K2.5 is the most intelligent Kimi model to date, featuring native multimodal architecture.',
-    displayName: 'Kimi K2.5',
-    enabled: true,
-    id: 'moonshotai/kimi-k2.5',
-    maxOutput: 65_536,
-    settings: {
-      extendParams: ['enableReasoning'],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'DeepSeek V3.1 is a next-gen reasoning model with stronger complex reasoning and chain-of-thought for deep analysis tasks.',
-    displayName: 'DeepSeek V3.1 Terminus',
-    id: 'deepseek-ai/deepseek-v3.1-terminus',
-    maxOutput: 16_384,
-    settings: {
-      extendParams: ['enableReasoning'],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -115,22 +22,6 @@ const nvidiaChatModels: AIChatModelCard[] = [
    id: 'meta/llama-3.3-70b-instruct',
    type: 'chat',
  },
-  {
-    contextWindowTokens: 128_000,
-    description:
-      'A cutting-edge small language model with strong understanding, reasoning, and text generation.',
-    displayName: 'Llama 3.2 1B Instruct',
-    id: 'meta/llama-3.2-1b-instruct',
-    type: 'chat',
-  },
-  {
-    contextWindowTokens: 128_000,
-    description:
-      'A cutting-edge small language model with strong understanding, reasoning, and text generation.',
-    displayName: 'Llama 3.2 3B Instruct',
-    id: 'meta/llama-3.2-3b-instruct',
-    type: 'chat',
-  },
  {
    abilities: {
      vision: true,
@@ -197,33 +88,6 @@ const nvidiaChatModels: AIChatModelCard[] = [
    id: 'google/gemma-2-2b-it',
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-    },
-    contextWindowTokens: 32_768,
-    description:
-      'A bilingual LLM for Chinese and English across language, coding, math, and reasoning.',
-    displayName: 'Qwen2.5 7B Instruct',
-    id: 'qwen/qwen2.5-7b-instruct',
-    type: 'chat',
-  },
-  {
-    contextWindowTokens: 32_768,
-    description:
-      'A strong mid-sized code model with 32K context, excelling at multilingual programming.',
-    displayName: 'Qwen2.5 Coder 7B Instruct',
-    id: 'qwen/qwen2.5-coder-7b-instruct',
-    type: 'chat',
-  },
-  {
-    contextWindowTokens: 32_768,
-    description:
-      'An advanced LLM for code generation, reasoning, and repair across mainstream programming languages.',
-    displayName: 'Qwen2.5 Coder 32B Instruct',
-    id: 'qwen/qwen2.5-coder-32b-instruct',
-    type: 'chat',
-  },
 ];

 export const allModels = [...nvidiaChatModels];
@@ -1174,54 +1174,6 @@ export const openaiChatModels: AIChatModelCard[] = [
    releasedAt: '2025-08-28',
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      //search: true,
-    },
-    contextWindowTokens: 128_000,
-    description: 'GPT-4o Audio Preview model with audio input and output.',
-    displayName: 'GPT-4o Audio Preview',
-    id: 'gpt-4o-audio-preview',
-    maxOutput: 16_384,
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 2.5, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 10, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2024-12-17',
-    /*
-    settings: {
-      searchImpl: 'params',
-    },
-    */
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      //search: true,
-    },
-    contextWindowTokens: 128_000,
-    description: 'GPT-4o mini Audio model with audio input and output.',
-    displayName: 'GPT-4o mini Audio',
-    id: 'gpt-4o-mini-audio-preview',
-    maxOutput: 16_384,
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 0.15, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 0.6, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2024-12-17',
-    /*
-    settings: {
-      searchImpl: 'params',
-    },
-    */
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1585,76 +1537,6 @@ export const openaiImageModels: AIImageModelCard[] = [
    releasedAt: '2025-10-06',
    type: 'image',
  },
-  {
-    description:
-      'The latest DALL·E model, released in November 2023, supports more realistic, accurate image generation with stronger detail.',
-    displayName: 'DALL·E 3',
-    id: 'dall-e-3',
-    parameters: {
-      prompt: { default: '' },
-      quality: {
-        default: 'standard',
-        enum: ['standard', 'hd'],
-      },
-      size: {
-        default: '1024x1024',
-        enum: ['1024x1024', '1792x1024', '1024x1792'],
-      },
-    },
-    pricing: {
-      units: [
-        {
-          lookup: {
-            prices: {
-              hd_1024x1024: 0.08,
-              hd_1024x1792: 0.12,
-              hd_1792x1024: 0.12,
-              standard_1024x1024: 0.04,
-              standard_1024x1792: 0.08,
-              standard_1792x1024: 0.08,
-            },
-            pricingParams: ['quality', 'size'],
-          },
-          name: 'imageGeneration',
-          strategy: 'lookup',
-          unit: 'image',
-        },
-      ],
-    },
-    type: 'image',
-  },
-  {
-    description:
-      'Second-generation DALL·E model with more realistic, accurate image generation and 4× the resolution of the first generation.',
-    displayName: 'DALL·E 2',
-    id: 'dall-e-2',
-    parameters: {
-      imageUrl: { default: null },
-      prompt: { default: '' },
-      size: {
-        default: '1024x1024',
-        enum: ['256x256', '512x512', '1024x1024'],
-      },
-    },
-    pricing: {
-      units: [
-        {
-          lookup: {
-            prices: {
-              '1024x1024': 0.02,
-              '256x256': 0.016,
-              '512x512': 0.018,
-            },
-            pricingParams: ['size'],
-          },
-          name: 'imageGeneration',
-          strategy: 'lookup',
-          unit: 'image',
-        },
-      ],
-    },
-    type: 'image',
-  },
 ];

 // GPT-4o and GPT-4o-mini realtime models
@@ -182,56 +182,6 @@ const qiniuChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      search: true,
-      vision: true,
-    },
-    contextWindowTokens: 2_000_000,
-    description:
-      'We’re excited to release Grok 4 Fast, our latest progress in cost-effective reasoning models.',
-    displayName: 'Grok 4 Fast',
-    enabled: true,
-    id: 'x-ai/grok-4-fast',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 7.2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 12.6, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-09',
-    settings: {
-      searchImpl: 'params',
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 256_000,
-    description:
-      'We’re excited to launch grok-code-fast-1, a fast and cost-effective reasoning model that excels at agentic coding.',
-    displayName: 'Grok Code Fast 1',
-    id: 'x-ai/grok-code-fast-1',
-    pricing: {
-      units: [
-        { name: 'textInput_cacheRead', rate: 0.02, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 1.5, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-08-27',
-    // settings: {
-    // reasoning_effort is not supported by grok-code. Specifying reasoning_effort parameter will get an error response.
-    // extendParams: ['reasoningEffort'],
-    // },
-    type: 'chat',
-  },
 ];

 export const allModels = [...qiniuChatModels];
@@ -1559,7 +1559,7 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 1.2, upTo: 0.256 },
+            { rate: 1.2, upTo: 256_000 },
            { rate: 4.8, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1568,7 +1568,7 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 7.2, upTo: 0.256 },
+            { rate: 7.2, upTo: 256_000 },
            { rate: 28.8, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1577,7 +1577,7 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 1.2 * 0.2, upTo: 0.256 },
+            { rate: 1.2 * 0.2, upTo: 256_000 },
            { rate: 4.8 * 0.2, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1616,8 +1616,8 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.2, upTo: 0.128 },
-            { rate: 0.8, upTo: 0.256 },
+            { rate: 0.2, upTo: 128_000 },
+            { rate: 0.8, upTo: 256_000 },
            { rate: 1.2, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1626,8 +1626,8 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 2, upTo: 0.128 },
-            { rate: 8, upTo: 0.256 },
+            { rate: 2, upTo: 128_000 },
+            { rate: 8, upTo: 256_000 },
            { rate: 12, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1636,8 +1636,8 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.2 * 0.2, upTo: 0.128 },
-            { rate: 0.8 * 0.2, upTo: 0.256 },
+            { rate: 0.2 * 0.2, upTo: 128_000 },
+            { rate: 0.8 * 0.2, upTo: 256_000 },
            { rate: 1.2 * 0.2, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1673,8 +1673,8 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.15, upTo: 0.128 },
-            { rate: 0.6, upTo: 0.256 },
+            { rate: 0.15, upTo: 128_000 },
+            { rate: 0.6, upTo: 256_000 },
            { rate: 1.2, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1683,8 +1683,8 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 1.5, upTo: 0.128 },
-            { rate: 6, upTo: 0.256 },
+            { rate: 1.5, upTo: 128_000 },
+            { rate: 6, upTo: 256_000 },
            { rate: 12, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1693,8 +1693,8 @@ const qwenChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.15 * 0.2, upTo: 0.128 },
-            { rate: 0.6 * 0.2, upTo: 0.256 },
+            { rate: 0.15 * 0.2, upTo: 128_000 },
+            { rate: 0.6 * 0.2, upTo: 256_000 },
            { rate: 1.2 * 0.2, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -1739,6 +1739,40 @@ const qwenChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
+  {
+    abilities: {
+      functionCall: true,
+      reasoning: true,
+      search: true,
+      video: true,
+      vision: true,
+    },
+    config: {
+      deploymentName: 'qwen3.7-plus', // Supports context caching
+    },
+    contextWindowTokens: 1_000_000,
+    description:
+      'Qwen3.7 Plus is a multimodal interactive hybrid agent model, building upon the Qwen3.7 series text capabilities to unify vision and language. It excels at GUI operation, visual coding, and complex agentic workflows.',
+    displayName: 'Qwen3.7 Plus',
+    enabled: true,
+    id: 'qwen3.7-plus',
+    maxOutput: 65_536,
+    organization: 'Qwen',
+    pricing: {
+      currency: 'CNY',
+      units: [
+        { name: 'textInput_cacheRead', rate: 2 * 0.2, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 2, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 8, strategy: 'fixed', unit: 'millionTokens' },
+      ],
+    },
+    releasedAt: '2026-06-01',
+    settings: {
+      extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'],
+      searchImpl: 'params',
+    },
+    type: 'chat',
+  },
  {
    abilities: {
      functionCall: true,
@@ -1752,44 +1786,19 @@ const qwenChatModels: AIChatModelCard[] = [
    },
    contextWindowTokens: 1_000_000,
    description:
-      'Qwen 3.6-Plus introduces major upgrades in coding capabilities, with a focus on Agentic Coding and front-end development, significantly enhancing the Vibe Coding experience. Its reasoning ability across general scenarios has been further improved. In terms of multimodality, capabilities such as universal recognition, OCR, and object localization have been substantially enhanced. It also fixes known issues from the Qwen 3.5-Plus release. Usage remains the same as Qwen 3.5-Plus.',
+      'Qwen3.6 Plus supports text, image, and video input. It delivers a balanced performance across quality, speed, and cost. Its multimodal capabilities are significantly improved compared to the Qwen3 VL series.',
    displayName: 'Qwen3.6 Plus',
-    enabled: true,
    id: 'qwen3.6-plus',
    maxOutput: 65_536,
    organization: 'Qwen',
    pricing: {
      currency: 'CNY',
      units: [
-        {
-          lookup: {
-            prices: {
-              '[0, 0.256]': 2 * 0.1,
-              '[0.256, infinity]': 8 * 0.1,
-            },
-            pricingParams: ['textInputRange'],
-          },
-          name: 'textInput_cacheRead',
-          strategy: 'lookup',
-          unit: 'millionTokens',
-        },
-        {
-          lookup: {
-            prices: {
-              '[0, 0.256]': 2 * 1.25,
-              '[0.256, infinity]': 8 * 1.25,
-            },
-            pricingParams: ['textInputRange'],
-          },
-          name: 'textInput_cacheWrite',
-          strategy: 'lookup',
-          unit: 'millionTokens',
-        },
        {
          lookup: {
            prices: {
              '[0, 0.256]': 2,
-              '[0.256, infinity]': 8,
+              '[0.256, 1]': 8,
            },
            pricingParams: ['textInputRange'],
          },
@@ -1801,7 +1810,7 @@ const qwenChatModels: AIChatModelCard[] = [
          lookup: {
            prices: {
              '[0, 0.256]': 12,
-              '[0.256, infinity]': 48,
+              '[0.256, 1]': 48,
            },
            pricingParams: ['textInputRange'],
          },
@@ -1809,11 +1818,23 @@ const qwenChatModels: AIChatModelCard[] = [
          strategy: 'lookup',
          unit: 'millionTokens',
        },
+        {
+          lookup: {
+            prices: {
+              '[0, 0.256]': 2 * 0.2,
+              '[0.256, 1]': 8 * 0.2,
+            },
+            pricingParams: ['textInputRange'],
+          },
+          name: 'textInput_cacheRead',
+          strategy: 'lookup',
+          unit: 'millionTokens',
+        },
      ],
    },
    releasedAt: '2026-04-02',
    settings: {
-      extendParams: ['enableReasoning', 'reasoningBudgetToken'],
+      extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'],
      searchImpl: 'params',
    },
    type: 'chat',
@@ -2058,13 +2079,44 @@ const qwenChatModels: AIChatModelCard[] = [
      search: true,
    },
    config: {
-      deploymentName: 'qwen3.6-max-preview', // Supports context caching
+      deploymentName: 'qwen3.7-max', // Supports context caching
+    },
+    contextWindowTokens: 1_000_000,
+    description:
+      'Qwen3.7 Max is the flagship omnipotent model of the AI agent era, offering comprehensive capabilities across text, image, and video understanding. It provides superior reasoning, function calling, and agent task execution performance.',
+    displayName: 'Qwen3.7 Max',
+    enabled: true,
+    id: 'qwen3.7-max',
+    maxOutput: 65_536,
+    organization: 'Qwen',
+    pricing: {
+      currency: 'CNY',
+      units: [
+        { name: 'textInput_cacheRead', rate: 12 * 0.2, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 12, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 36, strategy: 'fixed', unit: 'millionTokens' },
+      ],
+    },
+    releasedAt: '2026-05-20',
+    settings: {
+      extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'],
+      searchImpl: 'params',
+    },
+    type: 'chat',
+  },
+  {
+    abilities: {
+      functionCall: true,
+      reasoning: true,
+      search: true,
+    },
+    config: {
+      deploymentName: 'qwen3.6-max-preview',
    },
    contextWindowTokens: 262_144,
    description:
      'The largest closed-source model in the Qwen3.6 series. It delivers stronger world knowledge, instruction following, and agentic coding performance for complex tasks. It is text-only, supports thinking mode by default, explicit caching, and function calling.',
    displayName: 'Qwen3.6 Max Preview',
-    enabled: true,
    id: 'qwen3.6-max-preview',
    maxOutput: 65_536,
    organization: 'Qwen',
@@ -2111,7 +2163,7 @@ const qwenChatModels: AIChatModelCard[] = [
    },
    releasedAt: '2026-04-18',
    settings: {
-      extendParams: ['enableReasoning', 'reasoningBudgetToken'],
+      extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'],
      searchImpl: 'params',
    },
    type: 'chat',
@@ -438,26 +438,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 192_000,
-    description:
-      'MiniMax-M2.5 is the latest large language model from MiniMax, featuring a Mixture-of-Experts (MoE) architecture with 229 billion total parameters. It achieves industry-leading performance in programming, agent tool calling, search tasks, and office scenarios, with a SWE-Bench Verified score of 80.2% and 37% faster inference speed compared to M2.1.',
-    displayName: 'MiniMax-M2.5',
-    id: 'MiniMaxAI/MiniMax-M2.5',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 2.1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 8.4, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2026-02-13',
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -745,32 +725,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      video: true,
-      vision: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      "Kimi K2.6 is Moonshot AI's open-source native multimodal agent model. Built on MoE architecture with 1T total parameters and 32B activated, supporting 256K tokens context. It supports 4,000+ tool calls with sustained autonomous execution over 12 hours, multi-agent collaboration with up to 300 parallel sub-agents, and both Thinking and Instant inference modes.",
-    displayName: 'Kimi-K2.6 (Pro)',
-    id: 'Pro/moonshotai/Kimi-K2.6',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput_cacheRead', rate: 1.1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 6.5, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 27, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2026-04-21',
-    settings: {
-      extendParams: ['enableReasoning'],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      vision: true,
@@ -849,52 +803,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      "Kimi K2 Thinking is the latest and most powerful open-source thinking model. It greatly extends multi-step reasoning depth and sustains stable tool use across 200–300 consecutive calls, setting new records on Humanity's Last Exam (HLE), BrowseComp, and other benchmarks. 'It excels in coding, math, logic, and agent scenarios. Built on an MoE architecture with ~1T total parameters, it supports a 256K context window and tool calling.",
-    displayName: 'Kimi K2 Thinking',
-    id: 'moonshotai/Kimi-K2-Thinking',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-11-07',
-    settings: {
-      extendParams: ['reasoningBudgetToken'],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'Kimi K2 Thinking Turbo is the Turbo variant optimized for reasoning speed and throughput while retaining K2 Thinking’s multi-step reasoning and tool use. It is an MoE model with ~1T total parameters, native 256K context, and stable large-scale tool calling for production scenarios with stricter latency and concurrency needs.',
-    displayName: 'Kimi K2 Thinking (Pro)',
-    id: 'Pro/moonshotai/Kimi-K2-Thinking',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 32, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-11-07',
-    settings: {
-      extendParams: ['reasoningBudgetToken'],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1041,29 +949,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    releasedAt: '2025-09-01',
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 198_000,
-    description:
-      'Compared to GLM-4.5, GLM-4.6 expands context from 128K to 200K for more complex agent tasks. It scores higher on code benchmarks and shows stronger real-world performance in apps like Claude Code, Cline, Roo Code, and Kilo Code, including better frontend page generation. Reasoning is improved and tool use is supported during reasoning, strengthening overall capability. It integrates better into agent frameworks, improves tool/search agents, and has more human-preferred writing style and roleplay naturalness.',
-    displayName: 'GLM-4.6',
-    id: 'zai-org/GLM-4.6',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 3.5, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 14, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-30',
-    settings: {
-      extendParams: ['enableReasoning', 'reasoningBudgetToken'],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1200,29 +1085,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'Ring-flash-2.0 is a high-performance thinking model optimized from Ling-flash-2.0-base. It uses an MoE architecture with 100B total parameters and only 6.1B active per inference. Its icepop algorithm stabilizes RL training for MoE models, enabling continued gains in complex reasoning. It achieves major breakthroughs on tough benchmarks (math contests, code generation, logical reasoning), surpassing top dense models under 40B and rivaling larger open MoE and closed reasoning models. It also performs well in creative writing, and its efficient architecture delivers fast inference at lower deployment cost for high concurrency.',
-    displayName: 'Ring Flash 2.0',
-    id: 'inclusionAI/Ring-flash-2.0',
-    settings: {
-      extendParams: ['reasoningBudgetToken'],
-    },
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-19',
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1349,45 +1211,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    releasedAt: '2025-07-28',
    type: 'chat',
  },
-
-  {
-    abilities: {
-      functionCall: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'Kimi K2-Instruct-0905 is the newest and most powerful Kimi K2. It is a top-tier MoE model with 1T total and 32B active parameters. Key features include stronger agentic coding intelligence with significant gains on benchmarks and real-world agent tasks, plus improved frontend coding aesthetics and usability.',
-    displayName: 'Kimi K2 0905',
-    id: 'moonshotai/Kimi-K2-Instruct-0905',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-05',
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'Kimi K2-Instruct-0905 is the newest and most powerful Kimi K2. It is a top-tier MoE model with 1T total and 32B active parameters. Key features include stronger agentic coding intelligence with significant gains on benchmarks and real-world agent tasks, plus improved frontend coding aesthetics and usability.',
-    displayName: 'Kimi K2 0905 (Pro)',
-    id: 'Pro/moonshotai/Kimi-K2-Instruct-0905',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-09-05',
-    type: 'chat',
-  },
  {
    abilities: {
      reasoning: true,
@@ -1410,51 +1233,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-
-  {
-    abilities: {
-      functionCall: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'Qwen3-235B-A22B-Instruct-2507 is a flagship Qwen3 MoE model with 235B total and 22B active parameters. It is an updated non-thinking version focused on improving instruction following, logical reasoning, text understanding, math, science, coding, and tool use. It also expands multilingual long-tail knowledge and better aligns with user preferences for subjective open-ended tasks.',
-    displayName: 'Qwen3 235B A22B Instruct 2507',
-    id: 'Qwen/Qwen3-235B-A22B-Instruct-2507',
-    organization: 'Qwen',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 2.5, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 10, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-07-21',
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 262_144,
-    description:
-      'Qwen3-30B-A3B-Thinking-2507 is the latest thinking model in the Qwen3 series. It is an MoE model with 30.5B total and 3.3B active parameters, focused on complex tasks. It shows significant gains in logic, math, science, coding, and academic benchmarks, and improves instruction following, tool use, text generation, and preference alignment. It natively supports 256K context and can extend to 1M tokens. This version is designed for thinking mode with detailed step-by-step reasoning and strong agent capabilities.',
-    displayName: 'Qwen3 30B A3B Thinking 2507',
-    id: 'Qwen/Qwen3-30B-A3B-Thinking-2507',
-    organization: 'Qwen',
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 0.7, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 2.8, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-07-30',
-    settings: {
-      extendParams: ['reasoningBudgetToken'],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1547,53 +1325,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      reasoning: true,
-      vision: true,
-    },
-    contextWindowTokens: 65_536,
-    description:
-      'GLM-4.1V-9B-Thinking is an open-source VLM from Zhipu AI and Tsinghua KEG Lab, designed for complex multimodal cognition. Built on GLM-4-9B-0414, it adds chain-of-thought reasoning and RL to significantly improve cross-modal reasoning and stability.',
-    displayName: 'GLM-4.1V 9B Thinking (Free)',
-    id: 'THUDM/GLM-4.1V-9B-Thinking',
-    settings: {
-      extendParams: ['reasoningBudgetToken'],
-    },
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 0, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 0, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-07-02',
-    type: 'chat',
-  },
-
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'GLM-Z1-32B-0414 is a deep-thinking reasoning model built from GLM-4-32B-0414 with cold-start data and expanded RL, further trained on math, code, and logic. It significantly improves math ability and complex task solving over the base model.',
-    displayName: 'GLM-Z1 32B 0414',
-    id: 'THUDM/GLM-Z1-32B-0414',
-    settings: {
-      extendParams: ['reasoningBudgetToken'],
-    },
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-04-14',
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1864,30 +1595,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      vision: true,
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'GLM-4.6V achieves SOTA visual understanding accuracy at the same parameter scale, and is the first to natively integrate Function Call capability into vision models in the model architecture, connecting the chain from visual perception to executable action (Action), providing a unified technical foundation for multimodal Agents in real business scenarios. Visual context window expanded to 128K, supporting long video stream processing and high-resolution multi-image analysis.',
-    displayName: 'GLM-4.6V',
-    id: 'zai-org/GLM-4.6V',
-    settings: {
-      extendParams: ['enableReasoning', 'reasoningBudgetToken'],
-    },
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 3.5, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 14, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    releasedAt: '2025-12-08',
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -41,7 +41,6 @@ const stepfunChatModels: AIChatModelCard[] = [
    description:
      'Built on Step 3.5 Flash and optimized for high-frequency agent scenarios, it further improves token efficiency and inference speed while retaining flagship-level reasoning and tool-calling capabilities. It also supports switching to a low-reasoning mode to reduce resource consumption. Additionally, targeted optimizations have been made to enhance compatibility with coding tasks and agent frameworks.',
    displayName: 'Step 3.5 Flash 2603',
-    enabled: true,
    id: 'step-3.5-flash-2603',
    pricing: {
      currency: 'CNY',
@@ -95,7 +94,7 @@ const stepfunChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.3, upTo: 0.004 },
+            { rate: 0.3, upTo: 4_000 },
            { rate: 0.8, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -104,7 +103,7 @@ const stepfunChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 1.5, upTo: 0.004 },
+            { rate: 1.5, upTo: 4_000 },
            { rate: 4, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -113,7 +112,7 @@ const stepfunChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 4, upTo: 0.004 },
+            { rate: 4, upTo: 4_000 },
            { rate: 8, upTo: 'infinity' }, // Still differs from documentation
          ],
          unit: 'millionTokens',
@@ -345,7 +345,7 @@ const streamlakeModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 7, upTo: 0.256 },
+            { rate: 7, upTo: 256_000 },
            { rate: 14, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -354,7 +354,7 @@ const streamlakeModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 21, upTo: 0.256 },
+            { rate: 21, upTo: 256_000 },
            { rate: 42, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -74,6 +74,70 @@ const seedance15ProParams: VideoModelParamsSchema = {
 };

 const doubaoChatModels: AIChatModelCard[] = [
+  {
+    abilities: {
+      functionCall: true,
+      reasoning: true,
+      search: true,
+    },
+    config: {
+      deploymentName: 'deepseek-v4-pro-260425',
+    },
+    contextWindowTokens: 1_048_576,
+    description:
+      'DeepSeek-V4-Pro is DeepSeek’s flagship MoE model on Volcano Ark, supporting both non-thinking and thinking modes for advanced reasoning, code generation, and complex agent workflows.',
+    displayName: 'DeepSeek V4 Pro',
+    enabled: true,
+    id: 'deepseek-v4-pro',
+    maxOutput: 393_216,
+    pricing: {
+      currency: 'CNY',
+      units: [
+        { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 12, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 24, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput_cacheWrite', rate: 0.017, strategy: 'fixed', unit: 'millionTokens' },
+      ],
+    },
+    releasedAt: '2026-04-24',
+    settings: {
+      extendParams: ['enableReasoning'],
+      searchImpl: 'params',
+    },
+    type: 'chat',
+  },
+  {
+    abilities: {
+      functionCall: true,
+      reasoning: true,
+      search: true,
+    },
+    config: {
+      deploymentName: 'deepseek-v4-flash-260425',
+    },
+    contextWindowTokens: 1_048_576,
+    description:
+      'DeepSeek-V4-Flash is DeepSeek’s efficient 1M-context model on Volcano Ark, balancing speed and cost while keeping strong reasoning and agent capabilities.',
+    displayName: 'DeepSeek V4 Flash',
+    enabled: true,
+    id: 'deepseek-v4-flash',
+    maxOutput: 393_216,
+    pricing: {
+      currency: 'CNY',
+      units: [
+        { name: 'textInput_cacheRead', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 2, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput_cacheWrite', rate: 0.017, strategy: 'fixed', unit: 'millionTokens' },
+      ],
+    },
+    releasedAt: '2026-04-24',
+    settings: {
+      extendParams: ['enableReasoning'],
+      searchImpl: 'params',
+    },
+    type: 'chat',
+  },
  {
    abilities: {
      functionCall: true,
@@ -562,33 +626,6 @@ const doubaoChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    config: {
-      deploymentName: 'deepseek-v3-1-terminus',
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'DeepSeek-V3.1 is a new hybrid reasoning model from DeepSeek, supporting both thinking and non-thinking modes and offering higher thinking efficiency than DeepSeek-R1-0528. Post-training optimizations greatly improve agent tool use and agent-task performance. It supports a 128k context window and up to 64k output tokens.',
-    displayName: 'DeepSeek V3.1',
-    id: 'deepseek-v3.1',
-    maxOutput: 32_768,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput_cacheRead', rate: 0.8, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 12, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    settings: {
-      extendParams: ['enableReasoning'],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -700,62 +737,6 @@ const doubaoChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      video: true,
-      vision: true,
-      search: true,
-    },
-    config: {
-      deploymentName: 'doubao-seed-1-6-lite-251015',
-    },
-    contextWindowTokens: 256_000,
-    description:
-      'Doubao-Seed-1.6-lite is a new multimodal deep-reasoning model with adjustable reasoning effort (Minimal, Low, Medium, High), delivering better value and a strong choice for common tasks, with a context window up to 256k.',
-    displayName: 'Doubao Seed 1.6 Lite',
-    id: 'doubao-seed-1.6-lite',
-    maxOutput: 32_000,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        {
-          lookup: {
-            prices: {
-              '[0, 0.032]': 0.3,
-              '[0.032, 0.128]': 0.6,
-              '[0.128, 0.256]': 1.2,
-            },
-            pricingParams: ['textInputRange'],
-          },
-          name: 'textInput',
-          strategy: 'lookup',
-          unit: 'millionTokens',
-        },
-        {
-          lookup: {
-            prices: {
-              '[0, 0.032]_[0, 0.0002]': 0.6,
-              '[0, 0.032]_[0.0002, infinity]': 2.4,
-              '[0.032, 0.128]_[0, infinity]': 4,
-              '[0.128, 0.256]_[0, infinity]': 12,
-            },
-            pricingParams: ['textInputRange', 'textOutputRange'],
-          },
-          name: 'textOutput',
-          strategy: 'lookup',
-          unit: 'millionTokens',
-        },
-        { name: 'textInput_cacheRead', rate: 0.06, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    settings: {
-      extendParams: ['gpt5ReasoningEffort'],
-      searchImpl: 'params',
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -811,51 +792,6 @@ const doubaoChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-    },
-    config: {
-      deploymentName: 'deepseek-r1-250528',
-    },
-    contextWindowTokens: 131_072,
-    description:
-      'The latest 0528 release of DeepSeek-R1 applies large-scale reinforcement learning in post-training, greatly boosting reasoning with very little labeled data. It matches the OpenAI o1 production model on math, code, and natural language reasoning tasks.',
-    displayName: 'DeepSeek R1',
-    id: 'deepseek-r1',
-    maxOutput: 16_384,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-    },
-    config: {
-      deploymentName: 'deepseek-v3-250324',
-    },
-    contextWindowTokens: 128_000,
-    description:
-      'DeepSeek-V3 is a MoE model developed by DeepSeek. It surpasses other open models like Qwen2.5-72B and Llama-3.1-405B on many benchmarks, and is competitive with leading closed models such as GPT-4o and Claude 3.5 Sonnet.',
-    displayName: 'DeepSeek V3',
-    id: 'deepseek-v3',
-    maxOutput: 16_384,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 8, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -1055,28 +991,6 @@ const volcengineImageModels: AIImageModelCard[] = [
    releasedAt: '2025-09-09',
    type: 'image',
  },
-  {
-    description:
-      'Seedream 3.0 is an image generation model from ByteDance Seed, supporting text and image inputs with highly controllable, high-quality image generation. It generates images from text prompts.',
-    displayName: 'Seedream 3.0 Text-to-Image',
-    id: 'doubao-seedream-3-0-t2i-250415',
-    parameters: {
-      cfg: { default: 2.5, max: 10, min: 1, step: 0.1 },
-      height: { default: 1024, max: 3549, min: 296, step: 1 },
-      prompt: {
-        default: '',
-      },
-      seed: { default: null },
-      watermark: { default: false },
-      width: { default: 1024, max: 3549, min: 296, step: 1 },
-    },
-    pricing: {
-      currency: 'CNY',
-      units: [{ name: 'imageGeneration', rate: 0.259, strategy: 'fixed', unit: 'image' }],
-    },
-    releasedAt: '2025-04-15',
-    type: 'image',
-  },
 ];

 const volcengineVideoModels: AIVideoModelCard[] = [
@@ -1224,78 +1138,6 @@ const volcengineVideoModels: AIVideoModelCard[] = [
    releasedAt: '2025-05-28',
    type: 'video',
  },
-  {
-    description:
-      'Stable generation quality with high cost-effectiveness, capable of generating videos from a first frame, first-and-last frames, or reference images.',
-    displayName: 'Seedance 1.0 Lite I2V',
-    id: 'doubao-seedance-1-0-lite-i2v-250428',
-    organization: 'ByteDance',
-    parameters: {
-      aspectRatio: {
-        default: '16:9',
-        enum: ['21:9', '16:9', '4:3', '1.1', '3:4', '9:16'],
-      },
-      cameraFixed: { default: false },
-      endImageUrl: {
-        aspectRatio: { max: 2.5, min: 0.4 },
-        default: null,
-        height: { max: 6000, min: 300 },
-        maxFileSize: 30 * 1024 * 1024,
-        requiresImageUrl: true,
-        width: { max: 6000, min: 300 },
-      },
-      imageUrls: {
-        aspectRatio: { max: 2.5, min: 0.4 },
-        default: [],
-        height: { max: 6000, min: 300 },
-        maxFileSize: 30 * 1024 * 1024,
-        maxCount: 4,
-        width: { max: 6000, min: 300 },
-      },
-      duration: { default: 5, max: 12, min: 2 },
-      prompt: { default: '' },
-      resolution: {
-        default: '720p',
-        enum: ['480p', '720p', '1080p'],
-      },
-      seed: { default: null },
-      watermark: { default: false },
-    },
-    pricing: {
-      currency: 'CNY',
-      units: [{ name: 'videoGeneration', rate: 10, strategy: 'fixed', unit: 'millionTokens' }],
-    },
-    releasedAt: '2025-04-28',
-    type: 'video',
-  },
-  {
-    description:
-      'Stable generation quality with high cost-effectiveness, capable of generating videos based on text instructions.',
-    displayName: 'Seedance 1.0 Lite T2V',
-    id: 'doubao-seedance-1-0-lite-t2v-250428',
-    organization: 'ByteDance',
-    parameters: {
-      aspectRatio: {
-        default: '16:9',
-        enum: ['21:9', '16:9', '4:3', '1.1', '3:4', '9:16'],
-      },
-      cameraFixed: { default: false },
-      duration: { default: 5, max: 12, min: 2 },
-      prompt: { default: '' },
-      resolution: {
-        default: '720p',
-        enum: ['480p', '720p', '1080p'],
-      },
-      seed: { default: null },
-      watermark: { default: false },
-    },
-    pricing: {
-      currency: 'CNY',
-      units: [{ name: 'videoGeneration', rate: 10, strategy: 'fixed', unit: 'millionTokens' }],
-    },
-    releasedAt: '2025-04-28',
-    type: 'video',
-  },
 ];

 export const allModels = [...doubaoChatModels, ...volcengineImageModels, ...volcengineVideoModels];
@@ -1001,26 +1001,6 @@ const wenxinChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      video: true,
-      vision: true,
-    },
-    contextWindowTokens: 32_768,
-    description:
-      'Qwen2.5 VL 32B Instruct is an open-source multimodal model suitable for private deployment and multi-scenario use.',
-    displayName: 'Qwen2.5 VL 32B Instruct',
-    id: 'qwen2.5-vl-32b-instruct',
-    maxOutput: 8192,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 24, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      vision: true,
@@ -1556,61 +1536,6 @@ const wenxinChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    contextWindowTokens: 32_768,
-    description: 'Qwen3 4B is suitable for small-to-mid apps and local inference.',
-    displayName: 'Qwen3 4B',
-    id: 'qwen3-4b',
-    maxOutput: 8192,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    settings: {
-      extendParams: ['enableReasoning', 'reasoningBudgetToken'],
-    },
-    type: 'chat',
-  },
-  {
-    contextWindowTokens: 32_768,
-    description: 'Qwen3 1.7B is an ultra-light model for edge and device deployment.',
-    displayName: 'Qwen3 1.7B',
-    id: 'qwen3-1.7b',
-    maxOutput: 8192,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    settings: {
-      extendParams: ['enableReasoning', 'reasoningBudgetToken'],
-    },
-    type: 'chat',
-  },
-  {
-    contextWindowTokens: 32_768,
-    description:
-      'Qwen3 0.6B is an entry-level model for simple reasoning and very constrained environments.',
-    displayName: 'Qwen3 0.6B',
-    id: 'qwen3-0.6b',
-    maxOutput: 8192,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    settings: {
-      extendParams: ['enableReasoning', 'reasoningBudgetToken'],
-    },
-    type: 'chat',
-  },
 ];

 const wenxinImageModels: AIImageModelCard[] = [
@@ -20,7 +20,7 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.2, upTo: 200_000 },
+            { rate: 0.2, upTo: 0.2 },
            { rate: 0.4, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -29,7 +29,7 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 1.25, upTo: 200_000 },
+            { rate: 1.25, upTo: 0.2 },
            { rate: 2.5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -38,7 +38,7 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 2.5, upTo: 200_000 },
+            { rate: 2.5, upTo: 0.2 },
            { rate: 5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -59,7 +59,7 @@ const xaiChatModels: AIChatModelCard[] = [
      structuredOutput: true,
      vision: true,
    },
-    contextWindowTokens: 2_000_000,
+    contextWindowTokens: 1_000_000,
    description: 'A non-reasoning variant for simple use cases',
    displayName: 'Grok 4.20 (Non-Reasoning)',
    enabled: true,
@@ -70,7 +70,7 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.2, upTo: 200_000 },
+            { rate: 0.2, upTo: 0.2 },
            { rate: 0.4, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -79,8 +79,8 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 2, upTo: 200_000 },
-            { rate: 4, upTo: 'infinity' },
+            { rate: 1.25, upTo: 0.2 },
+            { rate: 2.5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
        },
@@ -88,8 +88,8 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 6, upTo: 200_000 },
-            { rate: 12, upTo: 'infinity' },
+            { rate: 2.5, upTo: 0.2 },
+            { rate: 5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
        },
@@ -109,7 +109,7 @@ const xaiChatModels: AIChatModelCard[] = [
      structuredOutput: true,
      vision: true,
    },
-    contextWindowTokens: 2_000_000,
+    contextWindowTokens: 1_000_000,
    description: 'Intelligent, blazing-fast model that reasons before responding',
    displayName: 'Grok 4.20',
    enabled: true,
@@ -120,7 +120,7 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.2, upTo: 200_000 },
+            { rate: 0.2, upTo: 0.2 },
            { rate: 0.4, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -129,8 +129,8 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 2, upTo: 200_000 },
-            { rate: 4, upTo: 'infinity' },
+            { rate: 1.25, upTo: 0.2 },
+            { rate: 2.5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
        },
@@ -138,8 +138,8 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 6, upTo: 200_000 },
-            { rate: 12, upTo: 'infinity' },
+            { rate: 2.5, upTo: 0.2 },
+            { rate: 5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
        },
@@ -170,7 +170,7 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput_cacheRead',
          strategy: 'tiered',
          tiers: [
-            { rate: 0.2, upTo: 200_000 },
+            { rate: 0.2, upTo: 0.2 },
            { rate: 0.4, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
@@ -179,8 +179,8 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textInput',
          strategy: 'tiered',
          tiers: [
-            { rate: 2, upTo: 200_000 },
-            { rate: 4, upTo: 'infinity' },
+            { rate: 1.25, upTo: 0.2 },
+            { rate: 2.5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
        },
@@ -188,8 +188,8 @@ const xaiChatModels: AIChatModelCard[] = [
          name: 'textOutput',
          strategy: 'tiered',
          tiers: [
-            { rate: 6, upTo: 200_000 },
-            { rate: 12, upTo: 'infinity' },
+            { rate: 2.5, upTo: 0.2 },
+            { rate: 5, upTo: 'infinity' },
          ],
          unit: 'millionTokens',
        },
@@ -18,33 +18,9 @@ const xiaomimimoChatModels: AIChatModelCard[] = [
    pricing: {
      currency: 'CNY',
      units: [
-        {
-          name: 'textInput_cacheRead',
-          strategy: 'tiered',
-          tiers: [
-            { rate: 1.4, upTo: 0.256 },
-            { rate: 2.8, upTo: 'infinity' },
-          ],
-          unit: 'millionTokens',
-        },
-        {
-          name: 'textInput',
-          strategy: 'tiered',
-          tiers: [
-            { rate: 7, upTo: 0.256 },
-            { rate: 14, upTo: 'infinity' },
-          ],
-          unit: 'millionTokens',
-        },
-        {
-          name: 'textOutput',
-          strategy: 'tiered',
-          tiers: [
-            { rate: 21, upTo: 0.256 },
-            { rate: 42, upTo: 'infinity' },
-          ],
-          unit: 'millionTokens',
-        },
+        { name: 'textInput_cacheRead', rate: 0.025, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 3, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 6, strategy: 'fixed', unit: 'millionTokens' },
      ],
    },
    releasedAt: '2026-04-22',
@@ -73,33 +49,9 @@ const xiaomimimoChatModels: AIChatModelCard[] = [
    pricing: {
      currency: 'CNY',
      units: [
-        {
-          name: 'textInput_cacheRead',
-          strategy: 'tiered',
-          tiers: [
-            { rate: 0.56, upTo: 0.256 },
-            { rate: 1.12, upTo: 'infinity' },
-          ],
-          unit: 'millionTokens',
-        },
-        {
-          name: 'textInput',
-          strategy: 'tiered',
-          tiers: [
-            { rate: 2.8, upTo: 0.256 },
-            { rate: 5.6, upTo: 'infinity' },
-          ],
-          unit: 'millionTokens',
-        },
-        {
-          name: 'textOutput',
-          strategy: 'tiered',
-          tiers: [
-            { rate: 14, upTo: 0.256 },
-            { rate: 28, upTo: 'infinity' },
-          ],
-          unit: 'millionTokens',
-        },
+        { name: 'textInput_cacheRead', rate: 0.02, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' },
+        { name: 'textOutput', rate: 2, strategy: 'fixed', unit: 'millionTokens' },
      ],
    },
    releasedAt: '2026-04-22',
@@ -120,7 +72,6 @@ const xiaomimimoChatModels: AIChatModelCard[] = [
    description:
      'MiMo-V2-Flash is now officially open source! This is a MoE (Mixture-of-Experts) model purpose-built for extreme inference efficiency, with 309B total parameters (15B activated). Through innovations in a hybrid attention architecture and multi-layer MTP inference acceleration, it ranks among the global Top 2 open-source models across multiple agent benchmarking suites. Its coding capabilities surpass all open-source models and rival leading closed-source models such as Claude 4.5 Sonnet, while incurring only 2.5% of the inference cost and delivering 2× faster generation speed—pushing large-model inference efficiency to the limit.',
    displayName: 'MiMo-V2 Flash',
-    enabled: true,
    id: 'mimo-v2-flash',
    maxOutput: 65_536,
    pricing: {
@@ -267,80 +267,6 @@ const zenmuxChatModels: AIChatModelCard[] = [
      reasoning: true,
      vision: true,
    },
-    contextWindowTokens: 2_000_000,
-    description:
-      'Grok 4 Fast is xAI’s high-throughput, low-cost model (supports a 2M context window), ideal for high-concurrency and long-context use cases.',
-    displayName: 'Grok 4 Fast',
-    id: 'x-ai/grok-4-fast',
-    maxOutput: 30_000,
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      functionCall: true,
-      vision: true,
-    },
-    contextWindowTokens: 2_000_000,
-    description:
-      'Grok 4 Fast (Non-Reasoning) is xAI’s high-throughput, low-cost multimodal model (supports a 2M context window) for scenarios sensitive to latency and cost that do not require in-model reasoning. It sits alongside the reasoning version of Grok 4 Fast, and reasoning can be enabled via the API reasoning parameter when needed. Prompts and completions may be used by xAI or OpenRouter to improve future models.',
-    displayName: 'Grok 4 Fast (Non-Reasoning)',
-    id: 'x-ai/grok-4-fast-non-reasoning',
-    maxOutput: 30_000,
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      reasoning: true,
-    },
-    contextWindowTokens: 256_000,
-    description:
-      "Grok 4 is xAI's flagship reasoning model with strong reasoning and multimodal capability.",
-    displayName: 'Grok 4',
-    id: 'x-ai/grok-4',
-    maxOutput: 256_000,
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 3, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 15, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      reasoning: true,
-    },
-    contextWindowTokens: 256_000,
-    description:
-      "Grok Code Fast 1 is xAI's fast code model with readable, engineering-friendly output.",
-    displayName: 'Grok Code Fast 1',
-    id: 'x-ai/grok-code-fast-1',
-    maxOutput: 10_000,
-    pricing: {
-      units: [
-        { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' },
-        { name: 'textOutput', rate: 1.5, strategy: 'fixed', unit: 'millionTokens' },
-      ],
-    },
-    type: 'chat',
-  },
-  {
-    abilities: {
-      reasoning: true,
-      vision: true,
-    },
    contextWindowTokens: 128_000,
    description:
      'ERNIE 5.0 Thinking Preview is Baidu’s next-generation native multimodal ERNIE model, strong in multimodal understanding, instruction following, creation, factual Q&A, and tool calling.',
@@ -69,6 +69,68 @@ const zhipuChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
+  {
+    abilities: {
+      functionCall: true,
+      reasoning: true,
+      search: true,
+      video: true,
+      vision: true,
+    },
+    contextWindowTokens: 200_000,
+    description:
+      'GLM-5V-Turbo is Zhipu’s multimodal Coding foundation model for visual programming tasks. It natively handles images, video, text, and files, and is optimized for long-horizon planning, complex coding, and agent execution in multimodal workflows.',
+    displayName: 'GLM-5V-Turbo',
+    enabled: true,
+    id: 'glm-5v-turbo',
+    maxOutput: 131_072,
+    pricing: {
+      currency: 'CNY',
+      units: [
+        {
+          lookup: {
+            prices: {
+              '[0, 0.032]': 1.2,
+              '[0.032, infinity]': 1.8,
+            },
+            pricingParams: ['textInput'],
+          },
+          name: 'textInput_cacheRead',
+          strategy: 'lookup',
+          unit: 'millionTokens',
+        },
+        {
+          lookup: {
+            prices: {
+              '[0, 0.032]': 5,
+              '[0.032, infinity]': 7,
+            },
+            pricingParams: ['textInput'],
+          },
+          name: 'textInput',
+          strategy: 'lookup',
+          unit: 'millionTokens',
+        },
+        {
+          lookup: {
+            prices: {
+              '[0, 0.032]': 22,
+              '[0.032, infinity]': 26,
+            },
+            pricingParams: ['textInput'],
+          },
+          name: 'textOutput',
+          strategy: 'lookup',
+          unit: 'millionTokens',
+        },
+      ],
+    },
+    settings: {
+      extendParams: ['enableReasoning'],
+      searchImpl: 'params',
+    },
+    type: 'chat',
+  },
  {
    abilities: {
      functionCall: true,
@@ -185,7 +247,7 @@ const zhipuChatModels: AIChatModelCard[] = [
    },
    releasedAt: '2026-02-12',
    settings: {
-      extendParams: ['enableReasoning'],
+      extendParams: ['enableReasoning', 'preserveThinking'],
      searchImpl: 'params',
    },
    type: 'chat',
@@ -248,7 +310,7 @@ const zhipuChatModels: AIChatModelCard[] = [
    },
    releasedAt: '2025-12-22',
    settings: {
-      extendParams: ['enableReasoning'],
+      extendParams: ['enableReasoning', 'preserveThinking'],
      searchImpl: 'params',
    },
    type: 'chat',
@@ -308,69 +370,6 @@ const zhipuChatModels: AIChatModelCard[] = [
    },
    type: 'chat',
  },
-  {
-    abilities: {
-      functionCall: true,
-      reasoning: true,
-      search: true,
-      video: true,
-      vision: true,
-    },
-    contextWindowTokens: 200_000,
-    description:
-      'GLM-5V-Turbo is Zhipu’s first multimodal coding foundation model, designed for visual programming tasks. It can natively process multimodal inputs such as images, videos, and text, while excelling in long-horizon planning, complex programming, and action execution. Deeply integrated with agent workflows, it can collaborate seamlessly with agents like Claude Code and OpenClaw to complete a full closed loop of “understanding the environment → planning actions → executing tasks.”',
-    displayName: 'GLM-5V-Turbo',
-    enabled: true,
-    id: 'glm-5v-turbo',
-    maxOutput: 131_072,
-    pricing: {
-      currency: 'CNY',
-      units: [
-        {
-          lookup: {
-            prices: {
-              '[0, 0.032]': 1.2,
-              '[0.032, infinity]': 1.8,
-            },
-            pricingParams: ['textInput'],
-          },
-          name: 'textInput_cacheRead',
-          strategy: 'lookup',
-          unit: 'millionTokens',
-        },
-        {
-          lookup: {
-            prices: {
-              '[0, 0.032]': 5,
-              '[0.032, infinity]': 7,
-            },
-            pricingParams: ['textInput'],
-          },
-          name: 'textInput',
-          strategy: 'lookup',
-          unit: 'millionTokens',
-        },
-        {
-          lookup: {
-            prices: {
-              '[0, 0.032]': 22,
-              '[0.032, infinity]': 26,
-            },
-            pricingParams: ['textInput'],
-          },
-          name: 'textOutput',
-          strategy: 'lookup',
-          unit: 'millionTokens',
-        },
-      ],
-    },
-    releasedAt: '2026-04-02',
-    settings: {
-      extendParams: ['enableReasoning'],
-      searchImpl: 'params',
-    },
-    type: 'chat',
-  },
  {
    abilities: {
      functionCall: true,
@@ -257,6 +257,7 @@ export type ExtendParamsType =
  | 'reasoningBudgetToken32k'
  | 'reasoningBudgetToken80k'
  | 'enableReasoning'
+  | 'preserveThinking'
  | 'enableAdaptiveThinking'
  | 'disableContextCaching'
  | 'effort'
@@ -307,6 +308,7 @@ export const ExtendParamsTypeSchema = z.enum([
  'reasoningBudgetToken32k',
  'reasoningBudgetToken80k',
  'enableReasoning',
+  'preserveThinking',
  'enableAdaptiveThinking',
  'disableContextCaching',
  'effort',
@@ -575,21 +575,32 @@ export const createOpenAICompatibleRuntime = <T extends Record<string, any> = an
          // Apply sampling sanitization to processedPayload for the custom client path.
          // We use processedPayload (ChatStreamPayload type) here because
          // createChatCompletionStream expects ChatStreamPayload, not the OpenAI SDK format.
+          // Strip LobeHub-internal fields that should never reach downstream APIs.
+          const {
+            apiMode: _apiMode,
+            preserveThinking: _preserveThinking,
+            ...cleanProcessedPayload
+          } = processedPayload as any;
          response = customClient.createChatCompletionStream(
            this.client,
            {
-              ...processedPayload,
-              ...resolveModelSamplingParameters(processedPayload.model, processedPayload, {
-                normalizeTemperature: false,
-                preferTemperature: true,
-              }),
+              ...cleanProcessedPayload,
+              ...resolveModelSamplingParameters(
+                cleanProcessedPayload.model,
+                cleanProcessedPayload,
+                {
+                  normalizeTemperature: false,
+                  preferTemperature: true,
+                },
+              ),
            },
            this,
          ) as any;
        } else {
-          // Remove internal apiMode parameter before sending to API
-
-          const { apiMode: _, ...cleanedPayload } = postPayload as any;
+          // Remove LobeHub-internal fields before sending to downstream API.
+          // `preserveThinking` is only consumed by Qwen/Zhipu handlePayload (which runs above)
+          // and must not leak to other providers' APIs as an unknown parameter.
+          const { apiMode: _, preserveThinking: _pt, ...cleanedPayload } = postPayload as any;
          const finalPayload = {
            ...cleanedPayload,
            messages,
@@ -1214,6 +1225,7 @@ export const createOpenAICompatibleRuntime = <T extends Record<string, any> = an
      delete res.apiMode;
      delete res.frequency_penalty;
      delete res.presence_penalty;
+      delete res.preserveThinking;

      const input = await convertOpenAIResponseInputs(messages as any, {
        forceImageBase64: chatCompletion?.forceImageBase64,
@@ -72,7 +72,7 @@ const maskSensitiveUrl = (url: string) => {
 const BaseAzureOpenAI = createOpenAICompatibleRuntime({
  chatCompletion: {
    handlePayload: (payload) => {
-      const { deploymentName, enabledSearch, model, ...rest } = payload;
+      const { deploymentName, enabledSearch, model, preserveThinking: _preserveThinking, ...rest } = payload;
      const requestModel = deploymentName ?? model;

      if (responsesAPIModels.has(model) || enabledSearch) {
@@ -134,7 +134,15 @@ const BaseAzureOpenAI = createOpenAICompatibleRuntime({
  provider: ModelProvider.Azure,
  responses: {
    handlePayload: (payload) => {
-      const { deploymentName, enabledSearch, model, tools, verbosity, ...rest } = payload;
+      const {
+        deploymentName,
+        enabledSearch,
+        model,
+        preserveThinking: _preserveThinking,
+        tools,
+        verbosity,
+        ...rest
+      } = payload;
      const requestModel = deploymentName ?? model;
      const updatedMessages = transformAzureSystemMessages(payload.messages, model);
      const azureTools = appendAzureSearchTool(tools, enabledSearch);
@@ -40,7 +40,15 @@ export class LobeAzureAI implements LobeRuntimeAI {
  async chat(payload: ChatStreamPayload, options?: ChatMethodOptions) {
    // Remove internal apiMode parameter to prevent sending to Azure AI API

-    const { messages, model, temperature, top_p, apiMode: _, ...params } = payload;
+    const {
+      messages,
+      model,
+      temperature,
+      top_p,
+      apiMode: _,
+      preserveThinking: _pt,
+      ...params
+    } = payload;
    // o1 series models on Azure OpenAI does not support streaming currently
    const enableStreaming = model.includes('o1') ? false : (params.stream ?? true);

@@ -271,6 +271,7 @@ export class LobeGithubCopilotAI implements LobeRuntimeAI {
          reasoning,
          max_tokens,
          verbosity,
+          preserveThinking: _pt,
          ...responseRest
        } = rest as any;

@@ -350,7 +351,7 @@ export class LobeGithubCopilotAI implements LobeRuntimeAI {
        );
      }

-      const { apiMode: _, ...cleanedRest } = rest as any;
+      const { apiMode: _, preserveThinking: _pt, ...cleanedRest } = rest as any;
      const messages = await convertOpenAIMessages(cleanedRest.messages as any, {
        forceImageBase64: true,
      });
@@ -4,7 +4,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';

 import type { LobeOpenAICompatibleRuntime } from '../../core/BaseAI';
 import { testProvider } from '../../providerTestUtils';
-import { LobeQwenAI } from './index';
+import { LobeQwenAI, params } from './index';

 const provider = ModelProvider.Qwen;
 const defaultBaseURL = 'https://dashscope.aliyuncs.com/compatible-mode/v1';
@@ -98,4 +98,117 @@ describe('LobeQwenAI - custom features', () => {
      expect(calledPayload.thinking_budget).toBe(4096);
    });
  });
+
+  describe('preserve thinking mapping', () => {
+    it('should map preserveThinking to preserve_thinking for qwen3.6-plus', () => {
+      const payload = {
+        messages: [
+          { content: 'hello', role: 'user' },
+          {
+            content: 'answer',
+            reasoning: { content: 'reasoning content' },
+            role: 'assistant',
+          },
+        ],
+        model: 'qwen3.6-plus',
+        preserveThinking: true,
+      } as any;
+
+      const result = params.chatCompletion!.handlePayload!(payload);
+
+      expect(result.preserve_thinking).toBe(true);
+      expect(result.messages).toEqual([
+        { content: 'hello', role: 'user' },
+        {
+          content: 'answer',
+          reasoning_content: 'reasoning content',
+          role: 'assistant',
+        },
+      ]);
+    });
+
+    it('should set preserve_thinking=false when explicitly disabled on supported model', () => {
+      const payload = {
+        messages: [{ content: 'hello', role: 'user' }],
+        model: 'qwen3.6-plus',
+        preserveThinking: false,
+      } as any;
+
+      const result = params.chatCompletion!.handlePayload!(payload);
+
+      expect(result.preserve_thinking).toBe(false);
+    });
+
+    it('should map preserveThinking for deployment-name aliases when caller provides the param', () => {
+      const payload = {
+        messages: [
+          {
+            content: 'answer',
+            reasoning: { content: 'reasoning content' },
+            role: 'assistant',
+          },
+        ],
+        model: 'my-qwen3.6-plus-deployment',
+        preserveThinking: true,
+      } as any;
+
+      const result = params.chatCompletion!.handlePayload!(payload);
+
+      expect(result.preserve_thinking).toBe(true);
+      expect(result.messages).toEqual([
+        {
+          content: 'answer',
+          reasoning_content: 'reasoning content',
+          role: 'assistant',
+        },
+      ]);
+    });
+
+    it('should not set preserve_thinking when preserveThinking is absent but still keep reasoning_content', () => {
+      const payload = {
+        messages: [
+          {
+            content: 'answer',
+            reasoning: { content: 'reasoning content' },
+            role: 'assistant',
+          },
+        ],
+        model: 'qwen3.5-plus',
+      } as any;
+
+      const result = params.chatCompletion!.handlePayload!(payload);
+
+      expect(result.preserve_thinking).toBeUndefined();
+      expect(result.messages).toEqual([
+        {
+          content: 'answer',
+          reasoning_content: 'reasoning content',
+          role: 'assistant',
+        },
+      ]);
+    });
+
+    it('should keep caller-provided reasoning_content', () => {
+      const payload = {
+        messages: [
+          {
+            content: 'answer',
+            reasoning_content: 'existing reasoning content',
+            role: 'assistant',
+          },
+        ],
+        model: 'qwen3.5-plus',
+      } as any;
+
+      const result = params.chatCompletion!.handlePayload!(payload);
+
+      expect(result.messages).toEqual([
+        {
+          content: 'answer',
+          reasoning_content: 'existing reasoning content',
+          role: 'assistant',
+        },
+      ]);
+    });
+  });
 });
@@ -1,5 +1,6 @@
 import { ModelProvider } from 'model-bank';

+import type { OpenAICompatibleFactoryOptions } from '../../core/openaiCompatibleFactory';
 import { createOpenAICompatibleRuntime } from '../../core/openaiCompatibleFactory';
 import { resolveParameters } from '../../core/parameterResolver';
 import { QwenAIStream } from '../../core/streams';
@@ -23,7 +24,7 @@ export const QwenLegacyModels = new Set([
  'qwen-1.8b-longcontext-chat',
 ]);

-export const LobeQwenAI = createOpenAICompatibleRuntime({
+export const params = {
  baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1',
  chatCompletion: {
    handlePayload: (payload) => {
@@ -35,6 +36,7 @@ export const LobeQwenAI = createOpenAICompatibleRuntime({
        thinking,
        top_p,
        enabledSearch,
+        preserveThinking,
        ...rest
      } = payload;
      const isDeepSeekV4Model = model.startsWith('deepseek-v4');
@@ -54,31 +56,53 @@ export const LobeQwenAI = createOpenAICompatibleRuntime({
        },
      );

+      const messages = (rest.messages || []).map((message: any) => {
+        const { reasoning, ...messageRest } = message;
+
+        const reasoningContent =
+          typeof messageRest.reasoning_content === 'string'
+            ? messageRest.reasoning_content
+            : typeof reasoning?.content === 'string'
+              ? reasoning.content
+              : undefined;
+
+        if (reasoningContent !== undefined) {
+          return {
+            ...messageRest,
+            reasoning_content: reasoningContent,
+          };
+        }
+
+        return messageRest;
+      });
+
      return {
        ...rest,
        ...(isDeepSeekV4Model
          ? {
-              ...(thinking?.type === 'enabled' || thinkingExplicitlyDisabled
-                ? { enable_thinking: !thinkingExplicitlyDisabled }
-                : {}),
-              ...(!thinkingExplicitlyDisabled && reasoning_effort && { reasoning_effort }),
-            }
+            ...(thinking?.type === 'enabled' || thinkingExplicitlyDisabled
+              ? { enable_thinking: !thinkingExplicitlyDisabled }
+              : {}),
+            ...(!thinkingExplicitlyDisabled && reasoning_effort && { reasoning_effort }),
+          }
          : model.includes('-thinking')
            ? {
-                enable_thinking: true,
+              enable_thinking: true,
+              thinking_budget:
+                thinking?.budget_tokens === 0 ? 0 : thinking?.budget_tokens || undefined,
+            }
+            : thinking
+              ? {
+                ...(thinking.type !== undefined && {
+                  enable_thinking: thinking.type === 'enabled',
+                }),
                thinking_budget:
                  thinking?.budget_tokens === 0 ? 0 : thinking?.budget_tokens || undefined,
              }
-            : thinking
-              ? {
-                  ...(thinking.type !== undefined && {
-                    enable_thinking: thinking.type === 'enabled',
-                  }),
-                  thinking_budget:
-                    thinking?.budget_tokens === 0 ? 0 : thinking?.budget_tokens || undefined,
-                }
              : {}),
+        ...(typeof preserveThinking === 'boolean' && { preserve_thinking: preserveThinking }),
        frequency_penalty: undefined,
+        messages,
        model,
        presence_penalty: resolvedParams.presence_penalty,
        stream: true,
@@ -118,4 +142,6 @@ export const LobeQwenAI = createOpenAICompatibleRuntime({
    return processMultiProviderModelList(modelList, 'qwen');
  },
  provider: ModelProvider.Qwen,
-});
+} satisfies OpenAICompatibleFactoryOptions;
+
+export const LobeQwenAI = createOpenAICompatibleRuntime(params);
@@ -407,6 +407,96 @@ describe('LobeZhipuAI - custom features', () => {
      });
    });

+    describe('preserve thinking mapping', () => {
+      it('should map preserveThinking=true to clear_thinking=false and convert reasoning content', () => {
+        const payload = {
+          messages: [
+            { content: 'hello', role: 'user' },
+            {
+              content: 'answer',
+              reasoning: { content: 'reasoning content' },
+              role: 'assistant',
+            },
+          ],
+          model: 'glm-5',
+          preserveThinking: true,
+          thinking: { budget_tokens: 1024, type: 'enabled' },
+        } as any;
+
+        const result = params.chatCompletion.handlePayload(payload);
+
+        expect(result.thinking).toEqual({ clear_thinking: false, type: 'enabled' });
+        expect(result.messages).toEqual([
+          { content: 'hello', role: 'user' },
+          {
+            content: 'answer',
+            reasoning_content: 'reasoning content',
+            role: 'assistant',
+          },
+        ]);
+      });
+
+      it('should still convert reasoning to reasoning_content when preserveThinking is absent', () => {
+        const payload = {
+          messages: [
+            {
+              content: 'answer',
+              reasoning: { content: 'reasoning content' },
+              role: 'assistant',
+            },
+          ],
+          model: 'glm-5',
+        } as any;
+
+        const result = params.chatCompletion.handlePayload(payload);
+
+        expect(result.thinking).toBeUndefined();
+        expect(result.messages).toEqual([
+          {
+            content: 'answer',
+            reasoning_content: 'reasoning content',
+            role: 'assistant',
+          },
+        ]);
+      });
+
+      it('should map preserveThinking=false to clear_thinking=true', () => {
+        const payload = {
+          messages: [{ content: 'hello', role: 'user' }],
+          model: 'glm-4.7',
+          preserveThinking: false,
+        } as any;
+
+        const result = params.chatCompletion.handlePayload(payload);
+
+        expect(result.thinking).toEqual({ clear_thinking: true });
+      });
+
+      it('should keep caller-provided reasoning_content', () => {
+        const payload = {
+          messages: [
+            {
+              content: 'answer',
+              reasoning_content: 'existing reasoning content',
+              role: 'assistant',
+            },
+          ],
+          model: 'glm-5',
+          preserveThinking: true,
+        } as any;
+
+        const result = params.chatCompletion.handlePayload(payload);
+
+        expect(result.messages).toEqual([
+          {
+            content: 'answer',
+            reasoning_content: 'existing reasoning content',
+            role: 'assistant',
+          },
+        ]);
+      });
+    });
+
    describe('Preserve other payload properties', () => {
      it('should preserve all other properties', async () => {
        await instance.chat({
@@ -26,6 +26,7 @@ export const params = {
        enabledSearch,
        max_tokens,
        model,
+        preserveThinking,
        stream,
        temperature,
        thinking,
@@ -34,6 +35,35 @@ export const params = {
        ...rest
      } = payload;

+      const messages = (rest.messages || []).map((message: any) => {
+        const { reasoning, ...messageRest } = message;
+
+        const reasoningContent =
+          typeof messageRest.reasoning_content === 'string'
+            ? messageRest.reasoning_content
+            : typeof reasoning?.content === 'string'
+              ? reasoning.content
+              : undefined;
+
+        if (reasoningContent !== undefined) {
+          return {
+            ...messageRest,
+            reasoning_content: reasoningContent,
+          };
+        }
+
+        return messageRest;
+      });
+
+      const shouldSetClearThinking = typeof preserveThinking === 'boolean';
+      const thinkingPayload = thinking ? { type: thinking.type } : undefined;
+      const resolvedThinking = shouldSetClearThinking
+        ? {
+            ...thinkingPayload,
+            clear_thinking: !preserveThinking,
+          }
+        : thinkingPayload;
+
      const zhipuTools = enabledSearch
        ? [
            ...(tools || []),
@@ -78,9 +108,10 @@ export const params = {
      return {
        ...rest,
        ...resolvedParams,
+        messages,
        model,
        stream,
-        thinking: thinking ? { type: thinking.type } : undefined,
+        thinking: resolvedThinking,
        tool_stream: stream && /^glm-(?:4\.(?:6|7)|5(?:\.1)?)$/.test(model) ? true : undefined,
        tools: zhipuTools,
      } as any;
@@ -51,6 +51,7 @@ export interface OpenAIChatMessage {
    content?: string;
    duration?: number;
  };
+  reasoning_content?: string;
  role: LLMRoleType;
  tool_call_id?: string;
  tool_calls?: MessageToolCall[];
@@ -123,6 +124,7 @@ export interface ChatStreamPayload {
   * @default 0
   */
  presence_penalty?: number;
+  preserveThinking?: boolean;
  provider?: string;
  reasoning?: {
    effort?: string;
@@ -104,7 +104,7 @@ export const MODEL_LIST_CONFIGS = {
      'qwen3',
    ],
    reasoningKeywords: ['qvq', 'qwq', 'qwen3', '!-instruct-', '!-coder-'],
-    visionKeywords: ['qvq', '-vl', '-omni'],
+    visionKeywords: ['qvq', '-vl', '-omni', 'qwen3.'],
  },
  replicate: {
    imageOutputKeywords: [
@@ -113,6 +113,12 @@ export interface LobeAgentChatConfig extends AgentMemoryChatConfig, AgentSelfIte
   * Effort level for Claude Opus 4.7 and later (adds xhigh tier between high and max)
   */
  opus47Effort?: 'low' | 'medium' | 'high' | 'xhigh' | 'max';
+
+  /**
+   * Whether to preserve and pass historical thinking content to the model
+   * (provider support required, e.g. Qwen preserve_thinking)
+   */
+  preserveThinking?: boolean;
  reasoningBudgetToken?: number;
  /**
   * Reasoning budget token for models with 32k max (GLM-5/GLM-4.7)
@@ -238,6 +244,7 @@ export const AgentChatConfigSchema = z
    imageResolution2: z.enum(['512', '1K', '2K', '4K']).optional(),
    opus47Effort: z.enum(['low', 'medium', 'high', 'xhigh', 'max']).optional(),
    runtimeEnv: RuntimeEnvConfigSchema.optional(),
+    preserveThinking: z.boolean().optional(),
    reasoningBudgetToken: z.number().optional(),
    reasoningBudgetToken32k: z.number().optional(),
    reasoningBudgetToken80k: z.number().optional(),
@@ -55,6 +55,11 @@ export interface OpenAIChatMessage {
   */
  function_call?: OpenAIFunctionCall;
  name?: string;
+  reasoning?: {
+    content?: string;
+    duration?: number;
+  };
+  reasoning_content?: string;
  /**
   * Role
   * @description Role of the message sender
@@ -102,6 +107,7 @@ export interface ChatStreamPayload {
   * @default 0
   */
  presence_penalty?: number;
+  preserveThinking?: boolean;
  /**
   * @default openai
   */
@@ -146,6 +146,18 @@ const ControlsForm = memo<ControlsFormProps>(
        minWidth: undefined,
        name: 'enableReasoning',
      },
+      {
+        children: <Switch size={'small'} />,
+        desc: isNarrow ? (
+          <span style={descNarrow}>{t('extendParams.preserveThinking.desc')}</span>
+        ) : (
+          t('extendParams.preserveThinking.desc')
+        ),
+        label: t('extendParams.preserveThinking.title'),
+        layout: isNarrow ? 'vertical' : 'horizontal',
+        minWidth: undefined,
+        name: 'preserveThinking',
+      },
      {
        children: <Switch size={'small'} />,
        desc: isNarrow ? (
@@ -98,6 +98,9 @@ export default {
  'extendParams.enableReasoning.desc':
    'Let the model reason before answering. Use it for complex tasks.',
  'extendParams.enableReasoning.title': 'Enable Deep Thinking',
+  'extendParams.preserveThinking.desc':
+    'When enabled, assistant historical reasoning will be sent back as context for models. This may increase token usage.',
+  'extendParams.preserveThinking.title': 'Preserve Historical Thinking',
  'extendParams.imageAspectRatio.title': 'Image Aspect Ratio',
  'extendParams.imageResolution.title': 'Image Resolution',
  'extendParams.reasoningBudgetToken.title': 'Thinking Consumption Token',
@@ -283,6 +283,8 @@ export default {
    'For Gemini 3.1 Flash Image models; controls resolution of generated images (supports 512px).',
  'providerModels.item.modelConfig.extendParams.options.opus47Effort.hint':
    'For Claude Opus 4.7 and later; controls effort level (low/medium/high/xhigh/max).',
+  'providerModels.item.modelConfig.extendParams.options.preserveThinking.hint':
+    'For Qwen3.6 Plus, GLM-5 and GLM-4.7; sends historical assistant reasoning back to model context (preserve_thinking / clear_thinking=false).',
  'providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint':
    'For Claude, Qwen3 and similar; controls token budget for reasoning.',
  'providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken32k.hint':
@@ -51,6 +51,10 @@ const EXTEND_PARAMS_OPTIONS: ExtendParamsOption[] = [
    hintKey: 'providerModels.item.modelConfig.extendParams.options.enableAdaptiveThinking.hint',
    key: 'enableAdaptiveThinking',
  },
+  {
+    hintKey: 'providerModels.item.modelConfig.extendParams.options.preserveThinking.hint',
+    key: 'preserveThinking',
+  },
  {
    hintKey: 'providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint',
    key: 'reasoningBudgetToken',
@@ -178,6 +182,7 @@ const TITLE_KEY_ALIASES: Partial<Record<ExtendParamsType, ExtendParamsType>> = {
  grok4_3ReasoningEffort: 'reasoningEffort',
  hy3ReasoningEffort: 'reasoningEffort',
  imageAspectRatio2: 'imageAspectRatio',
+  imageResolution2: 'imageResolution',
  opus47Effort: 'effort',
  reasoningBudgetToken32k: 'reasoningBudgetToken',
  reasoningBudgetToken80k: 'reasoningBudgetToken',
@@ -241,6 +246,11 @@ const PREVIEW_META: Partial<Record<ExtendParamsType, PreviewMeta>> = {
  imageResolution: { labelSuffix: '', previewWidth: 250, tag: 'resolution' },
  imageResolution2: { labelSuffix: ' (512px+)', previewWidth: 280, tag: 'resolution' },
  opus47Effort: { labelSuffix: ' (Opus 4.7+)', previewWidth: 280, tag: 'output_config.effort' },
+  preserveThinking: {
+    labelSuffix: ' (Qwen3.6+ / GLM-4.7+)',
+    previewWidth: 460,
+    tag: 'preserve_thinking',
+  },
  reasoningBudgetToken: { previewWidth: 350, tag: 'thinking.budget_tokens' },
  reasoningBudgetToken32k: {
    labelSuffix: ' (32k)',
@@ -383,6 +393,7 @@ const ExtendParamsSelect = memo<ExtendParamsSelectProps>(({ value, onChange }) =
      effort: <EffortSlider value="high" />,
      enableAdaptiveThinking: <Switch checked disabled />,
      enableReasoning: <Switch checked disabled />,
+      preserveThinking: <Switch checked disabled />,
      gpt5ReasoningEffort: <GPT5ReasoningEffortSlider value="medium" />,
      gpt5_1ReasoningEffort: <GPT51ReasoningEffortSlider value="none" />,
      gpt5_2ProReasoningEffort: <GPT52ProReasoningEffortSlider value="medium" />,
@@ -660,6 +660,8 @@ export const createRuntimeExecutors = (

    try {
      type ContentPart = { text: string; type: 'text' } | { image: string; type: 'image' };
+      let shouldPersistAssistantReasoning = false;
+      let preserveThinkingForPayload: boolean | undefined;

      // Process messages through serverMessagesEngine to inject system role, knowledge, etc.
      // Rebuild params from agentConfig at execution time (capabilities built dynamically)
@@ -669,6 +671,41 @@ export const createRuntimeExecutors = (
        const { loadModels } = await import('@/business/client/model-bank/loadModels');
        const builtinModels = await loadModels();

+        const preserveThinkingConfigured =
+          typeof agentConfig.chatConfig?.preserveThinking === 'boolean'
+            ? agentConfig.chatConfig.preserveThinking
+            : undefined;
+        const preserveThinkingRequested = preserveThinkingConfigured === true;
+
+        const modelCard = builtinModels.find(
+          (item) =>
+            item.providerId === provider &&
+            (item.id === model || item.config?.deploymentName === model),
+        );
+        const modelExtendParams =
+          modelCard &&
+          'settings' in modelCard &&
+          modelCard.settings &&
+          typeof modelCard.settings === 'object' &&
+          'extendParams' in modelCard.settings
+            ? (modelCard.settings as { extendParams?: string[] }).extendParams
+            : undefined;
+
+        const modelSupportsPreserveThinkingFromCard =
+          Array.isArray(modelExtendParams) && modelExtendParams.includes('preserveThinking');
+        const providerSupportsPreserveThinkingFallback =
+          provider === 'qwen' || provider === 'zhipu';
+        const modelSupportsPreserveThinking =
+          modelSupportsPreserveThinkingFromCard ||
+          (!modelCard && providerSupportsPreserveThinkingFallback);
+
+        shouldPersistAssistantReasoning =
+          preserveThinkingRequested && modelSupportsPreserveThinking;
+        preserveThinkingForPayload =
+          modelSupportsPreserveThinking && typeof preserveThinkingConfigured === 'boolean'
+            ? preserveThinkingConfigured
+            : undefined;
+
        // Extract <refer_topic> tags from messages and fetch summaries.
        // Skip if messages already contain injected topic_reference_context
        // (e.g., from client-side contextEngineering preprocessing) to avoid double injection.
@@ -1102,7 +1139,15 @@ export const createRuntimeExecutors = (

      // Construct ChatStreamPayload
      const stream = ctx.stream ?? true;
-      const chatPayload = { messages: processedMessages, model, stream, tools };
+      const chatPayload = {
+        messages: processedMessages,
+        model,
+        stream,
+        tools,
+        ...(typeof preserveThinkingForPayload === 'boolean' && {
+          preserveThinking: preserveThinkingForPayload,
+        }),
+      };

      // Buffer: accumulate text and reasoning, send every 50ms
      const BUFFER_INTERVAL = 50;
@@ -1594,6 +1639,10 @@ export const createRuntimeExecutors = (
                };
              }

+              const persistedReasoning = shouldPersistAssistantReasoning
+                ? finalReasoning
+                : undefined;
+
              try {
                // Build metadata object
                const metadata: Record<string, any> = {};
@@ -1626,7 +1675,7 @@ export const createRuntimeExecutors = (
                  content: finalContent,
                  imageList: imageList.length > 0 ? imageList : undefined,
                  metadata: Object.keys(metadata).length > 0 ? metadata : undefined,
-                  reasoning: finalReasoning,
+                  reasoning: persistedReasoning,
                  search: grounding,
                  tools: persistedTools,
                });
@@ -1659,7 +1708,7 @@ export const createRuntimeExecutors = (
              newState.messages.push({
                content,
                id: assistantMessageItem.id,
-                reasoning: finalReasoning,
+                reasoning: persistedReasoning,
                role: 'assistant',
                tool_calls: stateToolCalls,
              });
@@ -16,6 +16,12 @@ const mockBuiltinModels = vi.hoisted(() => [
    id: 'gpt-4',
    providerId: 'openai',
  },
+  {
+    abilities: { functionCall: true, video: true, vision: true },
+    id: 'qwen3.6-plus',
+    providerId: 'qwen',
+    settings: { extendParams: ['preserveThinking'] },
+  },
  {
    abilities: { functionCall: false, video: false, vision: false },
    id: 'no-tools-model',
@@ -25,6 +31,7 @@ const mockBuiltinModels = vi.hoisted(() => [
    abilities: { functionCall: true, video: true, vision: true },
    id: 'gemini-3.1-flash-lite-preview',
    providerId: 'google',
+    settings: { extendParams: ['preserveThinking'] },
  },
 ]);

@@ -100,12 +107,21 @@ describe('RuntimeExecutors', () => {

  beforeEach(() => {
    vi.clearAllMocks();
+    vi.mocked(initModelRuntimeFromDB).mockReset();
+    mockCreateCompressionGroup.mockReset();
+    mockFinalizeCompression.mockReset();
    mockCreateCompressionGroup.mockResolvedValue({
      messageGroupId: 'group-123',
      messagesToSummarize: [],
      success: true,
    });
    mockFinalizeCompression.mockResolvedValue({ success: true });
+    vi.mocked(initModelRuntimeFromDB).mockResolvedValue({
+      chat: vi.fn().mockImplementation(async (_payload: any, options: any) => {
+        await options?.callback?.onText?.('done');
+        return new Response('done');
+      }),
+    } as any);

    mockMessageModel = {
      create: vi.fn().mockResolvedValue({ id: 'msg-123' }),
@@ -392,52 +408,233 @@ describe('RuntimeExecutors', () => {
      );
    });

-    it('should preserve reasoning in newState when assistant returns tool calls', async () => {
-      const toolCallPayload = [
-        {
-          function: { arguments: '{}', name: 'search' },
-          id: 'call_1',
-          type: 'function',
-        },
-      ];
+    describe('reasoning persistence gate', () => {
+      it('should persist assistant reasoning with tool calls when preserveThinking is enabled on a supported model', async () => {
+        const toolCallPayload = [
+          {
+            function: { arguments: '{}', name: 'search' },
+            id: 'call_1',
+            type: 'function',
+          },
+        ];

-      const mockChat = vi.fn().mockImplementation(async (_payload, options) => {
-        await options?.callback?.onThinking?.('Need to inspect the search results first.');
-        await options?.callback?.onToolsCalling?.({ toolsCalling: toolCallPayload });
-        await options?.callback?.onCompletion?.({
-          usage: {
-            totalInputTokens: 1,
-            totalOutputTokens: 2,
-            totalTokens: 3,
+        const mockChat = vi.fn().mockImplementation(async (_payload, options) => {
+          await options?.callback?.onThinking?.('Need to inspect the search results first.');
+          await options?.callback?.onToolsCalling?.({ toolsCalling: toolCallPayload });
+          await options?.callback?.onCompletion?.({
+            usage: {
+              totalInputTokens: 1,
+              totalOutputTokens: 2,
+              totalTokens: 3,
+            },
+          });
+          return new Response('done');
+        });
+        vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any);
+
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: {
+            chatConfig: { preserveThinking: true },
+            plugins: [],
+            systemRole: 'test',
+          },
+        };
+
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState({
+          modelRuntimeConfig: {
+            model: 'qwen3.6-plus',
+            provider: 'qwen',
          },
        });
-        return new Response('done');
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'qwen3.6-plus',
+            provider: 'qwen',
+            tools: [],
+          },
+          type: 'call_llm' as const,
+        };
+
+        const result = await executors.call_llm!(instruction, state);
+
+        expect(result.newState.messages.at(-1)).toEqual(
+          expect.objectContaining({
+            reasoning: { content: 'Need to inspect the search results first.' },
+            role: 'assistant',
+            tool_calls: [expect.objectContaining({ id: 'call_1' })],
+          }),
+        );
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ preserveThinking: true }),
+          expect.anything(),
+        );
      });
-      vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any);

-      const executors = createRuntimeExecutors(ctx);
-      const state = createMockState();
+      it('should not persist assistant reasoning when preserveThinking is not enabled', async () => {
+        const mockChat = vi.fn().mockImplementation(async (_payload, options) => {
+          await options?.callback?.onThinking?.('hidden reasoning');
+          await options?.callback?.onText?.('answer');
+          return new Response('done');
+        });
+        vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any);

-      const instruction = {
-        payload: {
-          messages: [{ content: 'Hello', role: 'user' }],
-          model: 'gpt-4',
-          provider: 'openai',
-          tools: [],
-        },
-        type: 'call_llm' as const,
-      };
+        const executors = createRuntimeExecutors(ctx);
+        const state = createMockState();

-      const result = await executors.call_llm!(instruction, state);
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };

-      expect(result.newState.messages.at(-1)).toEqual(
-        expect.objectContaining({
-          id: 'msg-123',
-          reasoning: { content: 'Need to inspect the search results first.' },
-          role: 'assistant',
-          tool_calls: [expect.objectContaining({ id: 'call_1' })],
-        }),
-      );
+        const result = await executors.call_llm!(instruction, state);
+        const assistant = result.newState.messages.at(-1) as any;
+
+        expect(assistant.reasoning).toBeUndefined();
+      });
+
+      it('should persist assistant reasoning when preserveThinking is enabled on a supported model', async () => {
+        const mockChat = vi.fn().mockImplementation(async (_payload, options) => {
+          await options?.callback?.onThinking?.('preserved reasoning');
+          await options?.callback?.onText?.('answer');
+          return new Response('done');
+        });
+        vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any);
+
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: {
+            chatConfig: { preserveThinking: true },
+            plugins: [],
+            systemRole: 'test',
+          },
+        };
+
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState({
+          modelRuntimeConfig: {
+            model: 'qwen3.6-plus',
+            provider: 'qwen',
+          },
+        });
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'qwen3.6-plus',
+            provider: 'qwen',
+          },
+          type: 'call_llm' as const,
+        };
+
+        const result = await executors.call_llm!(instruction, state);
+        const assistant = result.newState.messages.at(-1) as any;
+
+        expect(assistant.reasoning).toEqual({
+          content: 'preserved reasoning',
+        });
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ preserveThinking: true }),
+          expect.anything(),
+        );
+      });
+
+      it('should persist reasoning for unknown custom deployments on supported providers', async () => {
+        const mockChat = vi.fn().mockImplementation(async (_payload, options) => {
+          await options?.callback?.onThinking?.('custom deployment reasoning');
+          await options?.callback?.onText?.('answer');
+          return new Response('done');
+        });
+        vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any);
+
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: {
+            chatConfig: { preserveThinking: true },
+            plugins: [],
+            systemRole: 'test',
+          },
+        };
+
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState({
+          modelRuntimeConfig: {
+            model: 'my-qwen-custom-deployment',
+            provider: 'qwen',
+          },
+        });
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'my-qwen-custom-deployment',
+            provider: 'qwen',
+          },
+          type: 'call_llm' as const,
+        };
+
+        const result = await executors.call_llm!(instruction, state);
+        const assistant = result.newState.messages.at(-1) as any;
+
+        expect(assistant.reasoning).toEqual({
+          content: 'custom deployment reasoning',
+        });
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ preserveThinking: true }),
+          expect.anything(),
+        );
+      });
+
+      it('should not persist reasoning when model does not declare preserveThinking capability', async () => {
+        const mockChat = vi.fn().mockImplementation(async (_payload, options) => {
+          await options?.callback?.onThinking?.('reasoning that should not be saved');
+          await options?.callback?.onText?.('answer');
+          return new Response('done');
+        });
+        vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any);
+
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: {
+            chatConfig: { preserveThinking: true },
+            plugins: [],
+            systemRole: 'test',
+          },
+        };
+
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState({
+          modelRuntimeConfig: {
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+        });
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        const result = await executors.call_llm!(instruction, state);
+        const assistant = result.newState.messages.at(-1) as any;
+
+        expect(assistant.reasoning).toBeUndefined();
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.not.objectContaining({ preserveThinking: expect.any(Boolean) }),
+          expect.anything(),
+        );
+      });
    });

    it('retries empty completions on the branded provider then throws ModelEmptyError', async () => {
@@ -574,7 +771,14 @@ describe('RuntimeExecutors', () => {
        });
        vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any);

-        const executors = createRuntimeExecutors(ctx);
+        // Reasoning only lands in the finalized message when preserveThinking is
+        // enabled on a supported model; otherwise it is intentionally dropped.
+        // Enable it here so this still guards reasoning_part capture (not drop).
+        const ctxWithThinking: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: { chatConfig: { preserveThinking: true }, plugins: [], systemRole: 'test' },
+        };
+        const executors = createRuntimeExecutors(ctxWithThinking);
        const result = await executors.call_llm!(geminiInstruction(), createMockState());

        expect(result.newState.messages.at(-1)).toEqual(
@@ -4032,7 +4236,7 @@ describe('RuntimeExecutors', () => {

        await vi.runOnlyPendingTimersAsync();

-        const result = await resultPromise;
+        await resultPromise;

        expect(mockChat).toHaveBeenCalledTimes(2);
        expect(mockMessageModel.create).toHaveBeenCalledTimes(1);
@@ -243,6 +243,51 @@ describe('resolveModelExtendParams', () => {
    });
  });

+  describe('preserve thinking', () => {
+    beforeEach(() => {
+      vi.spyOn(aiModelSelectors.aiModelSelectors, 'isModelHasExtendParams').mockReturnValue(
+        () => true,
+      );
+      vi.spyOn(aiModelSelectors.aiModelSelectors, 'modelExtendParams').mockReturnValue(() => [
+        'preserveThinking',
+      ]);
+    });
+
+    it('should set preserveThinking when supported and enabled', () => {
+      const result = resolveModelExtendParams({
+        chatConfig: {
+          preserveThinking: true,
+        } as any,
+        model: 'qwen3.6-plus',
+        provider: 'qwen',
+      });
+
+      expect(result.preserveThinking).toBe(true);
+    });
+
+    it('should set preserveThinking to false when explicitly disabled', () => {
+      const result = resolveModelExtendParams({
+        chatConfig: {
+          preserveThinking: false,
+        } as any,
+        model: 'qwen3.6-plus',
+        provider: 'qwen',
+      });
+
+      expect(result.preserveThinking).toBe(false);
+    });
+
+    it('should not set preserveThinking when not configured', () => {
+      const result = resolveModelExtendParams({
+        chatConfig: {} as any,
+        model: 'qwen3.6-plus',
+        provider: 'qwen',
+      });
+
+      expect(result.preserveThinking).toBeUndefined();
+    });
+  });
+
  describe('reasoning effort variants', () => {
    describe('reasoningEffort param', () => {
      beforeEach(() => {
@@ -21,6 +21,7 @@ export interface ModelExtendParams {
  enabledContextCaching?: boolean;
  imageAspectRatio?: string;
  imageResolution?: string;
+  preserveThinking?: boolean;
  reasoning_effort?: string;
  thinking?: {
    budget_tokens?: number;
@@ -188,6 +189,14 @@ export const resolveModelExtendParams = (ctx: ModelParamsContext): ModelExtendPa
    extendParams.enabledContextCaching = false;
  }

+  // Preserve historical thinking content (provider support required)
+  if (
+    modelExtendParams.includes('preserveThinking') &&
+    typeof chatConfig.preserveThinking === 'boolean'
+  ) {
+    extendParams.preserveThinking = chatConfig.preserveThinking;
+  }
+
  // Reasoning effort variants
  if (modelExtendParams.includes('reasoningEffort') && chatConfig.reasoningEffort) {
    extendParams.reasoning_effort = chatConfig.reasoningEffort;
@@ -571,41 +571,6 @@ describe('transformToChatModelCards', () => {
    expect(result).toMatchSnapshot();
  });

-  it('should use default deploymentName from known model when not specified in string (VolcEngine case)', async () => {
-    const knownModel = LOBE_DEFAULT_MODEL_LIST.find(
-      (m) => m.id === 'deepseek-r1' && m.providerId === 'volcengine',
-    );
-    const defaultChatModels: AiFullModelCard[] = [];
-    const result = await transformToAiModelList({
-      modelString: '+deepseek-r1',
-      defaultModels: defaultChatModels,
-      providerId: 'volcengine',
-      withDeploymentName: true,
-    });
-    expect(result).toContainEqual({
-      ...knownModel,
-      enabled: true,
-    });
-  });
-
-  it('should use deploymentName from modelString when specified (VolcEngine case)', async () => {
-    const defaultChatModels: AiFullModelCard[] = [];
-    const knownModel = LOBE_DEFAULT_MODEL_LIST.find(
-      (m) => m.id === 'deepseek-r1' && m.providerId === 'volcengine',
-    );
-    const result = await transformToAiModelList({
-      modelString: `+deepseek-r1->my-custom-deploy`,
-      defaultModels: defaultChatModels,
-      providerId: 'volcengine',
-      withDeploymentName: true,
-    });
-    expect(result).toContainEqual({
-      ...knownModel,
-      enabled: true,
-      config: { deploymentName: 'my-custom-deploy' },
-    });
-  });
-
  it('should set both id and deploymentName to the full string when no -> is used and withDeploymentName is true', async () => {
    const defaultChatModels: AiFullModelCard[] = [];
    const result = await transformToAiModelList({