diff --git a/locales/en-US/chat.json b/locales/en-US/chat.json index 46b311bb3d..22f57ad203 100644 --- a/locales/en-US/chat.json +++ b/locales/en-US/chat.json @@ -149,6 +149,8 @@ "extendParams.enableReasoning.title": "Enable Deep Thinking", "extendParams.imageAspectRatio.title": "Image Aspect Ratio", "extendParams.imageResolution.title": "Image Resolution", + "extendParams.preserveThinking.desc": "When enabled, assistant historical reasoning will be sent back as context for compatible models. This may increase token usage.", + "extendParams.preserveThinking.title": "Preserve Historical Thinking", "extendParams.reasoningBudgetToken.title": "Thinking Consumption Token", "extendParams.reasoningEffort.title": "Reasoning Intensity", "extendParams.textVerbosity.title": "Output Text Detail Level", diff --git a/locales/en-US/modelProvider.json b/locales/en-US/modelProvider.json index ac2d08ba53..d5e9a0175f 100644 --- a/locales/en-US/modelProvider.json +++ b/locales/en-US/modelProvider.json @@ -234,6 +234,7 @@ "providerModels.item.modelConfig.extendParams.options.imageResolution.hint": "For Gemini 3 image generation models; controls resolution of generated images.", "providerModels.item.modelConfig.extendParams.options.imageResolution2.hint": "For Gemini 3.1 Flash Image models; controls resolution of generated images (supports 512px).", "providerModels.item.modelConfig.extendParams.options.opus47Effort.hint": "For Claude Opus 4.7 and later; controls effort level (low/medium/high/xhigh/max).", + "providerModels.item.modelConfig.extendParams.options.preserveThinking.hint": "For Qwen3.6 Plus, GLM-5 and GLM-4.7; sends historical assistant reasoning back to model context (preserve_thinking).", "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint": "For Claude, Qwen3 and similar; controls token budget for reasoning.", "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken32k.hint": "For GLM-5 and GLM-4.7; controls token budget for reasoning (max 32k).", "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken80k.hint": "For Qwen3 series; controls token budget for reasoning (max 80k).", diff --git a/locales/zh-CN/chat.json b/locales/zh-CN/chat.json index 77bf63942f..6869ea7fd0 100644 --- a/locales/zh-CN/chat.json +++ b/locales/zh-CN/chat.json @@ -149,6 +149,8 @@ "extendParams.enableReasoning.title": "开启深度思考", "extendParams.imageAspectRatio.title": "图片宽高比", "extendParams.imageResolution.title": "图片分辨率", + "extendParams.preserveThinking.desc": "开启后会将历史助手思考过程作为上下文回传给模型,可能增加 Token 消耗。", + "extendParams.preserveThinking.title": "传递历史思考过程", "extendParams.reasoningBudgetToken.title": "思考 Token 预算", "extendParams.reasoningEffort.title": "推理强度", "extendParams.textVerbosity.title": "输出详细程度", diff --git a/locales/zh-CN/modelProvider.json b/locales/zh-CN/modelProvider.json index 6a42d3ed21..4168d5a726 100644 --- a/locales/zh-CN/modelProvider.json +++ b/locales/zh-CN/modelProvider.json @@ -234,6 +234,7 @@ "providerModels.item.modelConfig.extendParams.options.imageResolution.hint": "适用于 Gemini 3 图像生成模型;控制生成图像的分辨率。", "providerModels.item.modelConfig.extendParams.options.imageResolution2.hint": "适用于 Gemini 3.1 Flash Image 模型;控制生成图像的分辨率(支持 512px)。", "providerModels.item.modelConfig.extendParams.options.opus47Effort.hint": "适用于 Claude Opus 4.7 及更高版本;控制努力级别(低/中/高/超高/最大)。", + "providerModels.item.modelConfig.extendParams.options.preserveThinking.hint": "适用于 Qwen3.6 Plus、GLM-5 与 GLM-4.7;将历史助手思考过程回传为模型上下文(preserve_thinking)。", "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint": "适用于 Claude、Qwen3 等模型;控制用于推理的 Token 预算。", "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken32k.hint": "适用于GLM-5和GLM-4.7;控制推理的令牌预算(最大32k)。", "providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken80k.hint": "适用于Qwen3系列;控制推理的令牌预算(最大80k)。", diff --git a/packages/agent-runtime/src/core/__tests__/UsageCounter.test.ts b/packages/agent-runtime/src/core/__tests__/UsageCounter.test.ts index 1629c782f8..1da3b5a70a 100644 --- a/packages/agent-runtime/src/core/__tests__/UsageCounter.test.ts +++ b/packages/agent-runtime/src/core/__tests__/UsageCounter.test.ts @@ -685,14 +685,14 @@ describe('UsageCounter', () => { const result1 = UsageCounter.accumulateLLM({ cost: state.cost, - model: 'gpt-4o-audio-preview', + model: 'gpt-audio', modelUsage: usage1, provider: 'openai', usage: state.usage, }); const result2 = UsageCounter.accumulateLLM({ cost: result1.cost, - model: 'gpt-4o-audio-preview', + model: 'gpt-audio', modelUsage: usage2, provider: 'openai', usage: result1.usage, diff --git a/packages/model-bank/src/aiModels/aihubmix.ts b/packages/model-bank/src/aiModels/aihubmix.ts index 0722e729c5..a19ce0fbf4 100644 --- a/packages/model-bank/src/aiModels/aihubmix.ts +++ b/packages/model-bank/src/aiModels/aihubmix.ts @@ -887,73 +887,6 @@ const aihubmixChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - search: true, - vision: true, - }, - contextWindowTokens: 2_000_000, - description: - 'We’re excited to release Grok 4 Fast, our latest progress in cost-effective reasoning models.', - displayName: 'Grok 4 Fast (Non-Reasoning)', - id: 'grok-4-fast-non-reasoning', - pricing: { - units: [ - { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-09', - settings: { - searchImpl: 'params', - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - search: true, - vision: true, - }, - contextWindowTokens: 2_000_000, - description: - 'We’re excited to release Grok 4 Fast, our latest progress in cost-effective reasoning models.', - displayName: 'Grok 4 Fast', - id: 'grok-4-fast-reasoning', - pricing: { - units: [ - { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-09', - settings: { - searchImpl: 'params', - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - vision: true, - }, - contextWindowTokens: 256_000, - description: - 'Latest Grok flagship with unmatched performance in language, math, and reasoning — a true all-rounder. Currently points to grok-4-0709; due to limited resources it is temporarily 10% higher than official pricing and is expected to return to official price later.', - displayName: 'Grok 4 0709', - id: 'grok-4', - pricing: { - units: [ - { name: 'textInput', rate: 3.3, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16.5, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-07-09', - type: 'chat', - }, { abilities: { functionCall: true, @@ -1384,24 +1317,6 @@ const aihubmixChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 131_072, - description: - 'DeepSeek V3.1 Fast is the high-TPS fast variant of DeepSeek V3.1. Hybrid thinking mode: via chat templates, one model supports both thinking and non-thinking. Smarter tool use: post-training boosts tool and agent task performance.', - displayName: 'DeepSeek V3.1 (Fast)', - id: 'DeepSeek-V3.1-Fast', - pricing: { - units: [ - { name: 'textInput', rate: 1.096, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 3.288, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, { abilities: { functionCall: true, diff --git a/packages/model-bank/src/aiModels/cerebras.ts b/packages/model-bank/src/aiModels/cerebras.ts index ef89376c0a..789dccf718 100644 --- a/packages/model-bank/src/aiModels/cerebras.ts +++ b/packages/model-bank/src/aiModels/cerebras.ts @@ -4,33 +4,43 @@ const cerebrasModels: AIChatModelCard[] = [ { abilities: { functionCall: true, + reasoning: true, }, contextWindowTokens: 131_072, - displayName: 'Qwen 3 235B Instruct', - id: 'qwen-3-235b-a22b-instruct-2507', + displayName: 'GPT OSS 120B', + enabled: true, + id: 'gpt-oss-120b', + maxOutput: 40_960, pricing: { units: [ - { name: 'textInput', rate: 0.6, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 0.35, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 0.75, strategy: 'fixed', unit: 'millionTokens' }, ], }, + settings: { + extendParams: ['reasoningEffort'], + }, type: 'chat', }, { abilities: { functionCall: true, + reasoning: true, }, - contextWindowTokens: 32_768, + contextWindowTokens: 131_072, description: - 'Llama 3.1 8B: a small, low-latency Llama variant for lightweight online inference and chat.', - displayName: 'Llama 3.1 8B', - id: 'llama3.1-8b', + "GLM-4.7 is Zhipu's new generation flagship model with 355B total parameters and 32B active parameters, fully upgraded in general dialogue, reasoning, and agent capabilities. GLM-4.7 enhances Interleaved Thinking and introduces Preserved Thinking and Turn-level Thinking.", + displayName: 'GLM-4.7', + id: 'zai-glm-4.7', + maxOutput: 40_960, pricing: { + currency: 'USD', units: [ - { name: 'textInput', rate: 0.1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 0.1, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 2.25, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 2.75, strategy: 'fixed', unit: 'millionTokens' }, ], }, + releasedAt: '2025-12-22', type: 'chat', }, ]; diff --git a/packages/model-bank/src/aiModels/githubCopilot.ts b/packages/model-bank/src/aiModels/githubCopilot.ts index 2f6fbda8ee..fd75b24daf 100644 --- a/packages/model-bank/src/aiModels/githubCopilot.ts +++ b/packages/model-bank/src/aiModels/githubCopilot.ts @@ -317,23 +317,6 @@ const githubCopilotChatModels: AIChatModelCard[] = [ type: 'chat', }, - // Grok Models - { - abilities: { - functionCall: true, - reasoning: true, - structuredOutput: true, - }, - contextWindowTokens: 173_000, - description: - 'We’re excited to launch grok-code-fast-1, a fast and cost-effective reasoning model that excels at agentic coding.', - displayName: 'Grok Code Fast 1', - enabled: true, - id: 'grok-code-fast-1', - releasedAt: '2025-08-27', - type: 'chat', - }, - // Raptor Models { abilities: { diff --git a/packages/model-bank/src/aiModels/moonshot.ts b/packages/model-bank/src/aiModels/moonshot.ts index e578c27493..87b4dede2b 100644 --- a/packages/model-bank/src/aiModels/moonshot.ts +++ b/packages/model-bank/src/aiModels/moonshot.ts @@ -59,114 +59,6 @@ const moonshotChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - structuredOutput: true, - }, - contextWindowTokens: 262_144, - description: - 'K2 long-thinking model with 256k context, supporting multi-step tool use and reasoning for complex problems.', - displayName: 'Kimi K2 Thinking', - id: 'kimi-k2-thinking', - maxOutput: 65_536, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-11-06', - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - structuredOutput: true, - }, - contextWindowTokens: 262_144, - description: - 'High-speed K2 long-thinking variant with 256k context, strong deep reasoning, and 60–100 tokens/sec output.', - displayName: 'Kimi K2 Thinking Turbo', - id: 'kimi-k2-thinking-turbo', - maxOutput: 65_536, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 58, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-11-06', - type: 'chat', - }, - { - abilities: { - functionCall: true, - structuredOutput: true, - }, - contextWindowTokens: 262_144, - description: - 'kimi-k2-0905-preview offers a 256k context window, stronger agentic coding, better front-end code quality, and improved context understanding.', - displayName: 'Kimi K2 0905 Preview', - id: 'kimi-k2-0905-preview', - maxOutput: 65_536, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-05', - type: 'chat', - }, - { - abilities: { - functionCall: true, - }, - contextWindowTokens: 131_072, - description: - 'kimi-k2 is an MoE foundation model with strong coding and agent capabilities (1T total params, 32B active), outperforming other mainstream open models across reasoning, programming, math, and agent benchmarks.', - displayName: 'Kimi K2 0711 Preview', - id: 'kimi-k2-0711-preview', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-07-11', - type: 'chat', - }, - { - abilities: { - functionCall: true, - }, - contextWindowTokens: 262_144, - description: - 'kimi-k2 is an MoE foundation model with strong coding and agent capabilities (1T total params, 32B active), outperforming other mainstream open models across reasoning, programming, math, and agent benchmarks.', - displayName: 'Kimi K2 Turbo Preview', - id: 'kimi-k2-turbo-preview', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 58, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-05', - type: 'chat', - }, { abilities: { functionCall: true, diff --git a/packages/model-bank/src/aiModels/nvidia.ts b/packages/model-bank/src/aiModels/nvidia.ts index 5f3110be9c..1bc689d8e3 100644 --- a/packages/model-bank/src/aiModels/nvidia.ts +++ b/packages/model-bank/src/aiModels/nvidia.ts @@ -12,99 +12,6 @@ const nvidiaChatModels: AIChatModelCard[] = [ maxOutput: 131_072, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 204_800, - displayName: 'MiniMax-M2.5', - id: 'minimaxai/minimax-m2.5', - maxOutput: 131_072, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 131_072, - description: - 'DeepSeek V3.2 is a next-gen reasoning model with stronger complex reasoning and chain-of-thought capabilities.', - displayName: 'DeepSeek V3.2', - enabled: true, - id: 'deepseek-ai/deepseek-v3.2', - maxOutput: 65_536, - settings: { - extendParams: ['enableReasoning'], - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 200_000, - description: - 'GLM-4.7 is Zhipu latest flagship model, enhanced for Agentic Coding scenarios with improved coding capabilities.', - displayName: 'GLM-4.7', - id: 'z-ai/glm4.7', - maxOutput: 131_072, - settings: { - extendParams: ['enableReasoning'], - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 200_000, - description: - "GLM-5 is Zhipu AI's new flagship foundation model for agent engineering, achieving open-source SOTA performance in coding and agent capabilities. It matches Claude Opus 4.5 in performance.", - displayName: 'GLM-5', - id: 'z-ai/glm5', - maxOutput: 131_072, - settings: { - extendParams: ['enableReasoning'], - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 262_144, - description: - 'Kimi K2.5 is the most intelligent Kimi model to date, featuring native multimodal architecture.', - displayName: 'Kimi K2.5', - enabled: true, - id: 'moonshotai/kimi-k2.5', - maxOutput: 65_536, - settings: { - extendParams: ['enableReasoning'], - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 131_072, - description: - 'DeepSeek V3.1 is a next-gen reasoning model with stronger complex reasoning and chain-of-thought for deep analysis tasks.', - displayName: 'DeepSeek V3.1 Terminus', - id: 'deepseek-ai/deepseek-v3.1-terminus', - maxOutput: 16_384, - settings: { - extendParams: ['enableReasoning'], - }, - type: 'chat', - }, { abilities: { functionCall: true, @@ -115,22 +22,6 @@ const nvidiaChatModels: AIChatModelCard[] = [ id: 'meta/llama-3.3-70b-instruct', type: 'chat', }, - { - contextWindowTokens: 128_000, - description: - 'A cutting-edge small language model with strong understanding, reasoning, and text generation.', - displayName: 'Llama 3.2 1B Instruct', - id: 'meta/llama-3.2-1b-instruct', - type: 'chat', - }, - { - contextWindowTokens: 128_000, - description: - 'A cutting-edge small language model with strong understanding, reasoning, and text generation.', - displayName: 'Llama 3.2 3B Instruct', - id: 'meta/llama-3.2-3b-instruct', - type: 'chat', - }, { abilities: { vision: true, @@ -197,33 +88,6 @@ const nvidiaChatModels: AIChatModelCard[] = [ id: 'google/gemma-2-2b-it', type: 'chat', }, - { - abilities: { - functionCall: true, - }, - contextWindowTokens: 32_768, - description: - 'A bilingual LLM for Chinese and English across language, coding, math, and reasoning.', - displayName: 'Qwen2.5 7B Instruct', - id: 'qwen/qwen2.5-7b-instruct', - type: 'chat', - }, - { - contextWindowTokens: 32_768, - description: - 'A strong mid-sized code model with 32K context, excelling at multilingual programming.', - displayName: 'Qwen2.5 Coder 7B Instruct', - id: 'qwen/qwen2.5-coder-7b-instruct', - type: 'chat', - }, - { - contextWindowTokens: 32_768, - description: - 'An advanced LLM for code generation, reasoning, and repair across mainstream programming languages.', - displayName: 'Qwen2.5 Coder 32B Instruct', - id: 'qwen/qwen2.5-coder-32b-instruct', - type: 'chat', - }, ]; export const allModels = [...nvidiaChatModels]; diff --git a/packages/model-bank/src/aiModels/openai.ts b/packages/model-bank/src/aiModels/openai.ts index 8313a6ad1f..3b15d0505e 100644 --- a/packages/model-bank/src/aiModels/openai.ts +++ b/packages/model-bank/src/aiModels/openai.ts @@ -1174,54 +1174,6 @@ export const openaiChatModels: AIChatModelCard[] = [ releasedAt: '2025-08-28', type: 'chat', }, - { - abilities: { - functionCall: true, - //search: true, - }, - contextWindowTokens: 128_000, - description: 'GPT-4o Audio Preview model with audio input and output.', - displayName: 'GPT-4o Audio Preview', - id: 'gpt-4o-audio-preview', - maxOutput: 16_384, - pricing: { - units: [ - { name: 'textInput', rate: 2.5, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 10, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2024-12-17', - /* - settings: { - searchImpl: 'params', - }, - */ - type: 'chat', - }, - { - abilities: { - functionCall: true, - //search: true, - }, - contextWindowTokens: 128_000, - description: 'GPT-4o mini Audio model with audio input and output.', - displayName: 'GPT-4o mini Audio', - id: 'gpt-4o-mini-audio-preview', - maxOutput: 16_384, - pricing: { - units: [ - { name: 'textInput', rate: 0.15, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 0.6, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2024-12-17', - /* - settings: { - searchImpl: 'params', - }, - */ - type: 'chat', - }, { abilities: { functionCall: true, @@ -1585,76 +1537,6 @@ export const openaiImageModels: AIImageModelCard[] = [ releasedAt: '2025-10-06', type: 'image', }, - { - description: - 'The latest DALL·E model, released in November 2023, supports more realistic, accurate image generation with stronger detail.', - displayName: 'DALL·E 3', - id: 'dall-e-3', - parameters: { - prompt: { default: '' }, - quality: { - default: 'standard', - enum: ['standard', 'hd'], - }, - size: { - default: '1024x1024', - enum: ['1024x1024', '1792x1024', '1024x1792'], - }, - }, - pricing: { - units: [ - { - lookup: { - prices: { - hd_1024x1024: 0.08, - hd_1024x1792: 0.12, - hd_1792x1024: 0.12, - standard_1024x1024: 0.04, - standard_1024x1792: 0.08, - standard_1792x1024: 0.08, - }, - pricingParams: ['quality', 'size'], - }, - name: 'imageGeneration', - strategy: 'lookup', - unit: 'image', - }, - ], - }, - type: 'image', - }, - { - description: - 'Second-generation DALL·E model with more realistic, accurate image generation and 4× the resolution of the first generation.', - displayName: 'DALL·E 2', - id: 'dall-e-2', - parameters: { - imageUrl: { default: null }, - prompt: { default: '' }, - size: { - default: '1024x1024', - enum: ['256x256', '512x512', '1024x1024'], - }, - }, - pricing: { - units: [ - { - lookup: { - prices: { - '1024x1024': 0.02, - '256x256': 0.016, - '512x512': 0.018, - }, - pricingParams: ['size'], - }, - name: 'imageGeneration', - strategy: 'lookup', - unit: 'image', - }, - ], - }, - type: 'image', - }, ]; // GPT-4o and GPT-4o-mini realtime models diff --git a/packages/model-bank/src/aiModels/qiniu.ts b/packages/model-bank/src/aiModels/qiniu.ts index 0c198c3587..541d9e33b6 100644 --- a/packages/model-bank/src/aiModels/qiniu.ts +++ b/packages/model-bank/src/aiModels/qiniu.ts @@ -182,56 +182,6 @@ const qiniuChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - search: true, - vision: true, - }, - contextWindowTokens: 2_000_000, - description: - 'We’re excited to release Grok 4 Fast, our latest progress in cost-effective reasoning models.', - displayName: 'Grok 4 Fast', - enabled: true, - id: 'x-ai/grok-4-fast', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 7.2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 12.6, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-09', - settings: { - searchImpl: 'params', - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 256_000, - description: - 'We’re excited to launch grok-code-fast-1, a fast and cost-effective reasoning model that excels at agentic coding.', - displayName: 'Grok Code Fast 1', - id: 'x-ai/grok-code-fast-1', - pricing: { - units: [ - { name: 'textInput_cacheRead', rate: 0.02, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 1.5, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-08-27', - // settings: { - // reasoning_effort is not supported by grok-code. Specifying reasoning_effort parameter will get an error response. - // extendParams: ['reasoningEffort'], - // }, - type: 'chat', - }, ]; export const allModels = [...qiniuChatModels]; diff --git a/packages/model-bank/src/aiModels/qwen.ts b/packages/model-bank/src/aiModels/qwen.ts index 352b6831a2..dbb2c5ca13 100644 --- a/packages/model-bank/src/aiModels/qwen.ts +++ b/packages/model-bank/src/aiModels/qwen.ts @@ -1559,7 +1559,7 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 1.2, upTo: 0.256 }, + { rate: 1.2, upTo: 256_000 }, { rate: 4.8, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1568,7 +1568,7 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 7.2, upTo: 0.256 }, + { rate: 7.2, upTo: 256_000 }, { rate: 28.8, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1577,7 +1577,7 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 1.2 * 0.2, upTo: 0.256 }, + { rate: 1.2 * 0.2, upTo: 256_000 }, { rate: 4.8 * 0.2, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1616,8 +1616,8 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 0.2, upTo: 0.128 }, - { rate: 0.8, upTo: 0.256 }, + { rate: 0.2, upTo: 128_000 }, + { rate: 0.8, upTo: 256_000 }, { rate: 1.2, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1626,8 +1626,8 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 2, upTo: 0.128 }, - { rate: 8, upTo: 0.256 }, + { rate: 2, upTo: 128_000 }, + { rate: 8, upTo: 256_000 }, { rate: 12, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1636,8 +1636,8 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 0.2 * 0.2, upTo: 0.128 }, - { rate: 0.8 * 0.2, upTo: 0.256 }, + { rate: 0.2 * 0.2, upTo: 128_000 }, + { rate: 0.8 * 0.2, upTo: 256_000 }, { rate: 1.2 * 0.2, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1673,8 +1673,8 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 0.15, upTo: 0.128 }, - { rate: 0.6, upTo: 0.256 }, + { rate: 0.15, upTo: 128_000 }, + { rate: 0.6, upTo: 256_000 }, { rate: 1.2, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1683,8 +1683,8 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 1.5, upTo: 0.128 }, - { rate: 6, upTo: 0.256 }, + { rate: 1.5, upTo: 128_000 }, + { rate: 6, upTo: 256_000 }, { rate: 12, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1693,8 +1693,8 @@ const qwenChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 0.15 * 0.2, upTo: 0.128 }, - { rate: 0.6 * 0.2, upTo: 0.256 }, + { rate: 0.15 * 0.2, upTo: 128_000 }, + { rate: 0.6 * 0.2, upTo: 256_000 }, { rate: 1.2 * 0.2, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -1739,6 +1739,40 @@ const qwenChatModels: AIChatModelCard[] = [ }, type: 'chat', }, + { + abilities: { + functionCall: true, + reasoning: true, + search: true, + video: true, + vision: true, + }, + config: { + deploymentName: 'qwen3.7-plus', // Supports context caching + }, + contextWindowTokens: 1_000_000, + description: + 'Qwen3.7 Plus is a multimodal interactive hybrid agent model, building upon the Qwen3.7 series text capabilities to unify vision and language. It excels at GUI operation, visual coding, and complex agentic workflows.', + displayName: 'Qwen3.7 Plus', + enabled: true, + id: 'qwen3.7-plus', + maxOutput: 65_536, + organization: 'Qwen', + pricing: { + currency: 'CNY', + units: [ + { name: 'textInput_cacheRead', rate: 2 * 0.2, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 2, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 8, strategy: 'fixed', unit: 'millionTokens' }, + ], + }, + releasedAt: '2026-06-01', + settings: { + extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'], + searchImpl: 'params', + }, + type: 'chat', + }, { abilities: { functionCall: true, @@ -1752,44 +1786,19 @@ const qwenChatModels: AIChatModelCard[] = [ }, contextWindowTokens: 1_000_000, description: - 'Qwen 3.6-Plus introduces major upgrades in coding capabilities, with a focus on Agentic Coding and front-end development, significantly enhancing the Vibe Coding experience. Its reasoning ability across general scenarios has been further improved. In terms of multimodality, capabilities such as universal recognition, OCR, and object localization have been substantially enhanced. It also fixes known issues from the Qwen 3.5-Plus release. Usage remains the same as Qwen 3.5-Plus.', + 'Qwen3.6 Plus supports text, image, and video input. It delivers a balanced performance across quality, speed, and cost. Its multimodal capabilities are significantly improved compared to the Qwen3 VL series.', displayName: 'Qwen3.6 Plus', - enabled: true, id: 'qwen3.6-plus', maxOutput: 65_536, organization: 'Qwen', pricing: { currency: 'CNY', units: [ - { - lookup: { - prices: { - '[0, 0.256]': 2 * 0.1, - '[0.256, infinity]': 8 * 0.1, - }, - pricingParams: ['textInputRange'], - }, - name: 'textInput_cacheRead', - strategy: 'lookup', - unit: 'millionTokens', - }, - { - lookup: { - prices: { - '[0, 0.256]': 2 * 1.25, - '[0.256, infinity]': 8 * 1.25, - }, - pricingParams: ['textInputRange'], - }, - name: 'textInput_cacheWrite', - strategy: 'lookup', - unit: 'millionTokens', - }, { lookup: { prices: { '[0, 0.256]': 2, - '[0.256, infinity]': 8, + '[0.256, 1]': 8, }, pricingParams: ['textInputRange'], }, @@ -1801,7 +1810,7 @@ const qwenChatModels: AIChatModelCard[] = [ lookup: { prices: { '[0, 0.256]': 12, - '[0.256, infinity]': 48, + '[0.256, 1]': 48, }, pricingParams: ['textInputRange'], }, @@ -1809,11 +1818,23 @@ const qwenChatModels: AIChatModelCard[] = [ strategy: 'lookup', unit: 'millionTokens', }, + { + lookup: { + prices: { + '[0, 0.256]': 2 * 0.2, + '[0.256, 1]': 8 * 0.2, + }, + pricingParams: ['textInputRange'], + }, + name: 'textInput_cacheRead', + strategy: 'lookup', + unit: 'millionTokens', + }, ], }, releasedAt: '2026-04-02', settings: { - extendParams: ['enableReasoning', 'reasoningBudgetToken'], + extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'], searchImpl: 'params', }, type: 'chat', @@ -2058,13 +2079,44 @@ const qwenChatModels: AIChatModelCard[] = [ search: true, }, config: { - deploymentName: 'qwen3.6-max-preview', // Supports context caching + deploymentName: 'qwen3.7-max', // Supports context caching + }, + contextWindowTokens: 1_000_000, + description: + 'Qwen3.7 Max is the flagship omnipotent model of the AI agent era, offering comprehensive capabilities across text, image, and video understanding. It provides superior reasoning, function calling, and agent task execution performance.', + displayName: 'Qwen3.7 Max', + enabled: true, + id: 'qwen3.7-max', + maxOutput: 65_536, + organization: 'Qwen', + pricing: { + currency: 'CNY', + units: [ + { name: 'textInput_cacheRead', rate: 12 * 0.2, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 12, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 36, strategy: 'fixed', unit: 'millionTokens' }, + ], + }, + releasedAt: '2026-05-20', + settings: { + extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'], + searchImpl: 'params', + }, + type: 'chat', + }, + { + abilities: { + functionCall: true, + reasoning: true, + search: true, + }, + config: { + deploymentName: 'qwen3.6-max-preview', }, contextWindowTokens: 262_144, description: 'The largest closed-source model in the Qwen3.6 series. It delivers stronger world knowledge, instruction following, and agentic coding performance for complex tasks. It is text-only, supports thinking mode by default, explicit caching, and function calling.', displayName: 'Qwen3.6 Max Preview', - enabled: true, id: 'qwen3.6-max-preview', maxOutput: 65_536, organization: 'Qwen', @@ -2111,7 +2163,7 @@ const qwenChatModels: AIChatModelCard[] = [ }, releasedAt: '2026-04-18', settings: { - extendParams: ['enableReasoning', 'reasoningBudgetToken'], + extendParams: ['enableReasoning', 'reasoningBudgetToken', 'preserveThinking'], searchImpl: 'params', }, type: 'chat', diff --git a/packages/model-bank/src/aiModels/siliconcloud.ts b/packages/model-bank/src/aiModels/siliconcloud.ts index dfdebe42ff..c653b456ac 100644 --- a/packages/model-bank/src/aiModels/siliconcloud.ts +++ b/packages/model-bank/src/aiModels/siliconcloud.ts @@ -438,26 +438,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 192_000, - description: - 'MiniMax-M2.5 is the latest large language model from MiniMax, featuring a Mixture-of-Experts (MoE) architecture with 229 billion total parameters. It achieves industry-leading performance in programming, agent tool calling, search tasks, and office scenarios, with a SWE-Bench Verified score of 80.2% and 37% faster inference speed compared to M2.1.', - displayName: 'MiniMax-M2.5', - id: 'MiniMaxAI/MiniMax-M2.5', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 2.1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 8.4, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2026-02-13', - type: 'chat', - }, { abilities: { functionCall: true, @@ -745,32 +725,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - video: true, - vision: true, - }, - contextWindowTokens: 262_144, - description: - "Kimi K2.6 is Moonshot AI's open-source native multimodal agent model. Built on MoE architecture with 1T total parameters and 32B activated, supporting 256K tokens context. It supports 4,000+ tool calls with sustained autonomous execution over 12 hours, multi-agent collaboration with up to 300 parallel sub-agents, and both Thinking and Instant inference modes.", - displayName: 'Kimi-K2.6 (Pro)', - id: 'Pro/moonshotai/Kimi-K2.6', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput_cacheRead', rate: 1.1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 6.5, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 27, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2026-04-21', - settings: { - extendParams: ['enableReasoning'], - }, - type: 'chat', - }, { abilities: { vision: true, @@ -849,52 +803,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 262_144, - description: - "Kimi K2 Thinking is the latest and most powerful open-source thinking model. It greatly extends multi-step reasoning depth and sustains stable tool use across 200–300 consecutive calls, setting new records on Humanity's Last Exam (HLE), BrowseComp, and other benchmarks. 'It excels in coding, math, logic, and agent scenarios. Built on an MoE architecture with ~1T total parameters, it supports a 256K context window and tool calling.", - displayName: 'Kimi K2 Thinking', - id: 'moonshotai/Kimi-K2-Thinking', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-11-07', - settings: { - extendParams: ['reasoningBudgetToken'], - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 262_144, - description: - 'Kimi K2 Thinking Turbo is the Turbo variant optimized for reasoning speed and throughput while retaining K2 Thinking’s multi-step reasoning and tool use. It is an MoE model with ~1T total parameters, native 256K context, and stable large-scale tool calling for production scenarios with stricter latency and concurrency needs.', - displayName: 'Kimi K2 Thinking (Pro)', - id: 'Pro/moonshotai/Kimi-K2-Thinking', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 32, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-11-07', - settings: { - extendParams: ['reasoningBudgetToken'], - }, - type: 'chat', - }, { abilities: { functionCall: true, @@ -1041,29 +949,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ releasedAt: '2025-09-01', type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 198_000, - description: - 'Compared to GLM-4.5, GLM-4.6 expands context from 128K to 200K for more complex agent tasks. It scores higher on code benchmarks and shows stronger real-world performance in apps like Claude Code, Cline, Roo Code, and Kilo Code, including better frontend page generation. Reasoning is improved and tool use is supported during reasoning, strengthening overall capability. It integrates better into agent frameworks, improves tool/search agents, and has more human-preferred writing style and roleplay naturalness.', - displayName: 'GLM-4.6', - id: 'zai-org/GLM-4.6', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 3.5, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 14, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-30', - settings: { - extendParams: ['enableReasoning', 'reasoningBudgetToken'], - }, - type: 'chat', - }, { abilities: { functionCall: true, @@ -1200,29 +1085,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 131_072, - description: - 'Ring-flash-2.0 is a high-performance thinking model optimized from Ling-flash-2.0-base. It uses an MoE architecture with 100B total parameters and only 6.1B active per inference. Its icepop algorithm stabilizes RL training for MoE models, enabling continued gains in complex reasoning. It achieves major breakthroughs on tough benchmarks (math contests, code generation, logical reasoning), surpassing top dense models under 40B and rivaling larger open MoE and closed reasoning models. It also performs well in creative writing, and its efficient architecture delivers fast inference at lower deployment cost for high concurrency.', - displayName: 'Ring Flash 2.0', - id: 'inclusionAI/Ring-flash-2.0', - settings: { - extendParams: ['reasoningBudgetToken'], - }, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-19', - type: 'chat', - }, { abilities: { functionCall: true, @@ -1349,45 +1211,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ releasedAt: '2025-07-28', type: 'chat', }, - - { - abilities: { - functionCall: true, - }, - contextWindowTokens: 262_144, - description: - 'Kimi K2-Instruct-0905 is the newest and most powerful Kimi K2. It is a top-tier MoE model with 1T total and 32B active parameters. Key features include stronger agentic coding intelligence with significant gains on benchmarks and real-world agent tasks, plus improved frontend coding aesthetics and usability.', - displayName: 'Kimi K2 0905', - id: 'moonshotai/Kimi-K2-Instruct-0905', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-05', - type: 'chat', - }, - { - abilities: { - functionCall: true, - }, - contextWindowTokens: 262_144, - description: - 'Kimi K2-Instruct-0905 is the newest and most powerful Kimi K2. It is a top-tier MoE model with 1T total and 32B active parameters. Key features include stronger agentic coding intelligence with significant gains on benchmarks and real-world agent tasks, plus improved frontend coding aesthetics and usability.', - displayName: 'Kimi K2 0905 (Pro)', - id: 'Pro/moonshotai/Kimi-K2-Instruct-0905', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-09-05', - type: 'chat', - }, { abilities: { reasoning: true, @@ -1410,51 +1233,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - - { - abilities: { - functionCall: true, - }, - contextWindowTokens: 262_144, - description: - 'Qwen3-235B-A22B-Instruct-2507 is a flagship Qwen3 MoE model with 235B total and 22B active parameters. It is an updated non-thinking version focused on improving instruction following, logical reasoning, text understanding, math, science, coding, and tool use. It also expands multilingual long-tail knowledge and better aligns with user preferences for subjective open-ended tasks.', - displayName: 'Qwen3 235B A22B Instruct 2507', - id: 'Qwen/Qwen3-235B-A22B-Instruct-2507', - organization: 'Qwen', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 2.5, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 10, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-07-21', - type: 'chat', - }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 262_144, - description: - 'Qwen3-30B-A3B-Thinking-2507 is the latest thinking model in the Qwen3 series. It is an MoE model with 30.5B total and 3.3B active parameters, focused on complex tasks. It shows significant gains in logic, math, science, coding, and academic benchmarks, and improves instruction following, tool use, text generation, and preference alignment. It natively supports 256K context and can extend to 1M tokens. This version is designed for thinking mode with detailed step-by-step reasoning and strong agent capabilities.', - displayName: 'Qwen3 30B A3B Thinking 2507', - id: 'Qwen/Qwen3-30B-A3B-Thinking-2507', - organization: 'Qwen', - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 0.7, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 2.8, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-07-30', - settings: { - extendParams: ['reasoningBudgetToken'], - }, - type: 'chat', - }, { abilities: { functionCall: true, @@ -1547,53 +1325,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - reasoning: true, - vision: true, - }, - contextWindowTokens: 65_536, - description: - 'GLM-4.1V-9B-Thinking is an open-source VLM from Zhipu AI and Tsinghua KEG Lab, designed for complex multimodal cognition. Built on GLM-4-9B-0414, it adds chain-of-thought reasoning and RL to significantly improve cross-modal reasoning and stability.', - displayName: 'GLM-4.1V 9B Thinking (Free)', - id: 'THUDM/GLM-4.1V-9B-Thinking', - settings: { - extendParams: ['reasoningBudgetToken'], - }, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 0, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 0, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-07-02', - type: 'chat', - }, - - { - abilities: { - functionCall: true, - reasoning: true, - }, - contextWindowTokens: 131_072, - description: - 'GLM-Z1-32B-0414 is a deep-thinking reasoning model built from GLM-4-32B-0414 with cold-start data and expanded RL, further trained on math, code, and logic. It significantly improves math ability and complex task solving over the base model.', - displayName: 'GLM-Z1 32B 0414', - id: 'THUDM/GLM-Z1-32B-0414', - settings: { - extendParams: ['reasoningBudgetToken'], - }, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-04-14', - type: 'chat', - }, { abilities: { functionCall: true, @@ -1864,30 +1595,6 @@ const siliconcloudChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - vision: true, - }, - contextWindowTokens: 131_072, - description: - 'GLM-4.6V achieves SOTA visual understanding accuracy at the same parameter scale, and is the first to natively integrate Function Call capability into vision models in the model architecture, connecting the chain from visual perception to executable action (Action), providing a unified technical foundation for multimodal Agents in real business scenarios. Visual context window expanded to 128K, supporting long video stream processing and high-resolution multi-image analysis.', - displayName: 'GLM-4.6V', - id: 'zai-org/GLM-4.6V', - settings: { - extendParams: ['enableReasoning', 'reasoningBudgetToken'], - }, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 3.5, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 14, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - releasedAt: '2025-12-08', - type: 'chat', - }, { abilities: { functionCall: true, diff --git a/packages/model-bank/src/aiModels/stepfun.ts b/packages/model-bank/src/aiModels/stepfun.ts index fde7206f0f..0574e789ef 100644 --- a/packages/model-bank/src/aiModels/stepfun.ts +++ b/packages/model-bank/src/aiModels/stepfun.ts @@ -41,7 +41,6 @@ const stepfunChatModels: AIChatModelCard[] = [ description: 'Built on Step 3.5 Flash and optimized for high-frequency agent scenarios, it further improves token efficiency and inference speed while retaining flagship-level reasoning and tool-calling capabilities. It also supports switching to a low-reasoning mode to reduce resource consumption. Additionally, targeted optimizations have been made to enhance compatibility with coding tasks and agent frameworks.', displayName: 'Step 3.5 Flash 2603', - enabled: true, id: 'step-3.5-flash-2603', pricing: { currency: 'CNY', @@ -95,7 +94,7 @@ const stepfunChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 0.3, upTo: 0.004 }, + { rate: 0.3, upTo: 4_000 }, { rate: 0.8, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -104,7 +103,7 @@ const stepfunChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 1.5, upTo: 0.004 }, + { rate: 1.5, upTo: 4_000 }, { rate: 4, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -113,7 +112,7 @@ const stepfunChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 4, upTo: 0.004 }, + { rate: 4, upTo: 4_000 }, { rate: 8, upTo: 'infinity' }, // Still differs from documentation ], unit: 'millionTokens', diff --git a/packages/model-bank/src/aiModels/streamlake.ts b/packages/model-bank/src/aiModels/streamlake.ts index 19ae2828bd..f7925c2d3b 100644 --- a/packages/model-bank/src/aiModels/streamlake.ts +++ b/packages/model-bank/src/aiModels/streamlake.ts @@ -345,7 +345,7 @@ const streamlakeModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 7, upTo: 0.256 }, + { rate: 7, upTo: 256_000 }, { rate: 14, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -354,7 +354,7 @@ const streamlakeModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 21, upTo: 0.256 }, + { rate: 21, upTo: 256_000 }, { rate: 42, upTo: 'infinity' }, ], unit: 'millionTokens', diff --git a/packages/model-bank/src/aiModels/volcengine.ts b/packages/model-bank/src/aiModels/volcengine.ts index e162df3d26..6afcf83b32 100644 --- a/packages/model-bank/src/aiModels/volcengine.ts +++ b/packages/model-bank/src/aiModels/volcengine.ts @@ -74,6 +74,70 @@ const seedance15ProParams: VideoModelParamsSchema = { }; const doubaoChatModels: AIChatModelCard[] = [ + { + abilities: { + functionCall: true, + reasoning: true, + search: true, + }, + config: { + deploymentName: 'deepseek-v4-pro-260425', + }, + contextWindowTokens: 1_048_576, + description: + 'DeepSeek-V4-Pro is DeepSeek’s flagship MoE model on Volcano Ark, supporting both non-thinking and thinking modes for advanced reasoning, code generation, and complex agent workflows.', + displayName: 'DeepSeek V4 Pro', + enabled: true, + id: 'deepseek-v4-pro', + maxOutput: 393_216, + pricing: { + currency: 'CNY', + units: [ + { name: 'textInput_cacheRead', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 12, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 24, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput_cacheWrite', rate: 0.017, strategy: 'fixed', unit: 'millionTokens' }, + ], + }, + releasedAt: '2026-04-24', + settings: { + extendParams: ['enableReasoning'], + searchImpl: 'params', + }, + type: 'chat', + }, + { + abilities: { + functionCall: true, + reasoning: true, + search: true, + }, + config: { + deploymentName: 'deepseek-v4-flash-260425', + }, + contextWindowTokens: 1_048_576, + description: + 'DeepSeek-V4-Flash is DeepSeek’s efficient 1M-context model on Volcano Ark, balancing speed and cost while keeping strong reasoning and agent capabilities.', + displayName: 'DeepSeek V4 Flash', + enabled: true, + id: 'deepseek-v4-flash', + maxOutput: 393_216, + pricing: { + currency: 'CNY', + units: [ + { name: 'textInput_cacheRead', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 2, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput_cacheWrite', rate: 0.017, strategy: 'fixed', unit: 'millionTokens' }, + ], + }, + releasedAt: '2026-04-24', + settings: { + extendParams: ['enableReasoning'], + searchImpl: 'params', + }, + type: 'chat', + }, { abilities: { functionCall: true, @@ -562,33 +626,6 @@ const doubaoChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - config: { - deploymentName: 'deepseek-v3-1-terminus', - }, - contextWindowTokens: 131_072, - description: - 'DeepSeek-V3.1 is a new hybrid reasoning model from DeepSeek, supporting both thinking and non-thinking modes and offering higher thinking efficiency than DeepSeek-R1-0528. Post-training optimizations greatly improve agent tool use and agent-task performance. It supports a 128k context window and up to 64k output tokens.', - displayName: 'DeepSeek V3.1', - id: 'deepseek-v3.1', - maxOutput: 32_768, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput_cacheRead', rate: 0.8, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 12, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - settings: { - extendParams: ['enableReasoning'], - }, - type: 'chat', - }, { abilities: { functionCall: true, @@ -700,62 +737,6 @@ const doubaoChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - video: true, - vision: true, - search: true, - }, - config: { - deploymentName: 'doubao-seed-1-6-lite-251015', - }, - contextWindowTokens: 256_000, - description: - 'Doubao-Seed-1.6-lite is a new multimodal deep-reasoning model with adjustable reasoning effort (Minimal, Low, Medium, High), delivering better value and a strong choice for common tasks, with a context window up to 256k.', - displayName: 'Doubao Seed 1.6 Lite', - id: 'doubao-seed-1.6-lite', - maxOutput: 32_000, - pricing: { - currency: 'CNY', - units: [ - { - lookup: { - prices: { - '[0, 0.032]': 0.3, - '[0.032, 0.128]': 0.6, - '[0.128, 0.256]': 1.2, - }, - pricingParams: ['textInputRange'], - }, - name: 'textInput', - strategy: 'lookup', - unit: 'millionTokens', - }, - { - lookup: { - prices: { - '[0, 0.032]_[0, 0.0002]': 0.6, - '[0, 0.032]_[0.0002, infinity]': 2.4, - '[0.032, 0.128]_[0, infinity]': 4, - '[0.128, 0.256]_[0, infinity]': 12, - }, - pricingParams: ['textInputRange', 'textOutputRange'], - }, - name: 'textOutput', - strategy: 'lookup', - unit: 'millionTokens', - }, - { name: 'textInput_cacheRead', rate: 0.06, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - settings: { - extendParams: ['gpt5ReasoningEffort'], - searchImpl: 'params', - }, - type: 'chat', - }, { abilities: { functionCall: true, @@ -811,51 +792,6 @@ const doubaoChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - }, - config: { - deploymentName: 'deepseek-r1-250528', - }, - contextWindowTokens: 131_072, - description: - 'The latest 0528 release of DeepSeek-R1 applies large-scale reinforcement learning in post-training, greatly boosting reasoning with very little labeled data. It matches the OpenAI o1 production model on math, code, and natural language reasoning tasks.', - displayName: 'DeepSeek R1', - id: 'deepseek-r1', - maxOutput: 16_384, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 4, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 16, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - }, - config: { - deploymentName: 'deepseek-v3-250324', - }, - contextWindowTokens: 128_000, - description: - 'DeepSeek-V3 is a MoE model developed by DeepSeek. It surpasses other open models like Qwen2.5-72B and Llama-3.1-405B on many benchmarks, and is competitive with leading closed models such as GPT-4o and Claude 3.5 Sonnet.', - displayName: 'DeepSeek V3', - id: 'deepseek-v3', - maxOutput: 16_384, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 8, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, { abilities: { functionCall: true, @@ -1055,28 +991,6 @@ const volcengineImageModels: AIImageModelCard[] = [ releasedAt: '2025-09-09', type: 'image', }, - { - description: - 'Seedream 3.0 is an image generation model from ByteDance Seed, supporting text and image inputs with highly controllable, high-quality image generation. It generates images from text prompts.', - displayName: 'Seedream 3.0 Text-to-Image', - id: 'doubao-seedream-3-0-t2i-250415', - parameters: { - cfg: { default: 2.5, max: 10, min: 1, step: 0.1 }, - height: { default: 1024, max: 3549, min: 296, step: 1 }, - prompt: { - default: '', - }, - seed: { default: null }, - watermark: { default: false }, - width: { default: 1024, max: 3549, min: 296, step: 1 }, - }, - pricing: { - currency: 'CNY', - units: [{ name: 'imageGeneration', rate: 0.259, strategy: 'fixed', unit: 'image' }], - }, - releasedAt: '2025-04-15', - type: 'image', - }, ]; const volcengineVideoModels: AIVideoModelCard[] = [ @@ -1224,78 +1138,6 @@ const volcengineVideoModels: AIVideoModelCard[] = [ releasedAt: '2025-05-28', type: 'video', }, - { - description: - 'Stable generation quality with high cost-effectiveness, capable of generating videos from a first frame, first-and-last frames, or reference images.', - displayName: 'Seedance 1.0 Lite I2V', - id: 'doubao-seedance-1-0-lite-i2v-250428', - organization: 'ByteDance', - parameters: { - aspectRatio: { - default: '16:9', - enum: ['21:9', '16:9', '4:3', '1.1', '3:4', '9:16'], - }, - cameraFixed: { default: false }, - endImageUrl: { - aspectRatio: { max: 2.5, min: 0.4 }, - default: null, - height: { max: 6000, min: 300 }, - maxFileSize: 30 * 1024 * 1024, - requiresImageUrl: true, - width: { max: 6000, min: 300 }, - }, - imageUrls: { - aspectRatio: { max: 2.5, min: 0.4 }, - default: [], - height: { max: 6000, min: 300 }, - maxFileSize: 30 * 1024 * 1024, - maxCount: 4, - width: { max: 6000, min: 300 }, - }, - duration: { default: 5, max: 12, min: 2 }, - prompt: { default: '' }, - resolution: { - default: '720p', - enum: ['480p', '720p', '1080p'], - }, - seed: { default: null }, - watermark: { default: false }, - }, - pricing: { - currency: 'CNY', - units: [{ name: 'videoGeneration', rate: 10, strategy: 'fixed', unit: 'millionTokens' }], - }, - releasedAt: '2025-04-28', - type: 'video', - }, - { - description: - 'Stable generation quality with high cost-effectiveness, capable of generating videos based on text instructions.', - displayName: 'Seedance 1.0 Lite T2V', - id: 'doubao-seedance-1-0-lite-t2v-250428', - organization: 'ByteDance', - parameters: { - aspectRatio: { - default: '16:9', - enum: ['21:9', '16:9', '4:3', '1.1', '3:4', '9:16'], - }, - cameraFixed: { default: false }, - duration: { default: 5, max: 12, min: 2 }, - prompt: { default: '' }, - resolution: { - default: '720p', - enum: ['480p', '720p', '1080p'], - }, - seed: { default: null }, - watermark: { default: false }, - }, - pricing: { - currency: 'CNY', - units: [{ name: 'videoGeneration', rate: 10, strategy: 'fixed', unit: 'millionTokens' }], - }, - releasedAt: '2025-04-28', - type: 'video', - }, ]; export const allModels = [...doubaoChatModels, ...volcengineImageModels, ...volcengineVideoModels]; diff --git a/packages/model-bank/src/aiModels/wenxin.ts b/packages/model-bank/src/aiModels/wenxin.ts index 0b18cd736d..88fe7f6098 100644 --- a/packages/model-bank/src/aiModels/wenxin.ts +++ b/packages/model-bank/src/aiModels/wenxin.ts @@ -1001,26 +1001,6 @@ const wenxinChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - video: true, - vision: true, - }, - contextWindowTokens: 32_768, - description: - 'Qwen2.5 VL 32B Instruct is an open-source multimodal model suitable for private deployment and multi-scenario use.', - displayName: 'Qwen2.5 VL 32B Instruct', - id: 'qwen2.5-vl-32b-instruct', - maxOutput: 8192, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 8, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 24, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, { abilities: { vision: true, @@ -1556,61 +1536,6 @@ const wenxinChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - contextWindowTokens: 32_768, - description: 'Qwen3 4B is suitable for small-to-mid apps and local inference.', - displayName: 'Qwen3 4B', - id: 'qwen3-4b', - maxOutput: 8192, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - settings: { - extendParams: ['enableReasoning', 'reasoningBudgetToken'], - }, - type: 'chat', - }, - { - contextWindowTokens: 32_768, - description: 'Qwen3 1.7B is an ultra-light model for edge and device deployment.', - displayName: 'Qwen3 1.7B', - id: 'qwen3-1.7b', - maxOutput: 8192, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - settings: { - extendParams: ['enableReasoning', 'reasoningBudgetToken'], - }, - type: 'chat', - }, - { - contextWindowTokens: 32_768, - description: - 'Qwen3 0.6B is an entry-level model for simple reasoning and very constrained environments.', - displayName: 'Qwen3 0.6B', - id: 'qwen3-0.6b', - maxOutput: 8192, - pricing: { - currency: 'CNY', - units: [ - { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 1.2, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - settings: { - extendParams: ['enableReasoning', 'reasoningBudgetToken'], - }, - type: 'chat', - }, ]; const wenxinImageModels: AIImageModelCard[] = [ diff --git a/packages/model-bank/src/aiModels/xai.ts b/packages/model-bank/src/aiModels/xai.ts index cf4aa430b9..8ad66958f2 100644 --- a/packages/model-bank/src/aiModels/xai.ts +++ b/packages/model-bank/src/aiModels/xai.ts @@ -20,7 +20,7 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 0.2, upTo: 200_000 }, + { rate: 0.2, upTo: 0.2 }, { rate: 0.4, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -29,7 +29,7 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 1.25, upTo: 200_000 }, + { rate: 1.25, upTo: 0.2 }, { rate: 2.5, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -38,7 +38,7 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 2.5, upTo: 200_000 }, + { rate: 2.5, upTo: 0.2 }, { rate: 5, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -59,7 +59,7 @@ const xaiChatModels: AIChatModelCard[] = [ structuredOutput: true, vision: true, }, - contextWindowTokens: 2_000_000, + contextWindowTokens: 1_000_000, description: 'A non-reasoning variant for simple use cases', displayName: 'Grok 4.20 (Non-Reasoning)', enabled: true, @@ -70,7 +70,7 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 0.2, upTo: 200_000 }, + { rate: 0.2, upTo: 0.2 }, { rate: 0.4, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -79,8 +79,8 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 2, upTo: 200_000 }, - { rate: 4, upTo: 'infinity' }, + { rate: 1.25, upTo: 0.2 }, + { rate: 2.5, upTo: 'infinity' }, ], unit: 'millionTokens', }, @@ -88,8 +88,8 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 6, upTo: 200_000 }, - { rate: 12, upTo: 'infinity' }, + { rate: 2.5, upTo: 0.2 }, + { rate: 5, upTo: 'infinity' }, ], unit: 'millionTokens', }, @@ -109,7 +109,7 @@ const xaiChatModels: AIChatModelCard[] = [ structuredOutput: true, vision: true, }, - contextWindowTokens: 2_000_000, + contextWindowTokens: 1_000_000, description: 'Intelligent, blazing-fast model that reasons before responding', displayName: 'Grok 4.20', enabled: true, @@ -120,7 +120,7 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 0.2, upTo: 200_000 }, + { rate: 0.2, upTo: 0.2 }, { rate: 0.4, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -129,8 +129,8 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 2, upTo: 200_000 }, - { rate: 4, upTo: 'infinity' }, + { rate: 1.25, upTo: 0.2 }, + { rate: 2.5, upTo: 'infinity' }, ], unit: 'millionTokens', }, @@ -138,8 +138,8 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 6, upTo: 200_000 }, - { rate: 12, upTo: 'infinity' }, + { rate: 2.5, upTo: 0.2 }, + { rate: 5, upTo: 'infinity' }, ], unit: 'millionTokens', }, @@ -170,7 +170,7 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput_cacheRead', strategy: 'tiered', tiers: [ - { rate: 0.2, upTo: 200_000 }, + { rate: 0.2, upTo: 0.2 }, { rate: 0.4, upTo: 'infinity' }, ], unit: 'millionTokens', @@ -179,8 +179,8 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textInput', strategy: 'tiered', tiers: [ - { rate: 2, upTo: 200_000 }, - { rate: 4, upTo: 'infinity' }, + { rate: 1.25, upTo: 0.2 }, + { rate: 2.5, upTo: 'infinity' }, ], unit: 'millionTokens', }, @@ -188,8 +188,8 @@ const xaiChatModels: AIChatModelCard[] = [ name: 'textOutput', strategy: 'tiered', tiers: [ - { rate: 6, upTo: 200_000 }, - { rate: 12, upTo: 'infinity' }, + { rate: 2.5, upTo: 0.2 }, + { rate: 5, upTo: 'infinity' }, ], unit: 'millionTokens', }, diff --git a/packages/model-bank/src/aiModels/xiaomimimo.ts b/packages/model-bank/src/aiModels/xiaomimimo.ts index 07c2615f09..6c5e1f0ddd 100644 --- a/packages/model-bank/src/aiModels/xiaomimimo.ts +++ b/packages/model-bank/src/aiModels/xiaomimimo.ts @@ -18,33 +18,9 @@ const xiaomimimoChatModels: AIChatModelCard[] = [ pricing: { currency: 'CNY', units: [ - { - name: 'textInput_cacheRead', - strategy: 'tiered', - tiers: [ - { rate: 1.4, upTo: 0.256 }, - { rate: 2.8, upTo: 'infinity' }, - ], - unit: 'millionTokens', - }, - { - name: 'textInput', - strategy: 'tiered', - tiers: [ - { rate: 7, upTo: 0.256 }, - { rate: 14, upTo: 'infinity' }, - ], - unit: 'millionTokens', - }, - { - name: 'textOutput', - strategy: 'tiered', - tiers: [ - { rate: 21, upTo: 0.256 }, - { rate: 42, upTo: 'infinity' }, - ], - unit: 'millionTokens', - }, + { name: 'textInput_cacheRead', rate: 0.025, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 3, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 6, strategy: 'fixed', unit: 'millionTokens' }, ], }, releasedAt: '2026-04-22', @@ -73,33 +49,9 @@ const xiaomimimoChatModels: AIChatModelCard[] = [ pricing: { currency: 'CNY', units: [ - { - name: 'textInput_cacheRead', - strategy: 'tiered', - tiers: [ - { rate: 0.56, upTo: 0.256 }, - { rate: 1.12, upTo: 'infinity' }, - ], - unit: 'millionTokens', - }, - { - name: 'textInput', - strategy: 'tiered', - tiers: [ - { rate: 2.8, upTo: 0.256 }, - { rate: 5.6, upTo: 'infinity' }, - ], - unit: 'millionTokens', - }, - { - name: 'textOutput', - strategy: 'tiered', - tiers: [ - { rate: 14, upTo: 0.256 }, - { rate: 28, upTo: 'infinity' }, - ], - unit: 'millionTokens', - }, + { name: 'textInput_cacheRead', rate: 0.02, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 1, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 2, strategy: 'fixed', unit: 'millionTokens' }, ], }, releasedAt: '2026-04-22', @@ -120,7 +72,6 @@ const xiaomimimoChatModels: AIChatModelCard[] = [ description: 'MiMo-V2-Flash is now officially open source! This is a MoE (Mixture-of-Experts) model purpose-built for extreme inference efficiency, with 309B total parameters (15B activated). Through innovations in a hybrid attention architecture and multi-layer MTP inference acceleration, it ranks among the global Top 2 open-source models across multiple agent benchmarking suites. Its coding capabilities surpass all open-source models and rival leading closed-source models such as Claude 4.5 Sonnet, while incurring only 2.5% of the inference cost and delivering 2× faster generation speed—pushing large-model inference efficiency to the limit.', displayName: 'MiMo-V2 Flash', - enabled: true, id: 'mimo-v2-flash', maxOutput: 65_536, pricing: { diff --git a/packages/model-bank/src/aiModels/zenmux.ts b/packages/model-bank/src/aiModels/zenmux.ts index 5f20852348..de0d28b05f 100644 --- a/packages/model-bank/src/aiModels/zenmux.ts +++ b/packages/model-bank/src/aiModels/zenmux.ts @@ -267,80 +267,6 @@ const zenmuxChatModels: AIChatModelCard[] = [ reasoning: true, vision: true, }, - contextWindowTokens: 2_000_000, - description: - 'Grok 4 Fast is xAI’s high-throughput, low-cost model (supports a 2M context window), ideal for high-concurrency and long-context use cases.', - displayName: 'Grok 4 Fast', - id: 'x-ai/grok-4-fast', - maxOutput: 30_000, - pricing: { - units: [ - { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, - { - abilities: { - functionCall: true, - vision: true, - }, - contextWindowTokens: 2_000_000, - description: - 'Grok 4 Fast (Non-Reasoning) is xAI’s high-throughput, low-cost multimodal model (supports a 2M context window) for scenarios sensitive to latency and cost that do not require in-model reasoning. It sits alongside the reasoning version of Grok 4 Fast, and reasoning can be enabled via the API reasoning parameter when needed. Prompts and completions may be used by xAI or OpenRouter to improve future models.', - displayName: 'Grok 4 Fast (Non-Reasoning)', - id: 'x-ai/grok-4-fast-non-reasoning', - maxOutput: 30_000, - pricing: { - units: [ - { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 0.5, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, - { - abilities: { - reasoning: true, - }, - contextWindowTokens: 256_000, - description: - "Grok 4 is xAI's flagship reasoning model with strong reasoning and multimodal capability.", - displayName: 'Grok 4', - id: 'x-ai/grok-4', - maxOutput: 256_000, - pricing: { - units: [ - { name: 'textInput', rate: 3, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 15, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, - { - abilities: { - reasoning: true, - }, - contextWindowTokens: 256_000, - description: - "Grok Code Fast 1 is xAI's fast code model with readable, engineering-friendly output.", - displayName: 'Grok Code Fast 1', - id: 'x-ai/grok-code-fast-1', - maxOutput: 10_000, - pricing: { - units: [ - { name: 'textInput', rate: 0.2, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'textOutput', rate: 1.5, strategy: 'fixed', unit: 'millionTokens' }, - ], - }, - type: 'chat', - }, - { - abilities: { - reasoning: true, - vision: true, - }, contextWindowTokens: 128_000, description: 'ERNIE 5.0 Thinking Preview is Baidu’s next-generation native multimodal ERNIE model, strong in multimodal understanding, instruction following, creation, factual Q&A, and tool calling.', diff --git a/packages/model-bank/src/aiModels/zhipu.ts b/packages/model-bank/src/aiModels/zhipu.ts index 27daefb561..2259dfc98a 100644 --- a/packages/model-bank/src/aiModels/zhipu.ts +++ b/packages/model-bank/src/aiModels/zhipu.ts @@ -69,6 +69,68 @@ const zhipuChatModels: AIChatModelCard[] = [ }, type: 'chat', }, + { + abilities: { + functionCall: true, + reasoning: true, + search: true, + video: true, + vision: true, + }, + contextWindowTokens: 200_000, + description: + 'GLM-5V-Turbo is Zhipu’s multimodal Coding foundation model for visual programming tasks. It natively handles images, video, text, and files, and is optimized for long-horizon planning, complex coding, and agent execution in multimodal workflows.', + displayName: 'GLM-5V-Turbo', + enabled: true, + id: 'glm-5v-turbo', + maxOutput: 131_072, + pricing: { + currency: 'CNY', + units: [ + { + lookup: { + prices: { + '[0, 0.032]': 1.2, + '[0.032, infinity]': 1.8, + }, + pricingParams: ['textInput'], + }, + name: 'textInput_cacheRead', + strategy: 'lookup', + unit: 'millionTokens', + }, + { + lookup: { + prices: { + '[0, 0.032]': 5, + '[0.032, infinity]': 7, + }, + pricingParams: ['textInput'], + }, + name: 'textInput', + strategy: 'lookup', + unit: 'millionTokens', + }, + { + lookup: { + prices: { + '[0, 0.032]': 22, + '[0.032, infinity]': 26, + }, + pricingParams: ['textInput'], + }, + name: 'textOutput', + strategy: 'lookup', + unit: 'millionTokens', + }, + ], + }, + settings: { + extendParams: ['enableReasoning'], + searchImpl: 'params', + }, + type: 'chat', + }, { abilities: { functionCall: true, @@ -185,7 +247,7 @@ const zhipuChatModels: AIChatModelCard[] = [ }, releasedAt: '2026-02-12', settings: { - extendParams: ['enableReasoning'], + extendParams: ['enableReasoning', 'preserveThinking'], searchImpl: 'params', }, type: 'chat', @@ -248,7 +310,7 @@ const zhipuChatModels: AIChatModelCard[] = [ }, releasedAt: '2025-12-22', settings: { - extendParams: ['enableReasoning'], + extendParams: ['enableReasoning', 'preserveThinking'], searchImpl: 'params', }, type: 'chat', @@ -308,69 +370,6 @@ const zhipuChatModels: AIChatModelCard[] = [ }, type: 'chat', }, - { - abilities: { - functionCall: true, - reasoning: true, - search: true, - video: true, - vision: true, - }, - contextWindowTokens: 200_000, - description: - 'GLM-5V-Turbo is Zhipu’s first multimodal coding foundation model, designed for visual programming tasks. It can natively process multimodal inputs such as images, videos, and text, while excelling in long-horizon planning, complex programming, and action execution. Deeply integrated with agent workflows, it can collaborate seamlessly with agents like Claude Code and OpenClaw to complete a full closed loop of “understanding the environment → planning actions → executing tasks.”', - displayName: 'GLM-5V-Turbo', - enabled: true, - id: 'glm-5v-turbo', - maxOutput: 131_072, - pricing: { - currency: 'CNY', - units: [ - { - lookup: { - prices: { - '[0, 0.032]': 1.2, - '[0.032, infinity]': 1.8, - }, - pricingParams: ['textInput'], - }, - name: 'textInput_cacheRead', - strategy: 'lookup', - unit: 'millionTokens', - }, - { - lookup: { - prices: { - '[0, 0.032]': 5, - '[0.032, infinity]': 7, - }, - pricingParams: ['textInput'], - }, - name: 'textInput', - strategy: 'lookup', - unit: 'millionTokens', - }, - { - lookup: { - prices: { - '[0, 0.032]': 22, - '[0.032, infinity]': 26, - }, - pricingParams: ['textInput'], - }, - name: 'textOutput', - strategy: 'lookup', - unit: 'millionTokens', - }, - ], - }, - releasedAt: '2026-04-02', - settings: { - extendParams: ['enableReasoning'], - searchImpl: 'params', - }, - type: 'chat', - }, { abilities: { functionCall: true, diff --git a/packages/model-bank/src/types/aiModel.ts b/packages/model-bank/src/types/aiModel.ts index ca24e83468..c083e6801d 100644 --- a/packages/model-bank/src/types/aiModel.ts +++ b/packages/model-bank/src/types/aiModel.ts @@ -257,6 +257,7 @@ export type ExtendParamsType = | 'reasoningBudgetToken32k' | 'reasoningBudgetToken80k' | 'enableReasoning' + | 'preserveThinking' | 'enableAdaptiveThinking' | 'disableContextCaching' | 'effort' @@ -307,6 +308,7 @@ export const ExtendParamsTypeSchema = z.enum([ 'reasoningBudgetToken32k', 'reasoningBudgetToken80k', 'enableReasoning', + 'preserveThinking', 'enableAdaptiveThinking', 'disableContextCaching', 'effort', diff --git a/packages/model-runtime/src/core/openaiCompatibleFactory/index.ts b/packages/model-runtime/src/core/openaiCompatibleFactory/index.ts index 32c080e51f..eea4a0cbab 100644 --- a/packages/model-runtime/src/core/openaiCompatibleFactory/index.ts +++ b/packages/model-runtime/src/core/openaiCompatibleFactory/index.ts @@ -575,21 +575,32 @@ export const createOpenAICompatibleRuntime = = an // Apply sampling sanitization to processedPayload for the custom client path. // We use processedPayload (ChatStreamPayload type) here because // createChatCompletionStream expects ChatStreamPayload, not the OpenAI SDK format. + // Strip LobeHub-internal fields that should never reach downstream APIs. + const { + apiMode: _apiMode, + preserveThinking: _preserveThinking, + ...cleanProcessedPayload + } = processedPayload as any; response = customClient.createChatCompletionStream( this.client, { - ...processedPayload, - ...resolveModelSamplingParameters(processedPayload.model, processedPayload, { - normalizeTemperature: false, - preferTemperature: true, - }), + ...cleanProcessedPayload, + ...resolveModelSamplingParameters( + cleanProcessedPayload.model, + cleanProcessedPayload, + { + normalizeTemperature: false, + preferTemperature: true, + }, + ), }, this, ) as any; } else { - // Remove internal apiMode parameter before sending to API - - const { apiMode: _, ...cleanedPayload } = postPayload as any; + // Remove LobeHub-internal fields before sending to downstream API. + // `preserveThinking` is only consumed by Qwen/Zhipu handlePayload (which runs above) + // and must not leak to other providers' APIs as an unknown parameter. + const { apiMode: _, preserveThinking: _pt, ...cleanedPayload } = postPayload as any; const finalPayload = { ...cleanedPayload, messages, @@ -1214,6 +1225,7 @@ export const createOpenAICompatibleRuntime = = an delete res.apiMode; delete res.frequency_penalty; delete res.presence_penalty; + delete res.preserveThinking; const input = await convertOpenAIResponseInputs(messages as any, { forceImageBase64: chatCompletion?.forceImageBase64, diff --git a/packages/model-runtime/src/providers/azureOpenai/index.ts b/packages/model-runtime/src/providers/azureOpenai/index.ts index fabb4a556f..28d0559891 100644 --- a/packages/model-runtime/src/providers/azureOpenai/index.ts +++ b/packages/model-runtime/src/providers/azureOpenai/index.ts @@ -72,7 +72,7 @@ const maskSensitiveUrl = (url: string) => { const BaseAzureOpenAI = createOpenAICompatibleRuntime({ chatCompletion: { handlePayload: (payload) => { - const { deploymentName, enabledSearch, model, ...rest } = payload; + const { deploymentName, enabledSearch, model, preserveThinking: _preserveThinking, ...rest } = payload; const requestModel = deploymentName ?? model; if (responsesAPIModels.has(model) || enabledSearch) { @@ -134,7 +134,15 @@ const BaseAzureOpenAI = createOpenAICompatibleRuntime({ provider: ModelProvider.Azure, responses: { handlePayload: (payload) => { - const { deploymentName, enabledSearch, model, tools, verbosity, ...rest } = payload; + const { + deploymentName, + enabledSearch, + model, + preserveThinking: _preserveThinking, + tools, + verbosity, + ...rest + } = payload; const requestModel = deploymentName ?? model; const updatedMessages = transformAzureSystemMessages(payload.messages, model); const azureTools = appendAzureSearchTool(tools, enabledSearch); diff --git a/packages/model-runtime/src/providers/azureai/index.ts b/packages/model-runtime/src/providers/azureai/index.ts index f33cb01cca..06be5d6b2f 100644 --- a/packages/model-runtime/src/providers/azureai/index.ts +++ b/packages/model-runtime/src/providers/azureai/index.ts @@ -40,7 +40,15 @@ export class LobeAzureAI implements LobeRuntimeAI { async chat(payload: ChatStreamPayload, options?: ChatMethodOptions) { // Remove internal apiMode parameter to prevent sending to Azure AI API - const { messages, model, temperature, top_p, apiMode: _, ...params } = payload; + const { + messages, + model, + temperature, + top_p, + apiMode: _, + preserveThinking: _pt, + ...params + } = payload; // o1 series models on Azure OpenAI does not support streaming currently const enableStreaming = model.includes('o1') ? false : (params.stream ?? true); diff --git a/packages/model-runtime/src/providers/githubCopilot/index.ts b/packages/model-runtime/src/providers/githubCopilot/index.ts index 1ac15067d5..65b9b0c29f 100644 --- a/packages/model-runtime/src/providers/githubCopilot/index.ts +++ b/packages/model-runtime/src/providers/githubCopilot/index.ts @@ -271,6 +271,7 @@ export class LobeGithubCopilotAI implements LobeRuntimeAI { reasoning, max_tokens, verbosity, + preserveThinking: _pt, ...responseRest } = rest as any; @@ -350,7 +351,7 @@ export class LobeGithubCopilotAI implements LobeRuntimeAI { ); } - const { apiMode: _, ...cleanedRest } = rest as any; + const { apiMode: _, preserveThinking: _pt, ...cleanedRest } = rest as any; const messages = await convertOpenAIMessages(cleanedRest.messages as any, { forceImageBase64: true, }); diff --git a/packages/model-runtime/src/providers/qwen/index.test.ts b/packages/model-runtime/src/providers/qwen/index.test.ts index 7edbfe713c..bf684bae9a 100644 --- a/packages/model-runtime/src/providers/qwen/index.test.ts +++ b/packages/model-runtime/src/providers/qwen/index.test.ts @@ -4,7 +4,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; import type { LobeOpenAICompatibleRuntime } from '../../core/BaseAI'; import { testProvider } from '../../providerTestUtils'; -import { LobeQwenAI } from './index'; +import { LobeQwenAI, params } from './index'; const provider = ModelProvider.Qwen; const defaultBaseURL = 'https://dashscope.aliyuncs.com/compatible-mode/v1'; @@ -98,4 +98,117 @@ describe('LobeQwenAI - custom features', () => { expect(calledPayload.thinking_budget).toBe(4096); }); }); + + describe('preserve thinking mapping', () => { + it('should map preserveThinking to preserve_thinking for qwen3.6-plus', () => { + const payload = { + messages: [ + { content: 'hello', role: 'user' }, + { + content: 'answer', + reasoning: { content: 'reasoning content' }, + role: 'assistant', + }, + ], + model: 'qwen3.6-plus', + preserveThinking: true, + } as any; + + const result = params.chatCompletion!.handlePayload!(payload); + + expect(result.preserve_thinking).toBe(true); + expect(result.messages).toEqual([ + { content: 'hello', role: 'user' }, + { + content: 'answer', + reasoning_content: 'reasoning content', + role: 'assistant', + }, + ]); + }); + + it('should set preserve_thinking=false when explicitly disabled on supported model', () => { + const payload = { + messages: [{ content: 'hello', role: 'user' }], + model: 'qwen3.6-plus', + preserveThinking: false, + } as any; + + const result = params.chatCompletion!.handlePayload!(payload); + + expect(result.preserve_thinking).toBe(false); + }); + + it('should map preserveThinking for deployment-name aliases when caller provides the param', () => { + const payload = { + messages: [ + { + content: 'answer', + reasoning: { content: 'reasoning content' }, + role: 'assistant', + }, + ], + model: 'my-qwen3.6-plus-deployment', + preserveThinking: true, + } as any; + + const result = params.chatCompletion!.handlePayload!(payload); + + expect(result.preserve_thinking).toBe(true); + expect(result.messages).toEqual([ + { + content: 'answer', + reasoning_content: 'reasoning content', + role: 'assistant', + }, + ]); + }); + + it('should not set preserve_thinking when preserveThinking is absent but still keep reasoning_content', () => { + const payload = { + messages: [ + { + content: 'answer', + reasoning: { content: 'reasoning content' }, + role: 'assistant', + }, + ], + model: 'qwen3.5-plus', + } as any; + + const result = params.chatCompletion!.handlePayload!(payload); + + expect(result.preserve_thinking).toBeUndefined(); + expect(result.messages).toEqual([ + { + content: 'answer', + reasoning_content: 'reasoning content', + role: 'assistant', + }, + ]); + }); + + it('should keep caller-provided reasoning_content', () => { + const payload = { + messages: [ + { + content: 'answer', + reasoning_content: 'existing reasoning content', + role: 'assistant', + }, + ], + model: 'qwen3.5-plus', + } as any; + + const result = params.chatCompletion!.handlePayload!(payload); + + expect(result.messages).toEqual([ + { + content: 'answer', + reasoning_content: 'existing reasoning content', + role: 'assistant', + }, + ]); + }); + }); }); diff --git a/packages/model-runtime/src/providers/qwen/index.ts b/packages/model-runtime/src/providers/qwen/index.ts index ab01096acf..ea2aa03f25 100644 --- a/packages/model-runtime/src/providers/qwen/index.ts +++ b/packages/model-runtime/src/providers/qwen/index.ts @@ -1,5 +1,6 @@ import { ModelProvider } from 'model-bank'; +import type { OpenAICompatibleFactoryOptions } from '../../core/openaiCompatibleFactory'; import { createOpenAICompatibleRuntime } from '../../core/openaiCompatibleFactory'; import { resolveParameters } from '../../core/parameterResolver'; import { QwenAIStream } from '../../core/streams'; @@ -23,7 +24,7 @@ export const QwenLegacyModels = new Set([ 'qwen-1.8b-longcontext-chat', ]); -export const LobeQwenAI = createOpenAICompatibleRuntime({ +export const params = { baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1', chatCompletion: { handlePayload: (payload) => { @@ -35,6 +36,7 @@ export const LobeQwenAI = createOpenAICompatibleRuntime({ thinking, top_p, enabledSearch, + preserveThinking, ...rest } = payload; const isDeepSeekV4Model = model.startsWith('deepseek-v4'); @@ -54,31 +56,53 @@ export const LobeQwenAI = createOpenAICompatibleRuntime({ }, ); + const messages = (rest.messages || []).map((message: any) => { + const { reasoning, ...messageRest } = message; + + const reasoningContent = + typeof messageRest.reasoning_content === 'string' + ? messageRest.reasoning_content + : typeof reasoning?.content === 'string' + ? reasoning.content + : undefined; + + if (reasoningContent !== undefined) { + return { + ...messageRest, + reasoning_content: reasoningContent, + }; + } + + return messageRest; + }); + return { ...rest, ...(isDeepSeekV4Model ? { - ...(thinking?.type === 'enabled' || thinkingExplicitlyDisabled - ? { enable_thinking: !thinkingExplicitlyDisabled } - : {}), - ...(!thinkingExplicitlyDisabled && reasoning_effort && { reasoning_effort }), - } + ...(thinking?.type === 'enabled' || thinkingExplicitlyDisabled + ? { enable_thinking: !thinkingExplicitlyDisabled } + : {}), + ...(!thinkingExplicitlyDisabled && reasoning_effort && { reasoning_effort }), + } : model.includes('-thinking') ? { - enable_thinking: true, + enable_thinking: true, + thinking_budget: + thinking?.budget_tokens === 0 ? 0 : thinking?.budget_tokens || undefined, + } + : thinking + ? { + ...(thinking.type !== undefined && { + enable_thinking: thinking.type === 'enabled', + }), thinking_budget: thinking?.budget_tokens === 0 ? 0 : thinking?.budget_tokens || undefined, } - : thinking - ? { - ...(thinking.type !== undefined && { - enable_thinking: thinking.type === 'enabled', - }), - thinking_budget: - thinking?.budget_tokens === 0 ? 0 : thinking?.budget_tokens || undefined, - } : {}), + ...(typeof preserveThinking === 'boolean' && { preserve_thinking: preserveThinking }), frequency_penalty: undefined, + messages, model, presence_penalty: resolvedParams.presence_penalty, stream: true, @@ -118,4 +142,6 @@ export const LobeQwenAI = createOpenAICompatibleRuntime({ return processMultiProviderModelList(modelList, 'qwen'); }, provider: ModelProvider.Qwen, -}); +} satisfies OpenAICompatibleFactoryOptions; + +export const LobeQwenAI = createOpenAICompatibleRuntime(params); diff --git a/packages/model-runtime/src/providers/zhipu/index.test.ts b/packages/model-runtime/src/providers/zhipu/index.test.ts index 439190c7f7..ecc873fc34 100644 --- a/packages/model-runtime/src/providers/zhipu/index.test.ts +++ b/packages/model-runtime/src/providers/zhipu/index.test.ts @@ -407,6 +407,96 @@ describe('LobeZhipuAI - custom features', () => { }); }); + describe('preserve thinking mapping', () => { + it('should map preserveThinking=true to clear_thinking=false and convert reasoning content', () => { + const payload = { + messages: [ + { content: 'hello', role: 'user' }, + { + content: 'answer', + reasoning: { content: 'reasoning content' }, + role: 'assistant', + }, + ], + model: 'glm-5', + preserveThinking: true, + thinking: { budget_tokens: 1024, type: 'enabled' }, + } as any; + + const result = params.chatCompletion.handlePayload(payload); + + expect(result.thinking).toEqual({ clear_thinking: false, type: 'enabled' }); + expect(result.messages).toEqual([ + { content: 'hello', role: 'user' }, + { + content: 'answer', + reasoning_content: 'reasoning content', + role: 'assistant', + }, + ]); + }); + + it('should still convert reasoning to reasoning_content when preserveThinking is absent', () => { + const payload = { + messages: [ + { + content: 'answer', + reasoning: { content: 'reasoning content' }, + role: 'assistant', + }, + ], + model: 'glm-5', + } as any; + + const result = params.chatCompletion.handlePayload(payload); + + expect(result.thinking).toBeUndefined(); + expect(result.messages).toEqual([ + { + content: 'answer', + reasoning_content: 'reasoning content', + role: 'assistant', + }, + ]); + }); + + it('should map preserveThinking=false to clear_thinking=true', () => { + const payload = { + messages: [{ content: 'hello', role: 'user' }], + model: 'glm-4.7', + preserveThinking: false, + } as any; + + const result = params.chatCompletion.handlePayload(payload); + + expect(result.thinking).toEqual({ clear_thinking: true }); + }); + + it('should keep caller-provided reasoning_content', () => { + const payload = { + messages: [ + { + content: 'answer', + reasoning_content: 'existing reasoning content', + role: 'assistant', + }, + ], + model: 'glm-5', + preserveThinking: true, + } as any; + + const result = params.chatCompletion.handlePayload(payload); + + expect(result.messages).toEqual([ + { + content: 'answer', + reasoning_content: 'existing reasoning content', + role: 'assistant', + }, + ]); + }); + }); + describe('Preserve other payload properties', () => { it('should preserve all other properties', async () => { await instance.chat({ diff --git a/packages/model-runtime/src/providers/zhipu/index.ts b/packages/model-runtime/src/providers/zhipu/index.ts index 7b7b768f44..d4f635186e 100644 --- a/packages/model-runtime/src/providers/zhipu/index.ts +++ b/packages/model-runtime/src/providers/zhipu/index.ts @@ -26,6 +26,7 @@ export const params = { enabledSearch, max_tokens, model, + preserveThinking, stream, temperature, thinking, @@ -34,6 +35,35 @@ export const params = { ...rest } = payload; + const messages = (rest.messages || []).map((message: any) => { + const { reasoning, ...messageRest } = message; + + const reasoningContent = + typeof messageRest.reasoning_content === 'string' + ? messageRest.reasoning_content + : typeof reasoning?.content === 'string' + ? reasoning.content + : undefined; + + if (reasoningContent !== undefined) { + return { + ...messageRest, + reasoning_content: reasoningContent, + }; + } + + return messageRest; + }); + + const shouldSetClearThinking = typeof preserveThinking === 'boolean'; + const thinkingPayload = thinking ? { type: thinking.type } : undefined; + const resolvedThinking = shouldSetClearThinking + ? { + ...thinkingPayload, + clear_thinking: !preserveThinking, + } + : thinkingPayload; + const zhipuTools = enabledSearch ? [ ...(tools || []), @@ -78,9 +108,10 @@ export const params = { return { ...rest, ...resolvedParams, + messages, model, stream, - thinking: thinking ? { type: thinking.type } : undefined, + thinking: resolvedThinking, tool_stream: stream && /^glm-(?:4\.(?:6|7)|5(?:\.1)?)$/.test(model) ? true : undefined, tools: zhipuTools, } as any; diff --git a/packages/model-runtime/src/types/chat.ts b/packages/model-runtime/src/types/chat.ts index 7a48cb0d4d..e9c6304d9f 100644 --- a/packages/model-runtime/src/types/chat.ts +++ b/packages/model-runtime/src/types/chat.ts @@ -51,6 +51,7 @@ export interface OpenAIChatMessage { content?: string; duration?: number; }; + reasoning_content?: string; role: LLMRoleType; tool_call_id?: string; tool_calls?: MessageToolCall[]; @@ -123,6 +124,7 @@ export interface ChatStreamPayload { * @default 0 */ presence_penalty?: number; + preserveThinking?: boolean; provider?: string; reasoning?: { effort?: string; diff --git a/packages/model-runtime/src/utils/modelParse.ts b/packages/model-runtime/src/utils/modelParse.ts index 8bd0903d59..03c6240e5d 100644 --- a/packages/model-runtime/src/utils/modelParse.ts +++ b/packages/model-runtime/src/utils/modelParse.ts @@ -104,7 +104,7 @@ export const MODEL_LIST_CONFIGS = { 'qwen3', ], reasoningKeywords: ['qvq', 'qwq', 'qwen3', '!-instruct-', '!-coder-'], - visionKeywords: ['qvq', '-vl', '-omni'], + visionKeywords: ['qvq', '-vl', '-omni', 'qwen3.'], }, replicate: { imageOutputKeywords: [ diff --git a/packages/types/src/agent/chatConfig.ts b/packages/types/src/agent/chatConfig.ts index 4399877a96..ec59507629 100644 --- a/packages/types/src/agent/chatConfig.ts +++ b/packages/types/src/agent/chatConfig.ts @@ -113,6 +113,12 @@ export interface LobeAgentChatConfig extends AgentMemoryChatConfig, AgentSelfIte * Effort level for Claude Opus 4.7 and later (adds xhigh tier between high and max) */ opus47Effort?: 'low' | 'medium' | 'high' | 'xhigh' | 'max'; + + /** + * Whether to preserve and pass historical thinking content to the model + * (provider support required, e.g. Qwen preserve_thinking) + */ + preserveThinking?: boolean; reasoningBudgetToken?: number; /** * Reasoning budget token for models with 32k max (GLM-5/GLM-4.7) @@ -238,6 +244,7 @@ export const AgentChatConfigSchema = z imageResolution2: z.enum(['512', '1K', '2K', '4K']).optional(), opus47Effort: z.enum(['low', 'medium', 'high', 'xhigh', 'max']).optional(), runtimeEnv: RuntimeEnvConfigSchema.optional(), + preserveThinking: z.boolean().optional(), reasoningBudgetToken: z.number().optional(), reasoningBudgetToken32k: z.number().optional(), reasoningBudgetToken80k: z.number().optional(), diff --git a/packages/types/src/openai/chat.ts b/packages/types/src/openai/chat.ts index c05c44626b..f736aae3a3 100644 --- a/packages/types/src/openai/chat.ts +++ b/packages/types/src/openai/chat.ts @@ -55,6 +55,11 @@ export interface OpenAIChatMessage { */ function_call?: OpenAIFunctionCall; name?: string; + reasoning?: { + content?: string; + duration?: number; + }; + reasoning_content?: string; /** * Role * @description Role of the message sender @@ -102,6 +107,7 @@ export interface ChatStreamPayload { * @default 0 */ presence_penalty?: number; + preserveThinking?: boolean; /** * @default openai */ diff --git a/src/features/ModelSwitchPanel/components/ControlsForm/ControlsForm.tsx b/src/features/ModelSwitchPanel/components/ControlsForm/ControlsForm.tsx index 76eeecefbc..e6949a5773 100644 --- a/src/features/ModelSwitchPanel/components/ControlsForm/ControlsForm.tsx +++ b/src/features/ModelSwitchPanel/components/ControlsForm/ControlsForm.tsx @@ -146,6 +146,18 @@ const ControlsForm = memo( minWidth: undefined, name: 'enableReasoning', }, + { + children: , + desc: isNarrow ? ( + {t('extendParams.preserveThinking.desc')} + ) : ( + t('extendParams.preserveThinking.desc') + ), + label: t('extendParams.preserveThinking.title'), + layout: isNarrow ? 'vertical' : 'horizontal', + minWidth: undefined, + name: 'preserveThinking', + }, { children: , desc: isNarrow ? ( diff --git a/src/locales/default/chat.ts b/src/locales/default/chat.ts index cd3b3cdc20..d48ebc016f 100644 --- a/src/locales/default/chat.ts +++ b/src/locales/default/chat.ts @@ -98,6 +98,9 @@ export default { 'extendParams.enableReasoning.desc': 'Let the model reason before answering. Use it for complex tasks.', 'extendParams.enableReasoning.title': 'Enable Deep Thinking', + 'extendParams.preserveThinking.desc': + 'When enabled, assistant historical reasoning will be sent back as context for models. This may increase token usage.', + 'extendParams.preserveThinking.title': 'Preserve Historical Thinking', 'extendParams.imageAspectRatio.title': 'Image Aspect Ratio', 'extendParams.imageResolution.title': 'Image Resolution', 'extendParams.reasoningBudgetToken.title': 'Thinking Consumption Token', diff --git a/src/locales/default/modelProvider.ts b/src/locales/default/modelProvider.ts index df4743f09b..9568954dbe 100644 --- a/src/locales/default/modelProvider.ts +++ b/src/locales/default/modelProvider.ts @@ -283,6 +283,8 @@ export default { 'For Gemini 3.1 Flash Image models; controls resolution of generated images (supports 512px).', 'providerModels.item.modelConfig.extendParams.options.opus47Effort.hint': 'For Claude Opus 4.7 and later; controls effort level (low/medium/high/xhigh/max).', + 'providerModels.item.modelConfig.extendParams.options.preserveThinking.hint': + 'For Qwen3.6 Plus, GLM-5 and GLM-4.7; sends historical assistant reasoning back to model context (preserve_thinking / clear_thinking=false).', 'providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint': 'For Claude, Qwen3 and similar; controls token budget for reasoning.', 'providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken32k.hint': diff --git a/src/routes/(main)/settings/provider/features/ModelList/CreateNewModelModal/ExtendParamsSelect.tsx b/src/routes/(main)/settings/provider/features/ModelList/CreateNewModelModal/ExtendParamsSelect.tsx index 1079341ae0..7c9218c459 100644 --- a/src/routes/(main)/settings/provider/features/ModelList/CreateNewModelModal/ExtendParamsSelect.tsx +++ b/src/routes/(main)/settings/provider/features/ModelList/CreateNewModelModal/ExtendParamsSelect.tsx @@ -51,6 +51,10 @@ const EXTEND_PARAMS_OPTIONS: ExtendParamsOption[] = [ hintKey: 'providerModels.item.modelConfig.extendParams.options.enableAdaptiveThinking.hint', key: 'enableAdaptiveThinking', }, + { + hintKey: 'providerModels.item.modelConfig.extendParams.options.preserveThinking.hint', + key: 'preserveThinking', + }, { hintKey: 'providerModels.item.modelConfig.extendParams.options.reasoningBudgetToken.hint', key: 'reasoningBudgetToken', @@ -178,6 +182,7 @@ const TITLE_KEY_ALIASES: Partial> = { grok4_3ReasoningEffort: 'reasoningEffort', hy3ReasoningEffort: 'reasoningEffort', imageAspectRatio2: 'imageAspectRatio', + imageResolution2: 'imageResolution', opus47Effort: 'effort', reasoningBudgetToken32k: 'reasoningBudgetToken', reasoningBudgetToken80k: 'reasoningBudgetToken', @@ -241,6 +246,11 @@ const PREVIEW_META: Partial> = { imageResolution: { labelSuffix: '', previewWidth: 250, tag: 'resolution' }, imageResolution2: { labelSuffix: ' (512px+)', previewWidth: 280, tag: 'resolution' }, opus47Effort: { labelSuffix: ' (Opus 4.7+)', previewWidth: 280, tag: 'output_config.effort' }, + preserveThinking: { + labelSuffix: ' (Qwen3.6+ / GLM-4.7+)', + previewWidth: 460, + tag: 'preserve_thinking', + }, reasoningBudgetToken: { previewWidth: 350, tag: 'thinking.budget_tokens' }, reasoningBudgetToken32k: { labelSuffix: ' (32k)', @@ -383,6 +393,7 @@ const ExtendParamsSelect = memo(({ value, onChange }) = effort: , enableAdaptiveThinking: , enableReasoning: , + preserveThinking: , gpt5ReasoningEffort: , gpt5_1ReasoningEffort: , gpt5_2ProReasoningEffort: , diff --git a/src/server/modules/AgentRuntime/RuntimeExecutors.ts b/src/server/modules/AgentRuntime/RuntimeExecutors.ts index 03aa02b107..905a2db2d1 100644 --- a/src/server/modules/AgentRuntime/RuntimeExecutors.ts +++ b/src/server/modules/AgentRuntime/RuntimeExecutors.ts @@ -660,6 +660,8 @@ export const createRuntimeExecutors = ( try { type ContentPart = { text: string; type: 'text' } | { image: string; type: 'image' }; + let shouldPersistAssistantReasoning = false; + let preserveThinkingForPayload: boolean | undefined; // Process messages through serverMessagesEngine to inject system role, knowledge, etc. // Rebuild params from agentConfig at execution time (capabilities built dynamically) @@ -669,6 +671,41 @@ export const createRuntimeExecutors = ( const { loadModels } = await import('@/business/client/model-bank/loadModels'); const builtinModels = await loadModels(); + const preserveThinkingConfigured = + typeof agentConfig.chatConfig?.preserveThinking === 'boolean' + ? agentConfig.chatConfig.preserveThinking + : undefined; + const preserveThinkingRequested = preserveThinkingConfigured === true; + + const modelCard = builtinModels.find( + (item) => + item.providerId === provider && + (item.id === model || item.config?.deploymentName === model), + ); + const modelExtendParams = + modelCard && + 'settings' in modelCard && + modelCard.settings && + typeof modelCard.settings === 'object' && + 'extendParams' in modelCard.settings + ? (modelCard.settings as { extendParams?: string[] }).extendParams + : undefined; + + const modelSupportsPreserveThinkingFromCard = + Array.isArray(modelExtendParams) && modelExtendParams.includes('preserveThinking'); + const providerSupportsPreserveThinkingFallback = + provider === 'qwen' || provider === 'zhipu'; + const modelSupportsPreserveThinking = + modelSupportsPreserveThinkingFromCard || + (!modelCard && providerSupportsPreserveThinkingFallback); + + shouldPersistAssistantReasoning = + preserveThinkingRequested && modelSupportsPreserveThinking; + preserveThinkingForPayload = + modelSupportsPreserveThinking && typeof preserveThinkingConfigured === 'boolean' + ? preserveThinkingConfigured + : undefined; + // Extract tags from messages and fetch summaries. // Skip if messages already contain injected topic_reference_context // (e.g., from client-side contextEngineering preprocessing) to avoid double injection. @@ -1102,7 +1139,15 @@ export const createRuntimeExecutors = ( // Construct ChatStreamPayload const stream = ctx.stream ?? true; - const chatPayload = { messages: processedMessages, model, stream, tools }; + const chatPayload = { + messages: processedMessages, + model, + stream, + tools, + ...(typeof preserveThinkingForPayload === 'boolean' && { + preserveThinking: preserveThinkingForPayload, + }), + }; // Buffer: accumulate text and reasoning, send every 50ms const BUFFER_INTERVAL = 50; @@ -1594,6 +1639,10 @@ export const createRuntimeExecutors = ( }; } + const persistedReasoning = shouldPersistAssistantReasoning + ? finalReasoning + : undefined; + try { // Build metadata object const metadata: Record = {}; @@ -1626,7 +1675,7 @@ export const createRuntimeExecutors = ( content: finalContent, imageList: imageList.length > 0 ? imageList : undefined, metadata: Object.keys(metadata).length > 0 ? metadata : undefined, - reasoning: finalReasoning, + reasoning: persistedReasoning, search: grounding, tools: persistedTools, }); @@ -1659,7 +1708,7 @@ export const createRuntimeExecutors = ( newState.messages.push({ content, id: assistantMessageItem.id, - reasoning: finalReasoning, + reasoning: persistedReasoning, role: 'assistant', tool_calls: stateToolCalls, }); diff --git a/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts b/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts index ef51d7da5d..cf51ec6c25 100644 --- a/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts +++ b/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts @@ -16,6 +16,12 @@ const mockBuiltinModels = vi.hoisted(() => [ id: 'gpt-4', providerId: 'openai', }, + { + abilities: { functionCall: true, video: true, vision: true }, + id: 'qwen3.6-plus', + providerId: 'qwen', + settings: { extendParams: ['preserveThinking'] }, + }, { abilities: { functionCall: false, video: false, vision: false }, id: 'no-tools-model', @@ -25,6 +31,7 @@ const mockBuiltinModels = vi.hoisted(() => [ abilities: { functionCall: true, video: true, vision: true }, id: 'gemini-3.1-flash-lite-preview', providerId: 'google', + settings: { extendParams: ['preserveThinking'] }, }, ]); @@ -100,12 +107,21 @@ describe('RuntimeExecutors', () => { beforeEach(() => { vi.clearAllMocks(); + vi.mocked(initModelRuntimeFromDB).mockReset(); + mockCreateCompressionGroup.mockReset(); + mockFinalizeCompression.mockReset(); mockCreateCompressionGroup.mockResolvedValue({ messageGroupId: 'group-123', messagesToSummarize: [], success: true, }); mockFinalizeCompression.mockResolvedValue({ success: true }); + vi.mocked(initModelRuntimeFromDB).mockResolvedValue({ + chat: vi.fn().mockImplementation(async (_payload: any, options: any) => { + await options?.callback?.onText?.('done'); + return new Response('done'); + }), + } as any); mockMessageModel = { create: vi.fn().mockResolvedValue({ id: 'msg-123' }), @@ -392,52 +408,233 @@ describe('RuntimeExecutors', () => { ); }); - it('should preserve reasoning in newState when assistant returns tool calls', async () => { - const toolCallPayload = [ - { - function: { arguments: '{}', name: 'search' }, - id: 'call_1', - type: 'function', - }, - ]; + describe('reasoning persistence gate', () => { + it('should persist assistant reasoning with tool calls when preserveThinking is enabled on a supported model', async () => { + const toolCallPayload = [ + { + function: { arguments: '{}', name: 'search' }, + id: 'call_1', + type: 'function', + }, + ]; - const mockChat = vi.fn().mockImplementation(async (_payload, options) => { - await options?.callback?.onThinking?.('Need to inspect the search results first.'); - await options?.callback?.onToolsCalling?.({ toolsCalling: toolCallPayload }); - await options?.callback?.onCompletion?.({ - usage: { - totalInputTokens: 1, - totalOutputTokens: 2, - totalTokens: 3, + const mockChat = vi.fn().mockImplementation(async (_payload, options) => { + await options?.callback?.onThinking?.('Need to inspect the search results first.'); + await options?.callback?.onToolsCalling?.({ toolsCalling: toolCallPayload }); + await options?.callback?.onCompletion?.({ + usage: { + totalInputTokens: 1, + totalOutputTokens: 2, + totalTokens: 3, + }, + }); + return new Response('done'); + }); + vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any); + + const ctxWithConfig: RuntimeExecutorContext = { + ...ctx, + agentConfig: { + chatConfig: { preserveThinking: true }, + plugins: [], + systemRole: 'test', + }, + }; + + const executors = createRuntimeExecutors(ctxWithConfig); + const state = createMockState({ + modelRuntimeConfig: { + model: 'qwen3.6-plus', + provider: 'qwen', }, }); - return new Response('done'); + + const instruction = { + payload: { + messages: [{ content: 'Hello', role: 'user' }], + model: 'qwen3.6-plus', + provider: 'qwen', + tools: [], + }, + type: 'call_llm' as const, + }; + + const result = await executors.call_llm!(instruction, state); + + expect(result.newState.messages.at(-1)).toEqual( + expect.objectContaining({ + reasoning: { content: 'Need to inspect the search results first.' }, + role: 'assistant', + tool_calls: [expect.objectContaining({ id: 'call_1' })], + }), + ); + expect(mockChat).toHaveBeenCalledWith( + expect.objectContaining({ preserveThinking: true }), + expect.anything(), + ); }); - vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any); - const executors = createRuntimeExecutors(ctx); - const state = createMockState(); + it('should not persist assistant reasoning when preserveThinking is not enabled', async () => { + const mockChat = vi.fn().mockImplementation(async (_payload, options) => { + await options?.callback?.onThinking?.('hidden reasoning'); + await options?.callback?.onText?.('answer'); + return new Response('done'); + }); + vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any); - const instruction = { - payload: { - messages: [{ content: 'Hello', role: 'user' }], - model: 'gpt-4', - provider: 'openai', - tools: [], - }, - type: 'call_llm' as const, - }; + const executors = createRuntimeExecutors(ctx); + const state = createMockState(); - const result = await executors.call_llm!(instruction, state); + const instruction = { + payload: { + messages: [{ content: 'Hello', role: 'user' }], + model: 'gpt-4', + provider: 'openai', + }, + type: 'call_llm' as const, + }; - expect(result.newState.messages.at(-1)).toEqual( - expect.objectContaining({ - id: 'msg-123', - reasoning: { content: 'Need to inspect the search results first.' }, - role: 'assistant', - tool_calls: [expect.objectContaining({ id: 'call_1' })], - }), - ); + const result = await executors.call_llm!(instruction, state); + const assistant = result.newState.messages.at(-1) as any; + + expect(assistant.reasoning).toBeUndefined(); + }); + + it('should persist assistant reasoning when preserveThinking is enabled on a supported model', async () => { + const mockChat = vi.fn().mockImplementation(async (_payload, options) => { + await options?.callback?.onThinking?.('preserved reasoning'); + await options?.callback?.onText?.('answer'); + return new Response('done'); + }); + vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any); + + const ctxWithConfig: RuntimeExecutorContext = { + ...ctx, + agentConfig: { + chatConfig: { preserveThinking: true }, + plugins: [], + systemRole: 'test', + }, + }; + + const executors = createRuntimeExecutors(ctxWithConfig); + const state = createMockState({ + modelRuntimeConfig: { + model: 'qwen3.6-plus', + provider: 'qwen', + }, + }); + + const instruction = { + payload: { + messages: [{ content: 'Hello', role: 'user' }], + model: 'qwen3.6-plus', + provider: 'qwen', + }, + type: 'call_llm' as const, + }; + + const result = await executors.call_llm!(instruction, state); + const assistant = result.newState.messages.at(-1) as any; + + expect(assistant.reasoning).toEqual({ + content: 'preserved reasoning', + }); + expect(mockChat).toHaveBeenCalledWith( + expect.objectContaining({ preserveThinking: true }), + expect.anything(), + ); + }); + + it('should persist reasoning for unknown custom deployments on supported providers', async () => { + const mockChat = vi.fn().mockImplementation(async (_payload, options) => { + await options?.callback?.onThinking?.('custom deployment reasoning'); + await options?.callback?.onText?.('answer'); + return new Response('done'); + }); + vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any); + + const ctxWithConfig: RuntimeExecutorContext = { + ...ctx, + agentConfig: { + chatConfig: { preserveThinking: true }, + plugins: [], + systemRole: 'test', + }, + }; + + const executors = createRuntimeExecutors(ctxWithConfig); + const state = createMockState({ + modelRuntimeConfig: { + model: 'my-qwen-custom-deployment', + provider: 'qwen', + }, + }); + + const instruction = { + payload: { + messages: [{ content: 'Hello', role: 'user' }], + model: 'my-qwen-custom-deployment', + provider: 'qwen', + }, + type: 'call_llm' as const, + }; + + const result = await executors.call_llm!(instruction, state); + const assistant = result.newState.messages.at(-1) as any; + + expect(assistant.reasoning).toEqual({ + content: 'custom deployment reasoning', + }); + expect(mockChat).toHaveBeenCalledWith( + expect.objectContaining({ preserveThinking: true }), + expect.anything(), + ); + }); + + it('should not persist reasoning when model does not declare preserveThinking capability', async () => { + const mockChat = vi.fn().mockImplementation(async (_payload, options) => { + await options?.callback?.onThinking?.('reasoning that should not be saved'); + await options?.callback?.onText?.('answer'); + return new Response('done'); + }); + vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any); + + const ctxWithConfig: RuntimeExecutorContext = { + ...ctx, + agentConfig: { + chatConfig: { preserveThinking: true }, + plugins: [], + systemRole: 'test', + }, + }; + + const executors = createRuntimeExecutors(ctxWithConfig); + const state = createMockState({ + modelRuntimeConfig: { + model: 'gpt-4', + provider: 'openai', + }, + }); + + const instruction = { + payload: { + messages: [{ content: 'Hello', role: 'user' }], + model: 'gpt-4', + provider: 'openai', + }, + type: 'call_llm' as const, + }; + + const result = await executors.call_llm!(instruction, state); + const assistant = result.newState.messages.at(-1) as any; + + expect(assistant.reasoning).toBeUndefined(); + expect(mockChat).toHaveBeenCalledWith( + expect.not.objectContaining({ preserveThinking: expect.any(Boolean) }), + expect.anything(), + ); + }); }); it('retries empty completions on the branded provider then throws ModelEmptyError', async () => { @@ -574,7 +771,14 @@ describe('RuntimeExecutors', () => { }); vi.mocked(initModelRuntimeFromDB).mockResolvedValueOnce({ chat: mockChat } as any); - const executors = createRuntimeExecutors(ctx); + // Reasoning only lands in the finalized message when preserveThinking is + // enabled on a supported model; otherwise it is intentionally dropped. + // Enable it here so this still guards reasoning_part capture (not drop). + const ctxWithThinking: RuntimeExecutorContext = { + ...ctx, + agentConfig: { chatConfig: { preserveThinking: true }, plugins: [], systemRole: 'test' }, + }; + const executors = createRuntimeExecutors(ctxWithThinking); const result = await executors.call_llm!(geminiInstruction(), createMockState()); expect(result.newState.messages.at(-1)).toEqual( @@ -4032,7 +4236,7 @@ describe('RuntimeExecutors', () => { await vi.runOnlyPendingTimersAsync(); - const result = await resultPromise; + await resultPromise; expect(mockChat).toHaveBeenCalledTimes(2); expect(mockMessageModel.create).toHaveBeenCalledTimes(1); diff --git a/src/services/chat/mecha/modelParamsResolver.test.ts b/src/services/chat/mecha/modelParamsResolver.test.ts index 8808de8482..96f79644aa 100644 --- a/src/services/chat/mecha/modelParamsResolver.test.ts +++ b/src/services/chat/mecha/modelParamsResolver.test.ts @@ -243,6 +243,51 @@ describe('resolveModelExtendParams', () => { }); }); + describe('preserve thinking', () => { + beforeEach(() => { + vi.spyOn(aiModelSelectors.aiModelSelectors, 'isModelHasExtendParams').mockReturnValue( + () => true, + ); + vi.spyOn(aiModelSelectors.aiModelSelectors, 'modelExtendParams').mockReturnValue(() => [ + 'preserveThinking', + ]); + }); + + it('should set preserveThinking when supported and enabled', () => { + const result = resolveModelExtendParams({ + chatConfig: { + preserveThinking: true, + } as any, + model: 'qwen3.6-plus', + provider: 'qwen', + }); + + expect(result.preserveThinking).toBe(true); + }); + + it('should set preserveThinking to false when explicitly disabled', () => { + const result = resolveModelExtendParams({ + chatConfig: { + preserveThinking: false, + } as any, + model: 'qwen3.6-plus', + provider: 'qwen', + }); + + expect(result.preserveThinking).toBe(false); + }); + + it('should not set preserveThinking when not configured', () => { + const result = resolveModelExtendParams({ + chatConfig: {} as any, + model: 'qwen3.6-plus', + provider: 'qwen', + }); + + expect(result.preserveThinking).toBeUndefined(); + }); + }); + describe('reasoning effort variants', () => { describe('reasoningEffort param', () => { beforeEach(() => { diff --git a/src/services/chat/mecha/modelParamsResolver.ts b/src/services/chat/mecha/modelParamsResolver.ts index 878e42d518..9207178876 100644 --- a/src/services/chat/mecha/modelParamsResolver.ts +++ b/src/services/chat/mecha/modelParamsResolver.ts @@ -21,6 +21,7 @@ export interface ModelExtendParams { enabledContextCaching?: boolean; imageAspectRatio?: string; imageResolution?: string; + preserveThinking?: boolean; reasoning_effort?: string; thinking?: { budget_tokens?: number; @@ -188,6 +189,14 @@ export const resolveModelExtendParams = (ctx: ModelParamsContext): ModelExtendPa extendParams.enabledContextCaching = false; } + // Preserve historical thinking content (provider support required) + if ( + modelExtendParams.includes('preserveThinking') && + typeof chatConfig.preserveThinking === 'boolean' + ) { + extendParams.preserveThinking = chatConfig.preserveThinking; + } + // Reasoning effort variants if (modelExtendParams.includes('reasoningEffort') && chatConfig.reasoningEffort) { extendParams.reasoning_effort = chatConfig.reasoningEffort; diff --git a/src/utils/server/parseModels.test.ts b/src/utils/server/parseModels.test.ts index 8cb95e5691..9c398677f5 100644 --- a/src/utils/server/parseModels.test.ts +++ b/src/utils/server/parseModels.test.ts @@ -571,41 +571,6 @@ describe('transformToChatModelCards', () => { expect(result).toMatchSnapshot(); }); - it('should use default deploymentName from known model when not specified in string (VolcEngine case)', async () => { - const knownModel = LOBE_DEFAULT_MODEL_LIST.find( - (m) => m.id === 'deepseek-r1' && m.providerId === 'volcengine', - ); - const defaultChatModels: AiFullModelCard[] = []; - const result = await transformToAiModelList({ - modelString: '+deepseek-r1', - defaultModels: defaultChatModels, - providerId: 'volcengine', - withDeploymentName: true, - }); - expect(result).toContainEqual({ - ...knownModel, - enabled: true, - }); - }); - - it('should use deploymentName from modelString when specified (VolcEngine case)', async () => { - const defaultChatModels: AiFullModelCard[] = []; - const knownModel = LOBE_DEFAULT_MODEL_LIST.find( - (m) => m.id === 'deepseek-r1' && m.providerId === 'volcengine', - ); - const result = await transformToAiModelList({ - modelString: `+deepseek-r1->my-custom-deploy`, - defaultModels: defaultChatModels, - providerId: 'volcengine', - withDeploymentName: true, - }); - expect(result).toContainEqual({ - ...knownModel, - enabled: true, - config: { deploymentName: 'my-custom-deploy' }, - }); - }); - it('should set both id and deploymentName to the full string when no -> is used and withDeploymentName is true', async () => { const defaultChatModels: AiFullModelCard[] = []; const result = await transformToAiModelList({