diff --git a/packages/types/src/providers/zai.ts b/packages/types/src/providers/zai.ts index 41a6a808ca0..0fc175deb96 100644 --- a/packages/types/src/providers/zai.ts +++ b/packages/types/src/providers/zai.ts @@ -86,6 +86,9 @@ export const internationalZAiModels = { contextWindow: 131_072, supportsImages: true, supportsPromptCache: true, + supportsReasoningEffort: ["disable", "medium"], + reasoningEffort: "medium", + preserveReasoning: true, inputPrice: 0.3, outputPrice: 0.9, cacheWritesPrice: 0, @@ -98,6 +101,9 @@ export const internationalZAiModels = { contextWindow: 200_000, supportsImages: false, supportsPromptCache: true, + supportsReasoningEffort: ["disable", "medium"], + reasoningEffort: "medium", + preserveReasoning: true, inputPrice: 0.6, outputPrice: 2.2, cacheWritesPrice: 0, @@ -259,6 +265,9 @@ export const mainlandZAiModels = { contextWindow: 204_800, supportsImages: false, supportsPromptCache: true, + supportsReasoningEffort: ["disable", "medium"], + reasoningEffort: "medium", + preserveReasoning: true, inputPrice: 0.29, outputPrice: 1.14, cacheWritesPrice: 0, @@ -310,6 +319,9 @@ export const mainlandZAiModels = { contextWindow: 131_072, supportsImages: true, supportsPromptCache: true, + supportsReasoningEffort: ["disable", "medium"], + reasoningEffort: "medium", + preserveReasoning: true, inputPrice: 0.15, outputPrice: 0.45, cacheWritesPrice: 0, diff --git a/src/api/providers/base-openai-compatible-provider.ts b/src/api/providers/base-openai-compatible-provider.ts index fc3d769ae2a..7db09ac9f36 100644 --- a/src/api/providers/base-openai-compatible-provider.ts +++ b/src/api/providers/base-openai-compatible-provider.ts @@ -7,6 +7,7 @@ import { type ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/ap import { TagMatcher } from "../../utils/tag-matcher" import { ApiStream, ApiStreamUsageChunk } from "../transform/stream" import { convertToOpenAiMessages } from "../transform/openai-format" +import { convertToZAiFormat } from "../transform/zai-format" import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index" import { DEFAULT_HEADERS } from "./constants" @@ -14,6 +15,7 @@ import { BaseProvider } from "./base-provider" import { handleOpenAIError } from "./utils/openai-error-handler" import { calculateApiCostOpenAI } from "../../shared/cost" import { getApiRequestTimeout } from "./utils/timeout-config" +import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection" type BaseOpenAiCompatibleProviderOptions = ApiHandlerOptions & { providerName: string @@ -36,6 +38,7 @@ export abstract class BaseOpenAiCompatibleProvider protected readonly options: ApiHandlerOptions protected client: OpenAI + protected glmConfig: GlmModelConfig | null = null constructor({ providerName, @@ -65,6 +68,13 @@ export abstract class BaseOpenAiCompatibleProvider defaultHeaders: DEFAULT_HEADERS, timeout: getApiRequestTimeout(), }) + + // Detect GLM model on construction if model ID is available + const modelId = this.options.apiModelId || "" + if (modelId) { + this.glmConfig = detectGlmModel(modelId) + logGlmDetection(this.providerName, modelId, this.glmConfig) + } } protected createStream( @@ -75,6 +85,12 @@ export abstract class BaseOpenAiCompatibleProvider ) { const { id: model, info } = this.getModel() + // Re-detect GLM model if not already done or if model ID changed + if (!this.glmConfig || this.glmConfig.originalModelId !== model) { + this.glmConfig = detectGlmModel(model) + logGlmDetection(this.providerName, model, this.glmConfig) + } + // Centralized cap: clamp to 20% of the context window (unless provider-specific exceptions apply) const max_tokens = getModelMaxOutputTokens({ @@ -86,16 +102,32 @@ export abstract class BaseOpenAiCompatibleProvider const temperature = this.options.modelTemperature ?? info.defaultTemperature ?? this.defaultTemperature + // Convert messages based on whether this is a GLM model + // GLM models benefit from mergeToolResultText to prevent reasoning_content loss + const convertedMessages = this.glmConfig.isGlmModel + ? convertToZAiFormat(messages, { mergeToolResultText: this.glmConfig.mergeToolResultText }) + : convertToOpenAiMessages(messages) + + // Determine parallel_tool_calls setting + // Disable for GLM models as they may not support it properly + let parallelToolCalls: boolean + if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) { + parallelToolCalls = false + console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`) + } else { + parallelToolCalls = metadata?.parallelToolCalls ?? true + } + const params: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = { model, max_tokens, temperature, - messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)], + messages: [{ role: "system", content: systemPrompt }, ...convertedMessages], stream: true, stream_options: { include_usage: true }, tools: this.convertToolsForOpenAI(metadata?.tools), tool_choice: metadata?.tool_choice, - parallel_tool_calls: metadata?.parallelToolCalls ?? true, + parallel_tool_calls: parallelToolCalls, } // Add thinking parameter if reasoning is enabled and model supports it @@ -103,6 +135,13 @@ export abstract class BaseOpenAiCompatibleProvider ;(params as any).thinking = { type: "enabled" } } + // For GLM-4.7 models with thinking support, add thinking parameter + if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) { + const useReasoning = this.options.enableReasoningEffort !== false // Default to enabled for GLM-4.7 + ;(params as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" } + console.log(`[${this.providerName}] GLM-4.7 thinking mode: ${useReasoning ? "enabled" : "disabled"}`) + } + try { return this.client.chat.completions.create(params, requestOptions) } catch (error) { diff --git a/src/api/providers/lm-studio.ts b/src/api/providers/lm-studio.ts index a771394c535..480919af1aa 100644 --- a/src/api/providers/lm-studio.ts +++ b/src/api/providers/lm-studio.ts @@ -10,6 +10,7 @@ import { NativeToolCallParser } from "../../core/assistant-message/NativeToolCal import { TagMatcher } from "../../utils/tag-matcher" import { convertToOpenAiMessages } from "../transform/openai-format" +import { convertToZAiFormat } from "../transform/zai-format" import { ApiStream } from "../transform/stream" import { BaseProvider } from "./base-provider" @@ -17,11 +18,13 @@ import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from ". import { getModelsFromCache } from "./fetchers/modelCache" import { getApiRequestTimeout } from "./utils/timeout-config" import { handleOpenAIError } from "./utils/openai-error-handler" +import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection" export class LmStudioHandler extends BaseProvider implements SingleCompletionHandler { protected options: ApiHandlerOptions private client: OpenAI private readonly providerName = "LM Studio" + private glmConfig: GlmModelConfig | null = null constructor(options: ApiHandlerOptions) { super() @@ -35,6 +38,13 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan apiKey: apiKey, timeout: getApiRequestTimeout(), }) + + // Detect GLM model on construction if model ID is available + const modelId = this.options.lmStudioModelId || "" + if (modelId) { + this.glmConfig = detectGlmModel(modelId) + logGlmDetection(this.providerName, modelId, this.glmConfig) + } } override async *createMessage( @@ -42,9 +52,23 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan messages: Anthropic.Messages.MessageParam[], metadata?: ApiHandlerCreateMessageMetadata, ): ApiStream { + const model = this.getModel() + + // Re-detect GLM model if not already done or if model ID changed + if (!this.glmConfig || this.glmConfig.originalModelId !== model.id) { + this.glmConfig = detectGlmModel(model.id) + logGlmDetection(this.providerName, model.id, this.glmConfig) + } + + // Convert messages based on whether this is a GLM model + // GLM models benefit from mergeToolResultText to prevent reasoning_content loss + const convertedMessages = this.glmConfig.isGlmModel + ? convertToZAiFormat(messages, { mergeToolResultText: this.glmConfig.mergeToolResultText }) + : convertToOpenAiMessages(messages) + const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [ { role: "system", content: systemPrompt }, - ...convertToOpenAiMessages(messages), + ...convertedMessages, ] // ------------------------- @@ -83,20 +107,37 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan let assistantText = "" try { + // Determine parallel_tool_calls setting + // Disable for GLM models as they may not support it properly + let parallelToolCalls: boolean + if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) { + parallelToolCalls = false + console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`) + } else { + parallelToolCalls = metadata?.parallelToolCalls ?? true + } + const params: OpenAI.Chat.ChatCompletionCreateParamsStreaming & { draft_model?: string } = { - model: this.getModel().id, + model: model.id, messages: openAiMessages, temperature: this.options.modelTemperature ?? LMSTUDIO_DEFAULT_TEMPERATURE, stream: true, tools: this.convertToolsForOpenAI(metadata?.tools), tool_choice: metadata?.tool_choice, - parallel_tool_calls: metadata?.parallelToolCalls ?? true, + parallel_tool_calls: parallelToolCalls, } if (this.options.lmStudioSpeculativeDecodingEnabled && this.options.lmStudioDraftModelId) { params.draft_model = this.options.lmStudioDraftModelId } + // For GLM-4.7 models with thinking support, add thinking parameter + if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) { + const useReasoning = this.options.enableReasoningEffort !== false // Default to enabled for GLM-4.7 + ;(params as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" } + console.log(`[${this.providerName}] GLM-4.7 thinking mode: ${useReasoning ? "enabled" : "disabled"}`) + } + let results try { results = await this.client.chat.completions.create(params) @@ -124,6 +165,19 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan } } + // Handle reasoning_content for GLM models (similar to Z.ai) + if (delta) { + for (const key of ["reasoning_content", "reasoning"] as const) { + if (key in delta) { + const reasoning_content = ((delta as any)[key] as string | undefined) || "" + if (reasoning_content?.trim()) { + yield { type: "reasoning", text: reasoning_content } + } + break + } + } + } + // Handle tool calls in stream - emit partial chunks for NativeToolCallParser if (delta?.tool_calls) { for (const toolCall of delta.tool_calls) { diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts index 87589b93960..37ac133ec76 100644 --- a/src/api/providers/openai.ts +++ b/src/api/providers/openai.ts @@ -15,6 +15,7 @@ import type { ApiHandlerOptions } from "../../shared/api" import { TagMatcher } from "../../utils/tag-matcher" import { convertToOpenAiMessages } from "../transform/openai-format" +import { convertToZAiFormat } from "../transform/zai-format" import { convertToR1Format } from "../transform/r1-format" import { ApiStream, ApiStreamUsageChunk } from "../transform/stream" import { getModelParams } from "../transform/model-params" @@ -24,6 +25,7 @@ import { BaseProvider } from "./base-provider" import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index" import { getApiRequestTimeout } from "./utils/timeout-config" import { handleOpenAIError } from "./utils/openai-error-handler" +import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection" // TODO: Rename this to OpenAICompatibleHandler. Also, I think the // `OpenAINativeHandler` can subclass from this, since it's obviously @@ -31,7 +33,8 @@ import { handleOpenAIError } from "./utils/openai-error-handler" export class OpenAiHandler extends BaseProvider implements SingleCompletionHandler { protected options: ApiHandlerOptions protected client: OpenAI - private readonly providerName = "OpenAI" + private readonly providerName = "OpenAI Compatible" + private glmConfig: GlmModelConfig | null = null constructor(options: ApiHandlerOptions) { super() @@ -77,6 +80,13 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl timeout, }) } + + // Detect GLM model on construction if model ID is available + const modelId = this.options.openAiModelId || "" + if (modelId) { + this.glmConfig = detectGlmModel(modelId) + logGlmDetection(this.providerName, modelId, this.glmConfig) + } } override async *createMessage( @@ -91,6 +101,12 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl const isAzureAiInference = this._isAzureAiInference(modelUrl) const deepseekReasoner = modelId.includes("deepseek-reasoner") || enabledR1Format + // Re-detect GLM model if not already done or if model ID changed + if (!this.glmConfig || this.glmConfig.originalModelId !== modelId) { + this.glmConfig = detectGlmModel(modelId) + logGlmDetection(this.providerName, modelId, this.glmConfig) + } + if (modelId.includes("o1") || modelId.includes("o3") || modelId.includes("o4")) { yield* this.handleO3FamilyMessage(modelId, systemPrompt, messages, metadata) return @@ -106,6 +122,12 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl if (deepseekReasoner) { convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) + } else if (this.glmConfig.isGlmModel) { + // GLM models benefit from mergeToolResultText to prevent reasoning_content loss + const glmConvertedMessages = convertToZAiFormat(messages, { + mergeToolResultText: this.glmConfig.mergeToolResultText, + }) + convertedMessages = [systemMessage, ...glmConvertedMessages] } else { if (modelInfo.supportsPromptCache) { systemMessage = { @@ -152,6 +174,16 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl const isGrokXAI = this._isGrokXAI(this.options.openAiBaseUrl) + // Determine parallel_tool_calls setting + // Disable for GLM models as they may not support it properly + let parallelToolCalls: boolean + if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) { + parallelToolCalls = false + console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`) + } else { + parallelToolCalls = metadata?.parallelToolCalls ?? true + } + const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = { model: modelId, temperature: this.options.modelTemperature ?? (deepseekReasoner ? DEEP_SEEK_DEFAULT_TEMPERATURE : 0), @@ -161,12 +193,19 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl ...(reasoning && reasoning), tools: this.convertToolsForOpenAI(metadata?.tools), tool_choice: metadata?.tool_choice, - parallel_tool_calls: metadata?.parallelToolCalls ?? true, + parallel_tool_calls: parallelToolCalls, } // Add max_tokens if needed this.addMaxTokensIfNeeded(requestOptions, modelInfo) + // For GLM-4.7 models with thinking support, add thinking parameter + if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) { + const useReasoning = this.options.enableReasoningEffort !== false // Default to enabled for GLM-4.7 + ;(requestOptions as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" } + console.log(`[${this.providerName}] GLM-4.7 thinking mode: ${useReasoning ? "enabled" : "disabled"}`) + } + let stream try { stream = await this.client.chat.completions.create( @@ -221,20 +260,46 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl yield this.processUsageMetrics(lastUsage, modelInfo) } } else { + // Determine message conversion based on model type + let nonStreamingMessages + if (deepseekReasoner) { + nonStreamingMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) + } else if (this.glmConfig.isGlmModel) { + // GLM models benefit from mergeToolResultText to prevent reasoning_content loss + const glmConvertedMessages = convertToZAiFormat(messages, { + mergeToolResultText: this.glmConfig.mergeToolResultText, + }) + nonStreamingMessages = [systemMessage, ...glmConvertedMessages] + } else { + nonStreamingMessages = [systemMessage, ...convertToOpenAiMessages(messages)] + } + + // Determine parallel_tool_calls setting for non-streaming + let nonStreamingParallelToolCalls: boolean + if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) { + nonStreamingParallelToolCalls = false + } else { + nonStreamingParallelToolCalls = metadata?.parallelToolCalls ?? true + } + const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = { model: modelId, - messages: deepseekReasoner - ? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) - : [systemMessage, ...convertToOpenAiMessages(messages)], + messages: nonStreamingMessages, // Tools are always present (minimum ALWAYS_AVAILABLE_TOOLS) tools: this.convertToolsForOpenAI(metadata?.tools), tool_choice: metadata?.tool_choice, - parallel_tool_calls: metadata?.parallelToolCalls ?? true, + parallel_tool_calls: nonStreamingParallelToolCalls, } // Add max_tokens if needed this.addMaxTokensIfNeeded(requestOptions, modelInfo) + // For GLM-4.7 models with thinking support, add thinking parameter + if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) { + const useReasoning = this.options.enableReasoningEffort !== false + ;(requestOptions as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" } + } + let response try { response = await this.client.chat.completions.create( diff --git a/src/api/providers/utils/__tests__/glm-model-detection.spec.ts b/src/api/providers/utils/__tests__/glm-model-detection.spec.ts new file mode 100644 index 00000000000..dfcbe3a8abc --- /dev/null +++ b/src/api/providers/utils/__tests__/glm-model-detection.spec.ts @@ -0,0 +1,265 @@ +import { detectGlmModel, logGlmDetection, isGlmModel } from "../glm-model-detection" + +describe("GLM Model Detection", () => { + describe("detectGlmModel", () => { + describe("non-GLM models", () => { + it("should return isGlmModel=false for non-GLM models", () => { + expect(detectGlmModel("gpt-4").isGlmModel).toBe(false) + expect(detectGlmModel("claude-3-opus").isGlmModel).toBe(false) + expect(detectGlmModel("llama-3.1").isGlmModel).toBe(false) + expect(detectGlmModel("qwen-2.5").isGlmModel).toBe(false) + }) + + it("should NOT enable GLM optimizations for non-GLM models", () => { + const config = detectGlmModel("gpt-4") + expect(config.mergeToolResultText).toBe(false) + expect(config.disableParallelToolCalls).toBe(false) + }) + }) + + describe("GLM model detection", () => { + it("should detect standard GLM model IDs", () => { + expect(detectGlmModel("glm-4.5").isGlmModel).toBe(true) + expect(detectGlmModel("glm-4.6").isGlmModel).toBe(true) + expect(detectGlmModel("glm-4.7").isGlmModel).toBe(true) + expect(detectGlmModel("GLM-4.5").isGlmModel).toBe(true) + }) + + it("should detect GLM models with various prefixes", () => { + expect(detectGlmModel("mlx-community/GLM-4.5-4bit").isGlmModel).toBe(true) + expect(detectGlmModel("local/glm-4.7-flash").isGlmModel).toBe(true) + }) + + it("should detect GGUF GLM models", () => { + const result = detectGlmModel("GLM-4.5-UD-Q8_K_XL-00001-of-00008.gguf") + expect(result.isGlmModel).toBe(true) + expect(result.version).toBe("4.5") + }) + + it("should detect ChatGLM models", () => { + expect(detectGlmModel("chatglm-6b").isGlmModel).toBe(true) + expect(detectGlmModel("chatglm3-6b").isGlmModel).toBe(true) + }) + }) + + describe("version detection", () => { + it("should detect GLM-4.5 version", () => { + expect(detectGlmModel("glm-4.5").version).toBe("4.5") + expect(detectGlmModel("glm-4-5-flash").version).toBe("4.5") + expect(detectGlmModel("accounts/fireworks/models/glm-4p5").version).toBe("4.5") + }) + + it("should detect GLM-4.6 version", () => { + expect(detectGlmModel("glm-4.6").version).toBe("4.6") + expect(detectGlmModel("GLM-4.6V").version).toBe("4.6") + expect(detectGlmModel("glm-4-6-flash").version).toBe("4.6") + }) + + it("should detect GLM-4.7 version", () => { + expect(detectGlmModel("glm-4.7").version).toBe("4.7") + expect(detectGlmModel("GLM-4.7-Flash").version).toBe("4.7") + expect(detectGlmModel("glm-4-7-flashx").version).toBe("4.7") + }) + }) + + describe("variant detection", () => { + describe("base variant", () => { + it("should detect base variant", () => { + expect(detectGlmModel("glm-4.5").variant).toBe("base") + expect(detectGlmModel("glm-4.6").variant).toBe("base") + expect(detectGlmModel("glm-4.7").variant).toBe("base") + }) + }) + + describe("air variants", () => { + it("should detect air variant", () => { + expect(detectGlmModel("glm-4.5-air").variant).toBe("air") + expect(detectGlmModel("GLM-4.5-Air").variant).toBe("air") + }) + + it("should detect airx variant", () => { + expect(detectGlmModel("glm-4.5-airx").variant).toBe("airx") + expect(detectGlmModel("GLM-4.5-AirX").variant).toBe("airx") + }) + }) + + describe("flash variants", () => { + it("should detect flash variant", () => { + expect(detectGlmModel("glm-4.5-flash").variant).toBe("flash") + expect(detectGlmModel("glm-4.7-flash").variant).toBe("flash") + }) + + it("should detect flashx variant", () => { + expect(detectGlmModel("glm-4.7-flashx").variant).toBe("flashx") + expect(detectGlmModel("GLM-4.7-FlashX").variant).toBe("flashx") + }) + }) + + describe("x variant", () => { + it("should detect x variant", () => { + expect(detectGlmModel("glm-4.5-x").variant).toBe("x") + expect(detectGlmModel("GLM-4.5-X").variant).toBe("x") + }) + }) + + describe("vision variants", () => { + it("should detect v (vision) variant for 4.5", () => { + const result = detectGlmModel("glm-4.5v") + expect(result.variant).toBe("v") + expect(result.supportsVision).toBe(true) + }) + + it("should detect v (vision) variant for 4.6", () => { + const result = detectGlmModel("glm-4.6v") + expect(result.variant).toBe("v") + expect(result.supportsVision).toBe(true) + }) + + it("should detect v-flash variant", () => { + const result = detectGlmModel("glm-4.6v-flash") + expect(result.variant).toBe("v-flash") + expect(result.supportsVision).toBe(true) + }) + + it("should detect v-flashx variant", () => { + const result = detectGlmModel("glm-4.6v-flashx") + expect(result.variant).toBe("v-flashx") + expect(result.supportsVision).toBe(true) + }) + }) + }) + + describe("thinking support detection", () => { + it("should detect thinking support for GLM-4.7 variants", () => { + expect(detectGlmModel("glm-4.7").supportsThinking).toBe(true) + expect(detectGlmModel("glm-4.7-flash").supportsThinking).toBe(true) + expect(detectGlmModel("GLM-4.7-FlashX").supportsThinking).toBe(true) + }) + + it("should detect thinking support for GLM-4.6 base model", () => { + expect(detectGlmModel("glm-4.6").supportsThinking).toBe(true) + }) + + it("should detect thinking support for GLM-4.6V vision variants", () => { + expect(detectGlmModel("glm-4.6v").supportsThinking).toBe(true) + expect(detectGlmModel("GLM-4.6V").supportsThinking).toBe(true) + expect(detectGlmModel("glm-4.6v-flash").supportsThinking).toBe(true) + expect(detectGlmModel("glm-4.6v-flashx").supportsThinking).toBe(true) + }) + + it("should NOT detect thinking support for GLM-4.5 variants", () => { + expect(detectGlmModel("glm-4.5").supportsThinking).toBe(false) + expect(detectGlmModel("glm-4.5-air").supportsThinking).toBe(false) + expect(detectGlmModel("glm-4.5-flash").supportsThinking).toBe(false) + expect(detectGlmModel("glm-4.5v").supportsThinking).toBe(false) + }) + }) + + describe("configuration flags", () => { + it("should enable mergeToolResultText for all GLM models", () => { + expect(detectGlmModel("glm-4.5").mergeToolResultText).toBe(true) + expect(detectGlmModel("glm-4.6").mergeToolResultText).toBe(true) + expect(detectGlmModel("glm-4.7").mergeToolResultText).toBe(true) + }) + + it("should disable parallel tool calls for all GLM models", () => { + expect(detectGlmModel("glm-4.5").disableParallelToolCalls).toBe(true) + expect(detectGlmModel("glm-4.6").disableParallelToolCalls).toBe(true) + expect(detectGlmModel("glm-4.7").disableParallelToolCalls).toBe(true) + }) + }) + + describe("display name generation", () => { + it("should generate correct display names for base variants", () => { + expect(detectGlmModel("glm-4.5").displayName).toBe("GLM-4.5") + expect(detectGlmModel("glm-4.6").displayName).toBe("GLM-4.6") + expect(detectGlmModel("glm-4.7").displayName).toBe("GLM-4.7") + }) + + it("should generate correct display names for variants", () => { + expect(detectGlmModel("glm-4.5-air").displayName).toBe("GLM-4.5 AIR") + expect(detectGlmModel("glm-4.5-flash").displayName).toBe("GLM-4.5 FLASH") + expect(detectGlmModel("glm-4.7-flashx").displayName).toBe("GLM-4.7 FLASHX") + expect(detectGlmModel("glm-4.6v").displayName).toBe("GLM-4.6 V") + expect(detectGlmModel("glm-4.6v-flash").displayName).toBe("GLM-4.6 V FLASH") + }) + + it("should handle unknown version", () => { + // ChatGLM doesn't have a specific version number + const result = detectGlmModel("chatglm-6b") + expect(result.displayName).toBe("GLM-4.x") + }) + }) + + describe("real-world model ID formats", () => { + it("should correctly detect MLX community models", () => { + const result = detectGlmModel("mlx-community/GLM-4.5-4bit") + expect(result.isGlmModel).toBe(true) + expect(result.version).toBe("4.5") + expect(result.variant).toBe("base") + }) + + it("should correctly detect GGUF models", () => { + const result = detectGlmModel("GLM-4.5-UD-Q8_K_XL-00001-of-00008.gguf") + expect(result.isGlmModel).toBe(true) + expect(result.version).toBe("4.5") + }) + + it("should correctly detect Fireworks models", () => { + const result = detectGlmModel("accounts/fireworks/models/glm-4p5") + expect(result.isGlmModel).toBe(true) + expect(result.version).toBe("4.5") + }) + + it("should correctly detect Fireworks air models", () => { + const result = detectGlmModel("accounts/fireworks/models/glm-4p5-air") + expect(result.isGlmModel).toBe(true) + expect(result.version).toBe("4.5") + expect(result.variant).toBe("air") + }) + }) + }) + + describe("logGlmDetection", () => { + let consoleLogSpy: any + + beforeEach(() => { + consoleLogSpy = vi.spyOn(console, "log").mockImplementation(() => {}) + }) + + afterEach(() => { + consoleLogSpy.mockRestore() + }) + + it("should log detection results for GLM models", () => { + const config = detectGlmModel("glm-4.5") + logGlmDetection("LM Studio", "glm-4.5", config) + + expect(consoleLogSpy).toHaveBeenCalledWith('[LM Studio] Using model ID: "glm-4.5"') + expect(consoleLogSpy).toHaveBeenCalledWith('[GLM Detection] ✓ GLM model detected: "glm-4.5"') + expect(consoleLogSpy).toHaveBeenCalledWith("[GLM Detection] - Version: 4.5") + expect(consoleLogSpy).toHaveBeenCalledWith("[GLM Detection] - Variant: base") + }) + + it("should log when model is NOT a GLM model", () => { + const config = detectGlmModel("gpt-4") + logGlmDetection("OpenAI-compatible", "gpt-4", config) + + expect(consoleLogSpy).toHaveBeenCalledWith('[OpenAI-compatible] Using model ID: "gpt-4"') + expect(consoleLogSpy).toHaveBeenCalledWith('[GLM Detection] ✗ Not a GLM model: "gpt-4"') + }) + }) + + describe("isGlmModel", () => { + it("should return true for GLM models", () => { + expect(isGlmModel("glm-4.5")).toBe(true) + expect(isGlmModel("GLM-4.7-Flash")).toBe(true) + expect(isGlmModel("chatglm-6b")).toBe(true) + }) + + it("should return false for non-GLM models", () => { + expect(isGlmModel("gpt-4")).toBe(false) + expect(isGlmModel("claude-3-opus")).toBe(false) + }) + }) +}) diff --git a/src/api/providers/utils/glm-model-detection.ts b/src/api/providers/utils/glm-model-detection.ts new file mode 100644 index 00000000000..ac7c3a538f1 --- /dev/null +++ b/src/api/providers/utils/glm-model-detection.ts @@ -0,0 +1,203 @@ +/** + * GLM Model Detection Utility + * + * Detects GLM models from Z.ai (Zhipu AI) and returns appropriate configuration + * for optimal interaction. This utility supports various model ID formats from + * different providers like LM Studio and OpenAI-compatible endpoints. + * + * GLM Model Family: + * - GLM-4.5: Base model with 355B parameters + * - GLM-4.5-Air: Lightweight version balancing performance and cost + * - GLM-4.5-X: High-performance variant with ultra-fast responses + * - GLM-4.5-AirX: Lightweight ultra-fast variant + * - GLM-4.5-Flash: Free high-speed model + * - GLM-4.5V: Multimodal visual model + * - GLM-4.6: Extended 200k context window + * - GLM-4.6V: Multimodal vision model + * - GLM-4.6V-Flash: Free high-speed vision model + * - GLM-4.7: Built-in thinking capabilities + * - GLM-4.7-Flash: Free high-speed variant of GLM-4.7 + * - GLM-4.7-FlashX: Ultra-fast variant + */ + +/** + * GLM model version enumeration + */ +export type GlmVersion = "4.5" | "4.6" | "4.7" | "unknown" + +/** + * GLM model variant - specific model within a version + */ +export type GlmVariant = + | "base" + | "air" + | "x" + | "airx" + | "flash" + | "flashx" + | "v" // vision + | "v-flash" + | "v-flashx" + +/** + * Configuration options for GLM models + */ +export interface GlmModelConfig { + /** Whether this is a GLM model */ + isGlmModel: boolean + /** The detected GLM version (4.5, 4.6, 4.7) */ + version: GlmVersion + /** The detected variant (base, air, flash, v, etc.) */ + variant: GlmVariant | "unknown" + /** Whether this model supports vision/images */ + supportsVision: boolean + /** Whether this model has built-in thinking/reasoning support */ + supportsThinking: boolean + /** Whether to merge tool result text into tool messages */ + mergeToolResultText: boolean + /** Whether to disable parallel tool calls */ + disableParallelToolCalls: boolean + /** The original model ID */ + originalModelId: string + /** A normalized/canonical model name for display */ + displayName: string +} + +/** + * Detects if a model ID represents a GLM model and returns its configuration. + * + * Supports various model ID formats: + * - Standard: "glm-4.5", "glm-4.7-flash" + * - With prefix: "mlx-community/GLM-4.5-4bit" + * - GGUF files: "GLM-4.5-UD-Q8_K_XL-00001-of-00008.gguf" + * - ChatGLM: "chatglm-6b", "chatglm3-6b" + * + * @param modelId The model identifier string + * @returns GLM model configuration + */ +export function detectGlmModel(modelId: string): GlmModelConfig { + const lowerModelId = modelId.toLowerCase() + + // Check if this is a GLM model using case-insensitive matching + // Match patterns: "glm-", "glm4", "chatglm", or "glm" followed by a version number + const isGlm = /glm[-_]?4|chatglm|\/glm[-_]|^glm[-_]/i.test(modelId) + + if (!isGlm) { + return { + isGlmModel: false, + version: "unknown", + variant: "unknown", + supportsVision: false, + supportsThinking: false, + mergeToolResultText: false, + disableParallelToolCalls: false, + originalModelId: modelId, + displayName: modelId, + } + } + + // Detect version (4.5, 4.6, 4.7) + let version: GlmVersion = "unknown" + if (/4\.7|4-7|47/i.test(lowerModelId)) { + version = "4.7" + } else if (/4\.6|4-6|46/i.test(lowerModelId)) { + version = "4.6" + } else if (/4\.5|4-5|45|4p5/i.test(lowerModelId)) { + version = "4.5" + } + + // Detect variant + let variant: GlmVariant = "base" + let supportsVision = false + + // Check for vision variants first (they may also have flash/etc.) + if (/4\.5v|4-5v|45v|4p5v|glm-4\.5v/i.test(lowerModelId)) { + variant = "v" + supportsVision = true + } else if (/4\.6v[-_]?flashx|4-6v[-_]?flashx/i.test(lowerModelId)) { + variant = "v-flashx" + supportsVision = true + } else if (/4\.6v[-_]?flash|4-6v[-_]?flash/i.test(lowerModelId)) { + variant = "v-flash" + supportsVision = true + } else if (/4\.6v|4-6v|46v/i.test(lowerModelId)) { + variant = "v" + supportsVision = true + } + // Non-vision variants + else if (/flashx/i.test(lowerModelId)) { + variant = "flashx" + } else if (/flash/i.test(lowerModelId)) { + variant = "flash" + } else if (/airx/i.test(lowerModelId)) { + variant = "airx" + } else if (/air/i.test(lowerModelId)) { + variant = "air" + } else if (/[-_]x\b/i.test(lowerModelId)) { + // Match "-x" or "_x" at word boundary (to avoid matching "flashx", "airx") + variant = "x" + } + + // GLM-4.6, GLM-4.6V, and GLM-4.7 have built-in thinking support + // For GLM-4.6, only the base model and vision variants support thinking + const supportsThinking = + version === "4.7" || + (version === "4.6" && + (variant === "base" || variant === "v" || variant === "v-flash" || variant === "v-flashx")) + + // Generate display name + let displayName = `GLM-${version !== "unknown" ? version : "4.x"}` + if (variant !== "base") { + const variantName = variant.toUpperCase().replace("-", " ") + displayName += ` ${variantName}` + } + + return { + isGlmModel: true, + version, + variant, + supportsVision, + supportsThinking, + // All GLM models benefit from mergeToolResultText to prevent reasoning_content loss + mergeToolResultText: true, + // Disable parallel tool calls for GLM models as they may not support it properly + disableParallelToolCalls: true, + originalModelId: modelId, + displayName, + } +} + +/** + * Logs GLM model detection results to the console for debugging. + * + * @param providerName The name of the provider (e.g., "LM Studio", "OpenAI-compatible") + * @param modelId The model ID being used + * @param config The detected GLM configuration + */ +export function logGlmDetection(providerName: string, modelId: string, config: GlmModelConfig): void { + console.log(`[${providerName}] Using model ID: "${modelId}"`) + + if (config.isGlmModel) { + console.log(`[GLM Detection] ✓ GLM model detected: "${modelId}"`) + console.log(`[GLM Detection] - Version: ${config.version}`) + console.log(`[GLM Detection] - Variant: ${config.variant}`) + console.log(`[GLM Detection] - Display name: ${config.displayName}`) + console.log(`[GLM Detection] - Supports vision: ${config.supportsVision}`) + console.log(`[GLM Detection] - Supports thinking: ${config.supportsThinking}`) + console.log(`[GLM Detection] - mergeToolResultText: ${config.mergeToolResultText}`) + console.log(`[GLM Detection] - disableParallelToolCalls: ${config.disableParallelToolCalls}`) + } else { + console.log(`[GLM Detection] ✗ Not a GLM model: "${modelId}"`) + } +} + +/** + * Simple check if a model ID is a GLM model without full configuration. + * Use this for quick checks where you only need a boolean. + * + * @param modelId The model identifier string + * @returns true if the model is a GLM model + */ +export function isGlmModel(modelId: string): boolean { + return /glm[-_]?4|chatglm|\/glm[-_]|^glm[-_]/i.test(modelId) +} diff --git a/src/api/providers/zai.ts b/src/api/providers/zai.ts index a2e3740c56f..7e9a362c68a 100644 --- a/src/api/providers/zai.ts +++ b/src/api/providers/zai.ts @@ -40,9 +40,9 @@ export class ZAiHandler extends BaseOpenAiCompatibleProvider { } /** - * Override createStream to handle GLM-4.7's thinking mode. - * GLM-4.7 has thinking enabled by default in the API, so we need to - * explicitly send { type: "disabled" } when the user turns off reasoning. + * Override createStream to handle thinking mode for GLM models. + * GLM-4.6, GLM-4.6V, and GLM-4.7 have thinking enabled by default in the API, + * so we need to explicitly send { type: "disabled" } when the user turns off reasoning. */ protected override createStream( systemPrompt: string, @@ -52,11 +52,11 @@ export class ZAiHandler extends BaseOpenAiCompatibleProvider { ) { const { id: modelId, info } = this.getModel() - // Check if this is a GLM-4.7 model with thinking support - const isThinkingModel = modelId === "glm-4.7" && Array.isArray(info.supportsReasoningEffort) + // Check if this is a GLM model with thinking support (GLM-4.6, GLM-4.6V, GLM-4.7) + const isThinkingModel = Array.isArray(info.supportsReasoningEffort) if (isThinkingModel) { - // For GLM-4.7, thinking is ON by default in the API. + // For GLM thinking models, thinking is ON by default in the API. // We need to explicitly disable it when reasoning is off. const useReasoning = shouldUseReasoningEffort({ model: info, settings: this.options }) @@ -69,7 +69,7 @@ export class ZAiHandler extends BaseOpenAiCompatibleProvider { } /** - * Creates a stream with explicit thinking control for GLM-4.7 + * Creates a stream with explicit thinking control for GLM thinking models (4.6, 4.6V, 4.7) */ private createStreamWithThinking( systemPrompt: string, @@ -99,7 +99,7 @@ export class ZAiHandler extends BaseOpenAiCompatibleProvider { messages: [{ role: "system", content: systemPrompt }, ...convertedMessages], stream: true, stream_options: { include_usage: true }, - // For GLM-4.7: thinking is ON by default, so we explicitly disable when needed + // For GLM thinking models: thinking is ON by default, so we explicitly disable when needed thinking: useReasoning ? { type: "enabled" } : { type: "disabled" }, tools: this.convertToolsForOpenAI(metadata?.tools), tool_choice: metadata?.tool_choice, diff --git a/src/core/assistant-message/NativeToolCallParser.ts b/src/core/assistant-message/NativeToolCallParser.ts index 72c34f94a07..02d24c6b3b1 100644 --- a/src/core/assistant-message/NativeToolCallParser.ts +++ b/src/core/assistant-message/NativeToolCallParser.ts @@ -166,16 +166,35 @@ export class NativeToolCallParser { /** * Process stream finish reason. * Emits end events when finish_reason is 'tool_calls'. + * + * IMPORTANT: Only emits tool_call_end for tool calls that have actually started + * (i.e., where tool_call_start was emitted). This prevents finalizeStreamingToolCall + * from receiving IDs that were never registered via startStreamingToolCall, which + * would cause tool results to be silently dropped and trigger infinite retry loops. */ public static processFinishReason(finishReason: string | null | undefined): ToolCallStreamEvent[] { const events: ToolCallStreamEvent[] = [] if (finishReason === "tool_calls" && this.rawChunkTracker.size > 0) { for (const [, tracked] of this.rawChunkTracker.entries()) { - events.push({ - type: "tool_call_end", - id: tracked.id, - }) + // Only emit tool_call_end for tool calls that have actually started. + // Tool calls without hasStarted=true never had a tool_call_start emitted + // (likely due to missing tool name), so they were never registered in + // streamingToolCalls. Emitting tool_call_end for these would cause + // finalizeStreamingToolCall to fail, resulting in no tool_result being + // sent to the model and triggering infinite retry loops. + if (tracked.hasStarted) { + events.push({ + type: "tool_call_end", + id: tracked.id, + }) + } else { + // Log a warning for tool calls that were tracked but never started. + // This helps diagnose issues with models that send malformed tool calls. + console.warn( + `[NativeToolCallParser] Skipping tool_call_end for unstarted tool call: ${tracked.id} (no name received)`, + ) + } } } diff --git a/src/core/assistant-message/__tests__/NativeToolCallParser.spec.ts b/src/core/assistant-message/__tests__/NativeToolCallParser.spec.ts index db0dc00de41..1b4b1a7e179 100644 --- a/src/core/assistant-message/__tests__/NativeToolCallParser.spec.ts +++ b/src/core/assistant-message/__tests__/NativeToolCallParser.spec.ts @@ -343,4 +343,107 @@ describe("NativeToolCallParser", () => { }) }) }) + + describe("processFinishReason", () => { + describe("tool call tracking synchronization", () => { + it("should emit tool_call_end only for tool calls that have started", () => { + // Simulate a tool call with both ID and name (will start) + NativeToolCallParser.processRawChunk({ + index: 0, + id: "call_started_123", + name: "read_file", + }) + + const events = NativeToolCallParser.processFinishReason("tool_calls") + + expect(events).toHaveLength(1) + expect(events[0]).toEqual({ + type: "tool_call_end", + id: "call_started_123", + }) + }) + + it("should NOT emit tool_call_end for tool calls without a name (never started)", () => { + // Simulate a tool call with ID but NO name - this happens when models + // send malformed tool calls or split ID/name across chunks incorrectly + NativeToolCallParser.processRawChunk({ + index: 0, + id: "call_no_name_456", + // No name provided - tool_call_start will not be emitted + }) + + // Capture console.warn to verify warning is logged + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + + const events = NativeToolCallParser.processFinishReason("tool_calls") + + // Should NOT emit tool_call_end since tool was never started + expect(events).toHaveLength(0) + + // Should log a warning about the unstarted tool call + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining("Skipping tool_call_end for unstarted tool call"), + ) + + warnSpy.mockRestore() + }) + + it("should handle mixed started and unstarted tool calls correctly", () => { + // Tool call with ID and name (will start) + NativeToolCallParser.processRawChunk({ + index: 0, + id: "call_with_name", + name: "read_file", + }) + + // Tool call with only ID (will not start) + NativeToolCallParser.processRawChunk({ + index: 1, + id: "call_without_name", + // No name + }) + + // Another tool call with ID and name (will start) + NativeToolCallParser.processRawChunk({ + index: 2, + id: "call_also_with_name", + name: "write_to_file", + }) + + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + + const events = NativeToolCallParser.processFinishReason("tool_calls") + + // Should only emit tool_call_end for the two started tool calls + expect(events).toHaveLength(2) + expect(events.map((e) => e.id)).toContain("call_with_name") + expect(events.map((e) => e.id)).toContain("call_also_with_name") + expect(events.map((e) => e.id)).not.toContain("call_without_name") + + // Should log warning for the unstarted tool call + expect(warnSpy).toHaveBeenCalledTimes(1) + expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining("call_without_name")) + + warnSpy.mockRestore() + }) + + it("should return empty array when finish_reason is not tool_calls", () => { + NativeToolCallParser.processRawChunk({ + index: 0, + id: "call_123", + name: "read_file", + }) + + const events = NativeToolCallParser.processFinishReason("stop") + + expect(events).toHaveLength(0) + }) + + it("should return empty array when no tool calls are tracked", () => { + const events = NativeToolCallParser.processFinishReason("tool_calls") + + expect(events).toHaveLength(0) + }) + }) + }) })