Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions src/api/providers/base-openai-compatible-provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ import { type ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/ap
import { TagMatcher } from "../../utils/tag-matcher"
import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
import { convertToOpenAiMessages } from "../transform/openai-format"
import { convertToZAiFormat } from "../transform/zai-format"

import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
import { DEFAULT_HEADERS } from "./constants"
import { BaseProvider } from "./base-provider"
import { handleOpenAIError } from "./utils/openai-error-handler"
import { calculateApiCostOpenAI } from "../../shared/cost"
import { getApiRequestTimeout } from "./utils/timeout-config"
import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection"

type BaseOpenAiCompatibleProviderOptions<ModelName extends string> = ApiHandlerOptions & {
providerName: string
Expand All @@ -36,6 +38,7 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
protected readonly options: ApiHandlerOptions

protected client: OpenAI
protected glmConfig: GlmModelConfig | null = null

constructor({
providerName,
Expand Down Expand Up @@ -65,6 +68,13 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
defaultHeaders: DEFAULT_HEADERS,
timeout: getApiRequestTimeout(),
})

// Detect GLM model on construction if model ID is available
const modelId = this.options.apiModelId || ""
if (modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}
}

protected createStream(
Expand All @@ -75,6 +85,12 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
) {
const { id: model, info } = this.getModel()

// Re-detect GLM model if not already done or if model ID changed
if (!this.glmConfig || this.glmConfig.originalModelId !== model) {
this.glmConfig = detectGlmModel(model)
logGlmDetection(this.providerName, model, this.glmConfig)
}

// Centralized cap: clamp to 20% of the context window (unless provider-specific exceptions apply)
const max_tokens =
getModelMaxOutputTokens({
Expand All @@ -86,23 +102,46 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>

const temperature = this.options.modelTemperature ?? info.defaultTemperature ?? this.defaultTemperature

// Convert messages based on whether this is a GLM model
// GLM models benefit from mergeToolResultText to prevent reasoning_content loss
const convertedMessages = this.glmConfig.isGlmModel
? convertToZAiFormat(messages, { mergeToolResultText: this.glmConfig.mergeToolResultText })
: convertToOpenAiMessages(messages)

// Determine parallel_tool_calls setting
// Disable for GLM models as they may not support it properly
let parallelToolCalls: boolean
if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) {
parallelToolCalls = false
console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`)
} else {
parallelToolCalls = metadata?.parallelToolCalls ?? true
}

const params: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
model,
max_tokens,
temperature,
messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
messages: [{ role: "system", content: systemPrompt }, ...convertedMessages],
stream: true,
stream_options: { include_usage: true },
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: parallelToolCalls,
}

// Add thinking parameter if reasoning is enabled and model supports it
if (this.options.enableReasoningEffort && info.supportsReasoningBinary) {
;(params as any).thinking = { type: "enabled" }
}

// For GLM-4.7 models with thinking support, add thinking parameter
if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) {
const useReasoning = this.options.enableReasoningEffort !== false // Default to enabled for GLM-4.7
;(params as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" }
console.log(`[${this.providerName}] GLM-4.7 thinking mode: ${useReasoning ? "enabled" : "disabled"}`)
}

try {
return this.client.chat.completions.create(params, requestOptions)
} catch (error) {
Expand Down
60 changes: 57 additions & 3 deletions src/api/providers/lm-studio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@ import { NativeToolCallParser } from "../../core/assistant-message/NativeToolCal
import { TagMatcher } from "../../utils/tag-matcher"

import { convertToOpenAiMessages } from "../transform/openai-format"
import { convertToZAiFormat } from "../transform/zai-format"
import { ApiStream } from "../transform/stream"

import { BaseProvider } from "./base-provider"
import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
import { getModelsFromCache } from "./fetchers/modelCache"
import { getApiRequestTimeout } from "./utils/timeout-config"
import { handleOpenAIError } from "./utils/openai-error-handler"
import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection"

export class LmStudioHandler extends BaseProvider implements SingleCompletionHandler {
protected options: ApiHandlerOptions
private client: OpenAI
private readonly providerName = "LM Studio"
private glmConfig: GlmModelConfig | null = null

constructor(options: ApiHandlerOptions) {
super()
Expand All @@ -35,16 +38,37 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
apiKey: apiKey,
timeout: getApiRequestTimeout(),
})

// Detect GLM model on construction if model ID is available
const modelId = this.options.lmStudioModelId || ""
if (modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}
}

override async *createMessage(
systemPrompt: string,
messages: Anthropic.Messages.MessageParam[],
metadata?: ApiHandlerCreateMessageMetadata,
): ApiStream {
const model = this.getModel()

// Re-detect GLM model if not already done or if model ID changed
if (!this.glmConfig || this.glmConfig.originalModelId !== model.id) {
this.glmConfig = detectGlmModel(model.id)
logGlmDetection(this.providerName, model.id, this.glmConfig)
}

// Convert messages based on whether this is a GLM model
// GLM models benefit from mergeToolResultText to prevent reasoning_content loss
const convertedMessages = this.glmConfig.isGlmModel
? convertToZAiFormat(messages, { mergeToolResultText: this.glmConfig.mergeToolResultText })
: convertToOpenAiMessages(messages)

const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
{ role: "system", content: systemPrompt },
...convertToOpenAiMessages(messages),
...convertedMessages,
]

// -------------------------
Expand Down Expand Up @@ -83,20 +107,37 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
let assistantText = ""

try {
// Determine parallel_tool_calls setting
// Disable for GLM models as they may not support it properly
let parallelToolCalls: boolean
if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) {
parallelToolCalls = false
console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`)
} else {
parallelToolCalls = metadata?.parallelToolCalls ?? true
}

const params: OpenAI.Chat.ChatCompletionCreateParamsStreaming & { draft_model?: string } = {
model: this.getModel().id,
model: model.id,
messages: openAiMessages,
temperature: this.options.modelTemperature ?? LMSTUDIO_DEFAULT_TEMPERATURE,
stream: true,
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: parallelToolCalls,
}

if (this.options.lmStudioSpeculativeDecodingEnabled && this.options.lmStudioDraftModelId) {
params.draft_model = this.options.lmStudioDraftModelId
}

// For GLM-4.7 models with thinking support, add thinking parameter
if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) {
const useReasoning = this.options.enableReasoningEffort !== false // Default to enabled for GLM-4.7
;(params as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" }
console.log(`[${this.providerName}] GLM-4.7 thinking mode: ${useReasoning ? "enabled" : "disabled"}`)
}

let results
try {
results = await this.client.chat.completions.create(params)
Expand Down Expand Up @@ -124,6 +165,19 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
}
}

// Handle reasoning_content for GLM models (similar to Z.ai)
if (delta) {
for (const key of ["reasoning_content", "reasoning"] as const) {
if (key in delta) {
const reasoning_content = ((delta as any)[key] as string | undefined) || ""
if (reasoning_content?.trim()) {
yield { type: "reasoning", text: reasoning_content }
}
break
}
}
}

// Handle tool calls in stream - emit partial chunks for NativeToolCallParser
if (delta?.tool_calls) {
for (const toolCall of delta.tool_calls) {
Expand Down
77 changes: 71 additions & 6 deletions src/api/providers/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import type { ApiHandlerOptions } from "../../shared/api"
import { TagMatcher } from "../../utils/tag-matcher"

import { convertToOpenAiMessages } from "../transform/openai-format"
import { convertToZAiFormat } from "../transform/zai-format"
import { convertToR1Format } from "../transform/r1-format"
import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
import { getModelParams } from "../transform/model-params"
Expand All @@ -24,14 +25,16 @@ import { BaseProvider } from "./base-provider"
import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
import { getApiRequestTimeout } from "./utils/timeout-config"
import { handleOpenAIError } from "./utils/openai-error-handler"
import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection"

// TODO: Rename this to OpenAICompatibleHandler. Also, I think the
// `OpenAINativeHandler` can subclass from this, since it's obviously
// compatible with the OpenAI API. We can also rename it to `OpenAIHandler`.
export class OpenAiHandler extends BaseProvider implements SingleCompletionHandler {
protected options: ApiHandlerOptions
protected client: OpenAI
private readonly providerName = "OpenAI"
private readonly providerName = "OpenAI Compatible"
private glmConfig: GlmModelConfig | null = null

constructor(options: ApiHandlerOptions) {
super()
Expand Down Expand Up @@ -77,6 +80,13 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
timeout,
})
}

// Detect GLM model on construction if model ID is available
const modelId = this.options.openAiModelId || ""
if (modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}
}

override async *createMessage(
Expand All @@ -91,6 +101,12 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
const isAzureAiInference = this._isAzureAiInference(modelUrl)
const deepseekReasoner = modelId.includes("deepseek-reasoner") || enabledR1Format

// Re-detect GLM model if not already done or if model ID changed
if (!this.glmConfig || this.glmConfig.originalModelId !== modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}

if (modelId.includes("o1") || modelId.includes("o3") || modelId.includes("o4")) {
yield* this.handleO3FamilyMessage(modelId, systemPrompt, messages, metadata)
return
Expand All @@ -106,6 +122,12 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl

if (deepseekReasoner) {
convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
} else if (this.glmConfig.isGlmModel) {
// GLM models benefit from mergeToolResultText to prevent reasoning_content loss
const glmConvertedMessages = convertToZAiFormat(messages, {
mergeToolResultText: this.glmConfig.mergeToolResultText,
})
convertedMessages = [systemMessage, ...glmConvertedMessages]
} else {
if (modelInfo.supportsPromptCache) {
systemMessage = {
Expand Down Expand Up @@ -152,6 +174,16 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl

const isGrokXAI = this._isGrokXAI(this.options.openAiBaseUrl)

// Determine parallel_tool_calls setting
// Disable for GLM models as they may not support it properly
let parallelToolCalls: boolean
if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) {
parallelToolCalls = false
console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`)
} else {
parallelToolCalls = metadata?.parallelToolCalls ?? true
}

const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
model: modelId,
temperature: this.options.modelTemperature ?? (deepseekReasoner ? DEEP_SEEK_DEFAULT_TEMPERATURE : 0),
Expand All @@ -161,12 +193,19 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
...(reasoning && reasoning),
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: parallelToolCalls,
}

// Add max_tokens if needed
this.addMaxTokensIfNeeded(requestOptions, modelInfo)

// For GLM-4.7 models with thinking support, add thinking parameter
if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) {
const useReasoning = this.options.enableReasoningEffort !== false // Default to enabled for GLM-4.7
;(requestOptions as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" }
console.log(`[${this.providerName}] GLM-4.7 thinking mode: ${useReasoning ? "enabled" : "disabled"}`)
}

let stream
try {
stream = await this.client.chat.completions.create(
Expand Down Expand Up @@ -221,20 +260,46 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
yield this.processUsageMetrics(lastUsage, modelInfo)
}
} else {
// Determine message conversion based on model type
let nonStreamingMessages
if (deepseekReasoner) {
nonStreamingMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
} else if (this.glmConfig.isGlmModel) {
// GLM models benefit from mergeToolResultText to prevent reasoning_content loss
const glmConvertedMessages = convertToZAiFormat(messages, {
mergeToolResultText: this.glmConfig.mergeToolResultText,
})
nonStreamingMessages = [systemMessage, ...glmConvertedMessages]
} else {
nonStreamingMessages = [systemMessage, ...convertToOpenAiMessages(messages)]
}

// Determine parallel_tool_calls setting for non-streaming
let nonStreamingParallelToolCalls: boolean
if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) {
nonStreamingParallelToolCalls = false
} else {
nonStreamingParallelToolCalls = metadata?.parallelToolCalls ?? true
}

const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
model: modelId,
messages: deepseekReasoner
? convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
: [systemMessage, ...convertToOpenAiMessages(messages)],
messages: nonStreamingMessages,
// Tools are always present (minimum ALWAYS_AVAILABLE_TOOLS)
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: nonStreamingParallelToolCalls,
}

// Add max_tokens if needed
this.addMaxTokensIfNeeded(requestOptions, modelInfo)

// For GLM-4.7 models with thinking support, add thinking parameter
if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) {
const useReasoning = this.options.enableReasoningEffort !== false
;(requestOptions as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" }
}

let response
try {
response = await this.client.chat.completions.create(
Expand Down
Loading
Loading