Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 55 additions & 2 deletions src/api/providers/base-openai-compatible-provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ import { type ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/ap
import { TagMatcher } from "../../utils/tag-matcher"
import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
import { convertToOpenAiMessages } from "../transform/openai-format"
import { convertToZAiFormat } from "../transform/zai-format"

import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
import { DEFAULT_HEADERS } from "./constants"
import { BaseProvider } from "./base-provider"
import { handleOpenAIError } from "./utils/openai-error-handler"
import { calculateApiCostOpenAI } from "../../shared/cost"
import { getApiRequestTimeout } from "./utils/timeout-config"
import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection"

type BaseOpenAiCompatibleProviderOptions<ModelName extends string> = ApiHandlerOptions & {
providerName: string
Expand All @@ -36,6 +38,7 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
protected readonly options: ApiHandlerOptions

protected client: OpenAI
protected glmConfig: GlmModelConfig | null = null

constructor({
providerName,
Expand Down Expand Up @@ -65,6 +68,13 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
defaultHeaders: DEFAULT_HEADERS,
timeout: getApiRequestTimeout(),
})

// Detect GLM model on construction if model ID is available
const modelId = this.options.apiModelId || ""
if (modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}
}

protected createStream(
Expand All @@ -75,6 +85,12 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
) {
const { id: model, info } = this.getModel()

// Re-detect GLM model if not already done or if model ID changed
if (!this.glmConfig || this.glmConfig.originalModelId !== model) {
this.glmConfig = detectGlmModel(model)
logGlmDetection(this.providerName, model, this.glmConfig)
}

// Centralized cap: clamp to 20% of the context window (unless provider-specific exceptions apply)
const max_tokens =
getModelMaxOutputTokens({
Expand All @@ -86,23 +102,48 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>

const temperature = this.options.modelTemperature ?? info.defaultTemperature ?? this.defaultTemperature

// Convert messages based on whether this is a GLM model
// GLM models benefit from mergeToolResultText to prevent reasoning_content loss
const convertedMessages = this.glmConfig.isGlmModel
? convertToZAiFormat(messages, { mergeToolResultText: this.glmConfig.mergeToolResultText })
: convertToOpenAiMessages(messages)

// Determine parallel_tool_calls setting
// Disable for GLM models as they may not support it properly
let parallelToolCalls: boolean
if (this.glmConfig.isGlmModel && this.glmConfig.disableParallelToolCalls) {
parallelToolCalls = false
console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`)
} else {
parallelToolCalls = metadata?.parallelToolCalls ?? true
}

const params: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
model,
max_tokens,
temperature,
messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
messages: [{ role: "system", content: systemPrompt }, ...convertedMessages],
stream: true,
stream_options: { include_usage: true },
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: parallelToolCalls,
}

// Add thinking parameter if reasoning is enabled and model supports it
if (this.options.enableReasoningEffort && info.supportsReasoningBinary) {
;(params as any).thinking = { type: "enabled" }
}

// For GLM-4.7 models with thinking support, add thinking parameter
if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) {
const useReasoning = this.options.enableReasoningEffort !== false // Default to enabled for GLM-4.7
;(params as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" }
console.log(
`[${this.providerName}] GLM thinking mode: ${useReasoning ? "enabled" : "disabled"} for ${this.glmConfig.displayName}`,
)
}

try {
return this.client.chat.completions.create(params, requestOptions)
} catch (error) {
Expand Down Expand Up @@ -222,6 +263,12 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
async completePrompt(prompt: string): Promise<string> {
const { id: modelId, info: modelInfo } = this.getModel()

// Re-detect GLM model if not already done or if model ID changed
if (!this.glmConfig || this.glmConfig.originalModelId !== modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}

const params: OpenAI.Chat.Completions.ChatCompletionCreateParams = {
model: modelId,
messages: [{ role: "user", content: prompt }],
Expand All @@ -232,6 +279,12 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
;(params as any).thinking = { type: "enabled" }
}

// For GLM-4.7 models with thinking support, add thinking parameter
if (this.glmConfig.isGlmModel && this.glmConfig.supportsThinking) {
const useReasoning = this.options.enableReasoningEffort !== false
;(params as any).thinking = useReasoning ? { type: "enabled" } : { type: "disabled" }
}

try {
const response = await this.client.chat.completions.create(params)

Expand Down
62 changes: 58 additions & 4 deletions src/api/providers/lm-studio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@ import { NativeToolCallParser } from "../../core/assistant-message/NativeToolCal
import { TagMatcher } from "../../utils/tag-matcher"

import { convertToOpenAiMessages } from "../transform/openai-format"
import { convertToZAiFormat } from "../transform/zai-format"
import { ApiStream } from "../transform/stream"

import { BaseProvider } from "./base-provider"
import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
import { getModelsFromCache } from "./fetchers/modelCache"
import { getApiRequestTimeout } from "./utils/timeout-config"
import { handleOpenAIError } from "./utils/openai-error-handler"
import { detectGlmModel, logGlmDetection, type GlmModelConfig } from "./utils/glm-model-detection"

export class LmStudioHandler extends BaseProvider implements SingleCompletionHandler {
protected options: ApiHandlerOptions
private client: OpenAI
private readonly providerName = "LM Studio"
private glmConfig: GlmModelConfig | null = null

constructor(options: ApiHandlerOptions) {
super()
Expand All @@ -35,16 +38,37 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
apiKey: apiKey,
timeout: getApiRequestTimeout(),
})

// Detect GLM model on construction if model ID is available
const modelId = this.options.lmStudioModelId || ""
if (modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}
}

override async *createMessage(
systemPrompt: string,
messages: Anthropic.Messages.MessageParam[],
metadata?: ApiHandlerCreateMessageMetadata,
): ApiStream {
const modelId = this.getModel().id

// Re-detect GLM model if not already done or if model ID changed
if (!this.glmConfig || this.glmConfig.originalModelId !== modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}

// Convert messages based on whether this is a GLM model
// GLM models benefit from mergeToolResultText to prevent reasoning_content loss
const convertedMessages = this.glmConfig.isGlmModel
? convertToZAiFormat(messages, { mergeToolResultText: this.glmConfig.mergeToolResultText })
: convertToOpenAiMessages(messages)

const openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
{ role: "system", content: systemPrompt },
...convertToOpenAiMessages(messages),
...convertedMessages,
]

// -------------------------
Expand Down Expand Up @@ -83,14 +107,24 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
let assistantText = ""

try {
// Determine parallel_tool_calls setting
// Disable for GLM models as they may not support it properly
let parallelToolCalls: boolean
if (this.glmConfig?.isGlmModel && this.glmConfig.disableParallelToolCalls) {
parallelToolCalls = false
console.log(`[${this.providerName}] parallel_tool_calls disabled for GLM model`)
} else {
parallelToolCalls = metadata?.parallelToolCalls ?? true
}

const params: OpenAI.Chat.ChatCompletionCreateParamsStreaming & { draft_model?: string } = {
model: this.getModel().id,
model: modelId,
messages: openAiMessages,
temperature: this.options.modelTemperature ?? LMSTUDIO_DEFAULT_TEMPERATURE,
stream: true,
tools: this.convertToolsForOpenAI(metadata?.tools),
tool_choice: metadata?.tool_choice,
parallel_tool_calls: metadata?.parallelToolCalls ?? true,
parallel_tool_calls: parallelToolCalls,
}

if (this.options.lmStudioSpeculativeDecodingEnabled && this.options.lmStudioDraftModelId) {
Expand Down Expand Up @@ -124,6 +158,14 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
}
}

// Handle reasoning_content for GLM models with thinking support
if (delta && this.glmConfig?.supportsThinking) {
const deltaAny = delta as any
if (deltaAny.reasoning_content) {
yield { type: "reasoning", text: deltaAny.reasoning_content }
}
}

// Handle tool calls in stream - emit partial chunks for NativeToolCallParser
if (delta?.tool_calls) {
for (const toolCall of delta.tool_calls) {
Expand Down Expand Up @@ -186,10 +228,22 @@ export class LmStudioHandler extends BaseProvider implements SingleCompletionHan
}

async completePrompt(prompt: string): Promise<string> {
const modelId = this.getModel().id

// Re-detect GLM model if not already done or if model ID changed
if (!this.glmConfig || this.glmConfig.originalModelId !== modelId) {
this.glmConfig = detectGlmModel(modelId)
logGlmDetection(this.providerName, modelId, this.glmConfig)
}

try {
// Determine parallel_tool_calls setting for GLM models
const parallelToolCalls =
this.glmConfig?.isGlmModel && this.glmConfig.disableParallelToolCalls ? false : true

// Create params object with optional draft model
const params: any = {
model: this.getModel().id,
model: modelId,
messages: [{ role: "user", content: prompt }],
temperature: this.options.modelTemperature ?? LMSTUDIO_DEFAULT_TEMPERATURE,
stream: false,
Expand Down
Loading
Loading