diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b8ca01240be..11b30f871a1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1057,8 +1057,8 @@ importers: specifier: ^5.0.5 version: 5.0.5 '@types/vscode': - specifier: ^1.84.0 - version: 1.100.0 + specifier: ^1.106.0 + version: 1.108.1 '@vscode/test-electron': specifier: ^2.5.2 version: 2.5.2 @@ -4429,6 +4429,9 @@ packages: '@types/vscode@1.103.0': resolution: {integrity: sha512-o4hanZAQdNfsKecexq9L3eHICd0AAvdbLk6hA60UzGXbGH/q8b/9xv2RgR7vV3ZcHuyKVq7b37IGd/+gM4Tu+Q==} + '@types/vscode@1.108.1': + resolution: {integrity: sha512-DerV0BbSzt87TbrqmZ7lRDIYaMiqvP8tmJTzW2p49ZBVtGUnGAu2RGQd1Wv4XMzEVUpaHbsemVM5nfuQJj7H6w==} + '@types/ws@8.18.1': resolution: {integrity: sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==} @@ -14438,6 +14441,8 @@ snapshots: '@types/vscode@1.103.0': {} + '@types/vscode@1.108.1': {} + '@types/ws@8.18.1': dependencies: '@types/node': 24.2.1 diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index 305305d2289..a16bb1dd27f 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -1,4 +1,5 @@ import type { Mock } from "vitest" +import { checkModelSupportsImages, IMAGE_CAPABLE_MODEL_PREFIXES } from "../vscode-lm" // Mocks must come first, before imports vi.mock("vscode", () => { @@ -537,3 +538,92 @@ describe("VsCodeLmHandler", () => { }) }) }) + +describe("checkModelSupportsImages", () => { + describe("OpenAI GPT models", () => { + it("should return true for all gpt-* models (GitHub Copilot)", () => { + // All GPT models in GitHub Copilot support images + expect(checkModelSupportsImages("gpt", "gpt-4o")).toBe(true) + expect(checkModelSupportsImages("gpt", "gpt-4.1")).toBe(true) + expect(checkModelSupportsImages("gpt", "gpt-5")).toBe(true) + expect(checkModelSupportsImages("gpt", "gpt-5.1")).toBe(true) + expect(checkModelSupportsImages("gpt", "gpt-5.2")).toBe(true) + expect(checkModelSupportsImages("gpt-mini", "gpt-5-mini")).toBe(true) + expect(checkModelSupportsImages("gpt-codex", "gpt-5.1-codex")).toBe(true) + expect(checkModelSupportsImages("gpt-codex", "gpt-5.2-codex")).toBe(true) + expect(checkModelSupportsImages("gpt-codex", "gpt-5.1-codex-max")).toBe(true) + expect(checkModelSupportsImages("gpt-codex", "gpt-5.1-codex-mini")).toBe(true) + }) + + it("should return true for o1 and o3 reasoning models", () => { + expect(checkModelSupportsImages("o1", "o1-preview")).toBe(true) + expect(checkModelSupportsImages("o1", "o1-mini")).toBe(true) + expect(checkModelSupportsImages("o3", "o3")).toBe(true) + }) + }) + + describe("Anthropic Claude models", () => { + it("should return true for all claude-* models (GitHub Copilot)", () => { + // All Claude models in GitHub Copilot support images + expect(checkModelSupportsImages("claude-haiku", "claude-haiku-4.5")).toBe(true) + expect(checkModelSupportsImages("claude-opus", "claude-opus-4.5")).toBe(true) + expect(checkModelSupportsImages("claude-sonnet", "claude-sonnet-4")).toBe(true) + expect(checkModelSupportsImages("claude-sonnet", "claude-sonnet-4.5")).toBe(true) + }) + }) + + describe("Google Gemini models", () => { + it("should return true for all gemini-* models (GitHub Copilot)", () => { + // All Gemini models in GitHub Copilot support images + expect(checkModelSupportsImages("gemini-pro", "gemini-2.5-pro")).toBe(true) + expect(checkModelSupportsImages("gemini-flash", "gemini-3-flash-preview")).toBe(true) + expect(checkModelSupportsImages("gemini-pro", "gemini-3-pro-preview")).toBe(true) + }) + }) + + describe("non-vision models", () => { + it("should return false for grok models (text-only in GitHub Copilot)", () => { + // Grok is the only model family in GitHub Copilot that doesn't support images + expect(checkModelSupportsImages("grok", "grok-code-fast-1")).toBe(false) + }) + + it("should return false for models with non-matching prefixes", () => { + // Models that don't start with gpt, claude, gemini, o1, or o3 + expect(checkModelSupportsImages("mistral", "mistral-large")).toBe(false) + expect(checkModelSupportsImages("llama", "llama-3-70b")).toBe(false) + expect(checkModelSupportsImages("unknown", "some-random-model")).toBe(false) + }) + }) + + describe("case insensitivity", () => { + it("should match regardless of case", () => { + expect(checkModelSupportsImages("GPT", "GPT-4O")).toBe(true) + expect(checkModelSupportsImages("CLAUDE", "CLAUDE-SONNET-4")).toBe(true) + expect(checkModelSupportsImages("GEMINI", "GEMINI-2.5-PRO")).toBe(true) + }) + }) + + describe("prefix matching", () => { + it("should only match IDs that start with known prefixes", () => { + // ID must START with the prefix, not just contain it + expect(checkModelSupportsImages("custom", "gpt-4o")).toBe(true) // ID starts with gpt + expect(checkModelSupportsImages("custom", "my-gpt-model")).toBe(false) // gpt not at start + expect(checkModelSupportsImages("custom", "not-claude-model")).toBe(false) // claude not at start + }) + }) +}) + +describe("IMAGE_CAPABLE_MODEL_PREFIXES", () => { + it("should export the model prefixes array", () => { + expect(Array.isArray(IMAGE_CAPABLE_MODEL_PREFIXES)).toBe(true) + expect(IMAGE_CAPABLE_MODEL_PREFIXES.length).toBeGreaterThan(0) + }) + + it("should include key model prefixes", () => { + expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("gpt") + expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("claude") + expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("gemini") + expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("o1") + expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("o3") + }) +}) diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts index 8fb564a9d59..953a4fe84c2 100644 --- a/src/api/providers/vscode-lm.ts +++ b/src/api/providers/vscode-lm.ts @@ -529,6 +529,10 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan const modelId = this.client.id || modelParts.join(SELECTOR_SEPARATOR) + // Check if the model supports images based on known model families + // VS Code Language Model API 1.106+ supports image inputs via LanguageModelDataPart + const supportsImages = checkModelSupportsImages(this.client.family, this.client.id) + // Build model info with conservative defaults for missing values const modelInfo: ModelInfo = { maxTokens: -1, // Unlimited tokens by default @@ -536,7 +540,7 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan typeof this.client.maxInputTokens === "number" ? Math.max(0, this.client.maxInputTokens) : openAiModelInfoSaneDefaults.contextWindow, - supportsImages: false, // VSCode Language Model API currently doesn't support image inputs + supportsImages, supportsPromptCache: true, inputPrice: 0, outputPrice: 0, @@ -586,8 +590,43 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan } } -// Static blacklist of VS Code Language Model IDs that should be excluded from the model list e.g. because they will never work -const VSCODE_LM_STATIC_BLACKLIST: string[] = ["claude-3.7-sonnet", "claude-3.7-sonnet-thought"] +/** + * Model ID prefixes that support image inputs via VS Code Language Model API. + * These models support the LanguageModelDataPart.image() API introduced in VS Code 1.106+. + * + * All GitHub Copilot models with these prefixes support images. + * Only grok-* models don't support images (text only). + * + * Source: https://models.dev/api.json (github-copilot provider models) + */ +export const IMAGE_CAPABLE_MODEL_PREFIXES = [ + "gpt", // All GPT models (gpt-4o, gpt-4.1, gpt-5, gpt-5.1, gpt-5.2, gpt-5-mini, gpt-5.1-codex, etc.) + "claude", // All Claude models (claude-haiku-4.5, claude-opus-4.5, claude-sonnet-4, claude-sonnet-4.5) + "gemini", // All Gemini models (gemini-2.5-pro, gemini-3-flash-preview, gemini-3-pro-preview) + "o1", // OpenAI o1 reasoning models + "o3", // OpenAI o3 reasoning models +] + +/** + * Checks if a model supports image inputs based on its model ID. + * Uses prefix matching against known image-capable model families. + * + * @param _family The model family (unused, kept for API compatibility) + * @param id The model ID + * @returns true if the model supports image inputs + */ +export function checkModelSupportsImages(_family: string, id: string): boolean { + const idLower = id.toLowerCase() + return IMAGE_CAPABLE_MODEL_PREFIXES.some((prefix) => idLower.startsWith(prefix)) +} + +// Static blacklist of VS Code Language Model IDs that should be excluded from the model list +// e.g. because they don't support native tool calling or will never work +const VSCODE_LM_STATIC_BLACKLIST: string[] = [ + "claude-3.7-sonnet", + "claude-3.7-sonnet-thought", + "claude-opus-41", // Does not support native tool calling +] export async function getVsCodeLmModels() { try { diff --git a/src/api/transform/__tests__/vscode-lm-format.spec.ts b/src/api/transform/__tests__/vscode-lm-format.spec.ts index e60860b5491..edbfaa68b30 100644 --- a/src/api/transform/__tests__/vscode-lm-format.spec.ts +++ b/src/api/transform/__tests__/vscode-lm-format.spec.ts @@ -26,7 +26,13 @@ interface MockLanguageModelToolCallPart { interface MockLanguageModelToolResultPart { type: "tool_result" callId: string - content: MockLanguageModelTextPart[] + content: (MockLanguageModelTextPart | MockLanguageModelDataPart)[] +} + +interface MockLanguageModelDataPart { + type: "data" + data: Uint8Array + mimeType: string } // Mock vscode namespace @@ -54,10 +60,32 @@ vitest.mock("vscode", () => { type = "tool_result" constructor( public callId: string, - public content: MockLanguageModelTextPart[], + public content: (MockLanguageModelTextPart | MockLanguageModelDataPart)[], ) {} } + class MockLanguageModelDataPart { + type = "data" + constructor( + public data: Uint8Array, + public mimeType: string, + ) {} + + static image(data: Uint8Array, mime: string) { + return new MockLanguageModelDataPart(data, mime) + } + + static json(value: any, mime?: string) { + const bytes = new TextEncoder().encode(JSON.stringify(value)) + return new MockLanguageModelDataPart(bytes, mime || "application/json") + } + + static text(value: string, mime?: string) { + const bytes = new TextEncoder().encode(value) + return new MockLanguageModelDataPart(bytes, mime || "text/plain") + } + } + return { LanguageModelChatMessage: { Assistant: vitest.fn((content) => ({ @@ -75,6 +103,7 @@ vitest.mock("vscode", () => { LanguageModelTextPart: MockLanguageModelTextPart, LanguageModelToolCallPart: MockLanguageModelToolCallPart, LanguageModelToolResultPart: MockLanguageModelToolResultPart, + LanguageModelDataPart: MockLanguageModelDataPart, } }) @@ -150,7 +179,7 @@ describe("convertToVsCodeLmMessages", () => { expect(toolCall.type).toBe("tool_call") }) - it("should handle image blocks with appropriate placeholders", () => { + it("should convert image blocks to LanguageModelDataPart", () => { const messages: Anthropic.Messages.MessageParam[] = [ { role: "user", @@ -161,7 +190,7 @@ describe("convertToVsCodeLmMessages", () => { source: { type: "base64", media_type: "image/png", - data: "base64data", + data: "dGVzdA==", // "test" in base64 }, }, ], @@ -171,8 +200,123 @@ describe("convertToVsCodeLmMessages", () => { const result = convertToVsCodeLmMessages(messages) expect(result).toHaveLength(1) + expect(result[0].content).toHaveLength(2) + + // First part should be text + const textPart = result[0].content[0] as MockLanguageModelTextPart + expect(textPart.type).toBe("text") + expect(textPart.value).toBe("Look at this:") + + // Second part should be a LanguageModelDataPart for the image + const imagePart = result[0].content[1] as unknown as MockLanguageModelDataPart + expect(imagePart.type).toBe("data") + expect(imagePart.mimeType).toBe("image/png") + expect(imagePart.data).toBeInstanceOf(Uint8Array) + }) + + it("should handle images in tool results", () => { + const messages: Anthropic.Messages.MessageParam[] = [ + { + role: "user", + content: [ + { + type: "tool_result", + tool_use_id: "tool-1", + content: [ + { type: "text", text: "Screenshot result:" }, + { + type: "image", + source: { + type: "base64", + media_type: "image/jpeg", + data: "dGVzdA==", + }, + }, + ], + }, + ], + }, + ] + + const result = convertToVsCodeLmMessages(messages) + + expect(result).toHaveLength(1) + expect(result[0].content).toHaveLength(1) + + const toolResult = result[0].content[0] as MockLanguageModelToolResultPart + expect(toolResult.type).toBe("tool_result") + expect(toolResult.content).toHaveLength(2) + + // First item in tool result should be text + const textPart = toolResult.content[0] as MockLanguageModelTextPart + expect(textPart.type).toBe("text") + + // Second item should be image data + const imagePart = toolResult.content[1] as MockLanguageModelDataPart + expect(imagePart.type).toBe("data") + expect(imagePart.mimeType).toBe("image/jpeg") + }) + + it("should return text placeholder for URL-based images", () => { + const messages: Anthropic.Messages.MessageParam[] = [ + { + role: "user", + content: [ + { type: "text", text: "Check this image:" }, + { + type: "image", + source: { + type: "url", + url: "https://example.com/image.png", + } as any, + }, + ], + }, + ] + + const result = convertToVsCodeLmMessages(messages) + + expect(result).toHaveLength(1) + expect(result[0].content).toHaveLength(2) + + // First part should be text + const textPart = result[0].content[0] as MockLanguageModelTextPart + expect(textPart.type).toBe("text") + expect(textPart.value).toBe("Check this image:") + + // Second part should be a text placeholder (not an empty DataPart) const imagePlaceholder = result[0].content[1] as MockLanguageModelTextPart - expect(imagePlaceholder.value).toContain("[Image (base64): image/png not supported by VSCode LM API]") + expect(imagePlaceholder.type).toBe("text") + expect(imagePlaceholder.value).toContain("URL not supported") + expect(imagePlaceholder.value).toContain("https://example.com/image.png") + }) + + it("should return text placeholder for unknown image source types", () => { + const messages: Anthropic.Messages.MessageParam[] = [ + { + role: "user", + content: [ + { + type: "image", + source: { + type: "unknown", + media_type: "image/png", + data: "", // Required by type but ignored for unknown source types + } as any, + }, + ], + }, + ] + + const result = convertToVsCodeLmMessages(messages) + + expect(result).toHaveLength(1) + expect(result[0].content).toHaveLength(1) + + // Should return a text placeholder for unknown source types + const placeholder = result[0].content[0] as MockLanguageModelTextPart + expect(placeholder.type).toBe("text") + expect(placeholder.value).toContain("unsupported source type") }) }) diff --git a/src/api/transform/vscode-lm-format.ts b/src/api/transform/vscode-lm-format.ts index 388197c2c2c..4066b1fdc4a 100644 --- a/src/api/transform/vscode-lm-format.ts +++ b/src/api/transform/vscode-lm-format.ts @@ -28,6 +28,46 @@ function asObjectSafe(value: any): object { } } +/** + * Converts an Anthropic image block to a VS Code LanguageModelDataPart or TextPart. + * Uses the new LanguageModelDataPart.image() API available in VS Code 1.106+. + * @param imageBlock The Anthropic image block param + * @returns A LanguageModelDataPart for the image, or TextPart if the image cannot be converted + */ +function convertImageToDataPart( + imageBlock: Anthropic.ImageBlockParam, +): vscode.LanguageModelDataPart | vscode.LanguageModelTextPart { + const source = imageBlock.source + const mediaType = source.media_type || "image/png" + + if (source.type === "base64") { + // Convert base64 string to Uint8Array + const binaryString = atob(source.data) + const bytes = new Uint8Array(binaryString.length) + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i) + } + return vscode.LanguageModelDataPart.image(bytes, mediaType) + } else if (source.type === "url") { + // URL-based images cannot be directly converted - return a text placeholder + // explaining the limitation. URL images should be fetched and converted to base64 upstream. + console.warn( + "Roo Code : URL-based images are not supported by the VS Code LM API. " + + "Images must be provided as base64 data.", + ) + return new vscode.LanguageModelTextPart( + `[Image from URL not supported: ${(source as any).url || "unknown URL"}. ` + + `VS Code LM API requires base64-encoded image data.]`, + ) + } + + // Fallback for unknown source types - return a text placeholder + console.warn(`Roo Code : Unknown image source type: ${(source as any).type}`) + return new vscode.LanguageModelTextPart( + `[Image with unsupported source type "${(source as any).type}" cannot be displayed]`, + ) +} + export function convertToVsCodeLmMessages( anthropicMessages: Anthropic.Messages.MessageParam[], ): vscode.LanguageModelChatMessage[] { @@ -66,15 +106,13 @@ export function convertToVsCodeLmMessages( const contentParts = [ // Convert tool messages to ToolResultParts ...toolMessages.map((toolMessage) => { - // Process tool result content into TextParts - const toolContentParts: vscode.LanguageModelTextPart[] = + // Process tool result content into TextParts or DataParts + const toolContentParts: (vscode.LanguageModelTextPart | vscode.LanguageModelDataPart)[] = typeof toolMessage.content === "string" ? [new vscode.LanguageModelTextPart(toolMessage.content)] : (toolMessage.content?.map((part) => { if (part.type === "image") { - return new vscode.LanguageModelTextPart( - `[Image (${part.source?.type || "Unknown source-type"}): ${part.source?.media_type || "unknown media-type"} not supported by VSCode LM API]`, - ) + return convertImageToDataPart(part) } return new vscode.LanguageModelTextPart(part.text) }) ?? [new vscode.LanguageModelTextPart("")]) @@ -82,12 +120,10 @@ export function convertToVsCodeLmMessages( return new vscode.LanguageModelToolResultPart(toolMessage.tool_use_id, toolContentParts) }), - // Convert non-tool messages to TextParts after tool messages + // Convert non-tool messages to TextParts or DataParts after tool messages ...nonToolMessages.map((part) => { if (part.type === "image") { - return new vscode.LanguageModelTextPart( - `[Image (${part.source?.type || "Unknown source-type"}): ${part.source?.media_type || "unknown media-type"} not supported by VSCode LM API]`, - ) + return convertImageToDataPart(part) } return new vscode.LanguageModelTextPart(part.text) }), diff --git a/src/package.json b/src/package.json index bf4a009a946..e2255b83611 100644 --- a/src/package.json +++ b/src/package.json @@ -554,7 +554,7 @@ "@types/string-similarity": "^4.0.2", "@types/tmp": "^0.2.6", "@types/turndown": "^5.0.5", - "@types/vscode": "^1.84.0", + "@types/vscode": "^1.106.0", "@vscode/test-electron": "^2.5.2", "@vscode/vsce": "3.3.2", "ai": "^6.0.0", diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index 8eac6fa7403..17914b55eeb 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -312,7 +312,8 @@ function getSelectedModel({ : vscodeLlmDefaultModelId const modelFamily = apiConfiguration?.vsCodeLmModelSelector?.family ?? vscodeLlmDefaultModelId const info = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] - return { id, info: { ...openAiModelInfoSaneDefaults, ...info, supportsImages: false } } // VSCode LM API currently doesn't support images. + // VS Code LM API 1.106+ supports images via LanguageModelDataPart - use model's supportsImages capability + return { id, info: { ...openAiModelInfoSaneDefaults, ...info } } } case "cerebras": { const id = apiConfiguration.apiModelId ?? defaultModelId