From 34aef21ceae2136605db86720a0f3c79c42ac772 Mon Sep 17 00:00:00 2001 From: Matt Rubens Date: Thu, 29 Jan 2026 01:41:00 -0500 Subject: [PATCH] Revert "feat(vscode-lm): add image support for VS Code LM API provider (#11065)" This reverts commit 49aac7ea00a4e67813bc1ab3a03aa68aa50186f5. --- pnpm-lock.yaml | 9 +- src/api/providers/__tests__/vscode-lm.spec.ts | 90 ---------- src/api/providers/vscode-lm.ts | 45 +---- .../__tests__/vscode-lm-format.spec.ts | 154 +----------------- src/api/transform/vscode-lm-format.ts | 54 +----- src/package.json | 2 +- .../components/ui/hooks/useSelectedModel.ts | 3 +- 7 files changed, 21 insertions(+), 336 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f2ddf62834a..32d5ddfba8c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1060,8 +1060,8 @@ importers: specifier: ^5.0.5 version: 5.0.5 '@types/vscode': - specifier: ^1.106.0 - version: 1.108.1 + specifier: ^1.84.0 + version: 1.100.0 '@vscode/test-electron': specifier: ^2.5.2 version: 2.5.2 @@ -4454,9 +4454,6 @@ packages: '@types/vscode@1.103.0': resolution: {integrity: sha512-o4hanZAQdNfsKecexq9L3eHICd0AAvdbLk6hA60UzGXbGH/q8b/9xv2RgR7vV3ZcHuyKVq7b37IGd/+gM4Tu+Q==} - '@types/vscode@1.108.1': - resolution: {integrity: sha512-DerV0BbSzt87TbrqmZ7lRDIYaMiqvP8tmJTzW2p49ZBVtGUnGAu2RGQd1Wv4XMzEVUpaHbsemVM5nfuQJj7H6w==} - '@types/ws@8.18.1': resolution: {integrity: sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==} @@ -14491,8 +14488,6 @@ snapshots: '@types/vscode@1.103.0': {} - '@types/vscode@1.108.1': {} - '@types/ws@8.18.1': dependencies: '@types/node': 24.2.1 diff --git a/src/api/providers/__tests__/vscode-lm.spec.ts b/src/api/providers/__tests__/vscode-lm.spec.ts index a16bb1dd27f..305305d2289 100644 --- a/src/api/providers/__tests__/vscode-lm.spec.ts +++ b/src/api/providers/__tests__/vscode-lm.spec.ts @@ -1,5 +1,4 @@ import type { Mock } from "vitest" -import { checkModelSupportsImages, IMAGE_CAPABLE_MODEL_PREFIXES } from "../vscode-lm" // Mocks must come first, before imports vi.mock("vscode", () => { @@ -538,92 +537,3 @@ describe("VsCodeLmHandler", () => { }) }) }) - -describe("checkModelSupportsImages", () => { - describe("OpenAI GPT models", () => { - it("should return true for all gpt-* models (GitHub Copilot)", () => { - // All GPT models in GitHub Copilot support images - expect(checkModelSupportsImages("gpt", "gpt-4o")).toBe(true) - expect(checkModelSupportsImages("gpt", "gpt-4.1")).toBe(true) - expect(checkModelSupportsImages("gpt", "gpt-5")).toBe(true) - expect(checkModelSupportsImages("gpt", "gpt-5.1")).toBe(true) - expect(checkModelSupportsImages("gpt", "gpt-5.2")).toBe(true) - expect(checkModelSupportsImages("gpt-mini", "gpt-5-mini")).toBe(true) - expect(checkModelSupportsImages("gpt-codex", "gpt-5.1-codex")).toBe(true) - expect(checkModelSupportsImages("gpt-codex", "gpt-5.2-codex")).toBe(true) - expect(checkModelSupportsImages("gpt-codex", "gpt-5.1-codex-max")).toBe(true) - expect(checkModelSupportsImages("gpt-codex", "gpt-5.1-codex-mini")).toBe(true) - }) - - it("should return true for o1 and o3 reasoning models", () => { - expect(checkModelSupportsImages("o1", "o1-preview")).toBe(true) - expect(checkModelSupportsImages("o1", "o1-mini")).toBe(true) - expect(checkModelSupportsImages("o3", "o3")).toBe(true) - }) - }) - - describe("Anthropic Claude models", () => { - it("should return true for all claude-* models (GitHub Copilot)", () => { - // All Claude models in GitHub Copilot support images - expect(checkModelSupportsImages("claude-haiku", "claude-haiku-4.5")).toBe(true) - expect(checkModelSupportsImages("claude-opus", "claude-opus-4.5")).toBe(true) - expect(checkModelSupportsImages("claude-sonnet", "claude-sonnet-4")).toBe(true) - expect(checkModelSupportsImages("claude-sonnet", "claude-sonnet-4.5")).toBe(true) - }) - }) - - describe("Google Gemini models", () => { - it("should return true for all gemini-* models (GitHub Copilot)", () => { - // All Gemini models in GitHub Copilot support images - expect(checkModelSupportsImages("gemini-pro", "gemini-2.5-pro")).toBe(true) - expect(checkModelSupportsImages("gemini-flash", "gemini-3-flash-preview")).toBe(true) - expect(checkModelSupportsImages("gemini-pro", "gemini-3-pro-preview")).toBe(true) - }) - }) - - describe("non-vision models", () => { - it("should return false for grok models (text-only in GitHub Copilot)", () => { - // Grok is the only model family in GitHub Copilot that doesn't support images - expect(checkModelSupportsImages("grok", "grok-code-fast-1")).toBe(false) - }) - - it("should return false for models with non-matching prefixes", () => { - // Models that don't start with gpt, claude, gemini, o1, or o3 - expect(checkModelSupportsImages("mistral", "mistral-large")).toBe(false) - expect(checkModelSupportsImages("llama", "llama-3-70b")).toBe(false) - expect(checkModelSupportsImages("unknown", "some-random-model")).toBe(false) - }) - }) - - describe("case insensitivity", () => { - it("should match regardless of case", () => { - expect(checkModelSupportsImages("GPT", "GPT-4O")).toBe(true) - expect(checkModelSupportsImages("CLAUDE", "CLAUDE-SONNET-4")).toBe(true) - expect(checkModelSupportsImages("GEMINI", "GEMINI-2.5-PRO")).toBe(true) - }) - }) - - describe("prefix matching", () => { - it("should only match IDs that start with known prefixes", () => { - // ID must START with the prefix, not just contain it - expect(checkModelSupportsImages("custom", "gpt-4o")).toBe(true) // ID starts with gpt - expect(checkModelSupportsImages("custom", "my-gpt-model")).toBe(false) // gpt not at start - expect(checkModelSupportsImages("custom", "not-claude-model")).toBe(false) // claude not at start - }) - }) -}) - -describe("IMAGE_CAPABLE_MODEL_PREFIXES", () => { - it("should export the model prefixes array", () => { - expect(Array.isArray(IMAGE_CAPABLE_MODEL_PREFIXES)).toBe(true) - expect(IMAGE_CAPABLE_MODEL_PREFIXES.length).toBeGreaterThan(0) - }) - - it("should include key model prefixes", () => { - expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("gpt") - expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("claude") - expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("gemini") - expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("o1") - expect(IMAGE_CAPABLE_MODEL_PREFIXES).toContain("o3") - }) -}) diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts index 953a4fe84c2..8fb564a9d59 100644 --- a/src/api/providers/vscode-lm.ts +++ b/src/api/providers/vscode-lm.ts @@ -529,10 +529,6 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan const modelId = this.client.id || modelParts.join(SELECTOR_SEPARATOR) - // Check if the model supports images based on known model families - // VS Code Language Model API 1.106+ supports image inputs via LanguageModelDataPart - const supportsImages = checkModelSupportsImages(this.client.family, this.client.id) - // Build model info with conservative defaults for missing values const modelInfo: ModelInfo = { maxTokens: -1, // Unlimited tokens by default @@ -540,7 +536,7 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan typeof this.client.maxInputTokens === "number" ? Math.max(0, this.client.maxInputTokens) : openAiModelInfoSaneDefaults.contextWindow, - supportsImages, + supportsImages: false, // VSCode Language Model API currently doesn't support image inputs supportsPromptCache: true, inputPrice: 0, outputPrice: 0, @@ -590,43 +586,8 @@ export class VsCodeLmHandler extends BaseProvider implements SingleCompletionHan } } -/** - * Model ID prefixes that support image inputs via VS Code Language Model API. - * These models support the LanguageModelDataPart.image() API introduced in VS Code 1.106+. - * - * All GitHub Copilot models with these prefixes support images. - * Only grok-* models don't support images (text only). - * - * Source: https://models.dev/api.json (github-copilot provider models) - */ -export const IMAGE_CAPABLE_MODEL_PREFIXES = [ - "gpt", // All GPT models (gpt-4o, gpt-4.1, gpt-5, gpt-5.1, gpt-5.2, gpt-5-mini, gpt-5.1-codex, etc.) - "claude", // All Claude models (claude-haiku-4.5, claude-opus-4.5, claude-sonnet-4, claude-sonnet-4.5) - "gemini", // All Gemini models (gemini-2.5-pro, gemini-3-flash-preview, gemini-3-pro-preview) - "o1", // OpenAI o1 reasoning models - "o3", // OpenAI o3 reasoning models -] - -/** - * Checks if a model supports image inputs based on its model ID. - * Uses prefix matching against known image-capable model families. - * - * @param _family The model family (unused, kept for API compatibility) - * @param id The model ID - * @returns true if the model supports image inputs - */ -export function checkModelSupportsImages(_family: string, id: string): boolean { - const idLower = id.toLowerCase() - return IMAGE_CAPABLE_MODEL_PREFIXES.some((prefix) => idLower.startsWith(prefix)) -} - -// Static blacklist of VS Code Language Model IDs that should be excluded from the model list -// e.g. because they don't support native tool calling or will never work -const VSCODE_LM_STATIC_BLACKLIST: string[] = [ - "claude-3.7-sonnet", - "claude-3.7-sonnet-thought", - "claude-opus-41", // Does not support native tool calling -] +// Static blacklist of VS Code Language Model IDs that should be excluded from the model list e.g. because they will never work +const VSCODE_LM_STATIC_BLACKLIST: string[] = ["claude-3.7-sonnet", "claude-3.7-sonnet-thought"] export async function getVsCodeLmModels() { try { diff --git a/src/api/transform/__tests__/vscode-lm-format.spec.ts b/src/api/transform/__tests__/vscode-lm-format.spec.ts index edbfaa68b30..e60860b5491 100644 --- a/src/api/transform/__tests__/vscode-lm-format.spec.ts +++ b/src/api/transform/__tests__/vscode-lm-format.spec.ts @@ -26,13 +26,7 @@ interface MockLanguageModelToolCallPart { interface MockLanguageModelToolResultPart { type: "tool_result" callId: string - content: (MockLanguageModelTextPart | MockLanguageModelDataPart)[] -} - -interface MockLanguageModelDataPart { - type: "data" - data: Uint8Array - mimeType: string + content: MockLanguageModelTextPart[] } // Mock vscode namespace @@ -60,32 +54,10 @@ vitest.mock("vscode", () => { type = "tool_result" constructor( public callId: string, - public content: (MockLanguageModelTextPart | MockLanguageModelDataPart)[], + public content: MockLanguageModelTextPart[], ) {} } - class MockLanguageModelDataPart { - type = "data" - constructor( - public data: Uint8Array, - public mimeType: string, - ) {} - - static image(data: Uint8Array, mime: string) { - return new MockLanguageModelDataPart(data, mime) - } - - static json(value: any, mime?: string) { - const bytes = new TextEncoder().encode(JSON.stringify(value)) - return new MockLanguageModelDataPart(bytes, mime || "application/json") - } - - static text(value: string, mime?: string) { - const bytes = new TextEncoder().encode(value) - return new MockLanguageModelDataPart(bytes, mime || "text/plain") - } - } - return { LanguageModelChatMessage: { Assistant: vitest.fn((content) => ({ @@ -103,7 +75,6 @@ vitest.mock("vscode", () => { LanguageModelTextPart: MockLanguageModelTextPart, LanguageModelToolCallPart: MockLanguageModelToolCallPart, LanguageModelToolResultPart: MockLanguageModelToolResultPart, - LanguageModelDataPart: MockLanguageModelDataPart, } }) @@ -179,7 +150,7 @@ describe("convertToVsCodeLmMessages", () => { expect(toolCall.type).toBe("tool_call") }) - it("should convert image blocks to LanguageModelDataPart", () => { + it("should handle image blocks with appropriate placeholders", () => { const messages: Anthropic.Messages.MessageParam[] = [ { role: "user", @@ -190,7 +161,7 @@ describe("convertToVsCodeLmMessages", () => { source: { type: "base64", media_type: "image/png", - data: "dGVzdA==", // "test" in base64 + data: "base64data", }, }, ], @@ -200,123 +171,8 @@ describe("convertToVsCodeLmMessages", () => { const result = convertToVsCodeLmMessages(messages) expect(result).toHaveLength(1) - expect(result[0].content).toHaveLength(2) - - // First part should be text - const textPart = result[0].content[0] as MockLanguageModelTextPart - expect(textPart.type).toBe("text") - expect(textPart.value).toBe("Look at this:") - - // Second part should be a LanguageModelDataPart for the image - const imagePart = result[0].content[1] as unknown as MockLanguageModelDataPart - expect(imagePart.type).toBe("data") - expect(imagePart.mimeType).toBe("image/png") - expect(imagePart.data).toBeInstanceOf(Uint8Array) - }) - - it("should handle images in tool results", () => { - const messages: Anthropic.Messages.MessageParam[] = [ - { - role: "user", - content: [ - { - type: "tool_result", - tool_use_id: "tool-1", - content: [ - { type: "text", text: "Screenshot result:" }, - { - type: "image", - source: { - type: "base64", - media_type: "image/jpeg", - data: "dGVzdA==", - }, - }, - ], - }, - ], - }, - ] - - const result = convertToVsCodeLmMessages(messages) - - expect(result).toHaveLength(1) - expect(result[0].content).toHaveLength(1) - - const toolResult = result[0].content[0] as MockLanguageModelToolResultPart - expect(toolResult.type).toBe("tool_result") - expect(toolResult.content).toHaveLength(2) - - // First item in tool result should be text - const textPart = toolResult.content[0] as MockLanguageModelTextPart - expect(textPart.type).toBe("text") - - // Second item should be image data - const imagePart = toolResult.content[1] as MockLanguageModelDataPart - expect(imagePart.type).toBe("data") - expect(imagePart.mimeType).toBe("image/jpeg") - }) - - it("should return text placeholder for URL-based images", () => { - const messages: Anthropic.Messages.MessageParam[] = [ - { - role: "user", - content: [ - { type: "text", text: "Check this image:" }, - { - type: "image", - source: { - type: "url", - url: "https://example.com/image.png", - } as any, - }, - ], - }, - ] - - const result = convertToVsCodeLmMessages(messages) - - expect(result).toHaveLength(1) - expect(result[0].content).toHaveLength(2) - - // First part should be text - const textPart = result[0].content[0] as MockLanguageModelTextPart - expect(textPart.type).toBe("text") - expect(textPart.value).toBe("Check this image:") - - // Second part should be a text placeholder (not an empty DataPart) const imagePlaceholder = result[0].content[1] as MockLanguageModelTextPart - expect(imagePlaceholder.type).toBe("text") - expect(imagePlaceholder.value).toContain("URL not supported") - expect(imagePlaceholder.value).toContain("https://example.com/image.png") - }) - - it("should return text placeholder for unknown image source types", () => { - const messages: Anthropic.Messages.MessageParam[] = [ - { - role: "user", - content: [ - { - type: "image", - source: { - type: "unknown", - media_type: "image/png", - data: "", // Required by type but ignored for unknown source types - } as any, - }, - ], - }, - ] - - const result = convertToVsCodeLmMessages(messages) - - expect(result).toHaveLength(1) - expect(result[0].content).toHaveLength(1) - - // Should return a text placeholder for unknown source types - const placeholder = result[0].content[0] as MockLanguageModelTextPart - expect(placeholder.type).toBe("text") - expect(placeholder.value).toContain("unsupported source type") + expect(imagePlaceholder.value).toContain("[Image (base64): image/png not supported by VSCode LM API]") }) }) diff --git a/src/api/transform/vscode-lm-format.ts b/src/api/transform/vscode-lm-format.ts index 4066b1fdc4a..388197c2c2c 100644 --- a/src/api/transform/vscode-lm-format.ts +++ b/src/api/transform/vscode-lm-format.ts @@ -28,46 +28,6 @@ function asObjectSafe(value: any): object { } } -/** - * Converts an Anthropic image block to a VS Code LanguageModelDataPart or TextPart. - * Uses the new LanguageModelDataPart.image() API available in VS Code 1.106+. - * @param imageBlock The Anthropic image block param - * @returns A LanguageModelDataPart for the image, or TextPart if the image cannot be converted - */ -function convertImageToDataPart( - imageBlock: Anthropic.ImageBlockParam, -): vscode.LanguageModelDataPart | vscode.LanguageModelTextPart { - const source = imageBlock.source - const mediaType = source.media_type || "image/png" - - if (source.type === "base64") { - // Convert base64 string to Uint8Array - const binaryString = atob(source.data) - const bytes = new Uint8Array(binaryString.length) - for (let i = 0; i < binaryString.length; i++) { - bytes[i] = binaryString.charCodeAt(i) - } - return vscode.LanguageModelDataPart.image(bytes, mediaType) - } else if (source.type === "url") { - // URL-based images cannot be directly converted - return a text placeholder - // explaining the limitation. URL images should be fetched and converted to base64 upstream. - console.warn( - "Roo Code : URL-based images are not supported by the VS Code LM API. " + - "Images must be provided as base64 data.", - ) - return new vscode.LanguageModelTextPart( - `[Image from URL not supported: ${(source as any).url || "unknown URL"}. ` + - `VS Code LM API requires base64-encoded image data.]`, - ) - } - - // Fallback for unknown source types - return a text placeholder - console.warn(`Roo Code : Unknown image source type: ${(source as any).type}`) - return new vscode.LanguageModelTextPart( - `[Image with unsupported source type "${(source as any).type}" cannot be displayed]`, - ) -} - export function convertToVsCodeLmMessages( anthropicMessages: Anthropic.Messages.MessageParam[], ): vscode.LanguageModelChatMessage[] { @@ -106,13 +66,15 @@ export function convertToVsCodeLmMessages( const contentParts = [ // Convert tool messages to ToolResultParts ...toolMessages.map((toolMessage) => { - // Process tool result content into TextParts or DataParts - const toolContentParts: (vscode.LanguageModelTextPart | vscode.LanguageModelDataPart)[] = + // Process tool result content into TextParts + const toolContentParts: vscode.LanguageModelTextPart[] = typeof toolMessage.content === "string" ? [new vscode.LanguageModelTextPart(toolMessage.content)] : (toolMessage.content?.map((part) => { if (part.type === "image") { - return convertImageToDataPart(part) + return new vscode.LanguageModelTextPart( + `[Image (${part.source?.type || "Unknown source-type"}): ${part.source?.media_type || "unknown media-type"} not supported by VSCode LM API]`, + ) } return new vscode.LanguageModelTextPart(part.text) }) ?? [new vscode.LanguageModelTextPart("")]) @@ -120,10 +82,12 @@ export function convertToVsCodeLmMessages( return new vscode.LanguageModelToolResultPart(toolMessage.tool_use_id, toolContentParts) }), - // Convert non-tool messages to TextParts or DataParts after tool messages + // Convert non-tool messages to TextParts after tool messages ...nonToolMessages.map((part) => { if (part.type === "image") { - return convertImageToDataPart(part) + return new vscode.LanguageModelTextPart( + `[Image (${part.source?.type || "Unknown source-type"}): ${part.source?.media_type || "unknown media-type"} not supported by VSCode LM API]`, + ) } return new vscode.LanguageModelTextPart(part.text) }), diff --git a/src/package.json b/src/package.json index c067eb68fff..736ffdea131 100644 --- a/src/package.json +++ b/src/package.json @@ -555,7 +555,7 @@ "@types/string-similarity": "^4.0.2", "@types/tmp": "^0.2.6", "@types/turndown": "^5.0.5", - "@types/vscode": "^1.106.0", + "@types/vscode": "^1.84.0", "@vscode/test-electron": "^2.5.2", "@vscode/vsce": "3.3.2", "ai": "^6.0.0", diff --git a/webview-ui/src/components/ui/hooks/useSelectedModel.ts b/webview-ui/src/components/ui/hooks/useSelectedModel.ts index 17914b55eeb..8eac6fa7403 100644 --- a/webview-ui/src/components/ui/hooks/useSelectedModel.ts +++ b/webview-ui/src/components/ui/hooks/useSelectedModel.ts @@ -312,8 +312,7 @@ function getSelectedModel({ : vscodeLlmDefaultModelId const modelFamily = apiConfiguration?.vsCodeLmModelSelector?.family ?? vscodeLlmDefaultModelId const info = vscodeLlmModels[modelFamily as keyof typeof vscodeLlmModels] - // VS Code LM API 1.106+ supports images via LanguageModelDataPart - use model's supportsImages capability - return { id, info: { ...openAiModelInfoSaneDefaults, ...info } } + return { id, info: { ...openAiModelInfoSaneDefaults, ...info, supportsImages: false } } // VSCode LM API currently doesn't support images. } case "cerebras": { const id = apiConfiguration.apiModelId ?? defaultModelId