From e9cad30ac48bf506490aacaab7a8cd7e41df4882 Mon Sep 17 00:00:00 2001 From: dvaz-external Date: Fri, 13 Feb 2026 11:46:49 +0000 Subject: [PATCH 1/6] fix embedding dimensions for various models sizes Signed-off-by: dvaz-external --- database.ts | 34 ++++++++++++++++++++++------------ doc2vec.ts | 25 ++++++++++++++----------- package.json | 2 +- tests/database.test.ts | 33 ++++++++++++++++++--------------- tests/mcp-server.test.ts | 6 ++++-- tests/utils.test.ts | 28 ++++++++++++++++++++++++++++ utils.ts | 32 ++++++++++++++++++++++++++++++++ 7 files changed, 119 insertions(+), 41 deletions(-) diff --git a/database.ts b/database.ts index a4a4841..1346e48 100644 --- a/database.ts +++ b/database.ts @@ -19,7 +19,7 @@ import { export class DatabaseManager { private static columnCache: WeakMap = new WeakMap(); - static async initDatabase(config: SourceConfig, parentLogger: Logger): Promise { + static async initDatabase(config: SourceConfig, parentLogger: Logger, embeddingDimension: number = 3072): Promise { const logger = parentLogger.child('database'); const dbConfig = config.database_config; @@ -32,10 +32,10 @@ export class DatabaseManager { const db = new BetterSqlite3(dbPath, { allowExtension: true } as any); sqliteVec.load(db); - logger.debug(`Creating vec_items table if it doesn't exist`); + logger.debug(`Creating vec_items table if it doesn't exist (dimension: ${embeddingDimension})`); db.exec(` CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0( - embedding FLOAT[3072], + embedding FLOAT[${embeddingDimension}], product_name TEXT, version TEXT, branch TEXT, @@ -61,7 +61,7 @@ export class DatabaseManager { logger.info(`Connecting to Qdrant at ${qdrantUrl}:${qdrantPort}, collection: ${collectionName}`); const qdrantClient = new QdrantClient({ url: qdrantUrl, apiKey: process.env.QDRANT_API_KEY, port: qdrantPort }); - await this.createCollectionQdrant(qdrantClient, collectionName, logger); + await this.createCollectionQdrant(qdrantClient, collectionName, logger, embeddingDimension); logger.info(`Qdrant connection established successfully`); return { client: qdrantClient, collectionName, type: 'qdrant' }; } else { @@ -71,7 +71,7 @@ export class DatabaseManager { } } - static async createCollectionQdrant(qdrantClient: QdrantClient, collectionName: string, logger: Logger) { + static async createCollectionQdrant(qdrantClient: QdrantClient, collectionName: string, logger: Logger, embeddingDimension: number = 3072) { try { logger.debug(`Checking if collection ${collectionName} exists`); const collections = await qdrantClient.getCollections(); @@ -84,10 +84,10 @@ export class DatabaseManager { return; } - logger.info(`Creating new collection ${collectionName}`); + logger.info(`Creating new collection ${collectionName} with dimension ${embeddingDimension}`); await qdrantClient.createCollection(collectionName, { vectors: { - size: 3072, + size: embeddingDimension, distance: "Cosine", }, }); @@ -177,7 +177,8 @@ export class DatabaseManager { dbConnection: DatabaseConnection, key: string, value: string, - logger: Logger + logger: Logger, + embeddingDimension: number = 3072 ): Promise { try { if (dbConnection.type === 'sqlite') { @@ -189,8 +190,7 @@ export class DatabaseManager { logger.debug(`Updated metadata value for ${key}`); } else if (dbConnection.type === 'qdrant') { const metadataUUID = Utils.generateMetadataUUID(key); - const dummyEmbeddingSize = 3072; - const dummyEmbedding = new Array(dummyEmbeddingSize).fill(0); + const dummyEmbedding = new Array(embeddingDimension).fill(0); const metadataPoint = { id: metadataUUID, vector: dummyEmbedding, @@ -278,9 +278,19 @@ export class DatabaseManager { logger.debug(`Using UUID: ${metadataUUID} for metadata`); + // Get the embedding dimension from the collection info + let embeddingDimension = 3072; // Default + try { + const collectionInfo = await dbConnection.client.getCollection(dbConnection.collectionName); + if (collectionInfo.config?.params?.vectors && 'size' in collectionInfo.config.params.vectors) { + embeddingDimension = collectionInfo.config.params.vectors.size; + } + } catch (error) { + logger.warn('Could not get collection info, using default dimension'); + } + // Generate a dummy embedding (all zeros) - const dummyEmbeddingSize = 3072; // Same size as your content embeddings - const dummyEmbedding = new Array(dummyEmbeddingSize).fill(0); + const dummyEmbedding = new Array(embeddingDimension).fill(0); // Create a point with special metadata payload const metadataPoint = { diff --git a/doc2vec.ts b/doc2vec.ts index ec6c641..963f968 100644 --- a/doc2vec.ts +++ b/doc2vec.ts @@ -37,6 +37,7 @@ export class Doc2Vec { private config: Config; private openai: OpenAI | AzureOpenAI; private embeddingModel: string; + private embeddingDimension: number; private contentProcessor: ContentProcessor; private logger: Logger; private configDir: string; @@ -77,7 +78,8 @@ export class Doc2Vec { apiVersion: azureApiVersion, }); this.embeddingModel = azureDeploymentName; - this.logger.info(`Using Azure OpenAI with deployment: ${azureDeploymentName}`); + this.embeddingDimension = Utils.getEmbeddingDimension(azureDeploymentName); + this.logger.info(`Using Azure OpenAI with deployment: ${azureDeploymentName} (${this.embeddingDimension} dimensions)`); } else { const openaiApiKey = embeddingConfig.openai?.api_key || process.env.OPENAI_API_KEY; const openaiModel = embeddingConfig.openai?.model || process.env.OPENAI_MODEL || 'text-embedding-3-large'; @@ -89,7 +91,8 @@ export class Doc2Vec { this.openai = new OpenAI({ apiKey: openaiApiKey }); this.embeddingModel = openaiModel; - this.logger.info(`Using OpenAI with model: ${openaiModel}`); + this.embeddingDimension = Utils.getEmbeddingDimension(openaiModel); + this.logger.info(`Using OpenAI with model: ${openaiModel} (${this.embeddingDimension} dimensions)`); } this.contentProcessor = new ContentProcessor(this.logger); @@ -397,7 +400,7 @@ export class Doc2Vec { const logger = parentLogger.child('process'); logger.info(`Starting processing for GitHub repo: ${config.repo}`); - const dbConnection = await DatabaseManager.initDatabase(config, logger); + const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension); // Initialize metadata storage await DatabaseManager.initDatabaseMetadata(dbConnection, logger); @@ -414,8 +417,8 @@ export class Doc2Vec { const logger = parentLogger.child('process'); logger.info(`Starting processing for website: ${config.url}`); - const dbConnection = await DatabaseManager.initDatabase(config, logger); - await DatabaseManager.initDatabaseMetadata(dbConnection, logger); + const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension); + await DatabaseManager.initDatabaseMetadata(dbConnection, logger); const validChunkIds: Set = new Set(); const visitedUrls: Set = new Set(); const urlPrefix = Utils.getUrlPrefix(config.url); @@ -539,7 +542,7 @@ export class Doc2Vec { const logger = parentLogger.child('process'); logger.info(`Starting processing for local directory: ${config.path}`); - const dbConnection = await DatabaseManager.initDatabase(config, logger); + const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension); const validChunkIds: Set = new Set(); const processedFiles: Set = new Set(); @@ -611,7 +614,7 @@ export class Doc2Vec { const logger = parentLogger.child('process'); logger.info(`Starting processing for code source (${config.source})`); - const dbConnection = await DatabaseManager.initDatabase(config, logger); + const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension); const validChunkIds: Set = new Set(); const processedFiles: Set = new Set(); @@ -765,10 +768,10 @@ export class Doc2Vec { } } - await DatabaseManager.setMetadataValue(dbConnection, fileListKey, JSON.stringify(currentList), logger); + await DatabaseManager.setMetadataValue(dbConnection, fileListKey, JSON.stringify(currentList), logger, this.embeddingDimension); if (lastMtimeKey) { const nextMtime = maxObservedMtime > 0 ? maxObservedMtime : Date.now(); - await DatabaseManager.setMetadataValue(dbConnection, lastMtimeKey, `${nextMtime}`, logger); + await DatabaseManager.setMetadataValue(dbConnection, lastMtimeKey, `${nextMtime}`, logger, this.embeddingDimension); } } } else { @@ -785,7 +788,7 @@ export class Doc2Vec { const headSha = await this.getRepoHeadSha(basePath, logger); if (headSha) { const shaKey = this.buildCodeShaMetadataKey(config.repo as string, repoBranch); - await DatabaseManager.setMetadataValue(dbConnection, shaKey, headSha, logger); + await DatabaseManager.setMetadataValue(dbConnection, shaKey, headSha, logger, this.embeddingDimension); } } @@ -974,7 +977,7 @@ export class Doc2Vec { const logger = parentLogger.child('process'); logger.info(`Starting processing for Zendesk: ${config.zendesk_subdomain}.zendesk.com`); - const dbConnection = await DatabaseManager.initDatabase(config, logger); + const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension); // Initialize metadata storage await DatabaseManager.initDatabaseMetadata(dbConnection, logger); diff --git a/package.json b/package.json index b42711c..480d45b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "doc2vec", - "version": "2.4.0", + "version": "2.5.0", "type": "commonjs", "description": "", "main": "dist/doc2vec.js", diff --git a/tests/database.test.ts b/tests/database.test.ts index 8618c46..a1b651f 100644 --- a/tests/database.test.ts +++ b/tests/database.test.ts @@ -9,14 +9,17 @@ import * as path from 'path'; const testLogger = new Logger('test', { level: LogLevel.NONE }); +// Default embedding dimension for tests +const TEST_EMBEDDING_DIMENSION = 3072; + // Helper to create an in-memory SQLite database matching the app schema -function createTestDb(): BetterSqlite3.Database { +function createTestDb(embeddingDimension: number = TEST_EMBEDDING_DIMENSION): BetterSqlite3.Database { const db = new BetterSqlite3(':memory:', { allowExtension: true } as any); sqliteVec.load(db); db.exec(` CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0( - embedding FLOAT[3072], + embedding FLOAT[${embeddingDimension}], product_name TEXT, version TEXT, branch TEXT, @@ -60,8 +63,8 @@ function createTestChunk(overrides: Partial { @@ -115,14 +118,14 @@ describe('DatabaseManager', () => { }); it('should set and get metadata values', async () => { - await DatabaseManager.setMetadataValue(conn, 'mykey', 'myvalue', testLogger); + await DatabaseManager.setMetadataValue(conn, 'mykey', 'myvalue', testLogger, TEST_EMBEDDING_DIMENSION); const value = await DatabaseManager.getMetadataValue(conn, 'mykey', undefined, testLogger); expect(value).toBe('myvalue'); }); it('should upsert metadata values', async () => { - await DatabaseManager.setMetadataValue(conn, 'key1', 'value1', testLogger); - await DatabaseManager.setMetadataValue(conn, 'key1', 'value2', testLogger); + await DatabaseManager.setMetadataValue(conn, 'key1', 'value1', testLogger, TEST_EMBEDDING_DIMENSION); + await DatabaseManager.setMetadataValue(conn, 'key1', 'value2', testLogger, TEST_EMBEDDING_DIMENSION); const value = await DatabaseManager.getMetadataValue(conn, 'key1', undefined, testLogger); expect(value).toBe('value2'); }); @@ -693,12 +696,12 @@ describe('DatabaseManager', () => { createCollection: vi.fn().mockResolvedValue({}), }; - await DatabaseManager.createCollectionQdrant(mockClient as any, 'test_col', testLogger); + await DatabaseManager.createCollectionQdrant(mockClient as any, 'test_col', testLogger, TEST_EMBEDDING_DIMENSION); expect(mockClient.createCollection).toHaveBeenCalledOnce(); expect(mockClient.createCollection).toHaveBeenCalledWith('test_col', expect.objectContaining({ vectors: expect.objectContaining({ - size: 3072, + size: TEST_EMBEDDING_DIMENSION, distance: 'Cosine', }), })); @@ -805,7 +808,7 @@ describe('DatabaseManager', () => { type: 'qdrant', }; - await DatabaseManager.setMetadataValue(qdrantDb, 'test_key', 'test_value', testLogger); + await DatabaseManager.setMetadataValue(qdrantDb, 'test_key', 'test_value', testLogger, TEST_EMBEDDING_DIMENSION); expect(mockClient.upsert).toHaveBeenCalledOnce(); const call = mockClient.upsert.mock.calls[0]; @@ -904,7 +907,7 @@ describe('DatabaseManager', () => { sqliteVec.load(db); db.exec(` CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0( - embedding FLOAT[3072], + embedding FLOAT[${TEST_EMBEDDING_DIMENSION}], product_name TEXT, version TEXT, branch TEXT, @@ -1002,7 +1005,7 @@ describe('DatabaseManager', () => { } as SourceConfig; await expect( - DatabaseManager.initDatabase(config, testLogger) + DatabaseManager.initDatabase(config, testLogger, TEST_EMBEDDING_DIMENSION) ).rejects.toThrow('Unsupported database type: mongodb'); }); }); @@ -1091,7 +1094,7 @@ describe('DatabaseManager', () => { expect(point.payload.is_metadata).toBe(true); expect(point.payload.metadata_key).toBe('last_run_owner_repo'); expect(point.payload.metadata_value).toMatch(/^\d{4}-\d{2}-\d{2}T/); - expect(point.vector).toHaveLength(3072); + expect(point.vector).toHaveLength(TEST_EMBEDDING_DIMENSION); }); it('should handle upsert error gracefully', async () => { @@ -1312,7 +1315,7 @@ describe('DatabaseManager', () => { const conn: SqliteDB = { db: mockDb, type: 'sqlite' }; // Should not throw - error is caught internally - await DatabaseManager.setMetadataValue(conn, 'key', 'value', testLogger); + await DatabaseManager.setMetadataValue(conn, 'key', 'value', testLogger, TEST_EMBEDDING_DIMENSION); }); it('should handle Qdrant upsert error gracefully', async () => { @@ -1326,7 +1329,7 @@ describe('DatabaseManager', () => { }; // Should not throw - error is caught internally - await DatabaseManager.setMetadataValue(qdrantDb, 'key', 'value', testLogger); + await DatabaseManager.setMetadataValue(qdrantDb, 'key', 'value', testLogger, TEST_EMBEDDING_DIMENSION); }); }); diff --git a/tests/mcp-server.test.ts b/tests/mcp-server.test.ts index 0a7f5c5..f5df9d3 100644 --- a/tests/mcp-server.test.ts +++ b/tests/mcp-server.test.ts @@ -18,6 +18,8 @@ import { DatabaseManager } from '../database'; import { Logger, LogLevel } from '../logger'; import type { WebsiteSourceConfig } from '../types'; +const TEST_EMBEDDING_DIMENSION = 3072; + describe('MCP server helpers', () => { it('normalizes extensions to lowercase and dot-prefixed', () => { expect(normalizeExtensions(['ts', '.JS', 'Md'])).toEqual(['.ts', '.js', '.md']); @@ -310,7 +312,7 @@ describe('MCP server end-to-end', () => { sqliteVec.load(db); db.exec(` CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0( - embedding FLOAT[3072], + embedding FLOAT[${TEST_EMBEDDING_DIMENSION}], product_name TEXT, version TEXT, branch TEXT, @@ -341,7 +343,7 @@ describe('MCP server end-to-end', () => { }; const chunks = await processor.chunkMarkdown(markdown, sourceConfig, baseUrl); - const embedding = new Array(3072).fill(0.1); + const embedding = new Array(TEST_EMBEDDING_DIMENSION).fill(0.1); for (const chunk of chunks) { chunk.metadata.branch = ''; chunk.metadata.repo = ''; diff --git a/tests/utils.test.ts b/tests/utils.test.ts index 83cfed3..f896d53 100644 --- a/tests/utils.test.ts +++ b/tests/utils.test.ts @@ -341,6 +341,34 @@ describe('Utils', () => { }); }); + // ─── getEmbeddingDimension ────────────────────────────────────── + describe('getEmbeddingDimension', () => { + it('should return 1536 for text-embedding-3-small', () => { + expect(Utils.getEmbeddingDimension('text-embedding-3-small')).toBe(1536); + }); + + it('should return 3072 for text-embedding-3-large', () => { + expect(Utils.getEmbeddingDimension('text-embedding-3-large')).toBe(3072); + }); + + it('should return 1536 for text-embedding-ada-002', () => { + expect(Utils.getEmbeddingDimension('text-embedding-ada-002')).toBe(1536); + }); + + it('should return 3072 for gemini models', () => { + expect(Utils.getEmbeddingDimension('gemini-embedding-001')).toBe(3072); + }); + + it('should be case-insensitive', () => { + expect(Utils.getEmbeddingDimension('TEXT-EMBEDDING-3-SMALL')).toBe(1536); + expect(Utils.getEmbeddingDimension('Text-Embedding-3-Large')).toBe(3072); + }); + + it('should return 1536 for unknown models', () => { + expect(Utils.getEmbeddingDimension('unknown-model')).toBe(1536); + }); + }); + // ─── shouldProcessUrl - invalid URL ───────────────────────────── describe('shouldProcessUrl - invalid URL', () => { it('should throw on invalid URL', () => { diff --git a/utils.ts b/utils.ts index ae60df8..57660b9 100644 --- a/utils.ts +++ b/utils.ts @@ -85,4 +85,36 @@ export class Utils { static tokenize(text: string): string[] { return text.split(/(\s+)/).filter(token => token.length > 0); } + + /** + * Get the embedding dimension for a given model name + * @param modelName The embedding model name (e.g., 'text-embedding-3-small', 'text-embedding-3-large') + * @returns The dimension size for the model + */ + static getEmbeddingDimension(modelName: string): number { + const modelLower = modelName.toLowerCase(); + + // OpenAI text-embedding-3-small produces 1536 dimensions + if (modelLower.includes('text-embedding-3-small')) { + return 1536; + } + + // OpenAI text-embedding-3-large and text-embedding-ada-002 produce 3072 and 1536 respectively + if (modelLower.includes('text-embedding-3-large')) { + return 3072; + } + + if (modelLower.includes('text-embedding-ada-002')) { + return 1536; + } + + // Gemini embedding models default to 3072 dimensions + if (modelLower.includes('gemini')) { + return 3072; + } + + // Default to 1536 for unknown models (most common) + console.warn(`Unknown embedding model: ${modelName}, defaulting to 1536 dimensions`); + return 1536; + } } \ No newline at end of file From 14332dd3eb25d8848411eb3d1d548926d1a8140f Mon Sep 17 00:00:00 2001 From: dvaz-external Date: Fri, 13 Feb 2026 12:03:38 +0000 Subject: [PATCH 2/6] update package version Signed-off-by: dvaz-external --- package-lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package-lock.json b/package-lock.json index 38d6ac8..75c7514 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "doc2vec", - "version": "2.2.0", + "version": "2.3.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "doc2vec", - "version": "2.2.0", + "version": "2.3.0", "license": "ISC", "dependencies": { "@chonkiejs/core": "^0.0.7", From 9ff2714327236041a4f44ea7f202df40cb21c23d Mon Sep 17 00:00:00 2001 From: dvaz-external Date: Fri, 13 Feb 2026 15:56:02 +0000 Subject: [PATCH 3/6] bump package version Signed-off-by: dvaz-external --- package-lock.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package-lock.json b/package-lock.json index 75c7514..745263e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "doc2vec", - "version": "2.3.0", + "version": "2.4.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "doc2vec", - "version": "2.3.0", + "version": "2.4.0", "license": "ISC", "dependencies": { "@chonkiejs/core": "^0.0.7", From b2ba5c040b51be13d494750f17c67639a23fa2af Mon Sep 17 00:00:00 2001 From: dvaz-external Date: Fri, 13 Feb 2026 16:58:46 +0000 Subject: [PATCH 4/6] fix test Signed-off-by: dvaz-external --- tests/doc2vec.test.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/doc2vec.test.ts b/tests/doc2vec.test.ts index 9c1afb1..8d928ff 100644 --- a/tests/doc2vec.test.ts +++ b/tests/doc2vec.test.ts @@ -134,6 +134,7 @@ vi.mock('../utils', () => ({ isValidUuid: vi.fn().mockReturnValue(false), hashToUuid: vi.fn().mockReturnValue('00000000-0000-0000-0000-000000000000'), getUrlPrefix: vi.fn().mockReturnValue('https://example.com'), + getEmbeddingDimension: vi.fn().mockReturnValue(3072), }, })); @@ -198,6 +199,9 @@ describe('Doc2Vec class', () => { // Provide a dummy API key so the constructor validation doesn't call process.exit process.env.OPENAI_API_KEY = 'test-key-for-tests'; + // Force OpenAI provider for tests (override any system default) + process.env.EMBEDDING_PROVIDER = 'openai'; + process.env.OPENAI_MODEL = 'text-embedding-3-large'; // Ensure test config directory exists if (!fs.existsSync(testConfigDir)) { @@ -217,6 +221,8 @@ describe('Doc2Vec class', () => { delete process.env.TEST_DOC2VEC_URL; delete process.env.TEST_DOC2VEC_API_KEY; delete process.env.OPENAI_API_KEY; + delete process.env.EMBEDDING_PROVIDER; + delete process.env.OPENAI_MODEL; }); // ───────────────────────────────────────────────────────────────────────── From 53ddbed75cd307a24a7e081de7018e91f093b202 Mon Sep 17 00:00:00 2001 From: dvaz-external Date: Mon, 16 Feb 2026 08:19:58 +0000 Subject: [PATCH 5/6] change embbedings dimension size reference method Signed-off-by: dvaz-external --- README.md | 8 ++++++++ config.yaml | 6 ++++++ content-processor.ts | 2 +- database.ts | 24 +++++++++--------------- doc2vec.ts | 29 ++++++++++++++++++++++++----- tests/database.test.ts | 8 ++++---- tests/doc2vec.test.ts | 1 - tests/utils.test.ts | 28 ---------------------------- types.ts | 1 + utils.ts | 38 +++++--------------------------------- 10 files changed, 58 insertions(+), 87 deletions(-) diff --git a/README.md b/README.md index 8b85b11..5b650bf 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,9 @@ Configuration is managed through two files: OPENAI_API_KEY="sk-..." OPENAI_MODEL="text-embedding-3-large" # Optional, defaults to text-embedding-3-large + # Optional: Embedding dimension size (defaults to 3072) + EMBEDDING_DIMENSION="3072" + # Required: Your Azure OpenAI credentials (if using Azure provider) AZURE_OPENAI_KEY="your-azure-key" AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com" @@ -223,6 +226,10 @@ Configuration is managed through two files: * `qdrant_port`: (Optional) Port for the Qdrant REST API. Defaults to `443` if `qdrant_url` starts with `https`, otherwise `6333`. * `collection_name`: (Optional) Name of the Qdrant collection to use. Defaults to `_` (lowercased, spaces replaced with underscores). + Optional embedding configuration: + * `embedding.provider`: Provider for embeddings (`openai` or `azure`). + * `embedding.dimension`: Embedding vector size. Defaults to `3072` when not set. + **Example (`config.yaml`):** ```yaml # Optional: Configure embedding provider @@ -230,6 +237,7 @@ Configuration is managed through two files: # Defaults to OpenAI if not specified embedding: provider: 'openai' # or 'azure' + dimension: 3072 # Optional, defaults to 3072 openai: api_key: '${OPENAI_API_KEY}' # Optional, uses env var by default model: 'text-embedding-3-large' # Optional, defaults to text-embedding-3-large diff --git a/config.yaml b/config.yaml index fa44773..57dc866 100644 --- a/config.yaml +++ b/config.yaml @@ -1,4 +1,10 @@ # Doc2Vec Configuration +embedding: + provider: 'openai' + dimension: 3072 + openai: + model: 'text-embedding-3-large' + sources: # GitHub Sources - type: github diff --git a/content-processor.ts b/content-processor.ts index 7da872b..ec1b758 100644 --- a/content-processor.ts +++ b/content-processor.ts @@ -619,7 +619,7 @@ export class ContentProcessor { let newLinksFound = 0; for (const href of result.links) { - const fullUrl = Utils.buildUrl(href, pageUrlForLinks); + const fullUrl = Utils.buildUrl(href, pageUrlForLinks, logger); if (fullUrl.startsWith(sourceConfig.url)) { addReferrer(fullUrl, pageUrlForLinks); if (!visitedUrls.has(Utils.normalizeUrl(fullUrl))) { diff --git a/database.ts b/database.ts index 1346e48..76895fe 100644 --- a/database.ts +++ b/database.ts @@ -19,7 +19,7 @@ import { export class DatabaseManager { private static columnCache: WeakMap = new WeakMap(); - static async initDatabase(config: SourceConfig, parentLogger: Logger, embeddingDimension: number = 3072): Promise { + static async initDatabase(config: SourceConfig, parentLogger: Logger, embeddingDimension: number): Promise { const logger = parentLogger.child('database'); const dbConfig = config.database_config; @@ -71,7 +71,7 @@ export class DatabaseManager { } } - static async createCollectionQdrant(qdrantClient: QdrantClient, collectionName: string, logger: Logger, embeddingDimension: number = 3072) { + static async createCollectionQdrant(qdrantClient: QdrantClient, collectionName: string, logger: Logger, embeddingDimension: number) { try { logger.debug(`Checking if collection ${collectionName} exists`); const collections = await qdrantClient.getCollections(); @@ -178,7 +178,7 @@ export class DatabaseManager { key: string, value: string, logger: Logger, - embeddingDimension: number = 3072 + embeddingDimension: number ): Promise { try { if (dbConnection.type === 'sqlite') { @@ -259,7 +259,12 @@ export class DatabaseManager { return defaultDate; } - static async updateLastRunDate(dbConnection: DatabaseConnection, repo: string, logger: Logger): Promise { + static async updateLastRunDate( + dbConnection: DatabaseConnection, + repo: string, + logger: Logger, + embeddingDimension: number + ): Promise { const now = new Date().toISOString(); try { @@ -278,17 +283,6 @@ export class DatabaseManager { logger.debug(`Using UUID: ${metadataUUID} for metadata`); - // Get the embedding dimension from the collection info - let embeddingDimension = 3072; // Default - try { - const collectionInfo = await dbConnection.client.getCollection(dbConnection.collectionName); - if (collectionInfo.config?.params?.vectors && 'size' in collectionInfo.config.params.vectors) { - embeddingDimension = collectionInfo.config.params.vectors.size; - } - } catch (error) { - logger.warn('Could not get collection info, using default dimension'); - } - // Generate a dummy embedding (all zeros) const dummyEmbedding = new Array(embeddingDimension).fill(0); diff --git a/doc2vec.ts b/doc2vec.ts index 963f968..5fb3bd0 100644 --- a/doc2vec.ts +++ b/doc2vec.ts @@ -25,7 +25,8 @@ import { ZendeskSourceConfig, DatabaseConnection, DocumentChunk, - BrokenLink + BrokenLink, + EmbeddingConfig } from './types'; const GITHUB_TOKEN = process.env.GITHUB_PERSONAL_ACCESS_TOKEN; @@ -59,6 +60,7 @@ export class Doc2Vec { // Check environment variable if not specified in config const embeddingProvider = this.config.embedding?.provider || (process.env.EMBEDDING_PROVIDER as 'openai' | 'azure') || 'openai'; const embeddingConfig = this.config.embedding || { provider: embeddingProvider }; + this.embeddingDimension = this.resolveEmbeddingDimension(embeddingConfig); if (embeddingProvider === 'azure') { const azureApiKey = embeddingConfig.azure?.api_key || process.env.AZURE_OPENAI_KEY; @@ -78,7 +80,6 @@ export class Doc2Vec { apiVersion: azureApiVersion, }); this.embeddingModel = azureDeploymentName; - this.embeddingDimension = Utils.getEmbeddingDimension(azureDeploymentName); this.logger.info(`Using Azure OpenAI with deployment: ${azureDeploymentName} (${this.embeddingDimension} dimensions)`); } else { const openaiApiKey = embeddingConfig.openai?.api_key || process.env.OPENAI_API_KEY; @@ -91,7 +92,6 @@ export class Doc2Vec { this.openai = new OpenAI({ apiKey: openaiApiKey }); this.embeddingModel = openaiModel; - this.embeddingDimension = Utils.getEmbeddingDimension(openaiModel); this.logger.info(`Using OpenAI with model: ${openaiModel} (${this.embeddingDimension} dimensions)`); } @@ -141,6 +141,25 @@ export class Doc2Vec { } } + private resolveEmbeddingDimension(embeddingConfig: EmbeddingConfig | undefined): number { + const defaultDimension = 3072; + const rawConfigValue = embeddingConfig?.dimension; + const rawEnvValue = process.env.EMBEDDING_DIMENSION; + + const candidate = rawConfigValue ?? (rawEnvValue ? Number(rawEnvValue) : undefined); + if (candidate === undefined) { + return defaultDimension; + } + + const parsedValue = typeof candidate === 'string' ? Number(candidate) : candidate; + if (!Number.isFinite(parsedValue) || parsedValue <= 0 || !Number.isInteger(parsedValue)) { + this.logger.warn(`Invalid embedding dimension provided (${candidate}), falling back to ${defaultDimension}`); + return defaultDimension; + } + + return parsedValue; + } + public async run(): Promise { this.logger.section('PROCESSING SOURCES'); @@ -391,7 +410,7 @@ export class Doc2Vec { } // Update the last run date in the database after processing all issues - await DatabaseManager.updateLastRunDate(dbConnection, repo, logger); + await DatabaseManager.updateLastRunDate(dbConnection, repo, logger, this.embeddingDimension); logger.info(`Successfully processed ${issues.length} issues`); } @@ -1183,7 +1202,7 @@ export class Doc2Vec { } // Update the last run date in the database - await DatabaseManager.updateLastRunDate(dbConnection, `zendesk_tickets_${config.zendesk_subdomain}`, logger); + await DatabaseManager.updateLastRunDate(dbConnection, `zendesk_tickets_${config.zendesk_subdomain}`, logger, this.embeddingDimension); logger.info(`Successfully processed ${totalTickets} tickets`); } diff --git a/tests/database.test.ts b/tests/database.test.ts index a1b651f..32eabe4 100644 --- a/tests/database.test.ts +++ b/tests/database.test.ts @@ -152,7 +152,7 @@ describe('DatabaseManager', () => { }); it('should update and retrieve last run date', async () => { - await DatabaseManager.updateLastRunDate(conn, 'owner/repo', testLogger); + await DatabaseManager.updateLastRunDate(conn, 'owner/repo', testLogger, TEST_EMBEDDING_DIMENSION); const date = await DatabaseManager.getLastRunDate(conn, 'owner/repo', '2025-01-01T00:00:00Z', testLogger); // Should be an ISO date string, not the default expect(date).not.toBe('2025-01-01T00:00:00Z'); @@ -160,7 +160,7 @@ describe('DatabaseManager', () => { }); it('should normalize repo names in metadata keys', async () => { - await DatabaseManager.updateLastRunDate(conn, 'owner/repo', testLogger); + await DatabaseManager.updateLastRunDate(conn, 'owner/repo', testLogger, TEST_EMBEDDING_DIMENSION); // Check directly in db that the key uses underscore const result = db.prepare('SELECT key FROM vec_metadata WHERE key LIKE ?').get('last_run_%') as { key: string }; expect(result.key).toBe('last_run_owner_repo'); @@ -1085,7 +1085,7 @@ describe('DatabaseManager', () => { type: 'qdrant', }; - await DatabaseManager.updateLastRunDate(qdrantDb, 'owner/repo', testLogger); + await DatabaseManager.updateLastRunDate(qdrantDb, 'owner/repo', testLogger, TEST_EMBEDDING_DIMENSION); expect(mockClient.upsert).toHaveBeenCalledOnce(); const call = mockClient.upsert.mock.calls[0]; @@ -1108,7 +1108,7 @@ describe('DatabaseManager', () => { }; // Should not throw - await DatabaseManager.updateLastRunDate(qdrantDb, 'owner/repo', testLogger); + await DatabaseManager.updateLastRunDate(qdrantDb, 'owner/repo', testLogger, TEST_EMBEDDING_DIMENSION); }); }); diff --git a/tests/doc2vec.test.ts b/tests/doc2vec.test.ts index 8d928ff..c4d7e50 100644 --- a/tests/doc2vec.test.ts +++ b/tests/doc2vec.test.ts @@ -134,7 +134,6 @@ vi.mock('../utils', () => ({ isValidUuid: vi.fn().mockReturnValue(false), hashToUuid: vi.fn().mockReturnValue('00000000-0000-0000-0000-000000000000'), getUrlPrefix: vi.fn().mockReturnValue('https://example.com'), - getEmbeddingDimension: vi.fn().mockReturnValue(3072), }, })); diff --git a/tests/utils.test.ts b/tests/utils.test.ts index f896d53..83cfed3 100644 --- a/tests/utils.test.ts +++ b/tests/utils.test.ts @@ -341,34 +341,6 @@ describe('Utils', () => { }); }); - // ─── getEmbeddingDimension ────────────────────────────────────── - describe('getEmbeddingDimension', () => { - it('should return 1536 for text-embedding-3-small', () => { - expect(Utils.getEmbeddingDimension('text-embedding-3-small')).toBe(1536); - }); - - it('should return 3072 for text-embedding-3-large', () => { - expect(Utils.getEmbeddingDimension('text-embedding-3-large')).toBe(3072); - }); - - it('should return 1536 for text-embedding-ada-002', () => { - expect(Utils.getEmbeddingDimension('text-embedding-ada-002')).toBe(1536); - }); - - it('should return 3072 for gemini models', () => { - expect(Utils.getEmbeddingDimension('gemini-embedding-001')).toBe(3072); - }); - - it('should be case-insensitive', () => { - expect(Utils.getEmbeddingDimension('TEXT-EMBEDDING-3-SMALL')).toBe(1536); - expect(Utils.getEmbeddingDimension('Text-Embedding-3-Large')).toBe(3072); - }); - - it('should return 1536 for unknown models', () => { - expect(Utils.getEmbeddingDimension('unknown-model')).toBe(1536); - }); - }); - // ─── shouldProcessUrl - invalid URL ───────────────────────────── describe('shouldProcessUrl - invalid URL', () => { it('should throw on invalid URL', () => { diff --git a/types.ts b/types.ts index 97f9723..701715a 100644 --- a/types.ts +++ b/types.ts @@ -81,6 +81,7 @@ export interface QdrantDatabaseParams { export interface EmbeddingConfig { provider: 'openai' | 'azure'; + dimension?: number; openai?: { api_key?: string; // Can also use OPENAI_API_KEY env var model?: string; // Default: text-embedding-3-large diff --git a/utils.ts b/utils.ts index 57660b9..8296535 100644 --- a/utils.ts +++ b/utils.ts @@ -1,5 +1,6 @@ import crypto from 'crypto'; import * as path from 'path'; +import { Logger } from './logger'; export class Utils { static generateHash(content: string): string { @@ -33,11 +34,13 @@ export class Utils { } } - static buildUrl(href: string, currentUrl: string): string { + static buildUrl(href: string, currentUrl: string, logger?: Logger): string { try { return new URL(href, currentUrl).toString(); } catch (error) { - console.warn(`Invalid URL found: ${href}`); + if (logger) { + logger.warn(`Invalid URL found: ${href}`); + } return ''; } } @@ -86,35 +89,4 @@ export class Utils { return text.split(/(\s+)/).filter(token => token.length > 0); } - /** - * Get the embedding dimension for a given model name - * @param modelName The embedding model name (e.g., 'text-embedding-3-small', 'text-embedding-3-large') - * @returns The dimension size for the model - */ - static getEmbeddingDimension(modelName: string): number { - const modelLower = modelName.toLowerCase(); - - // OpenAI text-embedding-3-small produces 1536 dimensions - if (modelLower.includes('text-embedding-3-small')) { - return 1536; - } - - // OpenAI text-embedding-3-large and text-embedding-ada-002 produce 3072 and 1536 respectively - if (modelLower.includes('text-embedding-3-large')) { - return 3072; - } - - if (modelLower.includes('text-embedding-ada-002')) { - return 1536; - } - - // Gemini embedding models default to 3072 dimensions - if (modelLower.includes('gemini')) { - return 3072; - } - - // Default to 1536 for unknown models (most common) - console.warn(`Unknown embedding model: ${modelName}, defaulting to 1536 dimensions`); - return 1536; - } } \ No newline at end of file From 27200c82773a9c69c36e99465d7dd6782fb15350 Mon Sep 17 00:00:00 2001 From: dvaz-external Date: Mon, 16 Feb 2026 16:18:42 +0000 Subject: [PATCH 6/6] fix new dimension parameter issue Signed-off-by: dvaz-external --- doc2vec.ts | 4 ++-- package-lock.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc2vec.ts b/doc2vec.ts index 5fb3bd0..d0a21de 100644 --- a/doc2vec.ts +++ b/doc2vec.ts @@ -468,7 +468,7 @@ export class Doc2Vec { return DatabaseManager.getMetadataValue(dbConnection, `etag:${url}`, undefined, logger); }, set: async (url: string, etag: string): Promise => { - await DatabaseManager.setMetadataValue(dbConnection, `etag:${url}`, etag, logger); + await DatabaseManager.setMetadataValue(dbConnection, `etag:${url}`, etag, logger, this.embeddingDimension); }, }; @@ -477,7 +477,7 @@ export class Doc2Vec { return DatabaseManager.getMetadataValue(dbConnection, `lastmod:${url}`, undefined, logger); }, set: async (url: string, lastmod: string): Promise => { - await DatabaseManager.setMetadataValue(dbConnection, `lastmod:${url}`, lastmod, logger); + await DatabaseManager.setMetadataValue(dbConnection, `lastmod:${url}`, lastmod, logger, this.embeddingDimension); }, }; diff --git a/package-lock.json b/package-lock.json index 745263e..21d1444 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "doc2vec", - "version": "2.4.0", + "version": "2.5.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "doc2vec", - "version": "2.4.0", + "version": "2.5.0", "license": "ISC", "dependencies": { "@chonkiejs/core": "^0.0.7",