Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ Configuration is managed through two files:
OPENAI_API_KEY="sk-..."
OPENAI_MODEL="text-embedding-3-large" # Optional, defaults to text-embedding-3-large

# Optional: Embedding dimension size (defaults to 3072)
EMBEDDING_DIMENSION="3072"

# Required: Your Azure OpenAI credentials (if using Azure provider)
AZURE_OPENAI_KEY="your-azure-key"
AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com"
Expand Down Expand Up @@ -223,13 +226,18 @@ Configuration is managed through two files:
* `qdrant_port`: (Optional) Port for the Qdrant REST API. Defaults to `443` if `qdrant_url` starts with `https`, otherwise `6333`.
* `collection_name`: (Optional) Name of the Qdrant collection to use. Defaults to `<product_name>_<version>` (lowercased, spaces replaced with underscores).

Optional embedding configuration:
* `embedding.provider`: Provider for embeddings (`openai` or `azure`).
* `embedding.dimension`: Embedding vector size. Defaults to `3072` when not set.

**Example (`config.yaml`):**
```yaml
# Optional: Configure embedding provider
# Can also be set via EMBEDDING_PROVIDER environment variable
# Defaults to OpenAI if not specified
embedding:
provider: 'openai' # or 'azure'
dimension: 3072 # Optional, defaults to 3072
openai:
api_key: '${OPENAI_API_KEY}' # Optional, uses env var by default
model: 'text-embedding-3-large' # Optional, defaults to text-embedding-3-large
Expand Down
6 changes: 6 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# Doc2Vec Configuration
embedding:
provider: 'openai'
dimension: 3072
openai:
model: 'text-embedding-3-large'

sources:
# GitHub Sources
- type: github
Expand Down
2 changes: 1 addition & 1 deletion content-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ export class ContentProcessor {
let newLinksFound = 0;

for (const href of result.links) {
const fullUrl = Utils.buildUrl(href, pageUrlForLinks);
const fullUrl = Utils.buildUrl(href, pageUrlForLinks, logger);
if (fullUrl.startsWith(sourceConfig.url)) {
addReferrer(fullUrl, pageUrlForLinks);
if (!visitedUrls.has(Utils.normalizeUrl(fullUrl))) {
Expand Down
30 changes: 17 additions & 13 deletions database.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import {
export class DatabaseManager {
private static columnCache: WeakMap<Database, { hasBranch: boolean; hasRepo: boolean }> = new WeakMap();

static async initDatabase(config: SourceConfig, parentLogger: Logger): Promise<DatabaseConnection> {
static async initDatabase(config: SourceConfig, parentLogger: Logger, embeddingDimension: number): Promise<DatabaseConnection> {
const logger = parentLogger.child('database');
const dbConfig = config.database_config;

Expand All @@ -32,10 +32,10 @@ export class DatabaseManager {
const db = new BetterSqlite3(dbPath, { allowExtension: true } as any);
sqliteVec.load(db);

logger.debug(`Creating vec_items table if it doesn't exist`);
logger.debug(`Creating vec_items table if it doesn't exist (dimension: ${embeddingDimension})`);
db.exec(`
CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0(
embedding FLOAT[3072],
embedding FLOAT[${embeddingDimension}],
product_name TEXT,
version TEXT,
branch TEXT,
Expand All @@ -61,7 +61,7 @@ export class DatabaseManager {
logger.info(`Connecting to Qdrant at ${qdrantUrl}:${qdrantPort}, collection: ${collectionName}`);
const qdrantClient = new QdrantClient({ url: qdrantUrl, apiKey: process.env.QDRANT_API_KEY, port: qdrantPort });

await this.createCollectionQdrant(qdrantClient, collectionName, logger);
await this.createCollectionQdrant(qdrantClient, collectionName, logger, embeddingDimension);
logger.info(`Qdrant connection established successfully`);
return { client: qdrantClient, collectionName, type: 'qdrant' };
} else {
Expand All @@ -71,7 +71,7 @@ export class DatabaseManager {
}
}

static async createCollectionQdrant(qdrantClient: QdrantClient, collectionName: string, logger: Logger) {
static async createCollectionQdrant(qdrantClient: QdrantClient, collectionName: string, logger: Logger, embeddingDimension: number) {
try {
logger.debug(`Checking if collection ${collectionName} exists`);
const collections = await qdrantClient.getCollections();
Expand All @@ -84,10 +84,10 @@ export class DatabaseManager {
return;
}

logger.info(`Creating new collection ${collectionName}`);
logger.info(`Creating new collection ${collectionName} with dimension ${embeddingDimension}`);
await qdrantClient.createCollection(collectionName, {
vectors: {
size: 3072,
size: embeddingDimension,
distance: "Cosine",
},
});
Expand Down Expand Up @@ -177,7 +177,8 @@ export class DatabaseManager {
dbConnection: DatabaseConnection,
key: string,
value: string,
logger: Logger
logger: Logger,
embeddingDimension: number
): Promise<void> {
try {
if (dbConnection.type === 'sqlite') {
Expand All @@ -189,8 +190,7 @@ export class DatabaseManager {
logger.debug(`Updated metadata value for ${key}`);
} else if (dbConnection.type === 'qdrant') {
const metadataUUID = Utils.generateMetadataUUID(key);
const dummyEmbeddingSize = 3072;
const dummyEmbedding = new Array(dummyEmbeddingSize).fill(0);
const dummyEmbedding = new Array(embeddingDimension).fill(0);
const metadataPoint = {
id: metadataUUID,
vector: dummyEmbedding,
Expand Down Expand Up @@ -259,7 +259,12 @@ export class DatabaseManager {
return defaultDate;
}

static async updateLastRunDate(dbConnection: DatabaseConnection, repo: string, logger: Logger): Promise<void> {
static async updateLastRunDate(
dbConnection: DatabaseConnection,
repo: string,
logger: Logger,
embeddingDimension: number
): Promise<void> {
const now = new Date().toISOString();

try {
Expand All @@ -279,8 +284,7 @@ export class DatabaseManager {
logger.debug(`Using UUID: ${metadataUUID} for metadata`);

// Generate a dummy embedding (all zeros)
const dummyEmbeddingSize = 3072; // Same size as your content embeddings
const dummyEmbedding = new Array(dummyEmbeddingSize).fill(0);
const dummyEmbedding = new Array(embeddingDimension).fill(0);

// Create a point with special metadata payload
const metadataPoint = {
Expand Down
54 changes: 38 additions & 16 deletions doc2vec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ import {
ZendeskSourceConfig,
DatabaseConnection,
DocumentChunk,
BrokenLink
BrokenLink,
EmbeddingConfig
} from './types';

const GITHUB_TOKEN = process.env.GITHUB_PERSONAL_ACCESS_TOKEN;
Expand All @@ -37,6 +38,7 @@ export class Doc2Vec {
private config: Config;
private openai: OpenAI | AzureOpenAI;
private embeddingModel: string;
private embeddingDimension: number;
private contentProcessor: ContentProcessor;
private logger: Logger;
private configDir: string;
Expand All @@ -58,6 +60,7 @@ export class Doc2Vec {
// Check environment variable if not specified in config
const embeddingProvider = this.config.embedding?.provider || (process.env.EMBEDDING_PROVIDER as 'openai' | 'azure') || 'openai';
const embeddingConfig = this.config.embedding || { provider: embeddingProvider };
this.embeddingDimension = this.resolveEmbeddingDimension(embeddingConfig);

if (embeddingProvider === 'azure') {
const azureApiKey = embeddingConfig.azure?.api_key || process.env.AZURE_OPENAI_KEY;
Expand All @@ -77,7 +80,7 @@ export class Doc2Vec {
apiVersion: azureApiVersion,
});
this.embeddingModel = azureDeploymentName;
this.logger.info(`Using Azure OpenAI with deployment: ${azureDeploymentName}`);
this.logger.info(`Using Azure OpenAI with deployment: ${azureDeploymentName} (${this.embeddingDimension} dimensions)`);
} else {
const openaiApiKey = embeddingConfig.openai?.api_key || process.env.OPENAI_API_KEY;
const openaiModel = embeddingConfig.openai?.model || process.env.OPENAI_MODEL || 'text-embedding-3-large';
Expand All @@ -89,7 +92,7 @@ export class Doc2Vec {

this.openai = new OpenAI({ apiKey: openaiApiKey });
this.embeddingModel = openaiModel;
this.logger.info(`Using OpenAI with model: ${openaiModel}`);
this.logger.info(`Using OpenAI with model: ${openaiModel} (${this.embeddingDimension} dimensions)`);
}

this.contentProcessor = new ContentProcessor(this.logger);
Expand Down Expand Up @@ -138,6 +141,25 @@ export class Doc2Vec {
}
}

private resolveEmbeddingDimension(embeddingConfig: EmbeddingConfig | undefined): number {
const defaultDimension = 3072;
const rawConfigValue = embeddingConfig?.dimension;
const rawEnvValue = process.env.EMBEDDING_DIMENSION;

const candidate = rawConfigValue ?? (rawEnvValue ? Number(rawEnvValue) : undefined);
if (candidate === undefined) {
return defaultDimension;
}

const parsedValue = typeof candidate === 'string' ? Number(candidate) : candidate;
if (!Number.isFinite(parsedValue) || parsedValue <= 0 || !Number.isInteger(parsedValue)) {
this.logger.warn(`Invalid embedding dimension provided (${candidate}), falling back to ${defaultDimension}`);
return defaultDimension;
}

return parsedValue;
}

public async run(): Promise<void> {
this.logger.section('PROCESSING SOURCES');

Expand Down Expand Up @@ -388,7 +410,7 @@ export class Doc2Vec {
}

// Update the last run date in the database after processing all issues
await DatabaseManager.updateLastRunDate(dbConnection, repo, logger);
await DatabaseManager.updateLastRunDate(dbConnection, repo, logger, this.embeddingDimension);

logger.info(`Successfully processed ${issues.length} issues`);
}
Expand All @@ -397,7 +419,7 @@ export class Doc2Vec {
const logger = parentLogger.child('process');
logger.info(`Starting processing for GitHub repo: ${config.repo}`);

const dbConnection = await DatabaseManager.initDatabase(config, logger);
const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension);

// Initialize metadata storage
await DatabaseManager.initDatabaseMetadata(dbConnection, logger);
Expand All @@ -414,8 +436,8 @@ export class Doc2Vec {
const logger = parentLogger.child('process');
logger.info(`Starting processing for website: ${config.url}`);

const dbConnection = await DatabaseManager.initDatabase(config, logger);
await DatabaseManager.initDatabaseMetadata(dbConnection, logger);
const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension);
await DatabaseManager.initDatabaseMetadata(dbConnection, logger);
const validChunkIds: Set<string> = new Set();
const visitedUrls: Set<string> = new Set();
const urlPrefix = Utils.getUrlPrefix(config.url);
Expand Down Expand Up @@ -446,7 +468,7 @@ export class Doc2Vec {
return DatabaseManager.getMetadataValue(dbConnection, `etag:${url}`, undefined, logger);
},
set: async (url: string, etag: string): Promise<void> => {
await DatabaseManager.setMetadataValue(dbConnection, `etag:${url}`, etag, logger);
await DatabaseManager.setMetadataValue(dbConnection, `etag:${url}`, etag, logger, this.embeddingDimension);
},
};

Expand All @@ -455,7 +477,7 @@ export class Doc2Vec {
return DatabaseManager.getMetadataValue(dbConnection, `lastmod:${url}`, undefined, logger);
},
set: async (url: string, lastmod: string): Promise<void> => {
await DatabaseManager.setMetadataValue(dbConnection, `lastmod:${url}`, lastmod, logger);
await DatabaseManager.setMetadataValue(dbConnection, `lastmod:${url}`, lastmod, logger, this.embeddingDimension);
},
};

Expand Down Expand Up @@ -539,7 +561,7 @@ export class Doc2Vec {
const logger = parentLogger.child('process');
logger.info(`Starting processing for local directory: ${config.path}`);

const dbConnection = await DatabaseManager.initDatabase(config, logger);
const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension);
const validChunkIds: Set<string> = new Set();
const processedFiles: Set<string> = new Set();

Expand Down Expand Up @@ -611,7 +633,7 @@ export class Doc2Vec {
const logger = parentLogger.child('process');
logger.info(`Starting processing for code source (${config.source})`);

const dbConnection = await DatabaseManager.initDatabase(config, logger);
const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension);
const validChunkIds: Set<string> = new Set();
const processedFiles: Set<string> = new Set();

Expand Down Expand Up @@ -765,10 +787,10 @@ export class Doc2Vec {
}
}

await DatabaseManager.setMetadataValue(dbConnection, fileListKey, JSON.stringify(currentList), logger);
await DatabaseManager.setMetadataValue(dbConnection, fileListKey, JSON.stringify(currentList), logger, this.embeddingDimension);
if (lastMtimeKey) {
const nextMtime = maxObservedMtime > 0 ? maxObservedMtime : Date.now();
await DatabaseManager.setMetadataValue(dbConnection, lastMtimeKey, `${nextMtime}`, logger);
await DatabaseManager.setMetadataValue(dbConnection, lastMtimeKey, `${nextMtime}`, logger, this.embeddingDimension);
}
}
} else {
Expand All @@ -785,7 +807,7 @@ export class Doc2Vec {
const headSha = await this.getRepoHeadSha(basePath, logger);
if (headSha) {
const shaKey = this.buildCodeShaMetadataKey(config.repo as string, repoBranch);
await DatabaseManager.setMetadataValue(dbConnection, shaKey, headSha, logger);
await DatabaseManager.setMetadataValue(dbConnection, shaKey, headSha, logger, this.embeddingDimension);
}
}

Expand Down Expand Up @@ -974,7 +996,7 @@ export class Doc2Vec {
const logger = parentLogger.child('process');
logger.info(`Starting processing for Zendesk: ${config.zendesk_subdomain}.zendesk.com`);

const dbConnection = await DatabaseManager.initDatabase(config, logger);
const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension);

// Initialize metadata storage
await DatabaseManager.initDatabaseMetadata(dbConnection, logger);
Expand Down Expand Up @@ -1180,7 +1202,7 @@ export class Doc2Vec {
}

// Update the last run date in the database
await DatabaseManager.updateLastRunDate(dbConnection, `zendesk_tickets_${config.zendesk_subdomain}`, logger);
await DatabaseManager.updateLastRunDate(dbConnection, `zendesk_tickets_${config.zendesk_subdomain}`, logger, this.embeddingDimension);

logger.info(`Successfully processed ${totalTickets} tickets`);
}
Expand Down
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "doc2vec",
"version": "2.4.0",
"version": "2.5.0",
"type": "commonjs",
"description": "",
"main": "dist/doc2vec.js",
Expand Down
Loading