From 98322ae3526a8126bb018ff2d19e871ac6d8d056 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 29 Apr 2026 10:34:39 -0700 Subject: [PATCH 1/9] feat: add Article 2 select-algorithm samples for all 5 languages Implement vector index algorithm comparison samples (IVF, HNSW, DiskANN) for Python, TypeScript, Go, Java, and C#/.NET. Each sample demonstrates: - IVF index creation (numLists=10) for <10K documents - HNSW index creation (m=16, efConstruction=64) for 10K-50K documents - DiskANN index creation (maxDegree=20, lBuild=10) for 50K+ documents - Vector search using \ aggregation with cosmosSearch - Passwordless auth via DefaultAzureCredential/OIDC Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/README.md | 89 +++ ai/select-algorithm-dotnet/src/DiskannDemo.cs | 88 +++ ai/select-algorithm-dotnet/src/HnswDemo.cs | 88 +++ ai/select-algorithm-dotnet/src/IvfDemo.cs | 87 +++ ai/select-algorithm-dotnet/src/Program.cs | 45 ++ .../src/SelectAlgorithm.csproj | 15 + ai/select-algorithm-dotnet/src/Utils.cs | 162 ++++ ai/select-algorithm-go/README.md | 124 +++ ai/select-algorithm-go/go.mod | 11 + ai/select-algorithm-go/src/diskann.go | 112 +++ ai/select-algorithm-go/src/hnsw.go | 112 +++ ai/select-algorithm-go/src/ivf.go | 110 +++ ai/select-algorithm-go/src/main.go | 68 ++ ai/select-algorithm-go/src/utils.go | 395 ++++++++++ ai/select-algorithm-java/README.md | 90 +++ ai/select-algorithm-java/pom.xml | 65 ++ .../selectalgorithm/DiskannDemo.java | 77 ++ .../documentdb/selectalgorithm/HnswDemo.java | 77 ++ .../documentdb/selectalgorithm/IvfDemo.java | 76 ++ .../documentdb/selectalgorithm/Main.java | 34 + .../documentdb/selectalgorithm/Utils.java | 188 +++++ ai/select-algorithm-python/README.md | 69 ++ ai/select-algorithm-python/requirements.txt | 11 + ai/select-algorithm-python/src/diskann.py | 90 +++ ai/select-algorithm-python/src/hnsw.py | 90 +++ ai/select-algorithm-python/src/ivf.py | 88 +++ ai/select-algorithm-python/src/utils.py | 172 ++++ ai/select-algorithm-typescript/README.md | 74 ++ .../package-lock.json | 735 ++++++++++++++++++ ai/select-algorithm-typescript/package.json | 21 + ai/select-algorithm-typescript/src/diskann.ts | 101 +++ ai/select-algorithm-typescript/src/hnsw.ts | 101 +++ ai/select-algorithm-typescript/src/ivf.ts | 101 +++ ai/select-algorithm-typescript/src/utils.ts | 135 ++++ ai/select-algorithm-typescript/tsconfig.json | 18 + 35 files changed, 3919 insertions(+) create mode 100644 ai/select-algorithm-dotnet/README.md create mode 100644 ai/select-algorithm-dotnet/src/DiskannDemo.cs create mode 100644 ai/select-algorithm-dotnet/src/HnswDemo.cs create mode 100644 ai/select-algorithm-dotnet/src/IvfDemo.cs create mode 100644 ai/select-algorithm-dotnet/src/Program.cs create mode 100644 ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj create mode 100644 ai/select-algorithm-dotnet/src/Utils.cs create mode 100644 ai/select-algorithm-go/README.md create mode 100644 ai/select-algorithm-go/go.mod create mode 100644 ai/select-algorithm-go/src/diskann.go create mode 100644 ai/select-algorithm-go/src/hnsw.go create mode 100644 ai/select-algorithm-go/src/ivf.go create mode 100644 ai/select-algorithm-go/src/main.go create mode 100644 ai/select-algorithm-go/src/utils.go create mode 100644 ai/select-algorithm-java/README.md create mode 100644 ai/select-algorithm-java/pom.xml create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java create mode 100644 ai/select-algorithm-python/README.md create mode 100644 ai/select-algorithm-python/requirements.txt create mode 100644 ai/select-algorithm-python/src/diskann.py create mode 100644 ai/select-algorithm-python/src/hnsw.py create mode 100644 ai/select-algorithm-python/src/ivf.py create mode 100644 ai/select-algorithm-python/src/utils.py create mode 100644 ai/select-algorithm-typescript/README.md create mode 100644 ai/select-algorithm-typescript/package-lock.json create mode 100644 ai/select-algorithm-typescript/package.json create mode 100644 ai/select-algorithm-typescript/src/diskann.ts create mode 100644 ai/select-algorithm-typescript/src/hnsw.ts create mode 100644 ai/select-algorithm-typescript/src/ivf.ts create mode 100644 ai/select-algorithm-typescript/src/utils.ts create mode 100644 ai/select-algorithm-typescript/tsconfig.json diff --git a/ai/select-algorithm-dotnet/README.md b/ai/select-algorithm-dotnet/README.md new file mode 100644 index 0000000..78b12e7 --- /dev/null +++ b/ai/select-algorithm-dotnet/README.md @@ -0,0 +1,89 @@ +# Select Algorithm - .NET (C#) + +Demonstrates three vector index algorithms available in Azure DocumentDB (vCore): + +| Algorithm | Best For | Cluster Tier | Key Parameters | +|-----------|----------|--------------|----------------| +| **IVF** | < 10,000 documents | M10+ | `numLists` | +| **HNSW** | 10,000–50,000 documents | M30+ | `m`, `efConstruction` | +| **DiskANN** | 50,000+ documents | M30+ | `maxDegree`, `lBuild` | + +## Prerequisites + +- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) +- Azure DocumentDB (vCore) cluster +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. Copy the environment file and fill in your values: + + ```bash + cp .env.example .env + ``` + +2. Edit `.env` with your configuration: + + ```env + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + MONGO_CLUSTER_NAME= + AZURE_DOCUMENTDB_DATABASENAME=Hotels + ALGORITHM=all + SIMILARITY=COS + ``` + +3. Restore packages: + + ```bash + cd src + dotnet restore + ``` + +## Usage + +Run all algorithms: + +```bash +cd src +dotnet run +``` + +Run a specific algorithm: + +```bash +# Set in .env: ALGORITHM=ivf | hnsw | diskann | all +dotnet run +``` + +## Project Structure + +``` +select-algorithm-dotnet/ +├── .env.example # Environment variable template +├── README.md # This file +└── src/ + ├── SelectAlgorithm.csproj # Project file + ├── Program.cs # Entry point - dispatches by ALGORITHM env + ├── Utils.cs # Shared helpers (connection, embedding, search) + ├── IvfDemo.cs # IVF index creation and search + ├── HnswDemo.cs # HNSW index creation and search + └── DiskannDemo.cs # DiskANN index creation and search +``` + +## How It Works + +1. **Connect** to DocumentDB using Microsoft Entra ID (OIDC) passwordless authentication +2. **Load** hotel documents with pre-computed embeddings from `Hotels_Vector.json` +3. **Create** a vector index using the selected algorithm +4. **Search** using a natural language query converted to an embedding via Azure OpenAI +5. **Display** ranked results with similarity scores + +## Authentication + +This sample uses `DefaultAzureCredential` for both: +- **DocumentDB**: OIDC-based MongoDB authentication +- **Azure OpenAI**: Token-based authentication with `https://cognitiveservices.azure.com/.default` scope + +Ensure you are logged in with `az login` and have appropriate RBAC roles assigned. diff --git a/ai/select-algorithm-dotnet/src/DiskannDemo.cs b/ai/select-algorithm-dotnet/src/DiskannDemo.cs new file mode 100644 index 0000000..a52b1bb --- /dev/null +++ b/ai/select-algorithm-dotnet/src/DiskannDemo.cs @@ -0,0 +1,88 @@ +/// DiskANN vector index for Azure DocumentDB. +/// Best for: Datasets with 50,000+ documents. +/// Cluster tier: M30 or higher. +/// Key parameters: maxDegree (graph edges), lBuild (construction quality). + +namespace SelectAlgorithm; + +using MongoDB.Driver; +using MongoDB.Bson; + +public static class DiskannDemo +{ + public static void CreateDiskannIndex(IMongoCollection collection, string vectorField, int dimensions, string similarity, int maxDegree = 20, int lBuild = 10) + { + Console.WriteLine($"Creating DiskANN vector index on field '{vectorField}'..."); + + Utils.DropVectorIndexes(collection, vectorField); + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", $"diskann_index_{vectorField}" }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", new BsonDocument + { + { "kind", "vector-diskann" }, + { "dimensions", dimensions }, + { "similarity", similarity }, + { "maxDegree", maxDegree }, + { "lBuild", lBuild } + } + } + } + } + } + }; + + collection.Database.RunCommand(command); + Console.WriteLine("DiskANN vector index created successfully"); + } + + public static void Run() + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" DiskANN Vector Index - Select Algorithm Demo"); + Console.WriteLine(" Best for: 50,000+ documents"); + Console.WriteLine(new string('=', 60)); + + var databaseName = Environment.GetEnvironmentVariable("AZURE_DOCUMENTDB_DATABASENAME") ?? "Hotels"; + var dataFile = Environment.GetEnvironmentVariable("DATA_FILE_WITH_VECTORS") ?? "../../data/Hotels_Vector.json"; + var vectorField = Environment.GetEnvironmentVariable("EMBEDDED_FIELD") ?? "DescriptionVector"; + var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") ?? "text-embedding-3-small"; + var dimensions = int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536"); + var batchSize = int.Parse(Environment.GetEnvironmentVariable("LOAD_SIZE_BATCH") ?? "100"); + var similarity = Environment.GetEnvironmentVariable("SIMILARITY") ?? "COS"; + + var mongoClient = Utils.GetMongoClientPasswordless(); + var embeddingClient = Utils.GetEmbeddingClient(); + + try + { + var database = mongoClient.GetDatabase(databaseName); + var collection = database.GetCollection("hotels_diskann"); + + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + + Utils.InsertData(collection, documents, batchSize); + + CreateDiskannIndex(collection, vectorField, dimensions, similarity); + Console.WriteLine("Waiting for index to build..."); + Thread.Sleep(5000); + + var query = "quintessential lodging near running trails, eateries, retail"; + var results = Utils.PerformVectorSearch(collection, embeddingClient, query, vectorField, model); + Utils.PrintSearchResults(results, "DiskANN"); + } + finally + { + mongoClient.Cluster.Dispose(); + } + } +} diff --git a/ai/select-algorithm-dotnet/src/HnswDemo.cs b/ai/select-algorithm-dotnet/src/HnswDemo.cs new file mode 100644 index 0000000..acbeb81 --- /dev/null +++ b/ai/select-algorithm-dotnet/src/HnswDemo.cs @@ -0,0 +1,88 @@ +/// HNSW (Hierarchical Navigable Small World) vector index for Azure DocumentDB. +/// Best for: Datasets between 10,000 and 50,000 documents. +/// Cluster tier: M30 or higher. +/// Key parameters: m (graph connectivity), efConstruction (build quality). + +namespace SelectAlgorithm; + +using MongoDB.Driver; +using MongoDB.Bson; + +public static class HnswDemo +{ + public static void CreateHnswIndex(IMongoCollection collection, string vectorField, int dimensions, string similarity, int m = 16, int efConstruction = 64) + { + Console.WriteLine($"Creating HNSW vector index on field '{vectorField}'..."); + + Utils.DropVectorIndexes(collection, vectorField); + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", $"hnsw_index_{vectorField}" }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", new BsonDocument + { + { "kind", "vector-hnsw" }, + { "dimensions", dimensions }, + { "similarity", similarity }, + { "m", m }, + { "efConstruction", efConstruction } + } + } + } + } + } + }; + + collection.Database.RunCommand(command); + Console.WriteLine("HNSW vector index created successfully"); + } + + public static void Run() + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" HNSW Vector Index - Select Algorithm Demo"); + Console.WriteLine(" Best for: 10,000 - 50,000 documents"); + Console.WriteLine(new string('=', 60)); + + var databaseName = Environment.GetEnvironmentVariable("AZURE_DOCUMENTDB_DATABASENAME") ?? "Hotels"; + var dataFile = Environment.GetEnvironmentVariable("DATA_FILE_WITH_VECTORS") ?? "../../data/Hotels_Vector.json"; + var vectorField = Environment.GetEnvironmentVariable("EMBEDDED_FIELD") ?? "DescriptionVector"; + var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") ?? "text-embedding-3-small"; + var dimensions = int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536"); + var batchSize = int.Parse(Environment.GetEnvironmentVariable("LOAD_SIZE_BATCH") ?? "100"); + var similarity = Environment.GetEnvironmentVariable("SIMILARITY") ?? "COS"; + + var mongoClient = Utils.GetMongoClientPasswordless(); + var embeddingClient = Utils.GetEmbeddingClient(); + + try + { + var database = mongoClient.GetDatabase(databaseName); + var collection = database.GetCollection("hotels_hnsw"); + + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + + Utils.InsertData(collection, documents, batchSize); + + CreateHnswIndex(collection, vectorField, dimensions, similarity); + Console.WriteLine("Waiting for index to build..."); + Thread.Sleep(5000); + + var query = "quintessential lodging near running trails, eateries, retail"; + var results = Utils.PerformVectorSearch(collection, embeddingClient, query, vectorField, model); + Utils.PrintSearchResults(results, "HNSW"); + } + finally + { + mongoClient.Cluster.Dispose(); + } + } +} diff --git a/ai/select-algorithm-dotnet/src/IvfDemo.cs b/ai/select-algorithm-dotnet/src/IvfDemo.cs new file mode 100644 index 0000000..01a1b74 --- /dev/null +++ b/ai/select-algorithm-dotnet/src/IvfDemo.cs @@ -0,0 +1,87 @@ +/// IVF (Inverted File) vector index for Azure DocumentDB. +/// Best for: Datasets with fewer than 10,000 documents. +/// Cluster tier: M10 or higher. +/// Key parameters: numLists (cluster count). + +namespace SelectAlgorithm; + +using MongoDB.Driver; +using MongoDB.Bson; + +public static class IvfDemo +{ + public static void CreateIvfIndex(IMongoCollection collection, string vectorField, int dimensions, string similarity, int numLists = 10) + { + Console.WriteLine($"Creating IVF vector index on field '{vectorField}'..."); + + Utils.DropVectorIndexes(collection, vectorField); + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", $"ivf_index_{vectorField}" }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", new BsonDocument + { + { "kind", "vector-ivf" }, + { "dimensions", dimensions }, + { "similarity", similarity }, + { "numLists", numLists } + } + } + } + } + } + }; + + collection.Database.RunCommand(command); + Console.WriteLine("IVF vector index created successfully"); + } + + public static void Run() + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" IVF Vector Index - Select Algorithm Demo"); + Console.WriteLine(" Best for: < 10,000 documents"); + Console.WriteLine(new string('=', 60)); + + var databaseName = Environment.GetEnvironmentVariable("AZURE_DOCUMENTDB_DATABASENAME") ?? "Hotels"; + var dataFile = Environment.GetEnvironmentVariable("DATA_FILE_WITH_VECTORS") ?? "../../data/Hotels_Vector.json"; + var vectorField = Environment.GetEnvironmentVariable("EMBEDDED_FIELD") ?? "DescriptionVector"; + var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") ?? "text-embedding-3-small"; + var dimensions = int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536"); + var batchSize = int.Parse(Environment.GetEnvironmentVariable("LOAD_SIZE_BATCH") ?? "100"); + var similarity = Environment.GetEnvironmentVariable("SIMILARITY") ?? "COS"; + + var mongoClient = Utils.GetMongoClientPasswordless(); + var embeddingClient = Utils.GetEmbeddingClient(); + + try + { + var database = mongoClient.GetDatabase(databaseName); + var collection = database.GetCollection("hotels_ivf"); + + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + + Utils.InsertData(collection, documents, batchSize); + + CreateIvfIndex(collection, vectorField, dimensions, similarity); + Console.WriteLine("Waiting for index to build..."); + Thread.Sleep(3000); + + var query = "quintessential lodging near running trails, eateries, retail"; + var results = Utils.PerformVectorSearch(collection, embeddingClient, query, vectorField, model); + Utils.PrintSearchResults(results, "IVF"); + } + finally + { + mongoClient.Cluster.Dispose(); + } + } +} diff --git a/ai/select-algorithm-dotnet/src/Program.cs b/ai/select-algorithm-dotnet/src/Program.cs new file mode 100644 index 0000000..96fe4d3 --- /dev/null +++ b/ai/select-algorithm-dotnet/src/Program.cs @@ -0,0 +1,45 @@ +using DotNetEnv; + +namespace SelectAlgorithm; + +class Program +{ + static void Main(string[] args) + { + // Load .env file from parent directory + Env.Load("../.env"); + + var algorithm = (Environment.GetEnvironmentVariable("ALGORITHM") ?? "all").ToLowerInvariant(); + + Console.WriteLine(); + Console.WriteLine("Select Algorithm Demo - Azure DocumentDB Vector Search (.NET)"); + Console.WriteLine(new string('-', 60)); + Console.WriteLine($"Algorithm: {algorithm}"); + Console.WriteLine(); + + switch (algorithm) + { + case "ivf": + IvfDemo.Run(); + break; + case "hnsw": + HnswDemo.Run(); + break; + case "diskann": + DiskannDemo.Run(); + break; + case "all": + IvfDemo.Run(); + HnswDemo.Run(); + DiskannDemo.Run(); + break; + default: + Console.WriteLine($"Unknown algorithm: {algorithm}"); + Console.WriteLine("Valid options: ivf, hnsw, diskann, all"); + Environment.Exit(1); + break; + } + + Console.WriteLine("Done!"); + } +} diff --git a/ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj b/ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj new file mode 100644 index 0000000..033f6c4 --- /dev/null +++ b/ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj @@ -0,0 +1,15 @@ + + + Exe + net8.0 + enable + enable + SelectAlgorithm + + + + + + + + diff --git a/ai/select-algorithm-dotnet/src/Utils.cs b/ai/select-algorithm-dotnet/src/Utils.cs new file mode 100644 index 0000000..0d6381d --- /dev/null +++ b/ai/select-algorithm-dotnet/src/Utils.cs @@ -0,0 +1,162 @@ +using MongoDB.Driver; +using MongoDB.Bson; +using MongoDB.Bson.Serialization; +using Azure.Identity; +using Azure.AI.OpenAI; +using OpenAI.Embeddings; + +namespace SelectAlgorithm; + +public static class Utils +{ + public static IMongoClient GetMongoClientPasswordless() + { + var clusterName = Environment.GetEnvironmentVariable("MONGO_CLUSTER_NAME") + ?? throw new InvalidOperationException("MONGO_CLUSTER_NAME environment variable is required"); + + var credential = new DefaultAzureCredential(); + + var connectionString = $"mongodb+srv://{clusterName}.global.mongocluster.cosmos.azure.com/"; + var settings = MongoClientSettings.FromConnectionString(connectionString); + settings.ConnectTimeout = TimeSpan.FromSeconds(120); + settings.UseTls = true; + settings.RetryWrites = true; + settings.Credential = MongoCredential.CreateOidcCredential("azure", null) + .WithMechanismProperty("ENVIRONMENT", "azure"); + + return new MongoClient(settings); + } + + public static EmbeddingClient GetEmbeddingClient() + { + var endpoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_ENDPOINT") + ?? throw new InvalidOperationException("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required"); + var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") + ?? "text-embedding-3-small"; + + var credential = new DefaultAzureCredential(); + var azureClient = new AzureOpenAIClient(new Uri(endpoint), credential); + return azureClient.GetEmbeddingClient(model); + } + + public static List ReadJsonFile(string path) + { + if (!File.Exists(path)) + throw new FileNotFoundException($"Data file not found: {path}"); + + var json = File.ReadAllText(path); + return BsonSerializer.Deserialize>(json); + } + + public static void InsertData(IMongoCollection collection, List data, int batchSize) + { + var totalDocuments = data.Count; + var existingCount = collection.CountDocuments(new BsonDocument()); + + if (existingCount >= totalDocuments) + { + Console.WriteLine($"Collection already has {existingCount} documents, skipping insert"); + return; + } + + if (existingCount > 0) + { + collection.DeleteMany(new BsonDocument()); + } + + var insertedCount = 0; + for (var i = 0; i < totalDocuments; i += batchSize) + { + var batch = data.Skip(i).Take(batchSize).ToList(); + try + { + collection.InsertMany(batch, new InsertManyOptions { IsOrdered = false }); + insertedCount += batch.Count; + } + catch (MongoBulkWriteException) + { + // Some documents may have been inserted before the error + insertedCount += batch.Count; + } + Thread.Sleep(100); + } + + Console.WriteLine($"Inserted {insertedCount}/{totalDocuments} documents"); + } + + public static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + var indexes = cursor.ToList(); + foreach (var index in indexes) + { + if (index.Contains("key")) + { + var key = index["key"].AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + var indexName = index["name"].AsString; + collection.Indexes.DropOne(indexName); + Console.WriteLine($"Dropped existing vector index: {indexName}"); + } + } + } + } + catch (Exception ex) + { + Console.WriteLine($"Warning: Error dropping indexes: {ex.Message}"); + } + } + + public static List PerformVectorSearch( + IMongoCollection collection, + EmbeddingClient client, + string query, + string vectorField, + string model, + int topK = 5) + { + var embeddingResult = client.GenerateEmbedding(query); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "document", "$$ROOT" }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + public static void PrintSearchResults(List results, string algorithm) + { + Console.WriteLine(); + Console.WriteLine(new string('=', 60)); + Console.WriteLine($" {algorithm} Search Results ({results.Count} found)"); + Console.WriteLine(new string('=', 60)); + + for (var i = 0; i < results.Count; i++) + { + var result = results[i]; + var doc = result.Contains("document") ? result["document"].AsBsonDocument : result; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString + : doc.Contains("name") ? doc["name"].AsString + : "Unknown"; + var score = result.Contains("score") ? result["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + + Console.WriteLine(); + } +} diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md new file mode 100644 index 0000000..cec698a --- /dev/null +++ b/ai/select-algorithm-go/README.md @@ -0,0 +1,124 @@ +# Select Algorithm - Go + +This sample demonstrates how to use different vector search algorithms (IVF, HNSW, DiskANN) with Azure DocumentDB (vCore) in Go. It loads hotel data with pre-computed embeddings, creates vector indexes, and performs similarity searches using each algorithm. + +## Prerequisites + +- [Go 1.24+](https://golang.org/dl/) +- [Azure DocumentDB (vCore) cluster](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/) +- [Azure OpenAI resource](https://learn.microsoft.com/azure/ai-services/openai/) with an embedding model deployed +- [Azure CLI](https://learn.microsoft.com/cli/azure/) (for passwordless authentication) +- Pre-generated embeddings file (`Hotels_Vector.json`) — see the `vector-search-go` sample + +## Setup + +1. **Clone the repository** and navigate to this directory: + + ```bash + cd ai/select-algorithm-go + ``` + +2. **Configure environment variables** by copying the example file: + + ```bash + cp .env.example .env + ``` + + Edit `.env` with your Azure resource values. + +3. **Install dependencies**: + + ```bash + cd src + go mod tidy + ``` + +4. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +## Usage + +Run from the `src` directory: + +```bash +cd src +``` + +### Run all algorithms + +```bash +ALGORITHM=all go run . +``` + +### Run a specific algorithm + +```bash +# IVF (Inverted File) — clustering-based, works on all tiers +ALGORITHM=ivf go run . + +# HNSW (Hierarchical Navigable Small World) — graph-based, higher recall +ALGORITHM=hnsw go run . + +# DiskANN — disk-optimized, best for large datasets +ALGORITHM=diskann go run . +``` + +### On Windows (PowerShell) + +```powershell +$env:ALGORITHM="ivf"; go run . +``` + +## Algorithm comparison + +| Algorithm | Kind | Key Parameters | Best For | +|-----------|-----------------|-----------------------------|-----------------------------| +| IVF | `vector-ivf` | `numLists=10` | Small datasets, all tiers | +| HNSW | `vector-hnsw` | `m=16`, `efConstruction=64` | High recall, medium datasets| +| DiskANN | `vector-diskann`| `maxDegree=20`, `lBuild=10` | Large datasets, disk-based | + +## Project structure + +``` +select-algorithm-go/ +├── .env.example # Environment variable template +├── go.mod # Go module dependencies +├── README.md # This file +└── src/ + ├── main.go # Entry point — dispatches by ALGORITHM env var + ├── utils.go # Shared config, auth, data, and search helpers + ├── ivf.go # IVF index creation and search workflow + ├── hnsw.go # HNSW index creation and search workflow + └── diskann.go # DiskANN index creation and search workflow +``` + +## Authentication + +This sample uses **passwordless (OIDC) authentication** with `DefaultAzureCredential`. Ensure your Azure identity has: + +- **DocumentDB**: Appropriate RBAC role on the cluster +- **Azure OpenAI**: `Cognitive Services OpenAI User` role on the OpenAI resource + +The MongoDB OIDC auth uses the `https://ossrdbms-aad.database.windows.net/.default` scope, and the OpenAI client uses `https://cognitiveservices.azure.com/.default`. + +## Important notes + +- **One vector index per field**: DocumentDB supports only one vector index per field. The scripts automatically drop existing vector indexes before creating new ones. +- **Cluster tier requirements**: Some algorithms may not be available on all cluster tiers. The sample provides helpful error messages if a tier limitation is encountered. +- **Collection separation**: Each algorithm uses its own collection (`hotels_ivf`, `hotels_hnsw`, `hotels_diskann`) so they can coexist. +- **bson.D ordering**: All MongoDB commands use `bson.D` (ordered) instead of `bson.M` (unordered) to avoid "multi-key map" errors. + +## Troubleshooting + +- **Authentication errors**: Run `az login` and verify your identity has RBAC access to both DocumentDB and Azure OpenAI. +- **"not enabled for this cluster tier"**: Upgrade your DocumentDB cluster tier or try a different algorithm. +- **No embedding data**: Ensure your `Hotels_Vector.json` file contains documents with the embedding field specified in `EMBEDDED_FIELD`. + +## Further resources + +- [DocumentDB vector search documentation](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) +- [Azure OpenAI embeddings](https://learn.microsoft.com/azure/ai-services/openai/how-to/embeddings) +- [Go MongoDB driver](https://pkg.go.dev/go.mongodb.org/mongo-driver) diff --git a/ai/select-algorithm-go/go.mod b/ai/select-algorithm-go/go.mod new file mode 100644 index 0000000..c25f589 --- /dev/null +++ b/ai/select-algorithm-go/go.mod @@ -0,0 +1,11 @@ +module documentdb-select-algorithm + +go 1.24.0 + +require ( + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 + github.com/joho/godotenv v1.5.1 + github.com/openai/openai-go/v3 v3.12.0 + go.mongodb.org/mongo-driver v1.17.6 +) diff --git a/ai/select-algorithm-go/src/diskann.go b/ai/select-algorithm-go/src/diskann.go new file mode 100644 index 0000000..ca157fa --- /dev/null +++ b/ai/select-algorithm-go/src/diskann.go @@ -0,0 +1,112 @@ +package main + +import ( + "context" + "fmt" + "log" + "strings" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CreateDiskANNVectorIndex creates a DiskANN vector index on the specified field +func CreateDiskANNVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, dimensions int, similarity string) error { + fmt.Printf("Creating DiskANN vector index on field '%s'...\n", vectorField) + + err := DropVectorIndexes(ctx, collection, vectorField) + if err != nil { + fmt.Printf("Warning: Could not drop existing indexes: %v\n", err) + } + + // Must use bson.D for commands to preserve order and avoid "multi-key map" errors + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", fmt.Sprintf("diskann_index_%s", vectorField)}, + {"key", bson.D{ + {vectorField, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", bson.D{ + {"kind", "vector-diskann"}, + {"dimensions", dimensions}, + {"similarity", similarity}, + // Maximum degree: number of edges per node in the graph + {"maxDegree", 20}, + // Candidates evaluated during index construction + {"lBuild", 10}, + }}, + }, + }}, + } + + var result bson.M + err = collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + if strings.Contains(err.Error(), "not enabled for this cluster tier") { + fmt.Println("\nDiskANN indexes require a higher cluster tier.") + fmt.Println("Try upgrading your DocumentDB cluster or use a different algorithm.") + } + return fmt.Errorf("error creating DiskANN vector index: %v", err) + } + + fmt.Println("DiskANN vector index created successfully") + return nil +} + +// RunDiskANN executes the full DiskANN vector search workflow +func RunDiskANN(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + fmt.Println("\n" + strings.Repeat("=", 60)) + fmt.Println("DiskANN Vector Search") + fmt.Println(strings.Repeat("=", 60)) + + collection := dbClient.Database(config.DatabaseName).Collection("hotels_diskann") + + // Load data + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + // Insert data + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + if stats.Inserted == 0 { + return fmt.Errorf("no documents were inserted successfully") + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // Create DiskANN vector index + fmt.Println("\nCreating DiskANN vector index...") + err = CreateDiskANNVectorIndex(ctx, collection, config.VectorField, config.Dimensions, config.Similarity) + if err != nil { + return fmt.Errorf("failed to create DiskANN vector index: %v", err) + } + + fmt.Println("Waiting for index to be ready...") + time.Sleep(2 * time.Second) + + // Perform vector search + query := "quintessential lodging near running trails, eateries, retail" + results, err := PerformVectorSearch(ctx, collection, aiClient, query, config.VectorField, config.ModelName, 5) + if err != nil { + return fmt.Errorf("failed to perform DiskANN vector search: %v", err) + } + + PrintSearchResults(results, "diskann") + + log.Println("DiskANN demonstration completed successfully!") + return nil +} diff --git a/ai/select-algorithm-go/src/hnsw.go b/ai/select-algorithm-go/src/hnsw.go new file mode 100644 index 0000000..def5aff --- /dev/null +++ b/ai/select-algorithm-go/src/hnsw.go @@ -0,0 +1,112 @@ +package main + +import ( + "context" + "fmt" + "log" + "strings" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CreateHNSWVectorIndex creates an HNSW (Hierarchical Navigable Small World) vector index on the specified field +func CreateHNSWVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, dimensions int, similarity string) error { + fmt.Printf("Creating HNSW vector index on field '%s'...\n", vectorField) + + err := DropVectorIndexes(ctx, collection, vectorField) + if err != nil { + fmt.Printf("Warning: Could not drop existing indexes: %v\n", err) + } + + // Must use bson.D for commands to preserve order and avoid "multi-key map" errors + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", fmt.Sprintf("hnsw_index_%s", vectorField)}, + {"key", bson.D{ + {vectorField, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", bson.D{ + {"kind", "vector-hnsw"}, + {"dimensions", dimensions}, + {"similarity", similarity}, + // Maximum connections per node in the graph + {"m", 16}, + // Candidate list size during construction + {"efConstruction", 64}, + }}, + }, + }}, + } + + var result bson.M + err = collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + if strings.Contains(err.Error(), "not enabled for this cluster tier") { + fmt.Println("\nHNSW indexes require a higher cluster tier.") + fmt.Println("Try upgrading your DocumentDB cluster or use a different algorithm.") + } + return fmt.Errorf("error creating HNSW vector index: %v", err) + } + + fmt.Println("HNSW vector index created successfully") + return nil +} + +// RunHNSW executes the full HNSW vector search workflow +func RunHNSW(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + fmt.Println("\n" + strings.Repeat("=", 60)) + fmt.Println("HNSW (Hierarchical Navigable Small World) Vector Search") + fmt.Println(strings.Repeat("=", 60)) + + collection := dbClient.Database(config.DatabaseName).Collection("hotels_hnsw") + + // Load data + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + // Insert data + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + if stats.Inserted == 0 { + return fmt.Errorf("no documents were inserted successfully") + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // Create HNSW vector index + fmt.Println("\nCreating HNSW vector index...") + err = CreateHNSWVectorIndex(ctx, collection, config.VectorField, config.Dimensions, config.Similarity) + if err != nil { + return fmt.Errorf("failed to create HNSW vector index: %v", err) + } + + fmt.Println("Waiting for index to be ready...") + time.Sleep(2 * time.Second) + + // Perform vector search + query := "quintessential lodging near running trails, eateries, retail" + results, err := PerformVectorSearch(ctx, collection, aiClient, query, config.VectorField, config.ModelName, 5) + if err != nil { + return fmt.Errorf("failed to perform HNSW vector search: %v", err) + } + + PrintSearchResults(results, "hnsw") + + log.Println("HNSW demonstration completed successfully!") + return nil +} diff --git a/ai/select-algorithm-go/src/ivf.go b/ai/select-algorithm-go/src/ivf.go new file mode 100644 index 0000000..3da7cba --- /dev/null +++ b/ai/select-algorithm-go/src/ivf.go @@ -0,0 +1,110 @@ +package main + +import ( + "context" + "fmt" + "log" + "strings" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CreateIVFVectorIndex creates an IVF (Inverted File) vector index on the specified field +func CreateIVFVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, dimensions int, similarity string) error { + fmt.Printf("Creating IVF vector index on field '%s'...\n", vectorField) + + err := DropVectorIndexes(ctx, collection, vectorField) + if err != nil { + fmt.Printf("Warning: Could not drop existing indexes: %v\n", err) + } + + // Must use bson.D for commands to preserve order and avoid "multi-key map" errors + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", fmt.Sprintf("ivf_index_%s", vectorField)}, + {"key", bson.D{ + {vectorField, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", bson.D{ + {"kind", "vector-ivf"}, + {"dimensions", dimensions}, + {"similarity", similarity}, + // Number of clusters to partition vectors into + {"numLists", 10}, + }}, + }, + }}, + } + + var result bson.M + err = collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + if strings.Contains(err.Error(), "not enabled for this cluster tier") { + fmt.Println("\nIVF indexes require a higher cluster tier.") + fmt.Println("Try upgrading your DocumentDB cluster or use a different algorithm.") + } + return fmt.Errorf("error creating IVF vector index: %v", err) + } + + fmt.Println("IVF vector index created successfully") + return nil +} + +// RunIVF executes the full IVF vector search workflow +func RunIVF(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + fmt.Println("\n" + strings.Repeat("=", 60)) + fmt.Println("IVF (Inverted File) Vector Search") + fmt.Println(strings.Repeat("=", 60)) + + collection := dbClient.Database(config.DatabaseName).Collection("hotels_ivf") + + // Load data + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + // Insert data + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + if stats.Inserted == 0 { + return fmt.Errorf("no documents were inserted successfully") + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // Create IVF vector index + fmt.Println("\nCreating IVF vector index...") + err = CreateIVFVectorIndex(ctx, collection, config.VectorField, config.Dimensions, config.Similarity) + if err != nil { + return fmt.Errorf("failed to create IVF vector index: %v", err) + } + + fmt.Println("Waiting for index clustering to complete...") + time.Sleep(3 * time.Second) + + // Perform vector search + query := "quintessential lodging near running trails, eateries, retail" + results, err := PerformVectorSearch(ctx, collection, aiClient, query, config.VectorField, config.ModelName, 5) + if err != nil { + return fmt.Errorf("failed to perform IVF vector search: %v", err) + } + + PrintSearchResults(results, "ivf") + + log.Println("IVF demonstration completed successfully!") + return nil +} diff --git a/ai/select-algorithm-go/src/main.go b/ai/select-algorithm-go/src/main.go new file mode 100644 index 0000000..0f10b77 --- /dev/null +++ b/ai/select-algorithm-go/src/main.go @@ -0,0 +1,68 @@ +package main + +import ( + "context" + "fmt" + "log" +) + +func main() { + fmt.Println("DocumentDB Select Algorithm - Go Sample") + fmt.Println("========================================") + + ctx := context.Background() + + // Load configuration from environment variables + config := LoadConfig() + + fmt.Printf("Algorithm: %s\n", config.Algorithm) + fmt.Printf("Database: %s\n", config.DatabaseName) + fmt.Printf("Similarity: %s\n", config.Similarity) + fmt.Printf("Dimensions: %d\n", config.Dimensions) + + // Initialize MongoDB and Azure OpenAI clients + fmt.Println("\nInitializing MongoDB and Azure OpenAI clients...") + mongoClient, aiClient, err := GetClientsPasswordless(ctx, config) + if err != nil { + log.Fatalf("Failed to initialize clients: %v", err) + } + defer mongoClient.Disconnect(ctx) + + // Dispatch based on selected algorithm + switch config.Algorithm { + case "ivf": + if err := RunIVF(ctx, config, mongoClient, aiClient); err != nil { + log.Fatalf("IVF failed: %v", err) + } + + case "hnsw": + if err := RunHNSW(ctx, config, mongoClient, aiClient); err != nil { + log.Fatalf("HNSW failed: %v", err) + } + + case "diskann": + if err := RunDiskANN(ctx, config, mongoClient, aiClient); err != nil { + log.Fatalf("DiskANN failed: %v", err) + } + + case "all": + fmt.Println("\nRunning all algorithms...") + + if err := RunIVF(ctx, config, mongoClient, aiClient); err != nil { + log.Printf("IVF failed: %v", err) + } + + if err := RunHNSW(ctx, config, mongoClient, aiClient); err != nil { + log.Printf("HNSW failed: %v", err) + } + + if err := RunDiskANN(ctx, config, mongoClient, aiClient); err != nil { + log.Printf("DiskANN failed: %v", err) + } + + default: + log.Fatalf("Unknown algorithm: '%s'. Use 'all', 'ivf', 'hnsw', or 'diskann'", config.Algorithm) + } + + fmt.Println("\nDone!") +} diff --git a/ai/select-algorithm-go/src/utils.go b/ai/select-algorithm-go/src/utils.go new file mode 100644 index 0000000..6e6a8d4 --- /dev/null +++ b/ai/select-algorithm-go/src/utils.go @@ -0,0 +1,395 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "strconv" + "strings" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/joho/godotenv" + "github.com/openai/openai-go/v3" + "github.com/openai/openai-go/v3/azure" + "github.com/openai/openai-go/v3/option" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" + "go.mongodb.org/mongo-driver/mongo/options" +) + +// Config holds the application configuration +type Config struct { + ClusterName string + DatabaseName string + DataFile string + VectorField string + ModelName string + Dimensions int + BatchSize int + Similarity string + Algorithm string +} + +// SearchResult represents a search result document +type SearchResult struct { + Document interface{} `bson:"document"` + Score float64 `bson:"score"` +} + +// InsertStats holds statistics about data insertion +type InsertStats struct { + Total int `json:"total"` + Inserted int `json:"inserted"` + Failed int `json:"failed"` +} + +// LoadConfig loads configuration from environment variables +func LoadConfig() *Config { + // Load environment variables from .env file + // For production use, prefer Azure Key Vault or similar secret management + // services instead of .env files. For development/demo purposes only. + err := godotenv.Load() + if err != nil { + log.Printf("Warning: Error loading .env file: %v", err) + } + + dimensions, _ := strconv.Atoi(getEnvOrDefault("EMBEDDING_DIMENSIONS", "1536")) + batchSize, _ := strconv.Atoi(getEnvOrDefault("LOAD_SIZE_BATCH", "100")) + + return &Config{ + ClusterName: getEnvOrDefault("MONGO_CLUSTER_NAME", ""), + DatabaseName: getEnvOrDefault("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"), + DataFile: getEnvOrDefault("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"), + VectorField: getEnvOrDefault("EMBEDDED_FIELD", "contentVector"), + ModelName: getEnvOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"), + Dimensions: dimensions, + BatchSize: batchSize, + Similarity: getEnvOrDefault("SIMILARITY", "COS"), + Algorithm: strings.ToLower(getEnvOrDefault("ALGORITHM", "all")), + } +} + +// getEnvOrDefault returns environment variable value or default if not set +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} + +// GetClientsPasswordless creates MongoDB and Azure OpenAI clients with passwordless authentication +func GetClientsPasswordless(ctx context.Context, config *Config) (*mongo.Client, openai.Client, error) { + if config.ClusterName == "" { + return nil, openai.Client{}, fmt.Errorf("MONGO_CLUSTER_NAME environment variable is required") + } + + // Create Azure credential + credential, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("failed to create Azure credential: %v", err) + } + + // Connect to DocumentDB with OIDC authentication + mongoURI := fmt.Sprintf("mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", config.ClusterName) + + fmt.Println("Attempting OIDC authentication...") + mongoClient, err := connectWithOIDC(ctx, mongoURI, credential) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("OIDC authentication failed: %v", err) + } + fmt.Println("OIDC authentication successful!") + + // Get Azure OpenAI endpoint + azureOpenAIEndpoint := os.Getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if azureOpenAIEndpoint == "" { + return nil, openai.Client{}, fmt.Errorf("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + } + + // Create Azure OpenAI client with credential-based authentication + openAIClient := openai.NewClient( + option.WithBaseURL(fmt.Sprintf("%s/openai/v1", azureOpenAIEndpoint)), + azure.WithTokenCredential(credential)) + + return mongoClient, openAIClient, nil +} + +// connectWithOIDC attempts to connect using OIDC authentication +func connectWithOIDC(ctx context.Context, mongoURI string, credential *azidentity.DefaultAzureCredential) (*mongo.Client, error) { + oidcCallback := func(ctx context.Context, args *options.OIDCArgs) (*options.OIDCCredential, error) { + scope := "https://ossrdbms-aad.database.windows.net/.default" + fmt.Printf("Getting token with scope: %s\n", scope) + token, err := credential.GetToken(ctx, policy.TokenRequestOptions{ + Scopes: []string{scope}, + }) + if err != nil { + return nil, fmt.Errorf("failed to get token with scope %s: %v", scope, err) + } + + fmt.Printf("Successfully obtained token\n") + + return &options.OIDCCredential{ + AccessToken: token.Token, + }, nil + } + + clientOptions := options.Client(). + ApplyURI(mongoURI). + SetConnectTimeout(30 * time.Second). + SetServerSelectionTimeout(30 * time.Second). + SetRetryWrites(true). + SetAuth(options.Credential{ + AuthMechanism: "MONGODB-OIDC", + AuthMechanismProperties: map[string]string{ + "TOKEN_RESOURCE": "https://ossrdbms-aad.database.windows.net", + }, + OIDCMachineCallback: oidcCallback, + }) + + mongoClient, err := mongo.Connect(ctx, clientOptions) + if err != nil { + return nil, err + } + + return mongoClient, nil +} + +// ReadFileReturnJSON reads a JSON file and returns the data as a slice of maps +func ReadFileReturnJSON(filePath string) ([]map[string]interface{}, error) { + file, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("error reading file '%s': %v", filePath, err) + } + + var data []map[string]interface{} + err = json.Unmarshal(file, &data) + if err != nil { + return nil, fmt.Errorf("error parsing JSON in file '%s': %v", filePath, err) + } + + return data, nil +} + +// InsertData inserts data into a MongoDB collection in batches +func InsertData(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + totalDocuments := len(data) + insertedCount := 0 + failedCount := 0 + + fmt.Printf("Starting batch insertion of %d documents...\n", totalDocuments) + + for i := 0; i < totalDocuments; i += batchSize { + end := i + batchSize + if end > totalDocuments { + end = totalDocuments + } + + batch := data[i:end] + batchNum := (i / batchSize) + 1 + + documents := make([]interface{}, len(batch)) + for j, doc := range batch { + documents[j] = doc + } + + result, err := collection.InsertMany(ctx, documents, options.InsertMany().SetOrdered(false)) + if err != nil { + if bulkErr, ok := err.(mongo.BulkWriteException); ok { + errorCount := len(bulkErr.WriteErrors) + insertedCount += len(batch) - errorCount + failedCount += errorCount + fmt.Printf("Batch %d had errors: %d inserted, %d failed\n", batchNum, len(batch)-errorCount, errorCount) + for _, writeErr := range bulkErr.WriteErrors { + fmt.Printf(" Error: %s\n", writeErr.Message) + } + } else { + failedCount += len(batch) + fmt.Printf("Batch %d failed completely: %v\n", batchNum, err) + } + } else { + insertedCount += len(result.InsertedIDs) + fmt.Printf("Batch %d completed: %d documents inserted\n", batchNum, len(result.InsertedIDs)) + } + + time.Sleep(100 * time.Millisecond) + } + + return &InsertStats{ + Total: totalDocuments, + Inserted: insertedCount, + Failed: failedCount, + }, nil +} + +// DropVectorIndexes drops existing vector indexes on the specified field +func DropVectorIndexes(ctx context.Context, collection *mongo.Collection, vectorField string) error { + cursor, err := collection.Indexes().List(ctx) + if err != nil { + return fmt.Errorf("could not list indexes: %v", err) + } + defer cursor.Close(ctx) + + var vectorIndexes []string + for cursor.Next(ctx) { + var index bson.M + if err := cursor.Decode(&index); err != nil { + continue + } + + if key, ok := index["key"].(bson.M); ok { + if indexType, exists := key[vectorField]; exists && indexType == "cosmosSearch" { + if name, ok := index["name"].(string); ok { + vectorIndexes = append(vectorIndexes, name) + } + } + } + } + + for _, indexName := range vectorIndexes { + fmt.Printf("Dropping existing vector index: %s\n", indexName) + _, err := collection.Indexes().DropOne(ctx, indexName) + if err != nil { + fmt.Printf("Warning: Could not drop index %s: %v\n", indexName, err) + } + } + + if len(vectorIndexes) > 0 { + fmt.Printf("Dropped %d existing vector index(es)\n", len(vectorIndexes)) + } else { + fmt.Println("No existing vector indexes found to drop") + } + + return nil +} + +// PerformVectorSearch performs a vector search using the cosmosSearch aggregation pipeline +func PerformVectorSearch(ctx context.Context, collection *mongo.Collection, client openai.Client, query, vectorField, model string, topK int) ([]SearchResult, error) { + fmt.Printf("Performing vector search for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, client, query, model) + if err != nil { + return nil, fmt.Errorf("error generating embedding: %v", err) + } + + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": queryEmbedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, fmt.Errorf("error performing vector search: %v", err) + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + fmt.Printf("Warning: Could not decode result: %v\n", err) + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, fmt.Errorf("cursor error: %v", err) + } + + return results, nil +} + +// GenerateEmbedding generates an embedding for the given text using Azure OpenAI +func GenerateEmbedding(ctx context.Context, client openai.Client, text, modelName string) ([]float64, error) { + resp, err := client.Embeddings.New(ctx, openai.EmbeddingNewParams{ + Input: openai.EmbeddingNewParamsInputUnion{ + OfString: openai.String(text), + }, + Model: modelName, + }) + if err != nil { + return nil, fmt.Errorf("failed to generate embedding: %v", err) + } + + if len(resp.Data) == 0 { + return nil, fmt.Errorf("no embedding data received") + } + + embedding := make([]float64, len(resp.Data[0].Embedding)) + for i, v := range resp.Data[0].Embedding { + embedding[i] = float64(v) + } + + return embedding, nil +} + +// PrintSearchResults prints search results in a formatted way +func PrintSearchResults(results []SearchResult, algorithm string) { + if len(results) == 0 { + fmt.Println("No search results found.") + return + } + + fmt.Printf("\n%s Search Results (top %d):\n", strings.ToUpper(algorithm), len(results)) + fmt.Println(strings.Repeat("=", 80)) + + for i, result := range results { + doc := result.Document.(bson.D) + var hotelName string + for _, elem := range doc { + if elem.Key == "HotelName" { + hotelName = fmt.Sprintf("%v", elem.Value) + break + } + } + + fmt.Printf("%d. HotelName: %s, Score: %.4f\n", i+1, hotelName, result.Score) + } +} + +// FilterDocumentsWithEmbeddings returns only documents that contain the vector field +func FilterDocumentsWithEmbeddings(data []map[string]interface{}, vectorField string) []map[string]interface{} { + var filtered []map[string]interface{} + for _, doc := range data { + if _, exists := doc[vectorField]; exists { + filtered = append(filtered, doc) + } + } + return filtered +} + +// PrepareCollection clears existing data and inserts new documents +func PrepareCollection(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + fmt.Printf("Preparing collection '%s'...\n", collection.Name()) + + deleteResult, err := collection.DeleteMany(ctx, bson.M{}) + if err != nil { + return nil, fmt.Errorf("failed to clear existing data: %v", err) + } + if deleteResult.DeletedCount > 0 { + fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) + } + + stats, err := InsertData(ctx, collection, data, batchSize) + if err != nil { + return nil, fmt.Errorf("failed to insert data: %v", err) + } + + return stats, nil +} diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md new file mode 100644 index 0000000..72ba7cc --- /dev/null +++ b/ai/select-algorithm-java/README.md @@ -0,0 +1,90 @@ +# Select Algorithm - Java + +This sample demonstrates how to create and use different vector search index algorithms (IVF, HNSW, DiskANN) with Azure DocumentDB using the MongoDB Java driver. + +## Prerequisites + +- Java 17 or later +- Maven 3.8+ +- Azure DocumentDB cluster with vector search enabled +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. Copy the environment file and fill in your values: + + ```bash + cp .env.example .env + ``` + +2. Update `.env` with your Azure resource details: + - `MONGO_CLUSTER_NAME` — your DocumentDB cluster name + - `AZURE_OPENAI_EMBEDDING_ENDPOINT` — your Azure OpenAI endpoint + - `AZURE_OPENAI_EMBEDDING_MODEL` — deployment name (e.g., `text-embedding-3-small`) + - `DATA_FILE_WITH_VECTORS` — path to the pre-computed vectors JSON file + +## Build + +```bash +mvn clean compile +``` + +## Run + +Run all algorithms: + +```bash +mvn exec:java +``` + +Run a specific algorithm: + +```bash +# Set ALGORITHM to: ivf, hnsw, diskann, or all +ALGORITHM=ivf mvn exec:java +``` + +On Windows (PowerShell): + +```powershell +$env:ALGORITHM="hnsw"; mvn exec:java +``` + +## Algorithms + +| Algorithm | Description | Best For | +|-----------|-------------|----------| +| **IVF** | Inverted File index — partitions vectors into clusters | Large datasets with batch queries | +| **HNSW** | Hierarchical Navigable Small World graph | Low-latency, high-recall searches | +| **DiskANN** | Disk-based Approximate Nearest Neighbor | Very large datasets that exceed memory | + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `ALGORITHM` | `all` | Which algorithm to run: `ivf`, `hnsw`, `diskann`, `all` | +| `SIMILARITY` | `COS` | Similarity metric: `COS`, `L2`, `IP` | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `EMBEDDED_FIELD` | `contentVector` | Field name containing embeddings | + +## Authentication + +This sample uses **passwordless authentication** via `DefaultAzureCredential`: + +- **DocumentDB**: OIDC mechanism with Azure identity +- **Azure OpenAI**: Entra ID token-based auth + +Ensure your identity has the appropriate RBAC roles assigned on both resources. + +## Project Structure + +``` +src/main/java/com/azure/documentdb/selectalgorithm/ +├── Main.java — Entry point, dispatches to algorithm demos +├── Utils.java — Shared helpers (connection, embedding, data loading) +├── IvfDemo.java — IVF index creation and vector search +├── HnswDemo.java — HNSW index creation and vector search +└── DiskannDemo.java — DiskANN index creation and vector search +``` diff --git a/ai/select-algorithm-java/pom.xml b/ai/select-algorithm-java/pom.xml new file mode 100644 index 0000000..a91ea98 --- /dev/null +++ b/ai/select-algorithm-java/pom.xml @@ -0,0 +1,65 @@ + + + 4.0.0 + + com.azure.documentdb + select-algorithm-java + 1.0.0 + jar + + DocumentDB Select Algorithm - Java + Demonstrates IVF, HNSW, and DiskANN vector search indexes with Azure DocumentDB + + + 17 + 17 + UTF-8 + + + + + org.mongodb + mongodb-driver-sync + 5.4.0 + + + com.azure + azure-identity + 1.16.0 + + + com.azure + azure-ai-openai + 1.0.0-beta.16 + + + io.github.cdimascio + dotenv-java + 3.1.0 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.Main + + + + + diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java new file mode 100644 index 0000000..0b12686 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java @@ -0,0 +1,77 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.List; + +public class DiskannDemo { + + private static final String COLLECTION_NAME = "hotels_diskann"; + private static final String QUERY = "quintessential lodging near running trails, eateries, retail"; + + public static void createDiskannIndex(MongoCollection collection, String vectorField, int dimensions, String similarity) { + System.out.println(" Creating DiskANN vector index..."); + + Document indexDefinition = new Document() + .append("name", "diskann_index_" + vectorField) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", new Document() + .append("kind", "vector-diskann") + .append("dimensions", dimensions) + .append("similarity", similarity) + .append("maxDegree", 20) + .append("lBuild", 10)); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + collection.getDatabase().runCommand(command); + System.out.println(" DiskANN index created successfully."); + } + + public static void run() { + System.out.println("\n========================================"); + System.out.println(" DiskANN Index Demo"); + System.out.println("========================================\n"); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String similarity = Utils.getEnv("SIMILARITY", "COS"); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load and insert data + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop existing collection to start fresh + collection.drop(); + System.out.println(" Collection reset."); + + Utils.insertData(collection, data, 100); + + // Create DiskANN index + createDiskannIndex(collection, vectorField, dimensions, similarity); + + // Perform vector search + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.println("\n Performing vector search with DiskANN index..."); + List results = Utils.performVectorSearch( + collection, aiClient, QUERY, vectorField, model, 5); + + Utils.printResults(results); + } + + System.out.println(" DiskANN Demo complete.\n"); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java new file mode 100644 index 0000000..09d436a --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java @@ -0,0 +1,77 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.List; + +public class HnswDemo { + + private static final String COLLECTION_NAME = "hotels_hnsw"; + private static final String QUERY = "quintessential lodging near running trails, eateries, retail"; + + public static void createHnswIndex(MongoCollection collection, String vectorField, int dimensions, String similarity) { + System.out.println(" Creating HNSW vector index..."); + + Document indexDefinition = new Document() + .append("name", "hnsw_index_" + vectorField) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", new Document() + .append("kind", "vector-hnsw") + .append("dimensions", dimensions) + .append("similarity", similarity) + .append("m", 16) + .append("efConstruction", 64)); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + collection.getDatabase().runCommand(command); + System.out.println(" HNSW index created successfully."); + } + + public static void run() { + System.out.println("\n========================================"); + System.out.println(" HNSW (Hierarchical Navigable Small World) Index Demo"); + System.out.println("========================================\n"); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String similarity = Utils.getEnv("SIMILARITY", "COS"); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load and insert data + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop existing collection to start fresh + collection.drop(); + System.out.println(" Collection reset."); + + Utils.insertData(collection, data, 100); + + // Create HNSW index + createHnswIndex(collection, vectorField, dimensions, similarity); + + // Perform vector search + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.println("\n Performing vector search with HNSW index..."); + List results = Utils.performVectorSearch( + collection, aiClient, QUERY, vectorField, model, 5); + + Utils.printResults(results); + } + + System.out.println(" HNSW Demo complete.\n"); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java new file mode 100644 index 0000000..5baad0b --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java @@ -0,0 +1,76 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.List; + +public class IvfDemo { + + private static final String COLLECTION_NAME = "hotels_ivf"; + private static final String QUERY = "quintessential lodging near running trails, eateries, retail"; + + public static void createIvfIndex(MongoCollection collection, String vectorField, int dimensions, String similarity) { + System.out.println(" Creating IVF vector index..."); + + Document indexDefinition = new Document() + .append("name", "ivf_index_" + vectorField) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", new Document() + .append("kind", "vector-ivf") + .append("dimensions", dimensions) + .append("similarity", similarity) + .append("numLists", 10)); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + collection.getDatabase().runCommand(command); + System.out.println(" IVF index created successfully."); + } + + public static void run() { + System.out.println("\n========================================"); + System.out.println(" IVF (Inverted File) Index Demo"); + System.out.println("========================================\n"); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String similarity = Utils.getEnv("SIMILARITY", "COS"); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load and insert data + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop existing collection to start fresh + collection.drop(); + System.out.println(" Collection reset."); + + Utils.insertData(collection, data, 100); + + // Create IVF index + createIvfIndex(collection, vectorField, dimensions, similarity); + + // Perform vector search + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.println("\n Performing vector search with IVF index..."); + List results = Utils.performVectorSearch( + collection, aiClient, QUERY, vectorField, model, 5); + + Utils.printResults(results); + } + + System.out.println(" IVF Demo complete.\n"); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java new file mode 100644 index 0000000..18fe5b9 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java @@ -0,0 +1,34 @@ +package com.azure.documentdb.selectalgorithm; + +public class Main { + + public static void main(String[] args) { + String algorithm = Utils.getEnv("ALGORITHM", "all").toLowerCase().trim(); + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Vector Search Algorithms"); + System.out.println("=============================================="); + System.out.println(" Algorithm: " + algorithm); + System.out.println(); + + switch (algorithm) { + case "ivf" -> IvfDemo.run(); + case "hnsw" -> HnswDemo.run(); + case "diskann" -> DiskannDemo.run(); + case "all" -> { + IvfDemo.run(); + HnswDemo.run(); + DiskannDemo.run(); + } + default -> { + System.err.println("Unknown algorithm: " + algorithm); + System.err.println("Valid options: ivf, hnsw, diskann, all"); + System.exit(1); + } + } + + System.out.println("=============================================="); + System.out.println(" All demos complete."); + System.out.println("=============================================="); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java new file mode 100644 index 0000000..f72c9ad --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java @@ -0,0 +1,188 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingItem; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.identity.DefaultAzureCredential; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.model.InsertManyOptions; +import io.github.cdimascio.dotenv.Dotenv; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class Utils { + + private static final Dotenv dotenv = Dotenv.configure().ignoreIfMissing().load(); + + public static String getEnv(String key, String defaultValue) { + String value = dotenv.get(key); + if (value == null || value.isBlank()) { + value = System.getenv(key); + } + return (value != null && !value.isBlank()) ? value : defaultValue; + } + + public static String getEnv(String key) { + return getEnv(key, null); + } + + public static MongoClient getMongoClient() { + String clusterName = getEnv("MONGO_CLUSTER_NAME"); + if (clusterName == null) { + throw new IllegalStateException("MONGO_CLUSTER_NAME environment variable is required"); + } + + String connectionUri = String.format( + "mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", clusterName); + + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + + MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) + .withMechanism(MongoCredential.MONGODB_OIDC_MECHANISM) + .withMechanismProperty("ENVIRONMENT", "azure") + .withMechanismProperty("TOKEN_RESOURCE", "https://cosmos.azure.com"); + + MongoClientSettings settings = MongoClientSettings.builder() + .applyConnectionString(new ConnectionString(connectionUri)) + .credential(mongoCredential) + .build(); + + return MongoClients.create(settings); + } + + public static OpenAIClient getOpenAIClient() { + String endpoint = getEnv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + if (endpoint == null) { + throw new IllegalStateException("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required"); + } + + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + public static List readJsonFile(String path) { + try { + String content = Files.readString(Path.of(path)); + // Parse JSON array of documents + @SuppressWarnings("unchecked") + List docs = Document.parse("{\"data\":" + content + "}").getList("data", Document.class); + return docs; + } catch (IOException e) { + throw new RuntimeException("Failed to read data file: " + path, e); + } + } + + public static void insertData(MongoCollection collection, List data, int batchSize) { + System.out.printf(" Inserting %d documents in batches of %d...%n", data.size(), batchSize); + InsertManyOptions options = new InsertManyOptions().ordered(false); + + for (int i = 0; i < data.size(); i += batchSize) { + List batch = data.subList(i, Math.min(i + batchSize, data.size())); + // Remove _id to avoid duplicate key errors on re-run + List cleaned = new ArrayList<>(); + for (Document doc : batch) { + Document copy = new Document(doc); + copy.remove("_id"); + cleaned.add(copy); + } + try { + collection.insertMany(cleaned, options); + } catch (Exception e) { + // Ignore duplicate key errors on re-insert + if (!e.getMessage().contains("duplicate key")) { + throw e; + } + } + System.out.printf(" Inserted batch %d-%d%n", i + 1, Math.min(i + batchSize, data.size())); + } + System.out.println(" Data insertion complete."); + } + + public static void dropVectorIndexes(MongoCollection collection, String vectorField) { + try { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + if (name != null && name.contains(vectorField) && !name.equals("_id_")) { + System.out.printf(" Dropping existing index: %s%n", name); + collection.dropIndex(name); + } + } + } catch (Exception e) { + // Ignore errors when indexes don't exist + System.out.println(" No existing vector indexes to drop."); + } + } + + public static List getEmbedding(OpenAIClient client, String text, String model) { + EmbeddingsOptions options = new EmbeddingsOptions(List.of(text)); + List embeddings = client.getEmbeddings(model, options).getData(); + if (embeddings.isEmpty()) { + throw new RuntimeException("No embedding returned for query text"); + } + return embeddings.get(0).getEmbedding(); + } + + public static List performVectorSearch( + MongoCollection collection, + OpenAIClient aiClient, + String query, + String vectorField, + String model, + int topK) { + + System.out.printf(" Generating embedding for query: \"%s\"%n", query); + List queryVector = getEmbedding(aiClient, query, model); + System.out.printf(" Embedding generated (%d dimensions)%n", queryVector.size()); + + // Convert List to List for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + + return results; + } + + public static void printResults(List results) { + System.out.println("\n === Search Results ==="); + for (int i = 0; i < results.size(); i++) { + Document doc = results.get(i); + System.out.printf(" %d. %s (score: %.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + System.out.printf(" %s%n", doc.getString("Description")); + } + System.out.println(); + } +} diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md new file mode 100644 index 0000000..7e65211 --- /dev/null +++ b/ai/select-algorithm-python/README.md @@ -0,0 +1,69 @@ + +# Select Vector Algorithm (Python) + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB. Each algorithm is optimized for different dataset sizes and performance requirements. + +## Algorithm Selection Guide + +| Algorithm | Dataset Size | Cluster Tier | Key Parameters | +|-----------|-------------|--------------|----------------| +| IVF | < 10K docs | M10+ | numLists | +| HNSW | 10K-50K | M30+ | m, efConstruction | +| DiskANN | 50K+ | M30+ | maxDegree, lBuild | + +## Prerequisites + +- Azure subscription +- Azure DocumentDB vCore cluster (M30+ for all algorithms, M10+ for IVF only) +- Azure OpenAI resource with `text-embedding-3-small` deployed +- Python 3.10+ +- Azure CLI (`az login` for passwordless auth) + +## Setup + +1. Copy environment configuration: + ```bash + cp .env.example .env + ``` + +2. Update `.env` with your resource values. + +3. Install dependencies: + ```bash + cd src + pip install -r ../requirements.txt + ``` + +4. Ensure you're logged in to Azure: + ```bash + az login + ``` + +## Run + +```bash +cd src + +# Run individual algorithms +python ivf.py +python hnsw.py +python diskann.py +``` + +## Configuration + +Edit `.env` to configure: +- `ALGORITHM` — Which algorithm to test: `all`, `ivf`, `hnsw`, `diskann` +- `SIMILARITY` — Similarity metric: `COS`, `L2`, `IP` +- `EMBEDDING_DIMENSIONS` — Must match your embedding model (1536 for text-embedding-3-small) diff --git a/ai/select-algorithm-python/requirements.txt b/ai/select-algorithm-python/requirements.txt new file mode 100644 index 0000000..c0a35e0 --- /dev/null +++ b/ai/select-algorithm-python/requirements.txt @@ -0,0 +1,11 @@ +# MongoDB driver for connecting to DocumentDB +pymongo>=4.6.0 + +# Azure OpenAI SDK for generating embeddings +openai>=1.0.0,<1.56.0 + +# Azure authentication library for passwordless connection +azure-identity>=1.15.0 + +# Environment variable management from .env files +python-dotenv>=1.0.0 diff --git a/ai/select-algorithm-python/src/diskann.py b/ai/select-algorithm-python/src/diskann.py new file mode 100644 index 0000000..5fac5cd --- /dev/null +++ b/ai/select-algorithm-python/src/diskann.py @@ -0,0 +1,90 @@ +""" +DiskANN vector index for Azure DocumentDB. + +Best for: Datasets with 50,000+ documents. +Cluster tier: M30 or higher. +Key parameters: maxDegree (graph edges), lBuild (construction quality). +""" +import os +import time +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data, drop_vector_indexes, perform_vector_search, print_search_results +) + + +def create_diskann_vector_index(collection, vector_field: str, dimensions: int, + similarity: str = "COS", max_degree: int = 20, + l_build: int = 10) -> None: + """Create a DiskANN vector index on the specified field.""" + print(f"Creating DiskANN vector index on field '{vector_field}'...") + + drop_vector_indexes(collection, vector_field) + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": f"diskann_index_{vector_field}", + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-diskann", + "dimensions": dimensions, + "similarity": similarity, + "maxDegree": max_degree, + "lBuild": l_build + } + } + ] + } + + result = collection.database.command(index_command) + print(f"DiskANN vector index created successfully") + return result + + +def main(): + print("=" * 60) + print(" DiskANN Vector Index - Select Algorithm Demo") + print(" Best for: 50,000+ documents") + print("=" * 60) + + config = get_config() + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config['database_name']] + collection = database["hotels_diskann"] + + # Load and insert data + data = read_file_return_json(config['data_file']) + documents = [doc for doc in data if config['vector_field'] in doc] + print(f"\nLoaded {len(documents)} documents with embeddings") + + stats = insert_data(collection, documents, config['batch_size']) + + # Create DiskANN index + if not stats.get('skipped'): + create_diskann_vector_index( + collection, + config['vector_field'], + config['dimensions'], + config['similarity'] + ) + print("Waiting for index to build...") + time.sleep(5) + + # Perform search + query = "quintessential lodging near running trails, eateries, retail" + results = perform_vector_search( + collection, azure_openai_client, query, + config['vector_field'], config['model_name'] + ) + print_search_results(results, "DiskANN") + + finally: + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/hnsw.py b/ai/select-algorithm-python/src/hnsw.py new file mode 100644 index 0000000..568ef0b --- /dev/null +++ b/ai/select-algorithm-python/src/hnsw.py @@ -0,0 +1,90 @@ +""" +HNSW (Hierarchical Navigable Small World) vector index for Azure DocumentDB. + +Best for: Datasets between 10,000 and 50,000 documents. +Cluster tier: M30 or higher. +Key parameters: m (graph connectivity), efConstruction (build quality). +""" +import os +import time +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data, drop_vector_indexes, perform_vector_search, print_search_results +) + + +def create_hnsw_vector_index(collection, vector_field: str, dimensions: int, + similarity: str = "COS", m: int = 16, + ef_construction: int = 64) -> None: + """Create an HNSW vector index on the specified field.""" + print(f"Creating HNSW vector index on field '{vector_field}'...") + + drop_vector_indexes(collection, vector_field) + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": f"hnsw_index_{vector_field}", + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-hnsw", + "dimensions": dimensions, + "similarity": similarity, + "m": m, + "efConstruction": ef_construction + } + } + ] + } + + result = collection.database.command(index_command) + print(f"HNSW vector index created successfully") + return result + + +def main(): + print("=" * 60) + print(" HNSW Vector Index - Select Algorithm Demo") + print(" Best for: 10,000 - 50,000 documents") + print("=" * 60) + + config = get_config() + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config['database_name']] + collection = database["hotels_hnsw"] + + # Load and insert data + data = read_file_return_json(config['data_file']) + documents = [doc for doc in data if config['vector_field'] in doc] + print(f"\nLoaded {len(documents)} documents with embeddings") + + stats = insert_data(collection, documents, config['batch_size']) + + # Create HNSW index + if not stats.get('skipped'): + create_hnsw_vector_index( + collection, + config['vector_field'], + config['dimensions'], + config['similarity'] + ) + print("Waiting for index to build...") + time.sleep(5) + + # Perform search + query = "quintessential lodging near running trails, eateries, retail" + results = perform_vector_search( + collection, azure_openai_client, query, + config['vector_field'], config['model_name'] + ) + print_search_results(results, "HNSW") + + finally: + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/ivf.py b/ai/select-algorithm-python/src/ivf.py new file mode 100644 index 0000000..577f82b --- /dev/null +++ b/ai/select-algorithm-python/src/ivf.py @@ -0,0 +1,88 @@ +""" +IVF (Inverted File) vector index for Azure DocumentDB. + +Best for: Datasets with fewer than 10,000 documents. +Cluster tier: M10 or higher. +Key parameters: numLists (cluster count). +""" +import os +import time +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data, drop_vector_indexes, perform_vector_search, print_search_results +) + + +def create_ivf_vector_index(collection, vector_field: str, dimensions: int, + similarity: str = "COS", num_lists: int = 10) -> None: + """Create an IVF vector index on the specified field.""" + print(f"Creating IVF vector index on field '{vector_field}'...") + + drop_vector_indexes(collection, vector_field) + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": f"ivf_index_{vector_field}", + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-ivf", + "dimensions": dimensions, + "similarity": similarity, + "numLists": num_lists + } + } + ] + } + + result = collection.database.command(index_command) + print(f"IVF vector index created successfully") + return result + + +def main(): + print("=" * 60) + print(" IVF Vector Index - Select Algorithm Demo") + print(" Best for: < 10,000 documents") + print("=" * 60) + + config = get_config() + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config['database_name']] + collection = database["hotels_ivf"] + + # Load and insert data + data = read_file_return_json(config['data_file']) + documents = [doc for doc in data if config['vector_field'] in doc] + print(f"\nLoaded {len(documents)} documents with embeddings") + + stats = insert_data(collection, documents, config['batch_size']) + + # Create IVF index + if not stats.get('skipped'): + create_ivf_vector_index( + collection, + config['vector_field'], + config['dimensions'], + config['similarity'] + ) + print("Waiting for index to build...") + time.sleep(3) + + # Perform search + query = "quintessential lodging near running trails, eateries, retail" + results = perform_vector_search( + collection, azure_openai_client, query, + config['vector_field'], config['model_name'] + ) + print_search_results(results, "IVF") + + finally: + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py new file mode 100644 index 0000000..fe0fdaa --- /dev/null +++ b/ai/select-algorithm-python/src/utils.py @@ -0,0 +1,172 @@ +import json +import os +import time +import warnings +from typing import Dict, List, Any, Optional, Tuple + +# Suppress the PyMongo CosmosDB cluster detection warning +warnings.filterwarnings( + "ignore", + message="You appear to be connected to a CosmosDB cluster.*", +) + +from pymongo import MongoClient, InsertOne +from pymongo.collection import Collection +from pymongo.errors import BulkWriteError +from azure.identity import DefaultAzureCredential +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult +from openai import AzureOpenAI +from dotenv import load_dotenv + +load_dotenv() + + +class AzureIdentityTokenCallback(OIDCCallback): + def __init__(self, credential): + self.credential = credential + + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + token = self.credential.get_token( + "https://ossrdbms-aad.database.windows.net/.default").token + return OIDCCallbackResult(access_token=token) + + +def get_clients_passwordless() -> Tuple[MongoClient, AzureOpenAI]: + """Create MongoDB and Azure OpenAI clients using passwordless auth.""" + cluster_name = os.getenv("MONGO_CLUSTER_NAME") + if not cluster_name: + raise ValueError("MONGO_CLUSTER_NAME environment variable is required") + + credential = DefaultAzureCredential() + + mongo_client = MongoClient( + f"mongodb+srv://{cluster_name}.global.mongocluster.cosmos.azure.com/", + connectTimeoutMS=120000, + tls=True, + retryWrites=True, + authMechanism="MONGODB-OIDC", + authMechanismProperties={"OIDC_CALLBACK": AzureIdentityTokenCallback(credential)} + ) + + azure_openai_endpoint = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if not azure_openai_endpoint: + raise ValueError("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + + azure_openai_client = AzureOpenAI( + azure_endpoint=azure_openai_endpoint, + azure_ad_token_provider=lambda: credential.get_token("https://cognitiveservices.azure.com/.default").token, + api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION", "2023-05-15") + ) + + return mongo_client, azure_openai_client + + +def get_config() -> Dict[str, Any]: + """Load configuration from environment variables.""" + return { + 'database_name': os.getenv('AZURE_DOCUMENTDB_DATABASENAME', 'Hotels'), + 'data_file': os.getenv('DATA_FILE_WITH_VECTORS', '../data/Hotels_Vector.json'), + 'vector_field': os.getenv('EMBEDDED_FIELD', 'contentVector'), + 'model_name': os.getenv('AZURE_OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'dimensions': int(os.getenv('EMBEDDING_DIMENSIONS', '1536')), + 'batch_size': int(os.getenv('LOAD_SIZE_BATCH', '100')), + 'similarity': os.getenv('SIMILARITY', 'COS'), + } + + +def read_file_return_json(file_path: str) -> List[Dict[str, Any]]: + """Read a JSON file and return the parsed data.""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + return json.load(file) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found") + raise + + +def insert_data(collection: Collection, data: List[Dict[str, Any]], + batch_size: int = 100) -> Dict[str, Any]: + """Insert data into collection in batches, skipping if already populated.""" + total_documents = len(data) + + existing_count = collection.count_documents({}) + if existing_count >= total_documents: + print(f"Collection already has {existing_count} documents, skipping insert") + return {'total': total_documents, 'inserted': 0, 'skipped': True} + + if existing_count > 0: + collection.delete_many({}) + + inserted_count = 0 + for i in range(0, total_documents, batch_size): + batch = data[i:i + batch_size] + try: + operations = [InsertOne(doc) for doc in batch] + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count + except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + time.sleep(0.1) + + print(f"Inserted {inserted_count}/{total_documents} documents") + return {'total': total_documents, 'inserted': inserted_count, 'skipped': False} + + +def drop_vector_indexes(collection: Collection, vector_field: str) -> None: + """Drop any existing vector indexes on the specified field.""" + try: + indexes = list(collection.list_indexes()) + for index in indexes: + if 'key' in index and vector_field in index['key']: + if index['key'][vector_field] == 'cosmosSearch': + collection.drop_index(index['name']) + print(f"Dropped existing vector index: {index['name']}") + except Exception as e: + print(f"Warning: Error dropping indexes: {e}") + + +def perform_vector_search(collection: Collection, + azure_openai_client: AzureOpenAI, + query_text: str, + vector_field: str, + model_name: str, + top_k: int = 5) -> List[Dict[str, Any]]: + """Perform vector search using the $search aggregation stage.""" + embedding_response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + query_embedding = embedding_response.data[0].embedding + + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + return list(collection.aggregate(pipeline)) + + +def print_search_results(results: List[Dict[str, Any]], algorithm: str) -> None: + """Print formatted search results.""" + print(f"\n{'='*60}") + print(f" {algorithm} Search Results ({len(results)} found)") + print(f"{'='*60}") + for i, result in enumerate(results, 1): + doc = result.get('document', result) + name = doc.get('HotelName', doc.get('name', 'Unknown')) + score = result.get('score', 0) + print(f" {i}. {name} (score: {score:.4f})") + print() diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md new file mode 100644 index 0000000..208e43d --- /dev/null +++ b/ai/select-algorithm-typescript/README.md @@ -0,0 +1,74 @@ +# Select Algorithm — TypeScript + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB using TypeScript. + +## Prerequisites + +- [Node.js 20+](https://nodejs.org/) +- [Azure CLI](https://learn.microsoft.com/cli/azure/install-azure-cli) (for `az login`) +- An Azure DocumentDB cluster with vector search enabled +- An Azure OpenAI resource with an embedding model deployed + +## Setup + +1. **Install dependencies:** + + ```bash + npm install + ``` + +2. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +3. **Configure environment variables:** + + Copy `.env.example` to `.env` and fill in your values: + + ```bash + cp .env.example .env + ``` + + | Variable | Description | + |---|---| + | `MONGO_CLUSTER_NAME` | Your DocumentDB cluster name | + | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | + | `AZURE_OPENAI_EMBEDDING_MODEL` | Embedding model deployment name | + | `AZURE_OPENAI_EMBEDDING_API_VERSION` | Azure OpenAI API version | + | `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | + | `DATA_FILE_WITH_VECTORS` | Path to JSON data file with vectors | + | `EMBEDDED_FIELD` | Field name containing the vector (default: `contentVector`) | + | `EMBEDDING_DIMENSIONS` | Vector dimensions (default: `1536`) | + | `LOAD_SIZE_BATCH` | Batch size for data insertion | + | `SIMILARITY` | Similarity metric: `COS`, `L2`, or `IP` | + +4. **Build the project:** + + ```bash + npm run build + ``` + +## Run + +Each script creates a collection, inserts data, builds a vector index, and performs a similarity search. + +```bash +# IVF (Inverted File Index) +npm run start:ivf + +# HNSW (Hierarchical Navigable Small World) +npm run start:hnsw + +# DiskANN +npm run start:diskann +``` + +## Algorithm comparison + +| Algorithm | Index type | Best for | +|---|---|---| +| **IVF** | `vector-ivf` | Smaller datasets, lower memory usage | +| **HNSW** | `vector-hnsw` | Fast approximate search, balanced recall/speed | +| **DiskANN** | `vector-diskann` | Large-scale datasets, disk-based search | diff --git a/ai/select-algorithm-typescript/package-lock.json b/ai/select-algorithm-typescript/package-lock.json new file mode 100644 index 0000000..f0ceb74 --- /dev/null +++ b/ai/select-algorithm-typescript/package-lock.json @@ -0,0 +1,735 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "select-algorithm-typescript", + "version": "1.0.0", + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } + }, + "node_modules/@azure/abort-controller": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.1.2.tgz", + "integrity": "sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-auth": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.10.1.tgz", + "integrity": "sha512-ykRMW8PjVAn+RS6ww5cmK9U2CyH9p4Q88YJwvUslfuMmN98w/2rdGRLPqJYObapBCdzBVeDgYWdJnFPFb7qzpg==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-util": "^1.13.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-client": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-client/-/core-client-1.10.1.tgz", + "integrity": "sha512-Nh5PhEOeY6PrnxNPsEHRr9eimxLwgLlpmguQaHKBinFYA/RU9+kOYVOQqOrTsCL+KSxrLLl1gD8Dk5BFW/7l/w==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-rest-pipeline": "^1.22.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-rest-pipeline": { + "version": "1.23.0", + "resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.23.0.tgz", + "integrity": "sha512-Evs1INHo+jUjwHi1T6SG6Ua/LHOQBCLuKEEE6efIpt4ZOoNonaT1kP32GoOcdNDbfqsD2445CPri3MubBy5DEQ==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "@typespec/ts-http-runtime": "^0.3.4", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-tracing": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.3.1.tgz", + "integrity": "sha512-9MWKevR7Hz8kNzzPLfX4EAtGM2b8mr50HPDBvio96bURP/9C+HjdH3sBlLSNNrvRAr5/k/svoH457gB5IKpmwQ==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-util": { + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.13.1.tgz", + "integrity": "sha512-XPArKLzsvl0Hf0CaGyKHUyVgF7oDnhKoP85Xv6M4StF/1AhfORhZudHtOyf2s+FcbuQ9dPRAjB8J2KvRRMUK2A==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/identity": { + "version": "4.13.1", + "resolved": "https://registry.npmjs.org/@azure/identity/-/identity-4.13.1.tgz", + "integrity": "sha512-5C/2WD5Vb1lHnZS16dNQRPMjN6oV/Upba+C9nBIs15PmOi6A3ZGs4Lr2u60zw4S04gi+u3cEXiqTVP7M4Pz3kw==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-auth": "^1.9.0", + "@azure/core-client": "^1.9.2", + "@azure/core-rest-pipeline": "^1.17.0", + "@azure/core-tracing": "^1.0.0", + "@azure/core-util": "^1.11.0", + "@azure/logger": "^1.0.0", + "@azure/msal-browser": "^5.5.0", + "@azure/msal-node": "^5.1.0", + "open": "^10.1.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/logger": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.3.0.tgz", + "integrity": "sha512-fCqPIfOcLE+CGqGPd66c8bZpwAji98tZ4JI9i/mlTNTlsIWslCfpg48s/ypyLxZTump5sypjrKn2/kY7q8oAbA==", + "license": "MIT", + "dependencies": { + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/msal-browser": { + "version": "5.9.0", + "resolved": "https://registry.npmjs.org/@azure/msal-browser/-/msal-browser-5.9.0.tgz", + "integrity": "sha512-CzE+4PefDSJWj26zU7G1bKchlGRRHMBFreG4tAlGuzyI8hAPiYGobaJvZBgZBf6L63iphX7VH+ityL8VgEQz9Q==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-common": { + "version": "16.5.2", + "resolved": "https://registry.npmjs.org/@azure/msal-common/-/msal-common-16.5.2.tgz", + "integrity": "sha512-GkDEL6TYo3HgT3UuqakdgE9PZfc1hMki6+Hwgy1uddb/EauvAKfu85vVhuofRSo22D1xTnWt8Ucwfg4vSCVwvA==", + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-node": { + "version": "5.1.5", + "resolved": "https://registry.npmjs.org/@azure/msal-node/-/msal-node-5.1.5.tgz", + "integrity": "sha512-ObTeMoNPmq19X3z40et9Xvs4ZoWVeJg43PZMRLG5iwVL+2nCtAerG3YTDItqPp1CfXNwmCXBbg8jn1DOx65c3g==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2", + "jsonwebtoken": "^9.0.0" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/@mongodb-js/saslprep": { + "version": "1.4.9", + "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.4.9.tgz", + "integrity": "sha512-RXSxsokhAF/4nWys8An8npsqOI33Ex1Hlzqjw2pZOO+GKtMAR2noGnUdsFiGwsaO/xXI+56mtjTmDA3JXJsvmA==", + "license": "MIT", + "dependencies": { + "sparse-bitfield": "^3.0.3" + } + }, + "node_modules/@types/node": { + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/@types/webidl-conversions": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", + "integrity": "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==", + "license": "MIT" + }, + "node_modules/@types/whatwg-url": { + "version": "11.0.5", + "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz", + "integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==", + "license": "MIT", + "dependencies": { + "@types/webidl-conversions": "*" + } + }, + "node_modules/@typespec/ts-http-runtime": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@typespec/ts-http-runtime/-/ts-http-runtime-0.3.5.tgz", + "integrity": "sha512-yURCknZhvywvQItHMMmFSo+fq5arCUIyz/CVk7jD89MSai7dkaX8ufjCWp3NttLojoTVbcE72ri+be/TnEbMHw==", + "license": "MIT", + "dependencies": { + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/bson": { + "version": "6.10.4", + "resolved": "https://registry.npmjs.org/bson/-/bson-6.10.4.tgz", + "integrity": "sha512-WIsKqkSC0ABoBJuT1LEX+2HEvNmNKKgnTAyd0fL8qzK4SH2i9NXg+t08YtdZp/V9IZ33cxe3iV4yM0qg8lMQng==", + "license": "Apache-2.0", + "engines": { + "node": ">=16.20.1" + } + }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", + "license": "BSD-3-Clause" + }, + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "license": "MIT", + "dependencies": { + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/default-browser": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", + "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", + "license": "MIT", + "dependencies": { + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", + "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/is-docker": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "license": "MIT", + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "license": "MIT", + "dependencies": { + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-wsl": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.1.tgz", + "integrity": "sha512-e6rvdUCiQCAuumZslxRJWR/Doq4VpPR82kqclvcS0efgt430SlGIk05vdCN58+VrzgtIcfNODjozVielycD4Sw==", + "license": "MIT", + "dependencies": { + "is-inside-container": "^1.0.0" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/jsonwebtoken": { + "version": "9.0.3", + "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", + "integrity": "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g==", + "license": "MIT", + "dependencies": { + "jws": "^4.0.1", + "lodash.includes": "^4.3.0", + "lodash.isboolean": "^3.0.3", + "lodash.isinteger": "^4.0.4", + "lodash.isnumber": "^3.0.3", + "lodash.isplainobject": "^4.0.6", + "lodash.isstring": "^4.0.1", + "lodash.once": "^4.0.0", + "ms": "^2.1.1", + "semver": "^7.5.4" + }, + "engines": { + "node": ">=12", + "npm": ">=6" + } + }, + "node_modules/jwa": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", + "license": "MIT", + "dependencies": { + "buffer-equal-constant-time": "^1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", + "license": "MIT", + "dependencies": { + "jwa": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/lodash.includes": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.includes/-/lodash.includes-4.3.0.tgz", + "integrity": "sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==", + "license": "MIT" + }, + "node_modules/lodash.isboolean": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz", + "integrity": "sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==", + "license": "MIT" + }, + "node_modules/lodash.isinteger": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/lodash.isinteger/-/lodash.isinteger-4.0.4.tgz", + "integrity": "sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==", + "license": "MIT" + }, + "node_modules/lodash.isnumber": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isnumber/-/lodash.isnumber-3.0.3.tgz", + "integrity": "sha512-QYqzpfwO3/CWf3XP+Z+tkQsfaLL/EnUlXWVkIk5FUPc4sBdTehEqZONuyRt2P67PXAk+NXmTBcc97zw9t1FQrw==", + "license": "MIT" + }, + "node_modules/lodash.isplainobject": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz", + "integrity": "sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==", + "license": "MIT" + }, + "node_modules/lodash.isstring": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.isstring/-/lodash.isstring-4.0.1.tgz", + "integrity": "sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==", + "license": "MIT" + }, + "node_modules/lodash.once": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", + "integrity": "sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==", + "license": "MIT" + }, + "node_modules/memory-pager": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz", + "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==", + "license": "MIT" + }, + "node_modules/mongodb": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.21.0.tgz", + "integrity": "sha512-URyb/VXMjJ4da46OeSXg+puO39XH9DeQpWCslifrRn9JWugy0D+DvvBvkm2WxmHe61O/H19JM66p1z7RHVkZ6A==", + "license": "Apache-2.0", + "dependencies": { + "@mongodb-js/saslprep": "^1.3.0", + "bson": "^6.10.4", + "mongodb-connection-string-url": "^3.0.2" + }, + "engines": { + "node": ">=16.20.1" + }, + "peerDependencies": { + "@aws-sdk/credential-providers": "^3.188.0", + "@mongodb-js/zstd": "^1.1.0 || ^2.0.0", + "gcp-metadata": "^5.2.0", + "kerberos": "^2.0.1", + "mongodb-client-encryption": ">=6.0.0 <7", + "snappy": "^7.3.2", + "socks": "^2.7.1" + }, + "peerDependenciesMeta": { + "@aws-sdk/credential-providers": { + "optional": true + }, + "@mongodb-js/zstd": { + "optional": true + }, + "gcp-metadata": { + "optional": true + }, + "kerberos": { + "optional": true + }, + "mongodb-client-encryption": { + "optional": true + }, + "snappy": { + "optional": true + }, + "socks": { + "optional": true + } + } + }, + "node_modules/mongodb-connection-string-url": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.2.tgz", + "integrity": "sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA==", + "license": "Apache-2.0", + "dependencies": { + "@types/whatwg-url": "^11.0.2", + "whatwg-url": "^14.1.0 || ^13.0.0" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/open": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz", + "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==", + "license": "MIT", + "dependencies": { + "default-browser": "^5.2.1", + "define-lazy-prop": "^3.0.0", + "is-inside-container": "^1.0.0", + "wsl-utils": "^0.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/openai": { + "version": "5.23.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.23.2.tgz", + "integrity": "sha512-MQBzmTulj+MM5O8SKEk/gL8a7s5mktS9zUtAkU257WjvobGc9nKcBuVwjyEEcb9SI8a8Y2G/mzn3vm9n1Jlleg==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/run-applescript": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.1.0.tgz", + "integrity": "sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/sparse-bitfield": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz", + "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==", + "license": "MIT", + "dependencies": { + "memory-pager": "^1.0.2" + } + }, + "node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "dev": true, + "license": "MIT" + }, + "node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "license": "MIT", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/wsl-utils": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz", + "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==", + "license": "MIT", + "dependencies": { + "is-wsl": "^3.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + } + } +} diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json new file mode 100644 index 0000000..bac0876 --- /dev/null +++ b/ai/select-algorithm-typescript/package.json @@ -0,0 +1,21 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "description": "Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB", + "type": "module", + "scripts": { + "build": "tsc", + "start:ivf": "node --env-file .env dist/ivf.js", + "start:hnsw": "node --env-file .env dist/hnsw.js", + "start:diskann": "node --env-file .env dist/diskann.js" + }, + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } +} diff --git a/ai/select-algorithm-typescript/src/diskann.ts b/ai/select-algorithm-typescript/src/diskann.ts new file mode 100644 index 0000000..bd0c84a --- /dev/null +++ b/ai/select-algorithm-typescript/src/diskann.ts @@ -0,0 +1,101 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; + +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const baseConfig = getConfig(); + +const config = { + ...baseConfig, + query: "quintessential lodging near running trails, eateries, retail", + collectionName: "hotels_diskann", + indexName: "vectorIndex_diskann", +}; + +async function main() { + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) { + throw new Error('AI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + const collection = await db.createCollection(config.collectionName); + console.log('Created collection:', config.collectionName); + + const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create the DiskANN vector index + const indexOptions = { + createIndexes: config.collectionName, + indexes: [ + { + name: config.indexName, + key: { + [config.embeddedField]: 'cosmosSearch' + }, + cosmosSearchOptions: { + kind: 'vector-diskann', + maxDegree: 20, + lBuild: 10, + similarity: config.similarity, + dimensions: config.embeddingDimensions + } + } + ] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log('Created vector index:', config.indexName); + + // Create embedding for the query + const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [config.query] + }); + + // Perform the vector similarity search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: createEmbeddedForQueryResponse.data[0].embedding, + path: config.embeddedField, + k: 5 + } + } + }, + { + $project: { + score: { + $meta: "searchScore" + }, + document: "$$ROOT" + } + } + ]).toArray(); + + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('Closing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/hnsw.ts b/ai/select-algorithm-typescript/src/hnsw.ts new file mode 100644 index 0000000..a44d4c1 --- /dev/null +++ b/ai/select-algorithm-typescript/src/hnsw.ts @@ -0,0 +1,101 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; + +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const baseConfig = getConfig(); + +const config = { + ...baseConfig, + query: "quintessential lodging near running trails, eateries, retail", + collectionName: "hotels_hnsw", + indexName: "vectorIndex_hnsw", +}; + +async function main() { + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) { + throw new Error('AI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + const collection = await db.createCollection(config.collectionName); + console.log('Created collection:', config.collectionName); + + const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create the HNSW vector index + const indexOptions = { + createIndexes: config.collectionName, + indexes: [ + { + name: config.indexName, + key: { + [config.embeddedField]: 'cosmosSearch' + }, + cosmosSearchOptions: { + kind: 'vector-hnsw', + m: 16, + efConstruction: 64, + similarity: config.similarity, + dimensions: config.embeddingDimensions + } + } + ] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log('Created vector index:', config.indexName); + + // Create embedding for the query + const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [config.query] + }); + + // Perform the vector similarity search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: createEmbeddedForQueryResponse.data[0].embedding, + path: config.embeddedField, + k: 5 + } + } + }, + { + $project: { + score: { + $meta: "searchScore" + }, + document: "$$ROOT" + } + } + ]).toArray(); + + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('Closing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/ivf.ts b/ai/select-algorithm-typescript/src/ivf.ts new file mode 100644 index 0000000..9beff65 --- /dev/null +++ b/ai/select-algorithm-typescript/src/ivf.ts @@ -0,0 +1,101 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; + +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const baseConfig = getConfig(); + +const config = { + ...baseConfig, + query: "quintessential lodging near running trails, eateries, retail", + collectionName: "hotels_ivf", + indexName: "vectorIndex_ivf", +}; + +async function main() { + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) { + throw new Error('AI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + const collection = await db.createCollection(config.collectionName); + console.log('Created collection:', config.collectionName); + + const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create the IVF vector index + const indexOptions = { + createIndexes: config.collectionName, + indexes: [ + { + name: config.indexName, + key: { + [config.embeddedField]: 'cosmosSearch' + }, + cosmosSearchOptions: { + kind: 'vector-ivf', + numLists: 10, + similarity: config.similarity, + dimensions: config.embeddingDimensions + } + } + ] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log('Created vector index:', config.indexName); + + // Create embedding for the query + const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [config.query] + }); + + // Perform the vector similarity search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: createEmbeddedForQueryResponse.data[0].embedding, + path: config.embeddedField, + k: 5 + }, + returnStoredSource: true + } + }, + { + $project: { + score: { + $meta: "searchScore" + }, + document: "$$ROOT" + } + } + ]).toArray(); + + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('Closing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/utils.ts b/ai/select-algorithm-typescript/src/utils.ts new file mode 100644 index 0000000..37934da --- /dev/null +++ b/ai/select-algorithm-typescript/src/utils.ts @@ -0,0 +1,135 @@ +import { MongoClient, OIDCResponse, OIDCCallbackParams } from 'mongodb'; +import { AzureOpenAI } from 'openai/index.js'; +import { promises as fs } from "fs"; +import { AccessToken, DefaultAzureCredential, TokenCredential, getBearerTokenProvider } from '@azure/identity'; + +export type JsonData = Record; + +export const AzureIdentityTokenCallback = async (params: OIDCCallbackParams, credential: TokenCredential): Promise => { + const tokenResponse: AccessToken | null = await credential.getToken(['https://ossrdbms-aad.database.windows.net/.default']); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: (tokenResponse?.expiresOnTimestamp || 0) - Math.floor(Date.now() / 1000) + }; +}; + +export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClient: MongoClient | null } { + let aiClient: AzureOpenAI | null = null; + let dbClient: MongoClient | null = null; + + const apiVersion = process.env.AZURE_OPENAI_EMBEDDING_API_VERSION!; + const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const clusterName = process.env.MONGO_CLUSTER_NAME!; + + if (!apiVersion || !endpoint || !deployment || !clusterName) { + throw new Error('Missing required environment variables: AZURE_OPENAI_EMBEDDING_API_VERSION, AZURE_OPENAI_EMBEDDING_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL, MONGO_CLUSTER_NAME'); + } + + console.log(`Using Azure OpenAI Embedding API Version: ${apiVersion}`); + console.log(`Using Azure OpenAI Embedding Deployment/Model: ${deployment}`); + + const credential = new DefaultAzureCredential(); + + // Azure OpenAI with DefaultAzureCredential + { + const scope = "https://cognitiveservices.azure.com/.default"; + const azureADTokenProvider = getBearerTokenProvider(credential, scope); + aiClient = new AzureOpenAI({ + apiVersion, + endpoint, + deployment, + azureADTokenProvider + }); + } + + // DocumentDB with DefaultAzureCredential (OIDC) + { + dbClient = new MongoClient( + `mongodb+srv://${clusterName}.mongocluster.cosmos.azure.com/`, { + connectTimeoutMS: 120000, + tls: true, + retryWrites: false, + maxIdleTimeMS: 120000, + authMechanism: 'MONGODB-OIDC', + authMechanismProperties: { + OIDC_CALLBACK: (params: OIDCCallbackParams) => AzureIdentityTokenCallback(params, credential), + ALLOWED_HOSTS: ['*.azure.com'] + } + }); + } + + return { aiClient, dbClient }; +} + +export function getConfig() { + const dbName = process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels'; + const dataFile = process.env.DATA_FILE_WITH_VECTORS!; + const batchSize = parseInt(process.env.LOAD_SIZE_BATCH! || '100', 10); + const embeddedField = process.env.EMBEDDED_FIELD!; + const embeddingDimensions = parseInt(process.env.EMBEDDING_DIMENSIONS!, 10); + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const similarity = process.env.SIMILARITY || 'COS'; + + return { dbName, dataFile, batchSize, embeddedField, embeddingDimensions, deployment, similarity }; +} + +export async function readFileReturnJson(filePath: string): Promise { + console.log(`Reading JSON file from ${filePath}`); + const fileAsString = await fs.readFile(filePath, "utf-8"); + return JSON.parse(fileAsString); +} + +export async function insertData(config, collection, data) { + console.log(`Processing in batches of ${config.batchSize}...`); + const totalBatches = Math.ceil(data.length / config.batchSize); + + let inserted = 0; + let failed = 0; + + for (let i = 0; i < totalBatches; i++) { + const start = i * config.batchSize; + const end = Math.min(start + config.batchSize, data.length); + const batch = data.slice(start, end); + + try { + const result = await collection.insertMany(batch, { ordered: false }); + inserted += result.insertedCount || 0; + console.log(`Batch ${i + 1} complete: ${result.insertedCount} inserted`); + } catch (error: any) { + if (error?.writeErrors) { + console.error(`Error in batch ${i + 1}: ${error?.writeErrors.length} failures`); + failed += error?.writeErrors.length; + inserted += batch.length - error?.writeErrors.length; + } else { + console.error(`Error in batch ${i + 1}:`, error); + failed += batch.length; + } + } + + if (i < totalBatches - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + const indexColumns = ["HotelId", "Category", "Description", "Description_fr"]; + for (const col of indexColumns) { + const indexSpec = {}; + indexSpec[col] = 1; + await collection.createIndex(indexSpec); + } + + return { total: data.length, inserted, failed }; +} + +export function printSearchResults(insertSummary, indexSummary, searchResults) { + if (!searchResults || searchResults.length === 0) { + console.log('No search results found.'); + return; + } + + searchResults.map((result, index) => { + const { document, score } = result as any; + console.log(`${index + 1}. HotelName: ${document.HotelName}, Score: ${score.toFixed(4)}`); + }); +} diff --git a/ai/select-algorithm-typescript/tsconfig.json b/ai/select-algorithm-typescript/tsconfig.json new file mode 100644 index 0000000..3cb9aaa --- /dev/null +++ b/ai/select-algorithm-typescript/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "NodeNext", + "moduleResolution": "nodenext", + "declaration": true, + "outDir": "./dist", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "noImplicitAny": false, + "forceConsistentCasingInFileNames": true, + "sourceMap": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} From 511459128bae3ca6616615feeac09fb83617ffb9 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 29 Apr 2026 10:45:17 -0700 Subject: [PATCH 2/9] fix: review findings - auth scope, consistency, env vars - Java: Fix TOKEN_RESOURCE from cosmos.azure.com to ossrdbms-aad.database.windows.net - TypeScript IVF: Remove inconsistent returnStoredSource field - .NET .env.example: Fix vector field name to contentVector, remove unused AZURE_TENANT_ID - Java .env.example: Remove unused AZURE_MANAGED_IDENTITY_PRINCIPAL_ID - Python .env.example: Fix API version to 2023-05-15 for consistency Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/.env.example | 19 ++++++++++++ ai/select-algorithm-java/.env.example | 26 +++++++++++++++++ .../documentdb/selectalgorithm/Utils.java | 2 +- ai/select-algorithm-python/.env.example | 29 +++++++++++++++++++ ai/select-algorithm-typescript/src/ivf.ts | 3 +- 5 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 ai/select-algorithm-dotnet/.env.example create mode 100644 ai/select-algorithm-java/.env.example create mode 100644 ai/select-algorithm-python/.env.example diff --git a/ai/select-algorithm-dotnet/.env.example b/ai/select-algorithm-dotnet/.env.example new file mode 100644 index 0000000..e21ac60 --- /dev/null +++ b/ai/select-algorithm-dotnet/.env.example @@ -0,0 +1,19 @@ +# Azure OpenAI Embedding Settings +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + +# Data File Paths and Vector Configuration +DATA_FILE_WITH_VECTORS=../../data/Hotels_Vector.json +EMBEDDED_FIELD=contentVector +EMBEDDING_DIMENSIONS=1536 +LOAD_SIZE_BATCH=100 + +# MongoDB/DocumentDB Connection Settings +MONGO_CLUSTER_NAME= + +# Algorithm Selection +# ALGORITHM: all | diskann | hnsw | ivf +ALGORITHM=all +# SIMILARITY: all | COS | L2 | IP +SIMILARITY=COS diff --git a/ai/select-algorithm-java/.env.example b/ai/select-algorithm-java/.env.example new file mode 100644 index 0000000..30a037d --- /dev/null +++ b/ai/select-algorithm-java/.env.example @@ -0,0 +1,26 @@ +# Azure DocumentDB cluster name (find in Azure Portal > DocumentDB > Overview) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI embedding endpoint(find in Azure Portal > Azure OpenAI > Keys and Endpoint) +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model deployment name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Path to pre-computed vectors JSON file +DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + +# Database name (default: Hotels) +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Field name containing embeddings in the data file +EMBEDDED_FIELD=contentVector + +# Embedding dimensions (default: 1536) +EMBEDDING_DIMENSIONS=1536 + +# Algorithm to test: all, diskann, hnsw, ivf (default: all) +ALGORITHM=all + +# Similarity to test: COS, L2, IP (default: COS) +SIMILARITY=COS diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java index f72c9ad..eb10178 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java @@ -52,7 +52,7 @@ public static MongoClient getMongoClient() { MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) .withMechanism(MongoCredential.MONGODB_OIDC_MECHANISM) .withMechanismProperty("ENVIRONMENT", "azure") - .withMechanismProperty("TOKEN_RESOURCE", "https://cosmos.azure.com"); + .withMechanismProperty("TOKEN_RESOURCE", "https://ossrdbms-aad.database.windows.net"); MongoClientSettings settings = MongoClientSettings.builder() .applyConnectionString(new ConnectionString(connectionUri)) diff --git a/ai/select-algorithm-python/.env.example b/ai/select-algorithm-python/.env.example new file mode 100644 index 0000000..3bf4f64 --- /dev/null +++ b/ai/select-algorithm-python/.env.example @@ -0,0 +1,29 @@ +# Azure DocumentDB cluster name (find in Azure Portal > DocumentDB > Overview) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI embedding endpoint (find in Azure Portal > Azure OpenAI > Keys and Endpoint) +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model deployment name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Azure OpenAI API version (see: https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation) +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 + +# Database name (default: Hotels) +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Path to pre-computed vectors JSON file (default: ../data/Hotels_Vector.json) +DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + +# Field name containing embeddings in the data file +EMBEDDED_FIELD=contentVector + +# Embedding dimensions (default: 1536) +EMBEDDING_DIMENSIONS=1536 + +# Algorithm to test: all, diskann, hnsw, ivf (default: all) +ALGORITHM=all + +# Similarity to test: COS, L2, IP (default: COS) +SIMILARITY=COS \ No newline at end of file diff --git a/ai/select-algorithm-typescript/src/ivf.ts b/ai/select-algorithm-typescript/src/ivf.ts index 9beff65..7df1520 100644 --- a/ai/select-algorithm-typescript/src/ivf.ts +++ b/ai/select-algorithm-typescript/src/ivf.ts @@ -69,8 +69,7 @@ async function main() { vector: createEmbeddedForQueryResponse.data[0].embedding, path: config.embeddedField, k: 5 - }, - returnStoredSource: true + } } }, { From 7185bb97bce41b3538718d8bceafe7e7505d81e0 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 29 Apr 2026 13:33:16 -0700 Subject: [PATCH 3/9] refactor(.NET): replace DotNetEnv with appsettings.json + ConfigurationBuilder - Remove DotNetEnv package, add Microsoft.Extensions.Configuration packages - Add appsettings.json with strongly-typed config sections - Add Models/Configuration.cs with AppConfiguration classes - Update Program.cs to use ConfigurationBuilder (json + env var override) - Update Utils.cs to accept AppConfiguration parameter - Update all demo Run() methods to receive config from Program.cs - Delete .env.example (no longer needed) - Update README to reference appsettings.json + azd env get-values Matches Article 1 (vector-search-dotnet) configuration pattern. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/.env.example | 19 ------- ai/select-algorithm-dotnet/README.md | 57 ++++++++++++++----- ai/select-algorithm-dotnet/src/DiskannDemo.cs | 20 +++---- ai/select-algorithm-dotnet/src/HnswDemo.cs | 20 +++---- ai/select-algorithm-dotnet/src/IvfDemo.cs | 20 +++---- .../src/Models/Configuration.cs | 41 +++++++++++++ ai/select-algorithm-dotnet/src/Program.cs | 27 ++++++--- .../src/SelectAlgorithm.csproj | 10 +++- ai/select-algorithm-dotnet/src/Utils.cs | 19 ++++--- .../src/appsettings.json | 23 ++++++++ 10 files changed, 174 insertions(+), 82 deletions(-) delete mode 100644 ai/select-algorithm-dotnet/.env.example create mode 100644 ai/select-algorithm-dotnet/src/Models/Configuration.cs create mode 100644 ai/select-algorithm-dotnet/src/appsettings.json diff --git a/ai/select-algorithm-dotnet/.env.example b/ai/select-algorithm-dotnet/.env.example deleted file mode 100644 index e21ac60..0000000 --- a/ai/select-algorithm-dotnet/.env.example +++ /dev/null @@ -1,19 +0,0 @@ -# Azure OpenAI Embedding Settings -AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small -AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 -AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com - -# Data File Paths and Vector Configuration -DATA_FILE_WITH_VECTORS=../../data/Hotels_Vector.json -EMBEDDED_FIELD=contentVector -EMBEDDING_DIMENSIONS=1536 -LOAD_SIZE_BATCH=100 - -# MongoDB/DocumentDB Connection Settings -MONGO_CLUSTER_NAME= - -# Algorithm Selection -# ALGORITHM: all | diskann | hnsw | ivf -ALGORITHM=all -# SIMILARITY: all | COS | L2 | IP -SIMILARITY=COS diff --git a/ai/select-algorithm-dotnet/README.md b/ai/select-algorithm-dotnet/README.md index 78b12e7..78def8e 100644 --- a/ai/select-algorithm-dotnet/README.md +++ b/ai/select-algorithm-dotnet/README.md @@ -17,28 +17,50 @@ Demonstrates three vector index algorithms available in Azure DocumentDB (vCore) ## Setup -1. Copy the environment file and fill in your values: +1. Clone the repository: ```bash - cp .env.example .env + git clone https://github.com/documentdb-samples + cd ai/select-algorithm-dotnet ``` -2. Edit `.env` with your configuration: +2. Login to Azure: - ```env - AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small - AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com - MONGO_CLUSTER_NAME= - AZURE_DOCUMENTDB_DATABASENAME=Hotels - ALGORITHM=all - SIMILARITY=COS + ```bash + az login + ``` + +3. Configure environment variables: + + The .NET sample reads configuration from `appsettings.json` and environment variables. After deploying with `azd up`, you can view your provisioned resource values: + + ```bash + azd env get-values + ``` + + Use these values to update `appsettings.json` or set them as environment variables. + +4. Update `appsettings.json` with your Azure service details: + + ```json + { + "AzureOpenAI": { + "Endpoint": "https://your-openai-service-name.openai.azure.com/", + "EmbeddingModel": "text-embedding-3-small" + }, + "MongoDB": { + "ClusterName": "your-documentdb-cluster-name", + "DatabaseName": "Hotels" + } + } ``` -3. Restore packages: +5. Restore packages and run: ```bash cd src dotnet restore + dotnet run ``` ## Usage @@ -50,21 +72,26 @@ cd src dotnet run ``` -Run a specific algorithm: +Run a specific algorithm or similarity metric using environment variable overrides: ```bash -# Set in .env: ALGORITHM=ivf | hnsw | diskann | all -dotnet run +ALGORITHM=ivf dotnet run +ALGORITHM=hnsw SIMILARITY=L2 dotnet run +ALGORITHM=diskann dotnet run ``` +Valid values: +- `ALGORITHM`: `all` (default) | `ivf` | `hnsw` | `diskann` +- `SIMILARITY`: `COS` (default) | `L2` | `IP` + ## Project Structure ``` select-algorithm-dotnet/ -├── .env.example # Environment variable template ├── README.md # This file └── src/ ├── SelectAlgorithm.csproj # Project file + ├── appsettings.json # Configuration file ├── Program.cs # Entry point - dispatches by ALGORITHM env ├── Utils.cs # Shared helpers (connection, embedding, search) ├── IvfDemo.cs # IVF index creation and search diff --git a/ai/select-algorithm-dotnet/src/DiskannDemo.cs b/ai/select-algorithm-dotnet/src/DiskannDemo.cs index a52b1bb..a3e866b 100644 --- a/ai/select-algorithm-dotnet/src/DiskannDemo.cs +++ b/ai/select-algorithm-dotnet/src/DiskannDemo.cs @@ -43,23 +43,23 @@ public static void CreateDiskannIndex(IMongoCollection collection, Console.WriteLine("DiskANN vector index created successfully"); } - public static void Run() + public static void Run(Models.AppConfiguration config) { Console.WriteLine(new string('=', 60)); Console.WriteLine(" DiskANN Vector Index - Select Algorithm Demo"); Console.WriteLine(" Best for: 50,000+ documents"); Console.WriteLine(new string('=', 60)); - var databaseName = Environment.GetEnvironmentVariable("AZURE_DOCUMENTDB_DATABASENAME") ?? "Hotels"; - var dataFile = Environment.GetEnvironmentVariable("DATA_FILE_WITH_VECTORS") ?? "../../data/Hotels_Vector.json"; - var vectorField = Environment.GetEnvironmentVariable("EMBEDDED_FIELD") ?? "DescriptionVector"; - var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") ?? "text-embedding-3-small"; - var dimensions = int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536"); - var batchSize = int.Parse(Environment.GetEnvironmentVariable("LOAD_SIZE_BATCH") ?? "100"); - var similarity = Environment.GetEnvironmentVariable("SIMILARITY") ?? "COS"; + var databaseName = config.DocumentDB.DatabaseName; + var dataFile = config.DataFiles.WithVectors; + var vectorField = config.Embedding.EmbeddedField; + var model = config.AzureOpenAI.EmbeddingModel; + var dimensions = config.Embedding.Dimensions; + var batchSize = config.DocumentDB.LoadBatchSize; + var similarity = config.VectorSearch.Similarity; - var mongoClient = Utils.GetMongoClientPasswordless(); - var embeddingClient = Utils.GetEmbeddingClient(); + var mongoClient = Utils.GetMongoClientPasswordless(config); + var embeddingClient = Utils.GetEmbeddingClient(config); try { diff --git a/ai/select-algorithm-dotnet/src/HnswDemo.cs b/ai/select-algorithm-dotnet/src/HnswDemo.cs index acbeb81..20d48f0 100644 --- a/ai/select-algorithm-dotnet/src/HnswDemo.cs +++ b/ai/select-algorithm-dotnet/src/HnswDemo.cs @@ -43,23 +43,23 @@ public static void CreateHnswIndex(IMongoCollection collection, st Console.WriteLine("HNSW vector index created successfully"); } - public static void Run() + public static void Run(Models.AppConfiguration config) { Console.WriteLine(new string('=', 60)); Console.WriteLine(" HNSW Vector Index - Select Algorithm Demo"); Console.WriteLine(" Best for: 10,000 - 50,000 documents"); Console.WriteLine(new string('=', 60)); - var databaseName = Environment.GetEnvironmentVariable("AZURE_DOCUMENTDB_DATABASENAME") ?? "Hotels"; - var dataFile = Environment.GetEnvironmentVariable("DATA_FILE_WITH_VECTORS") ?? "../../data/Hotels_Vector.json"; - var vectorField = Environment.GetEnvironmentVariable("EMBEDDED_FIELD") ?? "DescriptionVector"; - var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") ?? "text-embedding-3-small"; - var dimensions = int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536"); - var batchSize = int.Parse(Environment.GetEnvironmentVariable("LOAD_SIZE_BATCH") ?? "100"); - var similarity = Environment.GetEnvironmentVariable("SIMILARITY") ?? "COS"; + var databaseName = config.DocumentDB.DatabaseName; + var dataFile = config.DataFiles.WithVectors; + var vectorField = config.Embedding.EmbeddedField; + var model = config.AzureOpenAI.EmbeddingModel; + var dimensions = config.Embedding.Dimensions; + var batchSize = config.DocumentDB.LoadBatchSize; + var similarity = config.VectorSearch.Similarity; - var mongoClient = Utils.GetMongoClientPasswordless(); - var embeddingClient = Utils.GetEmbeddingClient(); + var mongoClient = Utils.GetMongoClientPasswordless(config); + var embeddingClient = Utils.GetEmbeddingClient(config); try { diff --git a/ai/select-algorithm-dotnet/src/IvfDemo.cs b/ai/select-algorithm-dotnet/src/IvfDemo.cs index 01a1b74..5d9f6d5 100644 --- a/ai/select-algorithm-dotnet/src/IvfDemo.cs +++ b/ai/select-algorithm-dotnet/src/IvfDemo.cs @@ -42,23 +42,23 @@ public static void CreateIvfIndex(IMongoCollection collection, str Console.WriteLine("IVF vector index created successfully"); } - public static void Run() + public static void Run(Models.AppConfiguration config) { Console.WriteLine(new string('=', 60)); Console.WriteLine(" IVF Vector Index - Select Algorithm Demo"); Console.WriteLine(" Best for: < 10,000 documents"); Console.WriteLine(new string('=', 60)); - var databaseName = Environment.GetEnvironmentVariable("AZURE_DOCUMENTDB_DATABASENAME") ?? "Hotels"; - var dataFile = Environment.GetEnvironmentVariable("DATA_FILE_WITH_VECTORS") ?? "../../data/Hotels_Vector.json"; - var vectorField = Environment.GetEnvironmentVariable("EMBEDDED_FIELD") ?? "DescriptionVector"; - var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") ?? "text-embedding-3-small"; - var dimensions = int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536"); - var batchSize = int.Parse(Environment.GetEnvironmentVariable("LOAD_SIZE_BATCH") ?? "100"); - var similarity = Environment.GetEnvironmentVariable("SIMILARITY") ?? "COS"; + var databaseName = config.DocumentDB.DatabaseName; + var dataFile = config.DataFiles.WithVectors; + var vectorField = config.Embedding.EmbeddedField; + var model = config.AzureOpenAI.EmbeddingModel; + var dimensions = config.Embedding.Dimensions; + var batchSize = config.DocumentDB.LoadBatchSize; + var similarity = config.VectorSearch.Similarity; - var mongoClient = Utils.GetMongoClientPasswordless(); - var embeddingClient = Utils.GetEmbeddingClient(); + var mongoClient = Utils.GetMongoClientPasswordless(config); + var embeddingClient = Utils.GetEmbeddingClient(config); try { diff --git a/ai/select-algorithm-dotnet/src/Models/Configuration.cs b/ai/select-algorithm-dotnet/src/Models/Configuration.cs new file mode 100644 index 0000000..0c0600f --- /dev/null +++ b/ai/select-algorithm-dotnet/src/Models/Configuration.cs @@ -0,0 +1,41 @@ +namespace SelectAlgorithm.Models; + +public class AppConfiguration +{ + public AzureOpenAIConfiguration AzureOpenAI { get; set; } = new(); + public DocumentDBConfiguration DocumentDB { get; set; } = new(); + public EmbeddingConfiguration Embedding { get; set; } = new(); + public VectorSearchConfiguration VectorSearch { get; set; } = new(); + public DataFilesConfiguration DataFiles { get; set; } = new(); +} + +public class AzureOpenAIConfiguration +{ + public string Endpoint { get; set; } = string.Empty; + public string EmbeddingModel { get; set; } = "text-embedding-3-small"; +} + +public class DocumentDBConfiguration +{ + public string ClusterName { get; set; } = string.Empty; + public string DatabaseName { get; set; } = "Hotels"; + public int LoadBatchSize { get; set; } = 100; +} + +public class EmbeddingConfiguration +{ + public string EmbeddedField { get; set; } = "DescriptionVector"; + public int Dimensions { get; set; } = 1536; +} + +public class VectorSearchConfiguration +{ + public string Query { get; set; } = "quintessential lodging near running trails, eateries, retail"; + public string Similarity { get; set; } = "COS"; + public int TopK { get; set; } = 5; +} + +public class DataFilesConfiguration +{ + public string WithVectors { get; set; } = "../../data/Hotels_Vector.json"; +} diff --git a/ai/select-algorithm-dotnet/src/Program.cs b/ai/select-algorithm-dotnet/src/Program.cs index 96fe4d3..f40896f 100644 --- a/ai/select-algorithm-dotnet/src/Program.cs +++ b/ai/select-algorithm-dotnet/src/Program.cs @@ -1,4 +1,5 @@ -using DotNetEnv; +using Microsoft.Extensions.Configuration; +using SelectAlgorithm.Models; namespace SelectAlgorithm; @@ -6,9 +7,16 @@ class Program { static void Main(string[] args) { - // Load .env file from parent directory - Env.Load("../.env"); + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) + .AddEnvironmentVariables() + .Build(); + var appConfig = new AppConfiguration(); + configuration.Bind(appConfig); + + // ALGORITHM env var override for selecting which demo to run var algorithm = (Environment.GetEnvironmentVariable("ALGORITHM") ?? "all").ToLowerInvariant(); Console.WriteLine(); @@ -20,18 +28,18 @@ static void Main(string[] args) switch (algorithm) { case "ivf": - IvfDemo.Run(); + IvfDemo.Run(appConfig); break; case "hnsw": - HnswDemo.Run(); + HnswDemo.Run(appConfig); break; case "diskann": - DiskannDemo.Run(); + DiskannDemo.Run(appConfig); break; case "all": - IvfDemo.Run(); - HnswDemo.Run(); - DiskannDemo.Run(); + IvfDemo.Run(appConfig); + HnswDemo.Run(appConfig); + DiskannDemo.Run(appConfig); break; default: Console.WriteLine($"Unknown algorithm: {algorithm}"); @@ -43,3 +51,4 @@ static void Main(string[] args) Console.WriteLine("Done!"); } } + diff --git a/ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj b/ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj index 033f6c4..331e522 100644 --- a/ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj +++ b/ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj @@ -10,6 +10,14 @@ - + + + + + + + + PreserveNewest + diff --git a/ai/select-algorithm-dotnet/src/Utils.cs b/ai/select-algorithm-dotnet/src/Utils.cs index 0d6381d..30b9d5e 100644 --- a/ai/select-algorithm-dotnet/src/Utils.cs +++ b/ai/select-algorithm-dotnet/src/Utils.cs @@ -4,15 +4,17 @@ using Azure.Identity; using Azure.AI.OpenAI; using OpenAI.Embeddings; +using SelectAlgorithm.Models; namespace SelectAlgorithm; public static class Utils { - public static IMongoClient GetMongoClientPasswordless() + public static IMongoClient GetMongoClientPasswordless(AppConfiguration config) { - var clusterName = Environment.GetEnvironmentVariable("MONGO_CLUSTER_NAME") - ?? throw new InvalidOperationException("MONGO_CLUSTER_NAME environment variable is required"); + var clusterName = config.DocumentDB.ClusterName; + if (string.IsNullOrEmpty(clusterName)) + throw new InvalidOperationException("DocumentDB:ClusterName is required in appsettings.json"); var credential = new DefaultAzureCredential(); @@ -27,12 +29,13 @@ public static IMongoClient GetMongoClientPasswordless() return new MongoClient(settings); } - public static EmbeddingClient GetEmbeddingClient() + public static EmbeddingClient GetEmbeddingClient(AppConfiguration config) { - var endpoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_ENDPOINT") - ?? throw new InvalidOperationException("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required"); - var model = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_MODEL") - ?? "text-embedding-3-small"; + var endpoint = config.AzureOpenAI.Endpoint; + if (string.IsNullOrEmpty(endpoint)) + throw new InvalidOperationException("AzureOpenAI:Endpoint is required in appsettings.json"); + + var model = config.AzureOpenAI.EmbeddingModel; var credential = new DefaultAzureCredential(); var azureClient = new AzureOpenAIClient(new Uri(endpoint), credential); diff --git a/ai/select-algorithm-dotnet/src/appsettings.json b/ai/select-algorithm-dotnet/src/appsettings.json new file mode 100644 index 0000000..fc68d44 --- /dev/null +++ b/ai/select-algorithm-dotnet/src/appsettings.json @@ -0,0 +1,23 @@ +{ + "AzureOpenAI": { + "Endpoint": "https://.openai.azure.com/", + "EmbeddingModel": "text-embedding-3-small" + }, + "DocumentDB": { + "ClusterName": "", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536 + }, + "VectorSearch": { + "Query": "quintessential lodging near running trails, eateries, retail", + "Similarity": "COS", + "TopK": 5 + }, + "DataFiles": { + "WithVectors": "../../data/Hotels_Vector.json" + } +} From f9d5f10cc5beebc5376e1980c9cda71338cffe02 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 29 Apr 2026 13:40:49 -0700 Subject: [PATCH 4/9] docs: add azd env get-values config section to Article 2 READMEs All non-.NET Article 2 READMEs now show azd env get-values > .env as the primary config method after azd up, with manual cp .env.example as fallback. Matches Article 1 README pattern. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-go/README.md | 14 +++++++++++--- ai/select-algorithm-java/README.md | 14 ++++++++++++-- ai/select-algorithm-python/README.md | 15 ++++++++++++--- ai/select-algorithm-typescript/README.md | 10 +++++++++- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md index cec698a..baa4065 100644 --- a/ai/select-algorithm-go/README.md +++ b/ai/select-algorithm-go/README.md @@ -18,13 +18,21 @@ This sample demonstrates how to use different vector search algorithms (IVF, HNS cd ai/select-algorithm-go ``` -2. **Configure environment variables** by copying the example file: +2. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: ```bash - cp .env.example .env + azd env get-values > .env ``` - Edit `.env` with your Azure resource values. + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` 3. **Install dependencies**: diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md index 72ba7cc..fee137d 100644 --- a/ai/select-algorithm-java/README.md +++ b/ai/select-algorithm-java/README.md @@ -12,13 +12,23 @@ This sample demonstrates how to create and use different vector search index alg ## Setup -1. Copy the environment file and fill in your values: +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: ```bash cp .env.example .env ``` -2. Update `.env` with your Azure resource details: +2. Update `.env` with your Azure resource details (if not using `azd`): - `MONGO_CLUSTER_NAME` — your DocumentDB cluster name - `AZURE_OPENAI_EMBEDDING_ENDPOINT` — your Azure OpenAI endpoint - `AZURE_OPENAI_EMBEDDING_MODEL` — deployment name (e.g., `text-embedding-3-small`) diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md index 7e65211..6057aa0 100644 --- a/ai/select-algorithm-python/README.md +++ b/ai/select-algorithm-python/README.md @@ -32,12 +32,21 @@ Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB. Each ## Setup -1. Copy environment configuration: +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + ```bash - cp .env.example .env + azd env get-values > .env ``` -2. Update `.env` with your resource values. + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` 3. Install dependencies: ```bash diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md index 208e43d..df6b45d 100644 --- a/ai/select-algorithm-typescript/README.md +++ b/ai/select-algorithm-typescript/README.md @@ -25,7 +25,15 @@ Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB using 3. **Configure environment variables:** - Copy `.env.example` to `.env` and fill in your values: + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: ```bash cp .env.example .env From 2cde68acb344ee8811c815940ea6c79947ee0d01 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 29 Apr 2026 14:52:44 -0700 Subject: [PATCH 5/9] feat: add compare-all runner for all 5 languages Runs all 9 combinations (3 algorithms x 3 metrics) in a single execution with formatted comparison output. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/README.md | 40 ++- ai/select-algorithm-dotnet/src/CompareAll.cs | 265 ++++++++++++++ ai/select-algorithm-dotnet/src/Program.cs | 5 +- ai/select-algorithm-go/README.md | 27 +- ai/select-algorithm-go/go.mod | 25 ++ ai/select-algorithm-go/go.sum | 81 +++++ ai/select-algorithm-go/src/compare_all.go | 338 ++++++++++++++++++ ai/select-algorithm-go/src/main.go | 7 +- ai/select-algorithm-java/README.md | 47 ++- ai/select-algorithm-java/pom.xml | 18 + .../selectalgorithm/CompareAll.java | 231 ++++++++++++ .../documentdb/selectalgorithm/Main.java | 3 +- ai/select-algorithm-python/README.md | 19 + ai/select-algorithm-python/requirements.txt | 3 + ai/select-algorithm-python/src/compare_all.py | 234 ++++++++++++ ai/select-algorithm-typescript/README.md | 18 + ai/select-algorithm-typescript/package.json | 3 +- .../src/compare-all.ts | 205 +++++++++++ 18 files changed, 1562 insertions(+), 7 deletions(-) create mode 100644 ai/select-algorithm-dotnet/src/CompareAll.cs create mode 100644 ai/select-algorithm-go/go.sum create mode 100644 ai/select-algorithm-go/src/compare_all.go create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java create mode 100644 ai/select-algorithm-python/src/compare_all.py create mode 100644 ai/select-algorithm-typescript/src/compare-all.ts diff --git a/ai/select-algorithm-dotnet/README.md b/ai/select-algorithm-dotnet/README.md index 78b12e7..ba26f52 100644 --- a/ai/select-algorithm-dotnet/README.md +++ b/ai/select-algorithm-dotnet/README.md @@ -57,6 +57,43 @@ Run a specific algorithm: dotnet run ``` +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation with a formatted comparison table: + +```bash +# Set in .env: ALGORITHM=compare +dotnet run +``` + +This mode: +- Uses a **single collection** (`hotels`) with 9 vector indexes +- Generates **one embedding** for the query, reused across all searches +- Runs searches **sequentially** with `Stopwatch` timing for fair comparison +- Prints a formatted table with latency, top result, and scores + +**Additional environment variables for compare mode:** + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `3` | Number of results per search | +| `VERBOSE` | `false` | Show detailed per-result output | + +**9 Index Combinations:** + +| Index Name | Algorithm | Metric | Parameters | +|------------|-----------|--------|------------| +| `vector_ivf_cos` | IVF | COS | numLists=1 | +| `vector_hnsw_cos` | HNSW | COS | m=16, efConstruction=64 | +| `vector_diskann_cos` | DiskANN | COS | maxDegree=32, lBuild=50 | +| `vector_ivf_l2` | IVF | L2 | numLists=1 | +| `vector_hnsw_l2` | HNSW | L2 | m=16, efConstruction=64 | +| `vector_diskann_l2` | DiskANN | L2 | maxDegree=32, lBuild=50 | +| `vector_ivf_ip` | IVF | IP | numLists=1 | +| `vector_hnsw_ip` | HNSW | IP | m=16, efConstruction=64 | +| `vector_diskann_ip` | DiskANN | IP | maxDegree=32, lBuild=50 | + ## Project Structure ``` @@ -69,7 +106,8 @@ select-algorithm-dotnet/ ├── Utils.cs # Shared helpers (connection, embedding, search) ├── IvfDemo.cs # IVF index creation and search ├── HnswDemo.cs # HNSW index creation and search - └── DiskannDemo.cs # DiskANN index creation and search + ├── DiskannDemo.cs # DiskANN index creation and search + └── CompareAll.cs # Unified 9-combination comparison runner ``` ## How It Works diff --git a/ai/select-algorithm-dotnet/src/CompareAll.cs b/ai/select-algorithm-dotnet/src/CompareAll.cs new file mode 100644 index 0000000..d575d3e --- /dev/null +++ b/ai/select-algorithm-dotnet/src/CompareAll.cs @@ -0,0 +1,265 @@ +/// Unified comparison runner for all 9 combinations (3 algorithms × 3 similarity metrics). +/// Executes vector searches sequentially for fair timing and prints a formatted comparison table. + +namespace SelectAlgorithm; + +using System.Diagnostics; +using MongoDB.Driver; +using MongoDB.Bson; +using OpenAI.Embeddings; + +public static class CompareAll +{ + private record IndexConfig(string Name, string Kind, string Similarity, BsonDocument ExtraParams); + + private record SearchResult(string IndexName, string Algorithm, string Metric, long LatencyMs, List Results); + + public static void Run() + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" Compare All Algorithms × Metrics"); + Console.WriteLine(" 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP"); + Console.WriteLine(new string('=', 60)); + + var databaseName = Environment.GetEnvironmentVariable("AZURE_DOCUMENTDB_DATABASENAME") ?? "Hotels"; + var dataFile = Environment.GetEnvironmentVariable("DATA_FILE_WITH_VECTORS") ?? "../../data/Hotels_Vector.json"; + var vectorField = Environment.GetEnvironmentVariable("EMBEDDED_FIELD") ?? "DescriptionVector"; + var dimensions = int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536"); + var batchSize = int.Parse(Environment.GetEnvironmentVariable("LOAD_SIZE_BATCH") ?? "100"); + var queryText = Environment.GetEnvironmentVariable("QUERY_TEXT") ?? "luxury hotel near the beach"; + var topK = int.Parse(Environment.GetEnvironmentVariable("TOP_K") ?? "3"); + var verbose = (Environment.GetEnvironmentVariable("VERBOSE") ?? "false").Equals("true", StringComparison.OrdinalIgnoreCase); + + var mongoClient = Utils.GetMongoClientPasswordless(); + var embeddingClient = Utils.GetEmbeddingClient(); + + try + { + var database = mongoClient.GetDatabase(databaseName); + var collection = database.GetCollection("hotels"); + + // Load data once into single collection + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + Utils.InsertData(collection, documents, batchSize); + + // Generate ONE embedding for the query (reused for all 9 searches) + Console.WriteLine($"\nQuery: \"{queryText}\""); + Console.WriteLine($"Top K: {topK}"); + var embeddingResult = embeddingClient.GenerateEmbedding(queryText); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + Console.WriteLine("Embedding generated (reused for all searches)\n"); + + // Define 9 index configurations + var configs = BuildIndexConfigs(dimensions); + + // Create all 9 indexes (idempotent) + Console.WriteLine("Creating 9 vector indexes..."); + foreach (var config in configs) + { + CreateIndex(collection, vectorField, config); + } + Console.WriteLine("Waiting for indexes to build..."); + Thread.Sleep(5000); + + // Run searches sequentially for fair timing + Console.WriteLine("\nRunning searches...\n"); + var results = new List(); + foreach (var config in configs) + { + var sw = Stopwatch.StartNew(); + var searchResults = RunVectorSearch(collection, queryVector, vectorField, config.Name, topK); + sw.Stop(); + + results.Add(new SearchResult(config.Name, config.Kind, config.Similarity, sw.ElapsedMilliseconds, searchResults)); + + if (verbose) + { + Console.WriteLine($" {config.Name}: {sw.ElapsedMilliseconds}ms ({searchResults.Count} results)"); + } + } + + // Print comparison table + PrintComparisonTable(results, verbose); + } + finally + { + mongoClient.Cluster.Dispose(); + } + } + + private static List BuildIndexConfigs(int dimensions) + { + string[] metrics = ["COS", "L2", "IP"]; + var configs = new List(); + + foreach (var metric in metrics) + { + configs.Add(new IndexConfig( + $"vector_ivf_{metric.ToLower()}", + "vector-ivf", + metric, + new BsonDocument { { "numLists", 1 } } + )); + + configs.Add(new IndexConfig( + $"vector_hnsw_{metric.ToLower()}", + "vector-hnsw", + metric, + new BsonDocument { { "m", 16 }, { "efConstruction", 64 } } + )); + + configs.Add(new IndexConfig( + $"vector_diskann_{metric.ToLower()}", + "vector-diskann", + metric, + new BsonDocument { { "maxDegree", 32 }, { "lBuild", 50 } } + )); + } + + return configs; + } + + private static void CreateIndex(IMongoCollection collection, string vectorField, IndexConfig config) + { + // Drop existing index with same name if present + try + { + collection.Indexes.DropOne(config.Name); + } + catch (MongoCommandException) + { + // Index doesn't exist, that's fine + } + + var cosmosSearchOptions = new BsonDocument + { + { "kind", config.Kind }, + { "dimensions", int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536") }, + { "similarity", config.Similarity } + }; + + foreach (var param in config.ExtraParams) + { + cosmosSearchOptions.Add(param); + } + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", config.Name }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", cosmosSearchOptions } + } + } + } + }; + + try + { + collection.Database.RunCommand(command); + } + catch (MongoCommandException ex) when (ex.Message.Contains("already exists")) + { + // Index already exists with same config — idempotent + } + } + + private static List RunVectorSearch( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK) + { + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "HotelName", 1 }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + private static void PrintComparisonTable(List results, bool verbose) + { + Console.WriteLine(); + Console.WriteLine(new string('=', 78)); + Console.WriteLine(" COMPARISON RESULTS"); + Console.WriteLine(new string('=', 78)); + Console.WriteLine(); + + // Header + var header = "Index Name".PadRight(24) + + "Algorithm".PadRight(14) + + "Metric".PadRight(8) + + "Latency".PadRight(10) + + "Top Result".PadRight(22); + Console.WriteLine(header); + Console.WriteLine(new string('-', 78)); + + foreach (var result in results) + { + var topResult = "—"; + var topScore = ""; + if (result.Results.Count > 0) + { + var doc = result.Results[0]; + topResult = doc.Contains("HotelName") ? doc["HotelName"].AsString : "Unknown"; + if (topResult.Length > 18) topResult = topResult[..18] + "..."; + var score = doc.Contains("score") ? doc["score"].ToDouble() : 0.0; + topScore = $" ({score:F3})"; + } + + var algoDisplay = result.Algorithm.Replace("vector-", "").ToUpper(); + var row = result.IndexName.PadRight(24) + + algoDisplay.PadRight(14) + + result.Metric.PadRight(8) + + $"{result.LatencyMs}ms".PadRight(10) + + $"{topResult}{topScore}"; + Console.WriteLine(row); + } + + Console.WriteLine(new string('-', 78)); + Console.WriteLine(); + + // Summary stats + var fastest = results.MinBy(r => r.LatencyMs)!; + var slowest = results.MaxBy(r => r.LatencyMs)!; + Console.WriteLine($" Fastest: {fastest.IndexName} ({fastest.LatencyMs}ms)"); + Console.WriteLine($" Slowest: {slowest.IndexName} ({slowest.LatencyMs}ms)"); + Console.WriteLine(); + + if (verbose) + { + Console.WriteLine(" DETAILED RESULTS:"); + Console.WriteLine(); + foreach (var result in results) + { + Console.WriteLine($" [{result.IndexName}]"); + for (var i = 0; i < result.Results.Count; i++) + { + var doc = result.Results[i]; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString : "Unknown"; + var score = doc.Contains("score") ? doc["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + Console.WriteLine(); + } + } + } +} diff --git a/ai/select-algorithm-dotnet/src/Program.cs b/ai/select-algorithm-dotnet/src/Program.cs index 96fe4d3..34a1cc3 100644 --- a/ai/select-algorithm-dotnet/src/Program.cs +++ b/ai/select-algorithm-dotnet/src/Program.cs @@ -28,6 +28,9 @@ static void Main(string[] args) case "diskann": DiskannDemo.Run(); break; + case "compare": + CompareAll.Run(); + break; case "all": IvfDemo.Run(); HnswDemo.Run(); @@ -35,7 +38,7 @@ static void Main(string[] args) break; default: Console.WriteLine($"Unknown algorithm: {algorithm}"); - Console.WriteLine("Valid options: ivf, hnsw, diskann, all"); + Console.WriteLine("Valid options: ivf, hnsw, diskann, compare, all"); Environment.Exit(1); break; } diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md index cec698a..9832123 100644 --- a/ai/select-algorithm-go/README.md +++ b/ai/select-algorithm-go/README.md @@ -72,6 +72,30 @@ ALGORITHM=diskann go run . $env:ALGORITHM="ivf"; go run . ``` +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and print a formatted comparison table: + +```bash +ALGORITHM=compare-all go run . +``` + +### Environment variables for compare-all + +| Variable | Default | Description | +|--------------|----------------------------------|---------------------------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Text to generate the query embedding | +| `TOP_K` | `3` | Number of results per search | +| `VERBOSE` | `false` | Show per-index result details | + +On Windows (PowerShell): + +```powershell +$env:ALGORITHM="compare-all"; $env:VERBOSE="true"; go run . +``` + +The comparison uses a **single `hotels` collection** with 9 named indexes (`vector_ivf_cos`, `vector_hnsw_l2`, `vector_diskann_ip`, etc.), generates one embedding for the query text, and runs each search sequentially for fair timing. + ## Algorithm comparison | Algorithm | Kind | Key Parameters | Best For | @@ -92,7 +116,8 @@ select-algorithm-go/ ├── utils.go # Shared config, auth, data, and search helpers ├── ivf.go # IVF index creation and search workflow ├── hnsw.go # HNSW index creation and search workflow - └── diskann.go # DiskANN index creation and search workflow + ├── diskann.go # DiskANN index creation and search workflow + └── compare_all.go # Unified 9-combination comparison runner ``` ## Authentication diff --git a/ai/select-algorithm-go/go.mod b/ai/select-algorithm-go/go.mod index c25f589..53e0b34 100644 --- a/ai/select-algorithm-go/go.mod +++ b/ai/select-algorithm-go/go.mod @@ -9,3 +9,28 @@ require ( github.com/openai/openai-go/v3 v3.12.0 go.mongodb.org/mongo-driver v1.17.6 ) + +require ( + github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect + github.com/golang-jwt/jwt/v5 v5.3.0 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect + github.com/montanaflynn/stats v0.7.1 // indirect + github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.1.2 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect +) diff --git a/ai/select-algorithm-go/go.sum b/ai/select-algorithm-go/go.sum new file mode 100644 index 0000000..7795605 --- /dev/null +++ b/ai/select-algorithm-go/go.sum @@ -0,0 +1,81 @@ +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= +github.com/openai/openai-go/v3 v3.12.0 h1:NkrImaglFQeDycc/n/fEmpFV8kKr8snl9/8X2x4eHOg= +github.com/openai/openai-go/v3 v3.12.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.mongodb.org/mongo-driver v1.17.6 h1:87JUG1wZfWsr6rIz3ZmpH90rL5tea7O3IHuSwHUpsss= +go.mongodb.org/mongo-driver v1.17.6/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go new file mode 100644 index 0000000..6dc9edc --- /dev/null +++ b/ai/select-algorithm-go/src/compare_all.go @@ -0,0 +1,338 @@ +package main + +import ( + "context" + "fmt" + "os" + "strconv" + "strings" + "text/tabwriter" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CompareResult holds the result of a single algorithm+metric search +type CompareResult struct { + Algorithm string + Metric string + IndexName string + Latency time.Duration + Results []SearchResult + TopScore float64 + Error error +} + +// indexSpec defines one of the 9 combinations +type indexSpec struct { + Algorithm string + Kind string + Metric string + IndexName string + Options bson.D +} + +// RunCompareAll executes all 9 algorithm×metric combinations on a single collection +func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + queryText := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + topK, _ := strconv.Atoi(getEnvOrDefault("TOP_K", "3")) + verbose := strings.ToLower(getEnvOrDefault("VERBOSE", "false")) == "true" + + fmt.Println("\n" + strings.Repeat("=", 70)) + fmt.Println(" COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations)") + fmt.Println(strings.Repeat("=", 70)) + fmt.Printf("Query: %q\n", queryText) + fmt.Printf("Top-K: %d\n", topK) + fmt.Printf("Verbose: %v\n", verbose) + + // 1. Get collection and load data ONCE + collection := dbClient.Database(config.DatabaseName).Collection("hotels") + + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // 2. Generate ONE embedding for the query (reused for all 9 searches) + fmt.Printf("\nGenerating embedding for query: %q\n", queryText) + queryEmbedding, err := GenerateEmbedding(ctx, aiClient, queryText, config.ModelName) + if err != nil { + return fmt.Errorf("failed to generate query embedding: %v", err) + } + fmt.Printf("Embedding generated (%d dimensions)\n", len(queryEmbedding)) + + // 3. Define all 9 index specs + metrics := []string{"COS", "L2", "IP"} + specs := buildIndexSpecs(config.VectorField, config.Dimensions, metrics) + + // 4. Create all 9 indexes (idempotent) + fmt.Printf("\nCreating %d vector indexes...\n", len(specs)) + for _, spec := range specs { + if err := createNamedVectorIndex(ctx, collection, spec); err != nil { + fmt.Printf(" ⚠ %s: %v\n", spec.IndexName, err) + } else { + fmt.Printf(" ✓ %s created\n", spec.IndexName) + } + } + + // Allow indexes to become ready + fmt.Println("\nWaiting for indexes to be ready...") + time.Sleep(3 * time.Second) + + // 5. Run searches SEQUENTIALLY and collect results + fmt.Println("\nRunning vector searches...") + var results []CompareResult + + for _, spec := range specs { + start := time.Now() + searchResults, searchErr := vectorSearchWithIndex(ctx, collection, queryEmbedding, config.VectorField, spec.IndexName, topK) + latency := time.Since(start) + + cr := CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + IndexName: spec.IndexName, + Latency: latency, + Results: searchResults, + Error: searchErr, + } + if len(searchResults) > 0 { + cr.TopScore = searchResults[0].Score + } + results = append(results, cr) + + status := "✓" + if searchErr != nil { + status = "✗" + } + fmt.Printf(" %s %s (%v)\n", status, spec.IndexName, latency.Round(time.Millisecond)) + } + + // 6. Print comparison table + fmt.Println() + printComparisonTable(results, verbose) + + return nil +} + +// buildIndexSpecs creates the 9 index specifications +func buildIndexSpecs(vectorField string, dimensions int, metrics []string) []indexSpec { + var specs []indexSpec + + for _, metric := range metrics { + metricLower := strings.ToLower(metric) + + // IVF + specs = append(specs, indexSpec{ + Algorithm: "IVF", + Kind: "vector-ivf", + Metric: metric, + IndexName: fmt.Sprintf("vector_ivf_%s", metricLower), + Options: bson.D{ + {"kind", "vector-ivf"}, + {"dimensions", dimensions}, + {"similarity", metric}, + {"numLists", 1}, + }, + }) + + // HNSW + specs = append(specs, indexSpec{ + Algorithm: "HNSW", + Kind: "vector-hnsw", + Metric: metric, + IndexName: fmt.Sprintf("vector_hnsw_%s", metricLower), + Options: bson.D{ + {"kind", "vector-hnsw"}, + {"dimensions", dimensions}, + {"similarity", metric}, + {"m", 16}, + {"efConstruction", 64}, + }, + }) + + // DiskANN + specs = append(specs, indexSpec{ + Algorithm: "DiskANN", + Kind: "vector-diskann", + Metric: metric, + IndexName: fmt.Sprintf("vector_diskann_%s", metricLower), + Options: bson.D{ + {"kind", "vector-diskann"}, + {"dimensions", dimensions}, + {"similarity", metric}, + {"maxDegree", 32}, + {"lBuild", 50}, + }, + }) + } + + return specs +} + +// createNamedVectorIndex creates a single named vector index (idempotent) +func createNamedVectorIndex(ctx context.Context, collection *mongo.Collection, spec indexSpec) error { + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", spec.IndexName}, + {"key", bson.D{ + {spec.IndexName, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", spec.Options}, + }, + }}, + } + + var result bson.M + err := collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + // Treat "index already exists" as success (idempotent) + if strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "IndexAlreadyExists") { + return nil + } + return err + } + return nil +} + +// vectorSearchWithIndex performs a vector search targeting a specific named index +func vectorSearchWithIndex(ctx context.Context, collection *mongo.Collection, embedding []float64, vectorField, indexName string, topK int) ([]SearchResult, error) { + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": embedding, + "path": vectorField, + "k": topK, + }, + "cosmosSearchOptions": bson.M{ + "indexName": indexName, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, err + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, err + } + + return results, nil +} + +// printComparisonTable outputs a formatted table of results +func printComparisonTable(results []CompareResult, verbose bool) { + fmt.Println(strings.Repeat("=", 70)) + fmt.Println(" COMPARISON RESULTS") + fmt.Println(strings.Repeat("=", 70)) + + w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', tabwriter.AlignRight) + fmt.Fprintf(w, "ALGORITHM\tMETRIC\tLATENCY\tTOP SCORE\tRESULTS\tSTATUS\t\n") + fmt.Fprintf(w, "---------\t------\t-------\t---------\t-------\t------\t\n") + + for _, r := range results { + status := "OK" + scoreStr := fmt.Sprintf("%.4f", r.TopScore) + resultCount := fmt.Sprintf("%d", len(r.Results)) + + if r.Error != nil { + status = "ERROR" + scoreStr = "-" + resultCount = "-" + } + + fmt.Fprintf(w, "%s\t%s\t%v\t%s\t%s\t%s\t\n", + r.Algorithm, + r.Metric, + r.Latency.Round(time.Millisecond), + scoreStr, + resultCount, + status, + ) + } + w.Flush() + + // Print verbose details if requested + if verbose { + fmt.Println() + for _, r := range results { + if r.Error != nil { + fmt.Printf("\n[%s] Error: %v\n", r.IndexName, r.Error) + continue + } + if len(r.Results) > 0 { + fmt.Printf("\n[%s] Top results:\n", r.IndexName) + for i, res := range r.Results { + doc := res.Document.(bson.D) + var hotelName string + for _, elem := range doc { + if elem.Key == "HotelName" { + hotelName = fmt.Sprintf("%v", elem.Value) + break + } + } + fmt.Printf(" %d. %s (score: %.4f)\n", i+1, hotelName, res.Score) + } + } + } + } + + // Summary + fmt.Println() + var fastest CompareResult + for _, r := range results { + if r.Error == nil && (fastest.Latency == 0 || r.Latency < fastest.Latency) { + fastest = r + } + } + if fastest.Latency > 0 { + fmt.Printf("⚡ Fastest: %s/%s (%v)\n", fastest.Algorithm, fastest.Metric, fastest.Latency.Round(time.Millisecond)) + } + + var highestScore CompareResult + for _, r := range results { + if r.Error == nil && r.TopScore > highestScore.TopScore { + highestScore = r + } + } + if highestScore.TopScore > 0 { + fmt.Printf("🎯 Highest score: %s/%s (%.4f)\n", highestScore.Algorithm, highestScore.Metric, highestScore.TopScore) + } +} diff --git a/ai/select-algorithm-go/src/main.go b/ai/select-algorithm-go/src/main.go index 0f10b77..8508846 100644 --- a/ai/select-algorithm-go/src/main.go +++ b/ai/select-algorithm-go/src/main.go @@ -60,8 +60,13 @@ func main() { log.Printf("DiskANN failed: %v", err) } + case "compare-all": + if err := RunCompareAll(ctx, config, mongoClient, aiClient); err != nil { + log.Fatalf("Compare-all failed: %v", err) + } + default: - log.Fatalf("Unknown algorithm: '%s'. Use 'all', 'ivf', 'hnsw', or 'diskann'", config.Algorithm) + log.Fatalf("Unknown algorithm: '%s'. Use 'all', 'ivf', 'hnsw', 'diskann', or 'compare-all'", config.Algorithm) } fmt.Println("\nDone!") diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md index 72ba7cc..3c19570 100644 --- a/ai/select-algorithm-java/README.md +++ b/ai/select-algorithm-java/README.md @@ -78,6 +78,50 @@ This sample uses **passwordless authentication** via `DefaultAzureCredential`: Ensure your identity has the appropriate RBAC roles assigned on both resources. +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and print a formatted comparison table: + +```bash +mvn exec:java -Pcompare +``` + +Or via the `ALGORITHM` environment variable: + +```bash +ALGORITHM=compare mvn exec:java +``` + +On Windows (PowerShell): + +```powershell +$env:ALGORITHM="compare"; mvn exec:java +``` + +### Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `3` | Number of results per search | +| `VERBOSE` | `false` | Print detailed per-index results | + +### What It Does + +1. Connects to DocumentDB and loads hotel data into a single `hotels` collection +2. Generates one embedding for the query text (reused for all searches) +3. Creates 9 vector indexes: `vector_{algo}_{metric}` (e.g., `vector_hnsw_cos`) +4. Runs vector search against each index sequentially with timing +5. Prints a comparison table with latency, result count, and top match + +### Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=32, lBuild=50 | + ## Project Structure ``` @@ -86,5 +130,6 @@ src/main/java/com/azure/documentdb/selectalgorithm/ ├── Utils.java — Shared helpers (connection, embedding, data loading) ├── IvfDemo.java — IVF index creation and vector search ├── HnswDemo.java — HNSW index creation and vector search -└── DiskannDemo.java — DiskANN index creation and vector search +├── DiskannDemo.java — DiskANN index creation and vector search +└── CompareAll.java — Unified comparison runner (all 9 combinations) ``` diff --git a/ai/select-algorithm-java/pom.xml b/ai/select-algorithm-java/pom.xml index a91ea98..2414631 100644 --- a/ai/select-algorithm-java/pom.xml +++ b/ai/select-algorithm-java/pom.xml @@ -62,4 +62,22 @@ + + + + compare + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.CompareAll + + + + + + diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java new file mode 100644 index 0000000..edd24a2 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -0,0 +1,231 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * Unified comparison runner that executes all 9 combinations + * (3 algorithms x 3 similarity metrics) and prints a formatted table. + */ +public class CompareAll { + + private static final String COLLECTION_NAME = "hotels"; + private static final String[] ALGORITHMS = {"ivf", "hnsw", "diskann"}; + private static final String[] METRICS = {"COS", "L2", "IP"}; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String queryText = Utils.getEnv("QUERY_TEXT", "luxury hotel near the beach"); + int topK = Integer.parseInt(Utils.getEnv("TOP_K", "3")); + boolean verbose = Boolean.parseBoolean(Utils.getEnv("VERBOSE", "false")); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Top K: %d%n", topK); + System.out.printf(" Metrics: COS, L2, IP%n"); + System.out.printf(" Algos: IVF, HNSW, DiskANN%n"); + System.out.println(); + + List results = new ArrayList<>(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + collection.drop(); + System.out.println(" Collection reset."); + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Create all 9 indexes idempotently + System.out.println(" Creating 9 vector indexes..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + createIndex(collection, vectorField, dimensions, algo, metric); + } + } + System.out.println(" All indexes created.\n"); + + // Run searches sequentially for fair timing + System.out.println(" Running searches..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + long startNs = System.nanoTime(); + List searchResults = performSearch( + collection, vectorAsDoubles, vectorField, topK); + long elapsedNs = System.nanoTime() - startNs; + double elapsedMs = elapsedNs / 1_000_000.0; + + // Extract top result info + String topHotel = "-"; + double topScore = 0.0; + if (!searchResults.isEmpty()) { + Document top = searchResults.get(0); + topHotel = top.getString("HotelName") != null + ? top.getString("HotelName") : "-"; + topScore = top.getDouble("score") != null + ? top.getDouble("score") : 0.0; + } + + results.add(new SearchResult( + algo.toUpperCase(), metric, indexName, + elapsedMs, searchResults.size(), topHotel, topScore)); + + if (verbose) { + System.out.printf(" [%s] %d results in %.2f ms%n", + indexName, searchResults.size(), elapsedMs); + for (int i = 0; i < searchResults.size(); i++) { + Document doc = searchResults.get(i); + System.out.printf(" %d. %s (%.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + } + } + } + } + } + + // Print comparison table + printComparisonTable(results, topK); + } + + private static void createIndex(MongoCollection collection, + String vectorField, int dimensions, + String algo, String metric) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + Document cosmosSearchOptions = new Document() + .append("dimensions", dimensions) + .append("similarity", metric); + + switch (algo) { + case "ivf" -> cosmosSearchOptions + .append("kind", "vector-ivf") + .append("numLists", 1); + case "hnsw" -> cosmosSearchOptions + .append("kind", "vector-hnsw") + .append("m", 16) + .append("efConstruction", 64); + case "diskann" -> cosmosSearchOptions + .append("kind", "vector-diskann") + .append("maxDegree", 32) + .append("lBuild", 50); + } + + Document indexDefinition = new Document() + .append("name", indexName) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + try { + collection.getDatabase().runCommand(command); + } catch (Exception e) { + // Idempotent: ignore if index already exists + if (!e.getMessage().contains("already exists")) { + throw e; + } + } + } + + private static List performSearch(MongoCollection collection, + List vectorAsDoubles, + String vectorField, int topK) { + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + return results; + } + + private static void printComparisonTable(List results, int topK) { + System.out.println(); + System.out.println(" ╔══════════════════════════════════════════════════════════════════════════════════╗"); + System.out.println(" ║ COMPARISON TABLE — All Algorithms × Metrics ║"); + System.out.println(" ╠══════════════════════════════════════════════════════════════════════════════════╣"); + System.out.printf(" ║ %-10s %-8s %-22s %10s %8s %-18s ║%n", + "ALGO", "METRIC", "INDEX NAME", "LATENCY", "RESULTS", "TOP MATCH"); + System.out.println(" ╠══════════════════════════════════════════════════════════════════════════════════╣"); + + for (SearchResult r : results) { + String topMatch = r.topHotel.length() > 16 + ? r.topHotel.substring(0, 16) + ".." + : r.topHotel; + System.out.printf(" ║ %-10s %-8s %-22s %8.2f ms %5d %-18s ║%n", + r.algorithm, r.metric, r.indexName, + r.latencyMs, r.resultCount, topMatch); + } + + System.out.println(" ╠══════════════════════════════════════════════════════════════════════════════════╣"); + + // Summary stats + double fastest = results.stream().mapToDouble(r -> r.latencyMs).min().orElse(0); + double slowest = results.stream().mapToDouble(r -> r.latencyMs).max().orElse(0); + double avg = results.stream().mapToDouble(r -> r.latencyMs).average().orElse(0); + String fastestIdx = results.stream() + .filter(r -> r.latencyMs == fastest) + .findFirst().map(r -> r.indexName).orElse("-"); + + System.out.printf(" ║ Fastest: %-22s (%8.2f ms) ║%n", fastestIdx, fastest); + System.out.printf(" ║ Slowest: %8.2f ms | Average: %8.2f ms | Top K: %-3d ║%n", slowest, avg, topK); + System.out.println(" ╚══════════════════════════════════════════════════════════════════════════════════╝"); + System.out.println(); + } + + private record SearchResult( + String algorithm, + String metric, + String indexName, + double latencyMs, + int resultCount, + String topHotel, + double topScore) { + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java index 18fe5b9..982b698 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java @@ -15,6 +15,7 @@ public static void main(String[] args) { case "ivf" -> IvfDemo.run(); case "hnsw" -> HnswDemo.run(); case "diskann" -> DiskannDemo.run(); + case "compare" -> CompareAll.run(); case "all" -> { IvfDemo.run(); HnswDemo.run(); @@ -22,7 +23,7 @@ public static void main(String[] args) { } default -> { System.err.println("Unknown algorithm: " + algorithm); - System.err.println("Valid options: ivf, hnsw, diskann, all"); + System.err.println("Valid options: ivf, hnsw, diskann, compare, all"); System.exit(1); } } diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md index 7e65211..c3c11ab 100644 --- a/ai/select-algorithm-python/README.md +++ b/ai/select-algorithm-python/README.md @@ -61,6 +61,25 @@ python hnsw.py python diskann.py ``` +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation: + +```bash +cd src +python compare_all.py +``` + +This creates a single `hotels` collection with 9 vector indexes and runs each search sequentially for fair timing comparison. Output is a formatted table showing latency, scores, and top results for every combination. + +**Environment variables:** + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | "luxury hotel near the beach" | Search query text | +| `TOP_K` | 3 | Number of results per search | +| `VERBOSE` | false | Print individual results per combo | + ## Configuration Edit `.env` to configure: diff --git a/ai/select-algorithm-python/requirements.txt b/ai/select-algorithm-python/requirements.txt index c0a35e0..20dbd9c 100644 --- a/ai/select-algorithm-python/requirements.txt +++ b/ai/select-algorithm-python/requirements.txt @@ -9,3 +9,6 @@ azure-identity>=1.15.0 # Environment variable management from .env files python-dotenv>=1.0.0 + +# Formatted table output for compare_all.py +tabulate>=0.9.0 diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py new file mode 100644 index 0000000..0703e77 --- /dev/null +++ b/ai/select-algorithm-python/src/compare_all.py @@ -0,0 +1,234 @@ +""" +Compare All Algorithms — Unified comparison runner. + +Executes all 9 combinations (3 algorithms × 3 similarity metrics) in a single +invocation and prints a formatted comparison table. + +Algorithms: IVF, HNSW, DiskANN +Metrics: COS, L2, IP +""" +import os +import time +from typing import Dict, List, Any, Tuple + +from tabulate import tabulate +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data +) + +# Index definitions: (algo_label, kind, extra_params) +ALGORITHMS = [ + ("IVF", "vector-ivf", {"numLists": 1}), + ("HNSW", "vector-hnsw", {"m": 16, "efConstruction": 64}), + ("DiskANN", "vector-diskann", {"maxDegree": 32, "lBuild": 50}), +] + +METRICS = ["COS", "L2", "IP"] + + +def get_compare_config() -> Dict[str, Any]: + """Load comparison-specific configuration from environment variables.""" + config = get_config() + config["query_text"] = os.getenv("QUERY_TEXT", "luxury hotel near the beach") + config["top_k"] = int(os.getenv("TOP_K", "3")) + config["verbose"] = os.getenv("VERBOSE", "false").lower() in ("true", "1", "yes") + return config + + +def index_name(algo: str, metric: str) -> str: + """Generate canonical index name: vector_{algo}_{metric}.""" + return f"vector_{algo.lower()}_{metric.lower()}" + + +def get_existing_index_names(collection) -> List[str]: + """Return names of existing indexes on the collection.""" + return [idx["name"] for idx in collection.list_indexes()] + + +def create_vector_index(collection, name: str, kind: str, vector_field: str, + dimensions: int, similarity: str, + extra_params: Dict[str, Any]) -> None: + """Create a single vector index if it does not already exist.""" + existing = get_existing_index_names(collection) + if name in existing: + return + + cosmos_options = { + "kind": kind, + "dimensions": dimensions, + "similarity": similarity, + **extra_params, + } + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": name, + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": cosmos_options, + } + ], + } + collection.database.command(index_command) + + +def create_all_indexes(collection, vector_field: str, dimensions: int, + verbose: bool = False) -> None: + """Create all 9 vector indexes idempotently.""" + existing = get_existing_index_names(collection) + created = 0 + + for algo_label, kind, extra_params in ALGORITHMS: + for metric in METRICS: + name = index_name(algo_label, metric) + if name in existing: + if verbose: + print(f" Index '{name}' already exists, skipping") + continue + create_vector_index( + collection, name, kind, vector_field, dimensions, metric, extra_params + ) + created += 1 + if verbose: + print(f" Created index '{name}'") + + if created > 0: + print(f"Created {created} new index(es). Waiting for indexes to build...") + time.sleep(5) + else: + print("All 9 indexes already exist.") + + +def generate_embedding(azure_openai_client, query_text: str, + model_name: str) -> List[float]: + """Generate a single embedding for the query text.""" + response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + return response.data[0].embedding + + +def vector_search_with_index(collection, query_embedding: List[float], + vector_field: str, idx_name: str, + top_k: int) -> Tuple[List[Dict[str, Any]], float]: + """Run vector search against a specific index and return results + latency.""" + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + }, + "cosmosSearchOptions": { + "indexName": idx_name + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + start = time.perf_counter() + results = list(collection.aggregate(pipeline)) + elapsed_ms = (time.perf_counter() - start) * 1000 + + return results, elapsed_ms + + +def format_top_result(results: List[Dict[str, Any]]) -> str: + """Extract top result name for display.""" + if not results: + return "(no results)" + doc = results[0].get("document", results[0]) + return doc.get("HotelName", doc.get("name", "Unknown")) + + +def main(): + print("=" * 70) + print(" Compare All Algorithms — 9 Combinations") + print(" (3 Algorithms × 3 Similarity Metrics)") + print("=" * 70) + + config = get_compare_config() + query_text = config["query_text"] + top_k = config["top_k"] + verbose = config["verbose"] + + print(f"\n Query: \"{query_text}\"") + print(f" Top K: {top_k}") + print(f" Verbose: {verbose}\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config["database_name"]] + collection = database["hotels"] + + # Load data once + data = read_file_return_json(config["data_file"]) + documents = [doc for doc in data if config["vector_field"] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + insert_data(collection, documents, config["batch_size"]) + + # Create all 9 indexes idempotently + print("\nEnsuring all 9 vector indexes exist...") + create_all_indexes( + collection, config["vector_field"], config["dimensions"], verbose + ) + + # Generate ONE embedding for the query + print(f"\nGenerating embedding for query...") + query_embedding = generate_embedding( + azure_openai_client, query_text, config["model_name"] + ) + + # Run all 9 searches sequentially + print("Running 9 vector searches...\n") + table_rows = [] + + for algo_label, _, _ in ALGORITHMS: + for metric in METRICS: + idx = index_name(algo_label, metric) + results, latency_ms = vector_search_with_index( + collection, query_embedding, config["vector_field"], idx, top_k + ) + + top_score = results[0].get("score", 0) if results else 0 + top_name = format_top_result(results) + + table_rows.append([ + algo_label, + metric, + idx, + f"{latency_ms:.1f} ms", + len(results), + f"{top_score:.4f}", + top_name, + ]) + + if verbose: + for i, r in enumerate(results, 1): + doc = r.get("document", r) + name = doc.get("HotelName", doc.get("name", "Unknown")) + score = r.get("score", 0) + print(f" {idx} #{i}: {name} (score: {score:.4f})") + + # Print comparison table + headers = ["Algorithm", "Metric", "Index Name", "Latency", + "Results", "Top Score", "Top Result"] + print(tabulate(table_rows, headers=headers, tablefmt="grid")) + + finally: + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md index 208e43d..40dcc7f 100644 --- a/ai/select-algorithm-typescript/README.md +++ b/ai/select-algorithm-typescript/README.md @@ -65,6 +65,24 @@ npm run start:hnsw npm run start:diskann ``` +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table: + +```bash +npm run start:compare-all +``` + +**Environment variables** (optional overrides): + +| Variable | Default | Description | +|---|---|---| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `3` | Number of results per combination | +| `VERBOSE` | `false` | When `true`, shows all k results per combo | + +The script creates a single `hotels` collection, loads data once, creates 9 vector indexes (one per algorithm/metric pair), and runs searches sequentially for fair timing comparison. + ## Algorithm comparison | Algorithm | Index type | Best for | diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json index bac0876..dcadb2f 100644 --- a/ai/select-algorithm-typescript/package.json +++ b/ai/select-algorithm-typescript/package.json @@ -7,7 +7,8 @@ "build": "tsc", "start:ivf": "node --env-file .env dist/ivf.js", "start:hnsw": "node --env-file .env dist/hnsw.js", - "start:diskann": "node --env-file .env dist/diskann.js" + "start:diskann": "node --env-file .env dist/diskann.js", + "start:compare-all": "node --env-file .env dist/compare-all.js" }, "dependencies": { "@azure/identity": "^4.11.1", diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts new file mode 100644 index 0000000..2d63984 --- /dev/null +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -0,0 +1,205 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +interface AlgorithmConfig { + name: string; + kind: string; + options: Record; +} + +interface SearchResult { + algorithm: string; + similarity: string; + latencyMs: number; + topScore: number; + topResult: string; + results: Array<{ name: string; score: number }>; +} + +const ALGORITHMS: AlgorithmConfig[] = [ + { name: 'IVF', kind: 'vector-ivf', options: { numLists: 1 } }, + { name: 'HNSW', kind: 'vector-hnsw', options: { m: 16, efConstruction: 64 } }, + { name: 'DiskANN', kind: 'vector-diskann', options: { maxDegree: 32, lBuild: 50 } }, +]; + +const SIMILARITIES = ['COS', 'L2', 'IP']; + +async function main() { + const baseConfig = getConfig(); + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '3', 10); + const verbose = process.env.VERBOSE === 'true'; + const collectionName = 'hotels'; + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(baseConfig.dbName); + + // Create collection and load data once + let collection; + const collections = await db.listCollections({ name: collectionName }).toArray(); + if (collections.length === 0) { + collection = await db.createCollection(collectionName); + console.log(`Created collection: ${collectionName}`); + const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); + const insertSummary = await insertData(baseConfig, collection, data); + console.log(`Inserted ${insertSummary.inserted}/${insertSummary.total} documents`); + } else { + collection = db.collection(collectionName); + console.log(`Collection "${collectionName}" already exists, skipping data load`); + } + + // Check existing indexes to avoid duplicates + const existingIndexes = await collection.listIndexes().toArray(); + const existingIndexNames = new Set(existingIndexes.map(idx => idx.name)); + + // Create all 9 indexes + console.log('\nCreating vector indexes...'); + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + if (existingIndexNames.has(indexName)) { + console.log(` ✓ ${indexName} (already exists)`); + continue; + } + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [baseConfig.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: algo.kind, + ...algo.options, + similarity: sim, + dimensions: baseConfig.embeddingDimensions + } + }] + }; + await db.command(indexOptions); + console.log(` ✓ ${indexName} (created)`); + } + } + + // Generate one embedding for the query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: baseConfig.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + console.log(`Embedding generated (${queryVector.length} dimensions)`); + + // Run all 9 searches sequentially + console.log(`\nRunning searches (top ${topK} results)...\n`); + const results: SearchResult[] = []; + + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + + const start = performance.now(); + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + }, + cosmosSearchOptions: { + indexName: indexName + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]).toArray(); + const latencyMs = performance.now() - start; + + const topDoc = searchResults[0] as any; + results.push({ + algorithm: algo.name, + similarity: sim, + latencyMs, + topScore: topDoc?.score ?? 0, + topResult: topDoc?.document?.HotelName ?? '(none)', + results: searchResults.map((r: any) => ({ + name: r.document?.HotelName ?? '(none)', + score: r.score ?? 0 + })) + }); + } + } + + // Print comparison table + printComparisonTable(results, verbose); + + } catch (error) { + console.error('Compare-all failed:', error); + process.exitCode = 1; + } finally { + if (dbClient) await dbClient.close(); + console.log('\nDatabase connection closed'); + } +} + +function printComparisonTable(results: SearchResult[], verbose: boolean) { + const algoWidth = 10; + const simWidth = 10; + const latWidth = 8; + const scoreWidth = 10; + const nameWidth = 30; + + const pad = (s: string, w: number) => s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length); + + const topLine = `╔${'═'.repeat(algoWidth)}╤${'═'.repeat(simWidth)}╤${'═'.repeat(latWidth)}╤${'═'.repeat(scoreWidth)}╤${'═'.repeat(nameWidth)}╗`; + const headerSep = `╠${'═'.repeat(algoWidth)}╪${'═'.repeat(simWidth)}╪${'═'.repeat(latWidth)}╪${'═'.repeat(scoreWidth)}╪${'═'.repeat(nameWidth)}╣`; + const rowSep = `╟${'─'.repeat(algoWidth)}┼${'─'.repeat(simWidth)}┼${'─'.repeat(latWidth)}┼${'─'.repeat(scoreWidth)}┼${'─'.repeat(nameWidth)}╢`; + const bottomLine = `╚${'═'.repeat(algoWidth)}╧${'═'.repeat(simWidth)}╧${'═'.repeat(latWidth)}╧${'═'.repeat(scoreWidth)}╧${'═'.repeat(nameWidth)}╝`; + + console.log(topLine); + console.log(`║${pad(' Algorithm', algoWidth)}│${pad(' Similarity', simWidth)}│${pad(' Latency', latWidth)}│${pad(' Top Score', scoreWidth)}│${pad(' Top Result', nameWidth)}║`); + console.log(headerSep); + + results.forEach((r, i) => { + const latStr = `${Math.round(r.latencyMs)}ms`; + const scoreStr = r.topScore.toFixed(4); + console.log( + `║${pad(` ${r.algorithm}`, algoWidth)}│${pad(` ${r.similarity}`, simWidth)}│${pad(` ${latStr}`, latWidth)}│${pad(` ${scoreStr}`, scoreWidth)}│${pad(` ${r.topResult}`, nameWidth)}║` + ); + + if (verbose && r.results.length > 1) { + for (let j = 1; j < r.results.length; j++) { + const sub = r.results[j]; + console.log( + `║${pad('', algoWidth)}│${pad('', simWidth)}│${pad('', latWidth)}│${pad(` ${sub.score.toFixed(4)}`, scoreWidth)}│${pad(` ${sub.name}`, nameWidth)}║` + ); + } + } + + if (i < results.length - 1) { + console.log(rowSep); + } + }); + + console.log(bottomLine); +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); From 4d421ad2b6f0555415df353ab7c248db8e23f1fa Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 29 Apr 2026 15:12:22 -0700 Subject: [PATCH 6/9] refactor: make compare-all self-contained with create/cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - All 5 runners now: drop collection → create fresh → upload data → create indexes → run comparisons → drop collection on exit - Removed 15 individual algorithm files (ivf/hnsw/diskann per language) - Updated entry points (main.go, Main.java, Program.cs) to only run compare-all - Simplified package.json scripts (TypeScript) - All languages use DefaultAzureCredential for auth Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/src/CompareAll.cs | 16 +++ ai/select-algorithm-dotnet/src/DiskannDemo.cs | 88 -------------- ai/select-algorithm-dotnet/src/HnswDemo.cs | 88 -------------- ai/select-algorithm-dotnet/src/IvfDemo.cs | 87 -------------- ai/select-algorithm-dotnet/src/Program.cs | 40 +------ ai/select-algorithm-go/src/compare_all.go | 22 +++- ai/select-algorithm-go/src/diskann.go | 112 ------------------ ai/select-algorithm-go/src/hnsw.go | 112 ------------------ ai/select-algorithm-go/src/ivf.go | 110 ----------------- ai/select-algorithm-go/src/main.go | 44 +------ .../selectalgorithm/CompareAll.java | 5 + .../selectalgorithm/DiskannDemo.java | 77 ------------ .../documentdb/selectalgorithm/HnswDemo.java | 77 ------------ .../documentdb/selectalgorithm/IvfDemo.java | 76 ------------ .../documentdb/selectalgorithm/Main.java | 24 +--- ai/select-algorithm-python/src/compare_all.py | 15 ++- ai/select-algorithm-python/src/diskann.py | 90 -------------- ai/select-algorithm-python/src/hnsw.py | 90 -------------- ai/select-algorithm-python/src/ivf.py | 88 -------------- ai/select-algorithm-typescript/package.json | 5 +- .../src/compare-all.ts | 44 +++---- ai/select-algorithm-typescript/src/diskann.ts | 101 ---------------- ai/select-algorithm-typescript/src/hnsw.ts | 101 ---------------- ai/select-algorithm-typescript/src/ivf.ts | 100 ---------------- 24 files changed, 85 insertions(+), 1527 deletions(-) delete mode 100644 ai/select-algorithm-dotnet/src/DiskannDemo.cs delete mode 100644 ai/select-algorithm-dotnet/src/HnswDemo.cs delete mode 100644 ai/select-algorithm-dotnet/src/IvfDemo.cs delete mode 100644 ai/select-algorithm-go/src/diskann.go delete mode 100644 ai/select-algorithm-go/src/hnsw.go delete mode 100644 ai/select-algorithm-go/src/ivf.go delete mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java delete mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java delete mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java delete mode 100644 ai/select-algorithm-python/src/diskann.py delete mode 100644 ai/select-algorithm-python/src/hnsw.py delete mode 100644 ai/select-algorithm-python/src/ivf.py delete mode 100644 ai/select-algorithm-typescript/src/diskann.ts delete mode 100644 ai/select-algorithm-typescript/src/hnsw.ts delete mode 100644 ai/select-algorithm-typescript/src/ivf.ts diff --git a/ai/select-algorithm-dotnet/src/CompareAll.cs b/ai/select-algorithm-dotnet/src/CompareAll.cs index d575d3e..a29704c 100644 --- a/ai/select-algorithm-dotnet/src/CompareAll.cs +++ b/ai/select-algorithm-dotnet/src/CompareAll.cs @@ -36,6 +36,11 @@ public static void Run() try { var database = mongoClient.GetDatabase(databaseName); + + // Drop collection for a clean comparison + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + var collection = database.GetCollection("hotels"); // Load data once into single collection @@ -85,6 +90,17 @@ public static void Run() } finally { + // Cleanup: drop the comparison collection + try + { + var database = mongoClient.GetDatabase(databaseName); + database.DropCollection("hotels"); + Console.WriteLine("\nCleanup: dropped collection 'hotels'"); + } + catch (Exception ex) + { + Console.WriteLine($"Cleanup warning: {ex.Message}"); + } mongoClient.Cluster.Dispose(); } } diff --git a/ai/select-algorithm-dotnet/src/DiskannDemo.cs b/ai/select-algorithm-dotnet/src/DiskannDemo.cs deleted file mode 100644 index a3e866b..0000000 --- a/ai/select-algorithm-dotnet/src/DiskannDemo.cs +++ /dev/null @@ -1,88 +0,0 @@ -/// DiskANN vector index for Azure DocumentDB. -/// Best for: Datasets with 50,000+ documents. -/// Cluster tier: M30 or higher. -/// Key parameters: maxDegree (graph edges), lBuild (construction quality). - -namespace SelectAlgorithm; - -using MongoDB.Driver; -using MongoDB.Bson; - -public static class DiskannDemo -{ - public static void CreateDiskannIndex(IMongoCollection collection, string vectorField, int dimensions, string similarity, int maxDegree = 20, int lBuild = 10) - { - Console.WriteLine($"Creating DiskANN vector index on field '{vectorField}'..."); - - Utils.DropVectorIndexes(collection, vectorField); - - var command = new BsonDocument - { - { "createIndexes", collection.CollectionNamespace.CollectionName }, - { "indexes", new BsonArray - { - new BsonDocument - { - { "name", $"diskann_index_{vectorField}" }, - { "key", new BsonDocument(vectorField, "cosmosSearch") }, - { "cosmosSearchOptions", new BsonDocument - { - { "kind", "vector-diskann" }, - { "dimensions", dimensions }, - { "similarity", similarity }, - { "maxDegree", maxDegree }, - { "lBuild", lBuild } - } - } - } - } - } - }; - - collection.Database.RunCommand(command); - Console.WriteLine("DiskANN vector index created successfully"); - } - - public static void Run(Models.AppConfiguration config) - { - Console.WriteLine(new string('=', 60)); - Console.WriteLine(" DiskANN Vector Index - Select Algorithm Demo"); - Console.WriteLine(" Best for: 50,000+ documents"); - Console.WriteLine(new string('=', 60)); - - var databaseName = config.DocumentDB.DatabaseName; - var dataFile = config.DataFiles.WithVectors; - var vectorField = config.Embedding.EmbeddedField; - var model = config.AzureOpenAI.EmbeddingModel; - var dimensions = config.Embedding.Dimensions; - var batchSize = config.DocumentDB.LoadBatchSize; - var similarity = config.VectorSearch.Similarity; - - var mongoClient = Utils.GetMongoClientPasswordless(config); - var embeddingClient = Utils.GetEmbeddingClient(config); - - try - { - var database = mongoClient.GetDatabase(databaseName); - var collection = database.GetCollection("hotels_diskann"); - - var data = Utils.ReadJsonFile(dataFile); - var documents = data.Where(d => d.Contains(vectorField)).ToList(); - Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); - - Utils.InsertData(collection, documents, batchSize); - - CreateDiskannIndex(collection, vectorField, dimensions, similarity); - Console.WriteLine("Waiting for index to build..."); - Thread.Sleep(5000); - - var query = "quintessential lodging near running trails, eateries, retail"; - var results = Utils.PerformVectorSearch(collection, embeddingClient, query, vectorField, model); - Utils.PrintSearchResults(results, "DiskANN"); - } - finally - { - mongoClient.Cluster.Dispose(); - } - } -} diff --git a/ai/select-algorithm-dotnet/src/HnswDemo.cs b/ai/select-algorithm-dotnet/src/HnswDemo.cs deleted file mode 100644 index 20d48f0..0000000 --- a/ai/select-algorithm-dotnet/src/HnswDemo.cs +++ /dev/null @@ -1,88 +0,0 @@ -/// HNSW (Hierarchical Navigable Small World) vector index for Azure DocumentDB. -/// Best for: Datasets between 10,000 and 50,000 documents. -/// Cluster tier: M30 or higher. -/// Key parameters: m (graph connectivity), efConstruction (build quality). - -namespace SelectAlgorithm; - -using MongoDB.Driver; -using MongoDB.Bson; - -public static class HnswDemo -{ - public static void CreateHnswIndex(IMongoCollection collection, string vectorField, int dimensions, string similarity, int m = 16, int efConstruction = 64) - { - Console.WriteLine($"Creating HNSW vector index on field '{vectorField}'..."); - - Utils.DropVectorIndexes(collection, vectorField); - - var command = new BsonDocument - { - { "createIndexes", collection.CollectionNamespace.CollectionName }, - { "indexes", new BsonArray - { - new BsonDocument - { - { "name", $"hnsw_index_{vectorField}" }, - { "key", new BsonDocument(vectorField, "cosmosSearch") }, - { "cosmosSearchOptions", new BsonDocument - { - { "kind", "vector-hnsw" }, - { "dimensions", dimensions }, - { "similarity", similarity }, - { "m", m }, - { "efConstruction", efConstruction } - } - } - } - } - } - }; - - collection.Database.RunCommand(command); - Console.WriteLine("HNSW vector index created successfully"); - } - - public static void Run(Models.AppConfiguration config) - { - Console.WriteLine(new string('=', 60)); - Console.WriteLine(" HNSW Vector Index - Select Algorithm Demo"); - Console.WriteLine(" Best for: 10,000 - 50,000 documents"); - Console.WriteLine(new string('=', 60)); - - var databaseName = config.DocumentDB.DatabaseName; - var dataFile = config.DataFiles.WithVectors; - var vectorField = config.Embedding.EmbeddedField; - var model = config.AzureOpenAI.EmbeddingModel; - var dimensions = config.Embedding.Dimensions; - var batchSize = config.DocumentDB.LoadBatchSize; - var similarity = config.VectorSearch.Similarity; - - var mongoClient = Utils.GetMongoClientPasswordless(config); - var embeddingClient = Utils.GetEmbeddingClient(config); - - try - { - var database = mongoClient.GetDatabase(databaseName); - var collection = database.GetCollection("hotels_hnsw"); - - var data = Utils.ReadJsonFile(dataFile); - var documents = data.Where(d => d.Contains(vectorField)).ToList(); - Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); - - Utils.InsertData(collection, documents, batchSize); - - CreateHnswIndex(collection, vectorField, dimensions, similarity); - Console.WriteLine("Waiting for index to build..."); - Thread.Sleep(5000); - - var query = "quintessential lodging near running trails, eateries, retail"; - var results = Utils.PerformVectorSearch(collection, embeddingClient, query, vectorField, model); - Utils.PrintSearchResults(results, "HNSW"); - } - finally - { - mongoClient.Cluster.Dispose(); - } - } -} diff --git a/ai/select-algorithm-dotnet/src/IvfDemo.cs b/ai/select-algorithm-dotnet/src/IvfDemo.cs deleted file mode 100644 index 5d9f6d5..0000000 --- a/ai/select-algorithm-dotnet/src/IvfDemo.cs +++ /dev/null @@ -1,87 +0,0 @@ -/// IVF (Inverted File) vector index for Azure DocumentDB. -/// Best for: Datasets with fewer than 10,000 documents. -/// Cluster tier: M10 or higher. -/// Key parameters: numLists (cluster count). - -namespace SelectAlgorithm; - -using MongoDB.Driver; -using MongoDB.Bson; - -public static class IvfDemo -{ - public static void CreateIvfIndex(IMongoCollection collection, string vectorField, int dimensions, string similarity, int numLists = 10) - { - Console.WriteLine($"Creating IVF vector index on field '{vectorField}'..."); - - Utils.DropVectorIndexes(collection, vectorField); - - var command = new BsonDocument - { - { "createIndexes", collection.CollectionNamespace.CollectionName }, - { "indexes", new BsonArray - { - new BsonDocument - { - { "name", $"ivf_index_{vectorField}" }, - { "key", new BsonDocument(vectorField, "cosmosSearch") }, - { "cosmosSearchOptions", new BsonDocument - { - { "kind", "vector-ivf" }, - { "dimensions", dimensions }, - { "similarity", similarity }, - { "numLists", numLists } - } - } - } - } - } - }; - - collection.Database.RunCommand(command); - Console.WriteLine("IVF vector index created successfully"); - } - - public static void Run(Models.AppConfiguration config) - { - Console.WriteLine(new string('=', 60)); - Console.WriteLine(" IVF Vector Index - Select Algorithm Demo"); - Console.WriteLine(" Best for: < 10,000 documents"); - Console.WriteLine(new string('=', 60)); - - var databaseName = config.DocumentDB.DatabaseName; - var dataFile = config.DataFiles.WithVectors; - var vectorField = config.Embedding.EmbeddedField; - var model = config.AzureOpenAI.EmbeddingModel; - var dimensions = config.Embedding.Dimensions; - var batchSize = config.DocumentDB.LoadBatchSize; - var similarity = config.VectorSearch.Similarity; - - var mongoClient = Utils.GetMongoClientPasswordless(config); - var embeddingClient = Utils.GetEmbeddingClient(config); - - try - { - var database = mongoClient.GetDatabase(databaseName); - var collection = database.GetCollection("hotels_ivf"); - - var data = Utils.ReadJsonFile(dataFile); - var documents = data.Where(d => d.Contains(vectorField)).ToList(); - Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); - - Utils.InsertData(collection, documents, batchSize); - - CreateIvfIndex(collection, vectorField, dimensions, similarity); - Console.WriteLine("Waiting for index to build..."); - Thread.Sleep(3000); - - var query = "quintessential lodging near running trails, eateries, retail"; - var results = Utils.PerformVectorSearch(collection, embeddingClient, query, vectorField, model); - Utils.PrintSearchResults(results, "IVF"); - } - finally - { - mongoClient.Cluster.Dispose(); - } - } -} diff --git a/ai/select-algorithm-dotnet/src/Program.cs b/ai/select-algorithm-dotnet/src/Program.cs index 6513684..a05ec57 100644 --- a/ai/select-algorithm-dotnet/src/Program.cs +++ b/ai/select-algorithm-dotnet/src/Program.cs @@ -7,51 +7,13 @@ class Program { static void Main(string[] args) { - var configuration = new ConfigurationBuilder() - .SetBasePath(Directory.GetCurrentDirectory()) - .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) - .AddEnvironmentVariables() - .Build(); - - var appConfig = new AppConfiguration(); - configuration.Bind(appConfig); - - // ALGORITHM env var override for selecting which demo to run - var algorithm = (Environment.GetEnvironmentVariable("ALGORITHM") ?? "all").ToLowerInvariant(); - Console.WriteLine(); Console.WriteLine("Select Algorithm Demo - Azure DocumentDB Vector Search (.NET)"); Console.WriteLine(new string('-', 60)); - Console.WriteLine($"Algorithm: {algorithm}"); Console.WriteLine(); - switch (algorithm) - { - case "ivf": - IvfDemo.Run(appConfig); - break; - case "hnsw": - HnswDemo.Run(appConfig); - break; - case "diskann": - DiskannDemo.Run(appConfig); - break; - case "compare": - CompareAll.Run(); - break; - case "all": - IvfDemo.Run(appConfig); - HnswDemo.Run(appConfig); - DiskannDemo.Run(appConfig); - break; - default: - Console.WriteLine($"Unknown algorithm: {algorithm}"); - Console.WriteLine("Valid options: ivf, hnsw, diskann, compare, all"); - Environment.Exit(1); - break; - } + CompareAll.Run(); Console.WriteLine("Done!"); } } - diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go index 6dc9edc..463e55d 100644 --- a/ai/select-algorithm-go/src/compare_all.go +++ b/ai/select-algorithm-go/src/compare_all.go @@ -47,8 +47,26 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, fmt.Printf("Top-K: %d\n", topK) fmt.Printf("Verbose: %v\n", verbose) - // 1. Get collection and load data ONCE - collection := dbClient.Database(config.DatabaseName).Collection("hotels") + // 1. Drop collection for clean comparison, then load data + database := dbClient.Database(config.DatabaseName) + collection := database.Collection("hotels") + + // Drop existing collection for a clean comparison + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("\nCleanup: dropping comparison collection...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels'") + } + }() fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) diff --git a/ai/select-algorithm-go/src/diskann.go b/ai/select-algorithm-go/src/diskann.go deleted file mode 100644 index ca157fa..0000000 --- a/ai/select-algorithm-go/src/diskann.go +++ /dev/null @@ -1,112 +0,0 @@ -package main - -import ( - "context" - "fmt" - "log" - "strings" - "time" - - "github.com/openai/openai-go/v3" - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -// CreateDiskANNVectorIndex creates a DiskANN vector index on the specified field -func CreateDiskANNVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, dimensions int, similarity string) error { - fmt.Printf("Creating DiskANN vector index on field '%s'...\n", vectorField) - - err := DropVectorIndexes(ctx, collection, vectorField) - if err != nil { - fmt.Printf("Warning: Could not drop existing indexes: %v\n", err) - } - - // Must use bson.D for commands to preserve order and avoid "multi-key map" errors - indexCommand := bson.D{ - {"createIndexes", collection.Name()}, - {"indexes", []bson.D{ - { - {"name", fmt.Sprintf("diskann_index_%s", vectorField)}, - {"key", bson.D{ - {vectorField, "cosmosSearch"}, - }}, - {"cosmosSearchOptions", bson.D{ - {"kind", "vector-diskann"}, - {"dimensions", dimensions}, - {"similarity", similarity}, - // Maximum degree: number of edges per node in the graph - {"maxDegree", 20}, - // Candidates evaluated during index construction - {"lBuild", 10}, - }}, - }, - }}, - } - - var result bson.M - err = collection.Database().RunCommand(ctx, indexCommand).Decode(&result) - if err != nil { - if strings.Contains(err.Error(), "not enabled for this cluster tier") { - fmt.Println("\nDiskANN indexes require a higher cluster tier.") - fmt.Println("Try upgrading your DocumentDB cluster or use a different algorithm.") - } - return fmt.Errorf("error creating DiskANN vector index: %v", err) - } - - fmt.Println("DiskANN vector index created successfully") - return nil -} - -// RunDiskANN executes the full DiskANN vector search workflow -func RunDiskANN(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { - fmt.Println("\n" + strings.Repeat("=", 60)) - fmt.Println("DiskANN Vector Search") - fmt.Println(strings.Repeat("=", 60)) - - collection := dbClient.Database(config.DatabaseName).Collection("hotels_diskann") - - // Load data - fmt.Printf("\nLoading data from %s...\n", config.DataFile) - data, err := ReadFileReturnJSON(config.DataFile) - if err != nil { - return fmt.Errorf("failed to load data: %v", err) - } - - documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) - if len(documentsWithEmbeddings) == 0 { - return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) - } - fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) - - // Insert data - stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) - if err != nil { - return err - } - if stats.Inserted == 0 { - return fmt.Errorf("no documents were inserted successfully") - } - fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) - - // Create DiskANN vector index - fmt.Println("\nCreating DiskANN vector index...") - err = CreateDiskANNVectorIndex(ctx, collection, config.VectorField, config.Dimensions, config.Similarity) - if err != nil { - return fmt.Errorf("failed to create DiskANN vector index: %v", err) - } - - fmt.Println("Waiting for index to be ready...") - time.Sleep(2 * time.Second) - - // Perform vector search - query := "quintessential lodging near running trails, eateries, retail" - results, err := PerformVectorSearch(ctx, collection, aiClient, query, config.VectorField, config.ModelName, 5) - if err != nil { - return fmt.Errorf("failed to perform DiskANN vector search: %v", err) - } - - PrintSearchResults(results, "diskann") - - log.Println("DiskANN demonstration completed successfully!") - return nil -} diff --git a/ai/select-algorithm-go/src/hnsw.go b/ai/select-algorithm-go/src/hnsw.go deleted file mode 100644 index def5aff..0000000 --- a/ai/select-algorithm-go/src/hnsw.go +++ /dev/null @@ -1,112 +0,0 @@ -package main - -import ( - "context" - "fmt" - "log" - "strings" - "time" - - "github.com/openai/openai-go/v3" - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -// CreateHNSWVectorIndex creates an HNSW (Hierarchical Navigable Small World) vector index on the specified field -func CreateHNSWVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, dimensions int, similarity string) error { - fmt.Printf("Creating HNSW vector index on field '%s'...\n", vectorField) - - err := DropVectorIndexes(ctx, collection, vectorField) - if err != nil { - fmt.Printf("Warning: Could not drop existing indexes: %v\n", err) - } - - // Must use bson.D for commands to preserve order and avoid "multi-key map" errors - indexCommand := bson.D{ - {"createIndexes", collection.Name()}, - {"indexes", []bson.D{ - { - {"name", fmt.Sprintf("hnsw_index_%s", vectorField)}, - {"key", bson.D{ - {vectorField, "cosmosSearch"}, - }}, - {"cosmosSearchOptions", bson.D{ - {"kind", "vector-hnsw"}, - {"dimensions", dimensions}, - {"similarity", similarity}, - // Maximum connections per node in the graph - {"m", 16}, - // Candidate list size during construction - {"efConstruction", 64}, - }}, - }, - }}, - } - - var result bson.M - err = collection.Database().RunCommand(ctx, indexCommand).Decode(&result) - if err != nil { - if strings.Contains(err.Error(), "not enabled for this cluster tier") { - fmt.Println("\nHNSW indexes require a higher cluster tier.") - fmt.Println("Try upgrading your DocumentDB cluster or use a different algorithm.") - } - return fmt.Errorf("error creating HNSW vector index: %v", err) - } - - fmt.Println("HNSW vector index created successfully") - return nil -} - -// RunHNSW executes the full HNSW vector search workflow -func RunHNSW(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { - fmt.Println("\n" + strings.Repeat("=", 60)) - fmt.Println("HNSW (Hierarchical Navigable Small World) Vector Search") - fmt.Println(strings.Repeat("=", 60)) - - collection := dbClient.Database(config.DatabaseName).Collection("hotels_hnsw") - - // Load data - fmt.Printf("\nLoading data from %s...\n", config.DataFile) - data, err := ReadFileReturnJSON(config.DataFile) - if err != nil { - return fmt.Errorf("failed to load data: %v", err) - } - - documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) - if len(documentsWithEmbeddings) == 0 { - return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) - } - fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) - - // Insert data - stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) - if err != nil { - return err - } - if stats.Inserted == 0 { - return fmt.Errorf("no documents were inserted successfully") - } - fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) - - // Create HNSW vector index - fmt.Println("\nCreating HNSW vector index...") - err = CreateHNSWVectorIndex(ctx, collection, config.VectorField, config.Dimensions, config.Similarity) - if err != nil { - return fmt.Errorf("failed to create HNSW vector index: %v", err) - } - - fmt.Println("Waiting for index to be ready...") - time.Sleep(2 * time.Second) - - // Perform vector search - query := "quintessential lodging near running trails, eateries, retail" - results, err := PerformVectorSearch(ctx, collection, aiClient, query, config.VectorField, config.ModelName, 5) - if err != nil { - return fmt.Errorf("failed to perform HNSW vector search: %v", err) - } - - PrintSearchResults(results, "hnsw") - - log.Println("HNSW demonstration completed successfully!") - return nil -} diff --git a/ai/select-algorithm-go/src/ivf.go b/ai/select-algorithm-go/src/ivf.go deleted file mode 100644 index 3da7cba..0000000 --- a/ai/select-algorithm-go/src/ivf.go +++ /dev/null @@ -1,110 +0,0 @@ -package main - -import ( - "context" - "fmt" - "log" - "strings" - "time" - - "github.com/openai/openai-go/v3" - "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" -) - -// CreateIVFVectorIndex creates an IVF (Inverted File) vector index on the specified field -func CreateIVFVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, dimensions int, similarity string) error { - fmt.Printf("Creating IVF vector index on field '%s'...\n", vectorField) - - err := DropVectorIndexes(ctx, collection, vectorField) - if err != nil { - fmt.Printf("Warning: Could not drop existing indexes: %v\n", err) - } - - // Must use bson.D for commands to preserve order and avoid "multi-key map" errors - indexCommand := bson.D{ - {"createIndexes", collection.Name()}, - {"indexes", []bson.D{ - { - {"name", fmt.Sprintf("ivf_index_%s", vectorField)}, - {"key", bson.D{ - {vectorField, "cosmosSearch"}, - }}, - {"cosmosSearchOptions", bson.D{ - {"kind", "vector-ivf"}, - {"dimensions", dimensions}, - {"similarity", similarity}, - // Number of clusters to partition vectors into - {"numLists", 10}, - }}, - }, - }}, - } - - var result bson.M - err = collection.Database().RunCommand(ctx, indexCommand).Decode(&result) - if err != nil { - if strings.Contains(err.Error(), "not enabled for this cluster tier") { - fmt.Println("\nIVF indexes require a higher cluster tier.") - fmt.Println("Try upgrading your DocumentDB cluster or use a different algorithm.") - } - return fmt.Errorf("error creating IVF vector index: %v", err) - } - - fmt.Println("IVF vector index created successfully") - return nil -} - -// RunIVF executes the full IVF vector search workflow -func RunIVF(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { - fmt.Println("\n" + strings.Repeat("=", 60)) - fmt.Println("IVF (Inverted File) Vector Search") - fmt.Println(strings.Repeat("=", 60)) - - collection := dbClient.Database(config.DatabaseName).Collection("hotels_ivf") - - // Load data - fmt.Printf("\nLoading data from %s...\n", config.DataFile) - data, err := ReadFileReturnJSON(config.DataFile) - if err != nil { - return fmt.Errorf("failed to load data: %v", err) - } - - documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) - if len(documentsWithEmbeddings) == 0 { - return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) - } - fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) - - // Insert data - stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) - if err != nil { - return err - } - if stats.Inserted == 0 { - return fmt.Errorf("no documents were inserted successfully") - } - fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) - - // Create IVF vector index - fmt.Println("\nCreating IVF vector index...") - err = CreateIVFVectorIndex(ctx, collection, config.VectorField, config.Dimensions, config.Similarity) - if err != nil { - return fmt.Errorf("failed to create IVF vector index: %v", err) - } - - fmt.Println("Waiting for index clustering to complete...") - time.Sleep(3 * time.Second) - - // Perform vector search - query := "quintessential lodging near running trails, eateries, retail" - results, err := PerformVectorSearch(ctx, collection, aiClient, query, config.VectorField, config.ModelName, 5) - if err != nil { - return fmt.Errorf("failed to perform IVF vector search: %v", err) - } - - PrintSearchResults(results, "ivf") - - log.Println("IVF demonstration completed successfully!") - return nil -} diff --git a/ai/select-algorithm-go/src/main.go b/ai/select-algorithm-go/src/main.go index 8508846..10b6d65 100644 --- a/ai/select-algorithm-go/src/main.go +++ b/ai/select-algorithm-go/src/main.go @@ -15,9 +15,7 @@ func main() { // Load configuration from environment variables config := LoadConfig() - fmt.Printf("Algorithm: %s\n", config.Algorithm) fmt.Printf("Database: %s\n", config.DatabaseName) - fmt.Printf("Similarity: %s\n", config.Similarity) fmt.Printf("Dimensions: %d\n", config.Dimensions) // Initialize MongoDB and Azure OpenAI clients @@ -28,45 +26,9 @@ func main() { } defer mongoClient.Disconnect(ctx) - // Dispatch based on selected algorithm - switch config.Algorithm { - case "ivf": - if err := RunIVF(ctx, config, mongoClient, aiClient); err != nil { - log.Fatalf("IVF failed: %v", err) - } - - case "hnsw": - if err := RunHNSW(ctx, config, mongoClient, aiClient); err != nil { - log.Fatalf("HNSW failed: %v", err) - } - - case "diskann": - if err := RunDiskANN(ctx, config, mongoClient, aiClient); err != nil { - log.Fatalf("DiskANN failed: %v", err) - } - - case "all": - fmt.Println("\nRunning all algorithms...") - - if err := RunIVF(ctx, config, mongoClient, aiClient); err != nil { - log.Printf("IVF failed: %v", err) - } - - if err := RunHNSW(ctx, config, mongoClient, aiClient); err != nil { - log.Printf("HNSW failed: %v", err) - } - - if err := RunDiskANN(ctx, config, mongoClient, aiClient); err != nil { - log.Printf("DiskANN failed: %v", err) - } - - case "compare-all": - if err := RunCompareAll(ctx, config, mongoClient, aiClient); err != nil { - log.Fatalf("Compare-all failed: %v", err) - } - - default: - log.Fatalf("Unknown algorithm: '%s'. Use 'all', 'ivf', 'hnsw', 'diskann', or 'compare-all'", config.Algorithm) + // Run the comparison runner + if err := RunCompareAll(ctx, config, mongoClient, aiClient); err != nil { + log.Fatalf("Compare-all failed: %v", err) } fmt.Println("\nDone!") diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java index edd24a2..ef8d55a 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -118,6 +118,11 @@ public static void run() { } } } + + // Cleanup: drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); } // Print comparison table diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java deleted file mode 100644 index 0b12686..0000000 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskannDemo.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.azure.documentdb.selectalgorithm; - -import com.azure.ai.openai.OpenAIClient; -import com.mongodb.client.MongoClient; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; -import org.bson.Document; - -import java.util.List; - -public class DiskannDemo { - - private static final String COLLECTION_NAME = "hotels_diskann"; - private static final String QUERY = "quintessential lodging near running trails, eateries, retail"; - - public static void createDiskannIndex(MongoCollection collection, String vectorField, int dimensions, String similarity) { - System.out.println(" Creating DiskANN vector index..."); - - Document indexDefinition = new Document() - .append("name", "diskann_index_" + vectorField) - .append("key", new Document(vectorField, "cosmosSearch")) - .append("cosmosSearchOptions", new Document() - .append("kind", "vector-diskann") - .append("dimensions", dimensions) - .append("similarity", similarity) - .append("maxDegree", 20) - .append("lBuild", 10)); - - Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) - .append("indexes", List.of(indexDefinition)); - - collection.getDatabase().runCommand(command); - System.out.println(" DiskANN index created successfully."); - } - - public static void run() { - System.out.println("\n========================================"); - System.out.println(" DiskANN Index Demo"); - System.out.println("========================================\n"); - - String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); - String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); - String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); - int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); - String similarity = Utils.getEnv("SIMILARITY", "COS"); - String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); - - try (MongoClient mongoClient = Utils.getMongoClient()) { - MongoDatabase database = mongoClient.getDatabase(databaseName); - MongoCollection collection = database.getCollection(COLLECTION_NAME); - - // Load and insert data - System.out.println(" Loading data from: " + dataFile); - List data = Utils.readJsonFile(dataFile); - System.out.printf(" Loaded %d documents%n", data.size()); - - // Drop existing collection to start fresh - collection.drop(); - System.out.println(" Collection reset."); - - Utils.insertData(collection, data, 100); - - // Create DiskANN index - createDiskannIndex(collection, vectorField, dimensions, similarity); - - // Perform vector search - OpenAIClient aiClient = Utils.getOpenAIClient(); - System.out.println("\n Performing vector search with DiskANN index..."); - List results = Utils.performVectorSearch( - collection, aiClient, QUERY, vectorField, model, 5); - - Utils.printResults(results); - } - - System.out.println(" DiskANN Demo complete.\n"); - } -} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java deleted file mode 100644 index 09d436a..0000000 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HnswDemo.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.azure.documentdb.selectalgorithm; - -import com.azure.ai.openai.OpenAIClient; -import com.mongodb.client.MongoClient; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; -import org.bson.Document; - -import java.util.List; - -public class HnswDemo { - - private static final String COLLECTION_NAME = "hotels_hnsw"; - private static final String QUERY = "quintessential lodging near running trails, eateries, retail"; - - public static void createHnswIndex(MongoCollection collection, String vectorField, int dimensions, String similarity) { - System.out.println(" Creating HNSW vector index..."); - - Document indexDefinition = new Document() - .append("name", "hnsw_index_" + vectorField) - .append("key", new Document(vectorField, "cosmosSearch")) - .append("cosmosSearchOptions", new Document() - .append("kind", "vector-hnsw") - .append("dimensions", dimensions) - .append("similarity", similarity) - .append("m", 16) - .append("efConstruction", 64)); - - Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) - .append("indexes", List.of(indexDefinition)); - - collection.getDatabase().runCommand(command); - System.out.println(" HNSW index created successfully."); - } - - public static void run() { - System.out.println("\n========================================"); - System.out.println(" HNSW (Hierarchical Navigable Small World) Index Demo"); - System.out.println("========================================\n"); - - String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); - String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); - String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); - int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); - String similarity = Utils.getEnv("SIMILARITY", "COS"); - String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); - - try (MongoClient mongoClient = Utils.getMongoClient()) { - MongoDatabase database = mongoClient.getDatabase(databaseName); - MongoCollection collection = database.getCollection(COLLECTION_NAME); - - // Load and insert data - System.out.println(" Loading data from: " + dataFile); - List data = Utils.readJsonFile(dataFile); - System.out.printf(" Loaded %d documents%n", data.size()); - - // Drop existing collection to start fresh - collection.drop(); - System.out.println(" Collection reset."); - - Utils.insertData(collection, data, 100); - - // Create HNSW index - createHnswIndex(collection, vectorField, dimensions, similarity); - - // Perform vector search - OpenAIClient aiClient = Utils.getOpenAIClient(); - System.out.println("\n Performing vector search with HNSW index..."); - List results = Utils.performVectorSearch( - collection, aiClient, QUERY, vectorField, model, 5); - - Utils.printResults(results); - } - - System.out.println(" HNSW Demo complete.\n"); - } -} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java deleted file mode 100644 index 5baad0b..0000000 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IvfDemo.java +++ /dev/null @@ -1,76 +0,0 @@ -package com.azure.documentdb.selectalgorithm; - -import com.azure.ai.openai.OpenAIClient; -import com.mongodb.client.MongoClient; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; -import org.bson.Document; - -import java.util.List; - -public class IvfDemo { - - private static final String COLLECTION_NAME = "hotels_ivf"; - private static final String QUERY = "quintessential lodging near running trails, eateries, retail"; - - public static void createIvfIndex(MongoCollection collection, String vectorField, int dimensions, String similarity) { - System.out.println(" Creating IVF vector index..."); - - Document indexDefinition = new Document() - .append("name", "ivf_index_" + vectorField) - .append("key", new Document(vectorField, "cosmosSearch")) - .append("cosmosSearchOptions", new Document() - .append("kind", "vector-ivf") - .append("dimensions", dimensions) - .append("similarity", similarity) - .append("numLists", 10)); - - Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) - .append("indexes", List.of(indexDefinition)); - - collection.getDatabase().runCommand(command); - System.out.println(" IVF index created successfully."); - } - - public static void run() { - System.out.println("\n========================================"); - System.out.println(" IVF (Inverted File) Index Demo"); - System.out.println("========================================\n"); - - String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); - String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); - String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); - int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); - String similarity = Utils.getEnv("SIMILARITY", "COS"); - String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); - - try (MongoClient mongoClient = Utils.getMongoClient()) { - MongoDatabase database = mongoClient.getDatabase(databaseName); - MongoCollection collection = database.getCollection(COLLECTION_NAME); - - // Load and insert data - System.out.println(" Loading data from: " + dataFile); - List data = Utils.readJsonFile(dataFile); - System.out.printf(" Loaded %d documents%n", data.size()); - - // Drop existing collection to start fresh - collection.drop(); - System.out.println(" Collection reset."); - - Utils.insertData(collection, data, 100); - - // Create IVF index - createIvfIndex(collection, vectorField, dimensions, similarity); - - // Perform vector search - OpenAIClient aiClient = Utils.getOpenAIClient(); - System.out.println("\n Performing vector search with IVF index..."); - List results = Utils.performVectorSearch( - collection, aiClient, QUERY, vectorField, model, 5); - - Utils.printResults(results); - } - - System.out.println(" IVF Demo complete.\n"); - } -} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java index 982b698..5a9d54c 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java @@ -3,33 +3,15 @@ public class Main { public static void main(String[] args) { - String algorithm = Utils.getEnv("ALGORITHM", "all").toLowerCase().trim(); - System.out.println("=============================================="); - System.out.println(" Azure DocumentDB - Vector Search Algorithms"); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); System.out.println("=============================================="); - System.out.println(" Algorithm: " + algorithm); System.out.println(); - switch (algorithm) { - case "ivf" -> IvfDemo.run(); - case "hnsw" -> HnswDemo.run(); - case "diskann" -> DiskannDemo.run(); - case "compare" -> CompareAll.run(); - case "all" -> { - IvfDemo.run(); - HnswDemo.run(); - DiskannDemo.run(); - } - default -> { - System.err.println("Unknown algorithm: " + algorithm); - System.err.println("Valid options: ivf, hnsw, diskann, compare, all"); - System.exit(1); - } - } + CompareAll.run(); System.out.println("=============================================="); - System.out.println(" All demos complete."); + System.out.println(" Comparison complete."); System.out.println("=============================================="); } } diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py index 0703e77..1aac549 100644 --- a/ai/select-algorithm-python/src/compare_all.py +++ b/ai/select-algorithm-python/src/compare_all.py @@ -170,9 +170,13 @@ def main(): try: database = mongo_client[config["database_name"]] - collection = database["hotels"] - # Load data once + # Drop collection for a clean comparison + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection (if any)") + + # Create fresh collection and load data + collection = database["hotels"] data = read_file_return_json(config["data_file"]) documents = [doc for doc in data if config["vector_field"] in doc] print(f"Loaded {len(documents)} documents with embeddings") @@ -227,6 +231,13 @@ def main(): print(tabulate(table_rows, headers=headers, tablefmt="grid")) finally: + # Cleanup: drop the comparison collection + try: + database = mongo_client[config["database_name"]] + database.drop_collection("hotels") + print("\nCleanup: dropped collection 'hotels'") + except Exception as e: + print(f"Cleanup warning: {e}") mongo_client.close() diff --git a/ai/select-algorithm-python/src/diskann.py b/ai/select-algorithm-python/src/diskann.py deleted file mode 100644 index 5fac5cd..0000000 --- a/ai/select-algorithm-python/src/diskann.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -DiskANN vector index for Azure DocumentDB. - -Best for: Datasets with 50,000+ documents. -Cluster tier: M30 or higher. -Key parameters: maxDegree (graph edges), lBuild (construction quality). -""" -import os -import time -from utils import ( - get_clients_passwordless, get_config, read_file_return_json, - insert_data, drop_vector_indexes, perform_vector_search, print_search_results -) - - -def create_diskann_vector_index(collection, vector_field: str, dimensions: int, - similarity: str = "COS", max_degree: int = 20, - l_build: int = 10) -> None: - """Create a DiskANN vector index on the specified field.""" - print(f"Creating DiskANN vector index on field '{vector_field}'...") - - drop_vector_indexes(collection, vector_field) - - index_command = { - "createIndexes": collection.name, - "indexes": [ - { - "name": f"diskann_index_{vector_field}", - "key": {vector_field: "cosmosSearch"}, - "cosmosSearchOptions": { - "kind": "vector-diskann", - "dimensions": dimensions, - "similarity": similarity, - "maxDegree": max_degree, - "lBuild": l_build - } - } - ] - } - - result = collection.database.command(index_command) - print(f"DiskANN vector index created successfully") - return result - - -def main(): - print("=" * 60) - print(" DiskANN Vector Index - Select Algorithm Demo") - print(" Best for: 50,000+ documents") - print("=" * 60) - - config = get_config() - mongo_client, azure_openai_client = get_clients_passwordless() - - try: - database = mongo_client[config['database_name']] - collection = database["hotels_diskann"] - - # Load and insert data - data = read_file_return_json(config['data_file']) - documents = [doc for doc in data if config['vector_field'] in doc] - print(f"\nLoaded {len(documents)} documents with embeddings") - - stats = insert_data(collection, documents, config['batch_size']) - - # Create DiskANN index - if not stats.get('skipped'): - create_diskann_vector_index( - collection, - config['vector_field'], - config['dimensions'], - config['similarity'] - ) - print("Waiting for index to build...") - time.sleep(5) - - # Perform search - query = "quintessential lodging near running trails, eateries, retail" - results = perform_vector_search( - collection, azure_openai_client, query, - config['vector_field'], config['model_name'] - ) - print_search_results(results, "DiskANN") - - finally: - mongo_client.close() - - -if __name__ == "__main__": - main() diff --git a/ai/select-algorithm-python/src/hnsw.py b/ai/select-algorithm-python/src/hnsw.py deleted file mode 100644 index 568ef0b..0000000 --- a/ai/select-algorithm-python/src/hnsw.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -HNSW (Hierarchical Navigable Small World) vector index for Azure DocumentDB. - -Best for: Datasets between 10,000 and 50,000 documents. -Cluster tier: M30 or higher. -Key parameters: m (graph connectivity), efConstruction (build quality). -""" -import os -import time -from utils import ( - get_clients_passwordless, get_config, read_file_return_json, - insert_data, drop_vector_indexes, perform_vector_search, print_search_results -) - - -def create_hnsw_vector_index(collection, vector_field: str, dimensions: int, - similarity: str = "COS", m: int = 16, - ef_construction: int = 64) -> None: - """Create an HNSW vector index on the specified field.""" - print(f"Creating HNSW vector index on field '{vector_field}'...") - - drop_vector_indexes(collection, vector_field) - - index_command = { - "createIndexes": collection.name, - "indexes": [ - { - "name": f"hnsw_index_{vector_field}", - "key": {vector_field: "cosmosSearch"}, - "cosmosSearchOptions": { - "kind": "vector-hnsw", - "dimensions": dimensions, - "similarity": similarity, - "m": m, - "efConstruction": ef_construction - } - } - ] - } - - result = collection.database.command(index_command) - print(f"HNSW vector index created successfully") - return result - - -def main(): - print("=" * 60) - print(" HNSW Vector Index - Select Algorithm Demo") - print(" Best for: 10,000 - 50,000 documents") - print("=" * 60) - - config = get_config() - mongo_client, azure_openai_client = get_clients_passwordless() - - try: - database = mongo_client[config['database_name']] - collection = database["hotels_hnsw"] - - # Load and insert data - data = read_file_return_json(config['data_file']) - documents = [doc for doc in data if config['vector_field'] in doc] - print(f"\nLoaded {len(documents)} documents with embeddings") - - stats = insert_data(collection, documents, config['batch_size']) - - # Create HNSW index - if not stats.get('skipped'): - create_hnsw_vector_index( - collection, - config['vector_field'], - config['dimensions'], - config['similarity'] - ) - print("Waiting for index to build...") - time.sleep(5) - - # Perform search - query = "quintessential lodging near running trails, eateries, retail" - results = perform_vector_search( - collection, azure_openai_client, query, - config['vector_field'], config['model_name'] - ) - print_search_results(results, "HNSW") - - finally: - mongo_client.close() - - -if __name__ == "__main__": - main() diff --git a/ai/select-algorithm-python/src/ivf.py b/ai/select-algorithm-python/src/ivf.py deleted file mode 100644 index 577f82b..0000000 --- a/ai/select-algorithm-python/src/ivf.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -IVF (Inverted File) vector index for Azure DocumentDB. - -Best for: Datasets with fewer than 10,000 documents. -Cluster tier: M10 or higher. -Key parameters: numLists (cluster count). -""" -import os -import time -from utils import ( - get_clients_passwordless, get_config, read_file_return_json, - insert_data, drop_vector_indexes, perform_vector_search, print_search_results -) - - -def create_ivf_vector_index(collection, vector_field: str, dimensions: int, - similarity: str = "COS", num_lists: int = 10) -> None: - """Create an IVF vector index on the specified field.""" - print(f"Creating IVF vector index on field '{vector_field}'...") - - drop_vector_indexes(collection, vector_field) - - index_command = { - "createIndexes": collection.name, - "indexes": [ - { - "name": f"ivf_index_{vector_field}", - "key": {vector_field: "cosmosSearch"}, - "cosmosSearchOptions": { - "kind": "vector-ivf", - "dimensions": dimensions, - "similarity": similarity, - "numLists": num_lists - } - } - ] - } - - result = collection.database.command(index_command) - print(f"IVF vector index created successfully") - return result - - -def main(): - print("=" * 60) - print(" IVF Vector Index - Select Algorithm Demo") - print(" Best for: < 10,000 documents") - print("=" * 60) - - config = get_config() - mongo_client, azure_openai_client = get_clients_passwordless() - - try: - database = mongo_client[config['database_name']] - collection = database["hotels_ivf"] - - # Load and insert data - data = read_file_return_json(config['data_file']) - documents = [doc for doc in data if config['vector_field'] in doc] - print(f"\nLoaded {len(documents)} documents with embeddings") - - stats = insert_data(collection, documents, config['batch_size']) - - # Create IVF index - if not stats.get('skipped'): - create_ivf_vector_index( - collection, - config['vector_field'], - config['dimensions'], - config['similarity'] - ) - print("Waiting for index to build...") - time.sleep(3) - - # Perform search - query = "quintessential lodging near running trails, eateries, retail" - results = perform_vector_search( - collection, azure_openai_client, query, - config['vector_field'], config['model_name'] - ) - print_search_results(results, "IVF") - - finally: - mongo_client.close() - - -if __name__ == "__main__": - main() diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json index dcadb2f..e8176ec 100644 --- a/ai/select-algorithm-typescript/package.json +++ b/ai/select-algorithm-typescript/package.json @@ -5,10 +5,7 @@ "type": "module", "scripts": { "build": "tsc", - "start:ivf": "node --env-file .env dist/ivf.js", - "start:hnsw": "node --env-file .env dist/hnsw.js", - "start:diskann": "node --env-file .env dist/diskann.js", - "start:compare-all": "node --env-file .env dist/compare-all.js" + "start": "node --env-file .env dist/compare-all.js" }, "dependencies": { "@azure/identity": "^4.11.1", diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts index 2d63984..53c54aa 100644 --- a/ai/select-algorithm-typescript/src/compare-all.ts +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -45,33 +45,25 @@ async function main() { await dbClient.connect(); const db = dbClient.db(baseConfig.dbName); - // Create collection and load data once - let collection; - const collections = await db.listCollections({ name: collectionName }).toArray(); - if (collections.length === 0) { - collection = await db.createCollection(collectionName); - console.log(`Created collection: ${collectionName}`); - const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); - const insertSummary = await insertData(baseConfig, collection, data); - console.log(`Inserted ${insertSummary.inserted}/${insertSummary.total} documents`); - } else { - collection = db.collection(collectionName); - console.log(`Collection "${collectionName}" already exists, skipping data load`); + // Drop collection if it exists for a clean comparison + const existingCollections = await db.listCollections({ name: collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); } - // Check existing indexes to avoid duplicates - const existingIndexes = await collection.listIndexes().toArray(); - const existingIndexNames = new Set(existingIndexes.map(idx => idx.name)); + // Create collection and load data + const collection = await db.createCollection(collectionName); + console.log(`Created collection: ${collectionName}`); + const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); + const insertSummary = await insertData(baseConfig, collection, data); + console.log(`Inserted ${insertSummary.inserted}/${insertSummary.total} documents`); // Create all 9 indexes console.log('\nCreating vector indexes...'); for (const algo of ALGORITHMS) { for (const sim of SIMILARITIES) { const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; - if (existingIndexNames.has(indexName)) { - console.log(` ✓ ${indexName} (already exists)`); - continue; - } const indexOptions = { createIndexes: collectionName, indexes: [{ @@ -152,8 +144,18 @@ async function main() { console.error('Compare-all failed:', error); process.exitCode = 1; } finally { - if (dbClient) await dbClient.close(); - console.log('\nDatabase connection closed'); + // Cleanup: drop the comparison collection + if (dbClient) { + try { + const db = dbClient.db(baseConfig.dbName); + await db.dropCollection(collectionName); + console.log(`\nCleanup: dropped collection "${collectionName}"`); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } diff --git a/ai/select-algorithm-typescript/src/diskann.ts b/ai/select-algorithm-typescript/src/diskann.ts deleted file mode 100644 index bd0c84a..0000000 --- a/ai/select-algorithm-typescript/src/diskann.ts +++ /dev/null @@ -1,101 +0,0 @@ -import path from 'path'; -import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; - -import { fileURLToPath } from "node:url"; -import { dirname } from "node:path"; -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -const baseConfig = getConfig(); - -const config = { - ...baseConfig, - query: "quintessential lodging near running trails, eateries, retail", - collectionName: "hotels_diskann", - indexName: "vectorIndex_diskann", -}; - -async function main() { - const { aiClient, dbClient } = getClientsPasswordless(); - - try { - if (!aiClient) { - throw new Error('AI client is not configured. Please check your environment variables.'); - } - if (!dbClient) { - throw new Error('Database client is not configured. Please check your environment variables.'); - } - - await dbClient.connect(); - const db = dbClient.db(config.dbName); - const collection = await db.createCollection(config.collectionName); - console.log('Created collection:', config.collectionName); - - const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); - const insertSummary = await insertData(config, collection, data); - - // Create the DiskANN vector index - const indexOptions = { - createIndexes: config.collectionName, - indexes: [ - { - name: config.indexName, - key: { - [config.embeddedField]: 'cosmosSearch' - }, - cosmosSearchOptions: { - kind: 'vector-diskann', - maxDegree: 20, - lBuild: 10, - similarity: config.similarity, - dimensions: config.embeddingDimensions - } - } - ] - }; - const vectorIndexSummary = await db.command(indexOptions); - console.log('Created vector index:', config.indexName); - - // Create embedding for the query - const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ - model: config.deployment, - input: [config.query] - }); - - // Perform the vector similarity search - const searchResults = await collection.aggregate([ - { - $search: { - cosmosSearch: { - vector: createEmbeddedForQueryResponse.data[0].embedding, - path: config.embeddedField, - k: 5 - } - } - }, - { - $project: { - score: { - $meta: "searchScore" - }, - document: "$$ROOT" - } - } - ]).toArray(); - - printSearchResults(insertSummary, vectorIndexSummary, searchResults); - - } catch (error) { - console.error('App failed:', error); - process.exitCode = 1; - } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); - } -} - -main().catch(error => { - console.error('Unhandled error:', error); - process.exitCode = 1; -}); diff --git a/ai/select-algorithm-typescript/src/hnsw.ts b/ai/select-algorithm-typescript/src/hnsw.ts deleted file mode 100644 index a44d4c1..0000000 --- a/ai/select-algorithm-typescript/src/hnsw.ts +++ /dev/null @@ -1,101 +0,0 @@ -import path from 'path'; -import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; - -import { fileURLToPath } from "node:url"; -import { dirname } from "node:path"; -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -const baseConfig = getConfig(); - -const config = { - ...baseConfig, - query: "quintessential lodging near running trails, eateries, retail", - collectionName: "hotels_hnsw", - indexName: "vectorIndex_hnsw", -}; - -async function main() { - const { aiClient, dbClient } = getClientsPasswordless(); - - try { - if (!aiClient) { - throw new Error('AI client is not configured. Please check your environment variables.'); - } - if (!dbClient) { - throw new Error('Database client is not configured. Please check your environment variables.'); - } - - await dbClient.connect(); - const db = dbClient.db(config.dbName); - const collection = await db.createCollection(config.collectionName); - console.log('Created collection:', config.collectionName); - - const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); - const insertSummary = await insertData(config, collection, data); - - // Create the HNSW vector index - const indexOptions = { - createIndexes: config.collectionName, - indexes: [ - { - name: config.indexName, - key: { - [config.embeddedField]: 'cosmosSearch' - }, - cosmosSearchOptions: { - kind: 'vector-hnsw', - m: 16, - efConstruction: 64, - similarity: config.similarity, - dimensions: config.embeddingDimensions - } - } - ] - }; - const vectorIndexSummary = await db.command(indexOptions); - console.log('Created vector index:', config.indexName); - - // Create embedding for the query - const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ - model: config.deployment, - input: [config.query] - }); - - // Perform the vector similarity search - const searchResults = await collection.aggregate([ - { - $search: { - cosmosSearch: { - vector: createEmbeddedForQueryResponse.data[0].embedding, - path: config.embeddedField, - k: 5 - } - } - }, - { - $project: { - score: { - $meta: "searchScore" - }, - document: "$$ROOT" - } - } - ]).toArray(); - - printSearchResults(insertSummary, vectorIndexSummary, searchResults); - - } catch (error) { - console.error('App failed:', error); - process.exitCode = 1; - } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); - } -} - -main().catch(error => { - console.error('Unhandled error:', error); - process.exitCode = 1; -}); diff --git a/ai/select-algorithm-typescript/src/ivf.ts b/ai/select-algorithm-typescript/src/ivf.ts deleted file mode 100644 index 7df1520..0000000 --- a/ai/select-algorithm-typescript/src/ivf.ts +++ /dev/null @@ -1,100 +0,0 @@ -import path from 'path'; -import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; - -import { fileURLToPath } from "node:url"; -import { dirname } from "node:path"; -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -const baseConfig = getConfig(); - -const config = { - ...baseConfig, - query: "quintessential lodging near running trails, eateries, retail", - collectionName: "hotels_ivf", - indexName: "vectorIndex_ivf", -}; - -async function main() { - const { aiClient, dbClient } = getClientsPasswordless(); - - try { - if (!aiClient) { - throw new Error('AI client is not configured. Please check your environment variables.'); - } - if (!dbClient) { - throw new Error('Database client is not configured. Please check your environment variables.'); - } - - await dbClient.connect(); - const db = dbClient.db(config.dbName); - const collection = await db.createCollection(config.collectionName); - console.log('Created collection:', config.collectionName); - - const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); - const insertSummary = await insertData(config, collection, data); - - // Create the IVF vector index - const indexOptions = { - createIndexes: config.collectionName, - indexes: [ - { - name: config.indexName, - key: { - [config.embeddedField]: 'cosmosSearch' - }, - cosmosSearchOptions: { - kind: 'vector-ivf', - numLists: 10, - similarity: config.similarity, - dimensions: config.embeddingDimensions - } - } - ] - }; - const vectorIndexSummary = await db.command(indexOptions); - console.log('Created vector index:', config.indexName); - - // Create embedding for the query - const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ - model: config.deployment, - input: [config.query] - }); - - // Perform the vector similarity search - const searchResults = await collection.aggregate([ - { - $search: { - cosmosSearch: { - vector: createEmbeddedForQueryResponse.data[0].embedding, - path: config.embeddedField, - k: 5 - } - } - }, - { - $project: { - score: { - $meta: "searchScore" - }, - document: "$$ROOT" - } - } - ]).toArray(); - - printSearchResults(insertSummary, vectorIndexSummary, searchResults); - - } catch (error) { - console.error('App failed:', error); - process.exitCode = 1; - } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); - } -} - -main().catch(error => { - console.error('Unhandled error:', error); - process.exitCode = 1; -}); From edcfe2ab1d72219e72aa3564bf3876b50fcb6de3 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Thu, 30 Apr 2026 07:51:28 -0700 Subject: [PATCH 7/9] Standardize collection lifecycle: conditional drop at start, always drop at end All 10 sample directories now follow the same pattern: - START: conditionally drop collection only if it exists - END: always drop collection for cleanup (in finally/defer block) Languages updated: TypeScript, Python, Go, Java, .NET Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/src/CompareAll.cs | 10 +- ai/select-algorithm-go/src/compare_all.go | 15 +- .../selectalgorithm/CompareAll.java | 139 +++++++++--------- ai/select-algorithm-python/src/compare_all.py | 7 +- .../Services/VectorSearchService.cs | 48 ++++-- ai/vector-search-go/src/diskann.go | 31 ++-- ai/vector-search-go/src/hnsw.go | 31 ++-- ai/vector-search-go/src/ivf.go | 31 ++-- .../com/azure/documentdb/samples/DiskAnn.java | 33 +++-- .../com/azure/documentdb/samples/HNSW.java | 33 +++-- .../com/azure/documentdb/samples/IVF.java | 33 +++-- ai/vector-search-python/src/diskann.py | 14 +- ai/vector-search-python/src/hnsw.py | 14 +- ai/vector-search-python/src/ivf.py | 14 +- ai/vector-search-typescript/src/diskann.ts | 23 ++- ai/vector-search-typescript/src/hnsw.ts | 23 ++- ai/vector-search-typescript/src/ivf.ts | 23 ++- 17 files changed, 354 insertions(+), 168 deletions(-) diff --git a/ai/select-algorithm-dotnet/src/CompareAll.cs b/ai/select-algorithm-dotnet/src/CompareAll.cs index a29704c..d8af191 100644 --- a/ai/select-algorithm-dotnet/src/CompareAll.cs +++ b/ai/select-algorithm-dotnet/src/CompareAll.cs @@ -37,9 +37,13 @@ public static void Run() { var database = mongoClient.GetDatabase(databaseName); - // Drop collection for a clean comparison - database.DropCollection("hotels"); - Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + // Drop collection if it already exists (clean start) + var collectionNames = database.ListCollectionNames().ToList(); + if (collectionNames.Contains("hotels")) + { + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection."); + } var collection = database.GetCollection("hotels"); diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go index 463e55d..c873e18 100644 --- a/ai/select-algorithm-go/src/compare_all.go +++ b/ai/select-algorithm-go/src/compare_all.go @@ -47,15 +47,18 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, fmt.Printf("Top-K: %d\n", topK) fmt.Printf("Verbose: %v\n", verbose) - // 1. Drop collection for clean comparison, then load data + // 1. Drop collection if it exists for clean comparison, then load data database := dbClient.Database(config.DatabaseName) collection := database.Collection("hotels") - // Drop existing collection for a clean comparison - if err := collection.Drop(ctx); err != nil { - fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) - } else { - fmt.Println("Dropped existing 'hotels' collection") + // Drop existing collection if it exists (clean start) + names, _ := database.ListCollectionNames(ctx, bson.M{"name": "hotels"}) + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection: %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } } // Ensure cleanup on exit diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java index ef8d55a..7cbf094 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -49,80 +49,85 @@ public static void run() { MongoDatabase database = mongoClient.getDatabase(databaseName); MongoCollection collection = database.getCollection(COLLECTION_NAME); - // Load data ONCE into the single collection - System.out.println(" Loading data from: " + dataFile); - List data = Utils.readJsonFile(dataFile); - System.out.printf(" Loaded %d documents%n", data.size()); - - collection.drop(); - System.out.println(" Collection reset."); - Utils.insertData(collection, data, 100); - - // Generate ONE embedding for the query (reused for all 9 searches) - OpenAIClient aiClient = Utils.getOpenAIClient(); - System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); - List queryVector = Utils.getEmbedding(aiClient, queryText, model); - System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); - - // Convert to doubles for BSON - List vectorAsDoubles = queryVector.stream() - .map(Float::doubleValue) - .toList(); - - // Create all 9 indexes idempotently - System.out.println(" Creating 9 vector indexes..."); - for (String algo : ALGORITHMS) { - for (String metric : METRICS) { - createIndex(collection, vectorField, dimensions, algo, metric); + try { + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println(" Dropped existing collection."); } - } - System.out.println(" All indexes created.\n"); - - // Run searches sequentially for fair timing - System.out.println(" Running searches..."); - for (String algo : ALGORITHMS) { - for (String metric : METRICS) { - String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); - - long startNs = System.nanoTime(); - List searchResults = performSearch( - collection, vectorAsDoubles, vectorField, topK); - long elapsedNs = System.nanoTime() - startNs; - double elapsedMs = elapsedNs / 1_000_000.0; - - // Extract top result info - String topHotel = "-"; - double topScore = 0.0; - if (!searchResults.isEmpty()) { - Document top = searchResults.get(0); - topHotel = top.getString("HotelName") != null - ? top.getString("HotelName") : "-"; - topScore = top.getDouble("score") != null - ? top.getDouble("score") : 0.0; + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Create all 9 indexes idempotently + System.out.println(" Creating 9 vector indexes..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + createIndex(collection, vectorField, dimensions, algo, metric); } + } + System.out.println(" All indexes created.\n"); + + // Run searches sequentially for fair timing + System.out.println(" Running searches..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + long startNs = System.nanoTime(); + List searchResults = performSearch( + collection, vectorAsDoubles, vectorField, topK); + long elapsedNs = System.nanoTime() - startNs; + double elapsedMs = elapsedNs / 1_000_000.0; + + // Extract top result info + String topHotel = "-"; + double topScore = 0.0; + if (!searchResults.isEmpty()) { + Document top = searchResults.get(0); + topHotel = top.getString("HotelName") != null + ? top.getString("HotelName") : "-"; + topScore = top.getDouble("score") != null + ? top.getDouble("score") : 0.0; + } - results.add(new SearchResult( - algo.toUpperCase(), metric, indexName, - elapsedMs, searchResults.size(), topHotel, topScore)); - - if (verbose) { - System.out.printf(" [%s] %d results in %.2f ms%n", - indexName, searchResults.size(), elapsedMs); - for (int i = 0; i < searchResults.size(); i++) { - Document doc = searchResults.get(i); - System.out.printf(" %d. %s (%.4f)%n", - i + 1, - doc.getString("HotelName"), - doc.getDouble("score")); + results.add(new SearchResult( + algo.toUpperCase(), metric, indexName, + elapsedMs, searchResults.size(), topHotel, topScore)); + + if (verbose) { + System.out.printf(" [%s] %d results in %.2f ms%n", + indexName, searchResults.size(), elapsedMs); + for (int i = 0; i < searchResults.size(); i++) { + Document doc = searchResults.get(i); + System.out.printf(" %d. %s (%.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + } } } } + } finally { + // Cleanup: always drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); } - - // Cleanup: drop the comparison collection - System.out.println("\n Cleanup: dropping comparison collection..."); - collection.drop(); - System.out.println(" Cleanup: dropped collection 'hotels'"); } // Print comparison table diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py index 1aac549..8539898 100644 --- a/ai/select-algorithm-python/src/compare_all.py +++ b/ai/select-algorithm-python/src/compare_all.py @@ -171,9 +171,10 @@ def main(): try: database = mongo_client[config["database_name"]] - # Drop collection for a clean comparison - database.drop_collection("hotels") - print("Dropped existing 'hotels' collection (if any)") + # Drop collection if it already exists (clean start) + if "hotels" in database.list_collection_names(): + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection") # Create fresh collection and load data collection = database["hotels"] diff --git a/ai/vector-search-dotnet/Services/VectorSearchService.cs b/ai/vector-search-dotnet/Services/VectorSearchService.cs index e8505a1..a1aa841 100644 --- a/ai/vector-search-dotnet/Services/VectorSearchService.cs +++ b/ai/vector-search-dotnet/Services/VectorSearchService.cs @@ -43,24 +43,32 @@ public VectorSearchService(ILogger logger, MongoDbService m /// The vector search algorithm to use (IVF, HNSW, or DiskANN) public async Task RunSearchAsync(VectorIndexType indexType) { + _logger.LogInformation($"Starting {indexType} vector search workflow"); + + // Setup collection + var collectionSuffix = indexType switch + { + VectorIndexType.IVF => "ivf", + VectorIndexType.HNSW => "hnsw", + VectorIndexType.DiskANN => "diskann", + _ => throw new ArgumentException($"Unknown index type: {indexType}") + }; + var collectionName = $"hotels_{collectionSuffix}"; + var indexName = $"vectorIndex_{collectionSuffix}"; + + // Drop collection if it already exists (clean start) + var database = _mongoService.GetDatabase(_config.VectorSearch.DatabaseName); + var existingCollections = (await database.ListCollectionNamesAsync()).ToList(); + if (existingCollections.Contains(collectionName)) + { + await _mongoService.DropCollectionAsync(_config.VectorSearch.DatabaseName, collectionName); + } + try { - _logger.LogInformation($"Starting {indexType} vector search workflow"); - - // Setup collection - var collectionSuffix = indexType switch - { - VectorIndexType.IVF => "ivf", - VectorIndexType.HNSW => "hnsw", - VectorIndexType.DiskANN => "diskann", - _ => throw new ArgumentException($"Unknown index type: {indexType}") - }; - var collectionName = $"hotels_{collectionSuffix}"; - var indexName = $"vectorIndex_{collectionSuffix}"; - var collection = _mongoService.GetCollection(_config.VectorSearch.DatabaseName, collectionName); - // Load data from file if collection is empty + // Load data from file var assemblyLocation = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location) ?? string.Empty; var dataFilePath = Path.Combine(assemblyLocation, _config.DataFiles.WithVectors); await _mongoService.LoadDataIfNeededAsync(collection, dataFilePath); @@ -137,6 +145,18 @@ await _mongoService.CreateVectorIndexAsync( _logger.LogError(ex, $"{indexType} vector search failed"); throw; } + finally + { + // Cleanup: always drop the collection + try + { + await _mongoService.DropCollectionAsync(_config.VectorSearch.DatabaseName, collectionName); + } + catch (Exception ex) + { + _logger.LogWarning(ex, $"Cleanup warning: failed to drop collection '{collectionName}'"); + } + } } /// diff --git a/ai/vector-search-go/src/diskann.go b/ai/vector-search-go/src/diskann.go index 8991f58..e4536a3 100644 --- a/ai/vector-search-go/src/diskann.go +++ b/ai/vector-search-go/src/diskann.go @@ -154,6 +154,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_diskann") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_diskann"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_diskann'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_diskann'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_diskann'") + } + }() + // Load data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -177,15 +199,6 @@ func main() { // Insert data into collection fmt.Printf("\nInserting data into collection '%s'...\n", config.CollectionName) - // Clear existing data to ensure clean state - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert the hotel data stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-go/src/hnsw.go b/ai/vector-search-go/src/hnsw.go index ab6977c..93bc5bd 100644 --- a/ai/vector-search-go/src/hnsw.go +++ b/ai/vector-search-go/src/hnsw.go @@ -155,6 +155,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_hnsw") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_hnsw"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_hnsw'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_hnsw'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_hnsw'") + } + }() + // Load hotel data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -178,15 +200,6 @@ func main() { // Insert data into MongoDB collection fmt.Printf("\nPreparing collection '%s'...\n", config.CollectionName) - // Clear any existing data to start fresh - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert hotel data with embeddings stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-go/src/ivf.go b/ai/vector-search-go/src/ivf.go index 2aeddd8..2861845 100644 --- a/ai/vector-search-go/src/ivf.go +++ b/ai/vector-search-go/src/ivf.go @@ -152,6 +152,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_ivf") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_ivf"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_ivf'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_ivf'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_ivf'") + } + }() + // Load hotel data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -175,15 +197,6 @@ func main() { // Prepare collection with fresh data fmt.Printf("\nPreparing collection '%s'...\n", config.CollectionName) - // Remove any existing data for clean state - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert hotel data with embeddings stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java index 676630b..14a37c6 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java index 146fc27..a8b3be7 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java index e800107..9c23aec 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-python/src/diskann.py b/ai/vector-search-python/src/diskann.py index 81720ab..fdef640 100644 --- a/ai/vector-search-python/src/diskann.py +++ b/ai/vector-search-python/src/diskann.py @@ -142,6 +142,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -200,8 +207,13 @@ def main(): raise finally: - # Close the MongoDB client + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-python/src/hnsw.py b/ai/vector-search-python/src/hnsw.py index 9352220..fcc9e72 100644 --- a/ai/vector-search-python/src/hnsw.py +++ b/ai/vector-search-python/src/hnsw.py @@ -136,6 +136,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load hotel data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -196,8 +203,13 @@ def main(): raise finally: - # Clean up MongoDB connection + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-python/src/ivf.py b/ai/vector-search-python/src/ivf.py index f39c0d2..04a0794 100644 --- a/ai/vector-search-python/src/ivf.py +++ b/ai/vector-search-python/src/ivf.py @@ -133,6 +133,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load hotel data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -191,8 +198,13 @@ def main(): raise finally: - # Ensure MongoDB connection is properly closed + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-typescript/src/diskann.ts b/ai/vector-search-typescript/src/diskann.ts index 96b547c..b756405 100644 --- a/ai/vector-search-typescript/src/diskann.ts +++ b/ai/vector-search-typescript/src/diskann.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -95,9 +103,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } diff --git a/ai/vector-search-typescript/src/hnsw.ts b/ai/vector-search-typescript/src/hnsw.ts index 771146c..fede64e 100644 --- a/ai/vector-search-typescript/src/hnsw.ts +++ b/ai/vector-search-typescript/src/hnsw.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -95,9 +103,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } diff --git a/ai/vector-search-typescript/src/ivf.ts b/ai/vector-search-typescript/src/ivf.ts index e81ace8..908ae1c 100644 --- a/ai/vector-search-typescript/src/ivf.ts +++ b/ai/vector-search-typescript/src/ivf.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -96,9 +104,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } From b7363505d215d461bae64f3d0464ef9422f438e5 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Thu, 30 Apr 2026 07:53:56 -0700 Subject: [PATCH 8/9] Add validation workflow, copilot instructions, and collection lifecycle standardization - Add dual-mode GitHub Actions workflow (build-only for PR/push, full-run for manual dispatch) - Add copilot instruction files for all 5 languages (TypeScript, Python, Go, Java, .NET) - Document collection lifecycle convention (conditional drop-at-start, always-drop-at-end) - Document naming convention requirement for parallel CI safety - Update select-algorithm-typescript: remove IP metric, add multi-query support, add cleanup - Add bulk insert and env var documentation per language Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions-dotnet.md | 135 ++++++ .github/copilot-instructions-go.md | 133 +++++ .github/copilot-instructions-java.md | 122 +++++ .github/copilot-instructions-python.md | 119 +++++ .github/copilot-instructions-typescript.md | 114 +++++ .github/copilot-instructions.md | 148 ++++++ .github/workflows/validate-samples.yml | 455 +++++++++++++++--- ai/select-algorithm-typescript/README.md | 29 +- .../src/compare-all.ts | 364 ++++++++++---- 9 files changed, 1437 insertions(+), 182 deletions(-) create mode 100644 .github/copilot-instructions-dotnet.md create mode 100644 .github/copilot-instructions-go.md create mode 100644 .github/copilot-instructions-java.md create mode 100644 .github/copilot-instructions-python.md create mode 100644 .github/copilot-instructions-typescript.md create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions-dotnet.md b/.github/copilot-instructions-dotnet.md new file mode 100644 index 0000000..4789eca --- /dev/null +++ b/.github/copilot-instructions-dotnet.md @@ -0,0 +1,135 @@ +# .NET (C#) Specific Instructions + +## Stack + +- .NET 8+ +- `MongoDB.Driver` for DocumentDB access +- `Azure.Identity` for DefaultAzureCredential +- `Azure.AI.OpenAI` for Azure OpenAI + +## File Structure + +``` +ai/select-algorithm-dotnet/ +├── src/ +│ ├── CompareAll.cs +│ └── Utils.cs +├── select-algorithm-dotnet.csproj +└── README.md + +ai/vector-search-dotnet/ +├── src/ +│ ├── Ivf.cs +│ ├── Hnsw.cs +│ ├── Diskann.cs +│ └── Utils.cs +├── vector-search-dotnet.csproj +└── README.md +``` + +## Naming Conventions + +- Files: `PascalCase.cs` +- Methods: `PascalCase` +- Constants: `PascalCase` +- Private fields: `_camelCase` +- Local variables: `camelCase` +- Namespaces: `Azure.DocumentDB.Samples` + +## Authentication Pattern + +```csharp +using Azure.Identity; +using MongoDB.Driver; +using MongoDB.Driver.Authentication.Oidc; + +var credential = new DefaultAzureCredential(); +var oidcCallback = new OidcCallback(async (parameters, cancellationToken) => +{ + var token = await credential.GetTokenAsync( + new TokenRequestContext(new[] { "https://ossrdbms-aad.database.windows.net/.default" }), + cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn); +}); +``` + +## $search Syntax + +```csharp +// CORRECT +var searchStage = new BsonDocument("$search", + new BsonDocument("cosmosSearch", + new BsonDocument + { + { "vector", new BsonArray(queryVector) }, + { "path", embeddedField }, + { "k", topK } + })); + +// WRONG — do NOT add cosmosSearchOptions to the $search stage +``` + +## Bulk Insert + +Use `collection.InsertManyAsync()` with `InsertManyOptions { IsOrdered = false }`: + +```csharp +using MongoDB.Driver; + +try +{ + await collection.InsertManyAsync(batch, new InsertManyOptions { IsOrdered = false }); + insertedCount += batch.Count; +} +catch (MongoBulkWriteException e) +{ + // Partial failure — some docs inserted + insertedCount += (int)e.Result.InsertedCount; + failedCount += batch.Count - (int)e.Result.InsertedCount; +} +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`await Task.Delay(200)`) +- Catch `MongoBulkWriteException` for partial failure handling +- Always use the async variant (`InsertManyAsync`) + +## Key Patterns + +- Use `Environment.GetEnvironmentVariable("VAR") ?? "default"` for config +- Use `using` statements for disposable resources +- Use `try/finally` for collection cleanup +- Async/await throughout (use `Async` suffix on method names) +- Match TypeScript output format exactly + +## Environment Variables + +- Use `IConfiguration` with layered sources: `appsettings.json` → environment variables +- Provide `appsettings.json` with placeholder structure (committed) and gitignore `appsettings.local.json` +- Environment variables override JSON config values +- Bind to strongly-typed configuration classes (`AppConfiguration`, `AzureOpenAIConfiguration`, etc.) + +```csharp +var configuration = new ConfigurationBuilder() + .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) + .AddEnvironmentVariables() + .Build(); + +var appConfig = configuration.Get() + ?? throw new InvalidOperationException("Failed to load configuration"); +``` + +- Configuration class hierarchy: + - `AppConfiguration` → root + - `AzureOpenAIConfiguration` → endpoint, model, apiVersion + - `MongoDBConfiguration` → connectionString, clusterName, loadBatchSize + - `EmbeddingConfiguration` → fieldToEmbed, embeddedField, dimensions, batchSize + - `VectorSearchConfiguration` → query, databaseName, topK + +- Include `Microsoft.Extensions.Configuration` packages in `.csproj` + +## Build & Run + +```bash +dotnet run +``` diff --git a/.github/copilot-instructions-go.md b/.github/copilot-instructions-go.md new file mode 100644 index 0000000..16533ee --- /dev/null +++ b/.github/copilot-instructions-go.md @@ -0,0 +1,133 @@ +# Go-Specific Instructions + +## Stack + +- Go 1.21+ +- `go.mongodb.org/mongo-driver/v2` for DocumentDB access +- `github.com/Azure/azure-sdk-for-go/sdk/azidentity` for DefaultAzureCredential +- `github.com/openai/openai-go` for Azure OpenAI + +## File Structure + +``` +ai/select-algorithm-go/ +├── src/ +│ ├── compare_all.go # Multi-query comparison runner +│ └── utils.go # Shared utilities +├── go.mod +├── go.sum +└── README.md + +ai/vector-search-go/ +├── src/ +│ ├── ivf.go +│ ├── hnsw.go +│ ├── diskann.go +│ └── utils.go +├── go.mod +├── go.sum +└── README.md +``` + +## Naming Conventions + +- Files: `snake_case.go` +- Functions: `PascalCase` (exported), `camelCase` (unexported) +- Constants: `PascalCase` or `camelCase` +- Packages: `lowercase` + +## Authentication Pattern + +```go +import ( + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" +) + +credential, _ := azidentity.NewDefaultAzureCredential(nil) +// Use OIDC callback with DocumentDB scope +``` + +## $search Syntax + +```go +// CORRECT +searchStage := bson.D{{Key: "$search", Value: bson.D{ + {Key: "cosmosSearch", Value: bson.D{ + {Key: "vector", Value: queryVector}, + {Key: "path", Value: embeddedField}, + {Key: "k", Value: topK}, + }}, +}}} + +// WRONG — do NOT include cosmosSearchOptions in the $search stage +``` + +## Bulk Insert + +Use `collection.InsertMany()` with `SetOrdered(false)` and handle `BulkWriteException`: + +```go +result, err := collection.InsertMany(ctx, documents, options.InsertMany().SetOrdered(false)) +if err != nil { + if bulkErr, ok := err.(mongo.BulkWriteException); ok { + // Partial failure — some docs inserted, some failed + failed := len(bulkErr.WriteErrors) + insertedCount += len(batch) - failed + } else { + return fmt.Errorf("batch insert failed: %w", err) + } +} else { + insertedCount += len(result.InsertedIDs) +} +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`time.Sleep(200 * time.Millisecond)`) +- Type-assert `mongo.BulkWriteException` for partial failure handling + +## Key Patterns + +- Use `os.Getenv("VAR")` with fallback helper for config +- Always check errors explicitly — no panic in sample code +- Use `context.Background()` or appropriate timeout contexts +- Use `defer` for cleanup (drop collections) +- Match TypeScript output format exactly + +## Environment Variables + +- Use `github.com/joho/godotenv` to load from `.env` file at startup +- Provide a `.env.example` file in each sample directory +- Access pattern: `os.Getenv("VAR")` with a helper function for defaults +- Call `godotenv.Load()` early — log a warning if `.env` is missing but don't fail (env vars may be set externally) + +```go +import ( + "os" + "github.com/joho/godotenv" +) + +func init() { + err := godotenv.Load() + if err != nil { + fmt.Println("No .env file found, using environment variables") + } +} + +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} +``` + +- Include `github.com/joho/godotenv` in `go.mod` + +## Build & Run + +```bash +cd src +go run . +``` diff --git a/.github/copilot-instructions-java.md b/.github/copilot-instructions-java.md new file mode 100644 index 0000000..35cbf11 --- /dev/null +++ b/.github/copilot-instructions-java.md @@ -0,0 +1,122 @@ +# Java-Specific Instructions + +## Stack + +- Java 17+ +- MongoDB Java Driver (`org.mongodb:mongodb-driver-sync`) +- Azure Identity (`com.azure:azure-identity`) +- Azure OpenAI (`com.azure:azure-ai-openai`) + +## File Structure + +``` +ai/select-algorithm-java/ +├── src/main/java/com/azure/documentdb/sample/ +│ ├── CompareAll.java +│ └── Utils.java +├── pom.xml +└── README.md + +ai/vector-search-java/ +├── src/main/java/com/azure/documentdb/sample/ +│ ├── Ivf.java +│ ├── Hnsw.java +│ ├── Diskann.java +│ └── Utils.java +├── pom.xml +└── README.md +``` + +## Naming Conventions + +- Files: `PascalCase.java` +- Methods: `camelCase` +- Constants: `UPPER_SNAKE_CASE` +- Classes: `PascalCase` +- Packages: `com.azure.documentdb.sample` + +## Authentication Pattern + +```java +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; + +DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); +MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", (context) -> { + AccessToken token = credential.getToken( + new TokenRequestContext().addScopes("https://ossrdbms-aad.database.windows.net/.default") + ).block(); + return new OidcCallbackResult(token.getToken()); + }); +``` + +## $search Syntax + +```java +// CORRECT +Document searchStage = new Document("$search", + new Document("cosmosSearch", + new Document("vector", queryVector) + .append("path", embeddedField) + .append("k", topK))); + +// WRONG — do NOT add cosmosSearchOptions to the $search stage +``` + +## Bulk Insert + +Use `collection.insertMany()` with `InsertManyOptions().ordered(false)`: + +```java +import com.mongodb.client.model.InsertManyOptions; +import com.mongodb.MongoBulkWriteException; + +try { + collection.insertMany(documents, new InsertManyOptions().ordered(false)); + insertedCount += documents.size(); +} catch (MongoBulkWriteException e) { + // Partial failure — some docs inserted + insertedCount += e.getWriteResult().getInsertedCount(); + failedCount += documents.size() - e.getWriteResult().getInsertedCount(); +} +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`Thread.sleep(200)`) +- Catch `MongoBulkWriteException` for partial failure handling + +## Key Patterns + +- Use `System.getenv("VAR")` with null check for config +- Use try-with-resources for MongoClient +- Use `try/finally` for collection cleanup +- Match TypeScript output format exactly + +## Environment Variables + +- Read directly via `System.getenv("VAR")` — **no dotenv library** +- Provide a `.env.example` file in each sample directory for documentation purposes +- Access pattern: `System.getenv("VAR")` with null check or ternary for defaults +- Validate required vars early and fail with a clear message + +```java +var clusterName = System.getenv("MONGO_CLUSTER_NAME"); +var endpoint = System.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); +var model = System.getenv("AZURE_OPENAI_EMBEDDING_MODEL"); +var batchSizeStr = System.getenv("LOAD_SIZE_BATCH"); +var batchSize = batchSizeStr != null ? Integer.parseInt(batchSizeStr) : 100; + +if (clusterName == null || endpoint == null) { + throw new IllegalStateException("Missing required environment variables: MONGO_CLUSTER_NAME, AZURE_OPENAI_EMBEDDING_ENDPOINT"); +} +``` + +- Users set env vars via shell export, IDE run configuration, or azd-provided `.env` + +## Build & Run + +```bash +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.sample.CompareAll" +``` diff --git a/.github/copilot-instructions-python.md b/.github/copilot-instructions-python.md new file mode 100644 index 0000000..2605d13 --- /dev/null +++ b/.github/copilot-instructions-python.md @@ -0,0 +1,119 @@ +# Python-Specific Instructions + +## Stack + +- Python 3.10+ +- `pymongo` for DocumentDB access +- `openai` SDK (AzureOpenAI class) +- `azure-identity` for DefaultAzureCredential + +## File Structure + +``` +ai/select-algorithm-python/ +├── src/ +│ ├── compare_all.py # Multi-query comparison runner +│ └── utils.py # Shared utilities +├── requirements.txt +└── README.md + +ai/vector-search-python/ +├── src/ +│ ├── ivf.py +│ ├── hnsw.py +│ ├── diskann.py +│ ├── create_embeddings.py +│ └── utils.py +├── requirements.txt +└── README.md +``` + +## Naming Conventions + +- Files: `snake_case.py` +- Functions: `snake_case` +- Constants: `UPPER_SNAKE_CASE` +- Classes: `PascalCase` + +## Authentication Pattern + +```python +from azure.identity import DefaultAzureCredential +from pymongo import MongoClient +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult + +class AzureIdentityCallback(OIDCCallback): + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + credential = DefaultAzureCredential() + token = credential.get_token("https://ossrdbms-aad.database.windows.net/.default") + return OIDCCallbackResult(access_token=token.token, expires_in_seconds=300) +``` + +## $search Syntax + +```python +# CORRECT +pipeline = [ + {"$search": {"cosmosSearch": {"vector": query_vector, "path": field, "k": top_k}}}, + {"$project": {"similarityScore": {"$meta": "searchScore"}, "document": "$$ROOT"}} +] + +# WRONG — do NOT use cosmosSearchOptions in $search +# pipeline = [{"$search": {"cosmosSearch": {...}, "cosmosSearchOptions": {...}}}] +``` + +## Bulk Insert + +Use `collection.bulk_write()` with `InsertOne` operations and `ordered=False`: + +```python +from pymongo import InsertOne +from pymongo.errors import BulkWriteError + +operations = [InsertOne(document) for document in batch] +try: + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count +except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + failed_count += len(batch) - e.details.get('nInserted', 0) +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`time.sleep(0.2)`) +- Handle `BulkWriteError` for partial failures + +## Key Patterns + +- Use `os.environ.get("VAR", "default")` for config +- Type hints on all function signatures +- Use `try/finally` for collection cleanup +- Match TypeScript output format exactly (table layout, emoji, section headers) + +## Environment Variables + +- Use `python-dotenv` to load from `.env` file at startup +- Provide a `.env.example` file in each sample directory +- Access pattern: `os.environ.get("VAR", "default")` for optional, `os.environ["VAR"]` for required +- Call `load_dotenv()` at the top of the entry point before accessing any env vars + +```python +from dotenv import load_dotenv +import os + +load_dotenv() + +endpoint = os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"] +model = os.environ["AZURE_OPENAI_EMBEDDING_MODEL"] +cluster_name = os.environ["MONGO_CLUSTER_NAME"] +batch_size = int(os.environ.get("LOAD_SIZE_BATCH", "100")) +``` + +- Include `python-dotenv` in `requirements.txt` + +## Build & Run + +```bash +pip install -r requirements.txt +python src/compare_all.py +``` diff --git a/.github/copilot-instructions-typescript.md b/.github/copilot-instructions-typescript.md new file mode 100644 index 0000000..8d944b1 --- /dev/null +++ b/.github/copilot-instructions-typescript.md @@ -0,0 +1,114 @@ +# TypeScript-Specific Instructions + +> This is the **reference implementation**. Other languages must match its behavior. + +## Stack + +- Node.js with ESM modules (`"type": "module"` in package.json) +- TypeScript 5+ with strict mode +- `mongodb` driver (native MongoDB client) +- `openai` SDK (AzureOpenAI class) +- `@azure/identity` for DefaultAzureCredential + +## File Structure + +``` +ai/select-algorithm-typescript/ +├── src/ +│ ├── compare-all.ts # Multi-query comparison runner +│ ├── utils.ts # Shared utilities (auth, config, insert, print) +│ └── ... +├── package.json +├── tsconfig.json +└── README.md + +ai/vector-search-typescript/ +├── src/ +│ ├── ivf.ts # Individual IVF example +│ ├── hnsw.ts # Individual HNSW example +│ ├── diskann.ts # Individual DiskANN example +│ ├── create-embeddings.ts +│ ├── utils.ts +│ └── showIndexes.ts +├── package.json +├── tsconfig.json +└── README.md +``` + +## Authentication Pattern + +```typescript +import { DefaultAzureCredential, getBearerTokenProvider } from '@azure/identity'; +import { MongoClient, OIDCCallbackParams, OIDCResponse } from 'mongodb'; + +// OIDC callback for passwordless auth +const AzureIdentityTokenCallback = async ( + params: OIDCCallbackParams, + credential: TokenCredential +): Promise => { + const tokenResponse = await credential.getToken([ + 'https://ossrdbms-aad.database.windows.net/.default' + ]); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: (tokenResponse?.expiresOnTimestamp || 0) - Math.floor(Date.now() / 1000) + }; +}; +``` + +## ESM Considerations + +```typescript +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); +``` + +## Environment Variables + +- Loaded via `process.env` directly — **no dotenv library** in production code +- Provide a `.env.example` file in each sample directory showing all required vars with placeholder values +- A `.env` file at the sample root is used for local development (gitignored) +- Access pattern: `process.env.VAR_NAME!` (non-null assertion) for required vars +- For optional vars with defaults: `process.env.VAR_NAME || 'default'` +- Validate all required vars at startup — throw with a clear error listing missing vars + +```typescript +const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; +const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; +const clusterName = process.env.MONGO_CLUSTER_NAME!; + +if (!endpoint || !deployment || !clusterName) { + throw new Error('Missing required environment variables: ...'); +} +``` + +## Build & Run + +```bash +npm install +npm run build # tsc +npm start # node dist/compare-all.js +``` + +## Bulk Insert + +Use `collection.insertMany()` with `ordered: false` for batch inserts: + +```typescript +const result = await collection.insertMany(batch, { ordered: false }); +inserted += result.insertedCount || 0; +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches to avoid rate limiting +- Handle partial failures gracefully (log failed count, continue) + +## Key Patterns + +- Use `interface` for data shapes (SearchResult, AlgorithmConfig) +- Use `const` arrays for ALGORITHMS and SIMILARITIES definitions +- Clean up collections in `finally` block +- Template literal strings for console output formatting diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..3474847 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,148 @@ +# Copilot Instructions for DocumentDB Samples + +## Repository Purpose + +This repo contains Azure DocumentDB (vCore) code samples demonstrating vector search capabilities across multiple languages. Each sample must work identically across all supported languages. + +## Supported Languages + +- [TypeScript](.github/copilot-instructions-typescript.md) (reference implementation) +- [Python](.github/copilot-instructions-python.md) +- [Go](.github/copilot-instructions-go.md) +- [Java](.github/copilot-instructions-java.md) +- [.NET (C#)](.github/copilot-instructions-dotnet.md) + +## Architecture Rules + +### Authentication + +- **Always support two auth modes**: passwordless (DefaultAzureCredential with OIDC callback) AND connection string +- Passwordless is the primary path; connection string is fallback +- DocumentDB vCore uses MongoDB wire protocol — auth token scope is `https://ossrdbms-aad.database.windows.net/.default` + +### Azure OpenAI Integration + +- Use `text-embedding-3-small` (1536 dimensions) as the default embedding model +- Model deployment name comes from env var `AZURE_OPENAI_EMBEDDING_MODEL` +- Support both API key and DefaultAzureCredential for OpenAI client + +### DocumentDB Vector Search + +- **One vector index per field per collection** — this is a hard platform constraint +- When comparing multiple index types, use separate collections (one per algorithm×metric combination) +- Collection naming: `compare_{algorithm}_{metric}` (e.g., `compare_hnsw_cos`) +- Supported algorithms: `vector-ivf`, `vector-hnsw`, `vector-diskann` +- Supported metrics: `COS`, `L2` (IP is omitted — see below) + +### Why No Inner Product (IP) + +`text-embedding-3-small` produces unit-normalized vectors (magnitude ≈ 1). For normalized vectors: +- cosine similarity = dot(a,b) / (||a|| × ||b||) = dot(a,b) = inner product +- COS and IP always return identical results + +Including IP adds no insight and doubles comparison time. All samples use only COS and L2. + +### $search Query Syntax + +The correct MongoDB `$search` syntax for DocumentDB vector search is: + +``` +{ $search: { cosmosSearch: { vector: , path: "", k: } } } +``` + +**DO NOT** use `cosmosSearchOptions` as a key in the `$search` stage. That key is only valid in index creation commands. + +### Data + +- Shared dataset: `ai/data/Hotels_Vector.json` (50 documents with pre-computed embeddings) +- All samples reference this shared data file — do not duplicate data per language +- The `DescriptionVector` field contains the 1536-dimension embedding + +### Batch Insert + +- Always use bulk/batch insert (`insertMany` or equivalent) with `ordered: false` +- Default batch size: 100 (configurable via `LOAD_SIZE_BATCH` env var) +- Add a small delay between batches (200ms) to avoid rate limiting +- Handle partial failures gracefully (log failed count, continue) + +### Environment Variables + +All samples must support these env vars: + +| Variable | Purpose | +|----------|---------| +| `AZURE_DOCUMENTDB_CONNECTION_STRING` | MongoDB connection string | +| `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | +| `AZURE_OPENAI_EMBEDDING_MODEL` | Deployment name (e.g., `text-embedding-3-small`) | +| `AZURE_OPENAI_EMBEDDING_KEY` | API key (optional if using DefaultAzureCredential) | +| `AZURE_OPENAI_EMBEDDING_API_VERSION` | API version | +| `TOP_K` | Number of results to return (default: 5) | +| `LOAD_SIZE_BATCH` | Batch size for bulk insert (default: 100) | +| `QUERY_TEXT` | Single query override (optional) | +| `VERBOSE` | Enable verbose output (default: false) | + +### Sample Categories + +1. **vector-search-{lang}**: Basic vector search with individual algorithm samples (ivf.ts, hnsw.ts, diskann.ts) +2. **select-algorithm-{lang}**: Comparison runner that tests all algorithms × metrics with multi-query support + +### select-algorithm Comparison Runner Requirements + +The comparison runner (`compare-all`) must: + +1. **Multi-query support**: Run 5 diverse default queries (overridable via `QUERY_TEXT` for single) +2. **Adaptive table collapse**: When all algorithms return the same #1 result for a query, show collapsed metric-only view. When they disagree, show expanded algorithm×metric grid. +3. **Gap analysis**: Show the score gap between #1 and #2 results +4. **Per-query output**: Header with query text, then comparison table +5. **Summary**: Final divergence summary across all queries + +### Console Output Style + +- Use clear section headers with `\n` separation +- Tables with aligned columns (use padding) +- Emoji indicators: ✅ (agreement), ⚠️ (disagreement) +- Show document counts, embedding dimensions, and collection names during setup + +### Collection Lifecycle (REQUIRED) + +Every sample must follow this exact lifecycle — the validation workflow depends on it: + +1. **Start**: Check if collection exists → drop only if it does (defensive, handles prior crashes) +2. **End**: Always drop the collection in a `finally`/`defer` block (cleanup for next run) + +Language-specific patterns: + +| Language | Conditional drop at start | Always drop at end | +|----------|--------------------------|-------------------| +| TypeScript | `db.listCollections({name}).toArray()` → `db.dropCollection(name)` | `finally { db.dropCollection(name) }` | +| Python | `name in database.list_collection_names()` → `database.drop_collection(name)` | `finally: database.drop_collection(name)` | +| Go | `database.ListCollectionNames(ctx, bson.M{"name": name})` → `collection.Drop(ctx)` | `defer func() { collection.Drop(ctx) }()` | +| Java | `database.listCollectionNames().into(list).contains(name)` → `collection.drop()` | `finally { collection.drop() }` | +| .NET | `ListCollectionNamesAsync(filter)` → `DropCollectionAsync(name)` | `finally { DropCollectionAsync(name) }` | + +**Why this matters**: The CI workflow runs samples in parallel across languages. Without end-of-run cleanup, leftover collections cause name conflicts and flaky test failures. + +### Collection Naming Convention (REQUIRED) + +Collection names must be unique per algorithm to avoid conflicts: + +- **vector-search samples**: `hotels_{algorithm}` (e.g., `hotels_diskann`, `hotels_hnsw`, `hotels_ivf`) +- **select-algorithm samples**: `compare_{algorithm}_{metric}` (e.g., `compare_hnsw_cos`, `compare_ivf_l2`) +- **Database**: Always `Hotels` +- **Index names**: `vectorIndex_{algorithm}` (e.g., `vectorIndex_diskann`) + +All languages must use identical collection/index names for a given algorithm. This enables the shared validation workflow to verify behavior consistency. + +### Error Handling + +- Graceful cleanup: drop created collections on error (use try/finally) +- Log but don't crash on individual batch insert failures +- Validate all required env vars at startup with clear error messages + +### Code Style + +- No unnecessary comments — only comment non-obvious decisions (like why IP is omitted) +- Use descriptive variable names over comments +- Keep functions focused — extract helpers for repeated patterns +- TypeScript is the reference implementation — other languages should match its behavior exactly diff --git a/.github/workflows/validate-samples.yml b/.github/workflows/validate-samples.yml index 7bd29ec..d35e962 100644 --- a/.github/workflows/validate-samples.yml +++ b/.github/workflows/validate-samples.yml @@ -1,100 +1,135 @@ +# ============================================================================= +# Validate Samples — End-to-end validation for all DocumentDB AI samples +# ============================================================================= +# +# PURPOSE: +# Validates that every sample in this repo compiles and (optionally) runs +# correctly against a live Azure DocumentDB + Azure OpenAI deployment. +# +# TWO MODES: +# 1. BUILD-ONLY (automatic) — Triggered on PR/push to ai/** paths. +# Compiles all 5 languages (TypeScript, Python, Go, Java, .NET) to catch +# syntax errors, missing imports, and type issues. No secrets needed. +# +# 2. FULL RUN (manual) — Triggered via workflow_dispatch ("Run workflow" button). +# Builds AND executes every sample against real Azure resources. +# Requires the SAMPLES_ENV_FILE repo secret (see setup below). +# Captures all stdout/stderr as downloadable artifacts. +# +# SETUP — Creating the SAMPLES_ENV_FILE secret: +# 1. Go to repo Settings > Secrets and variables > Actions +# 2. Click "New repository secret" +# 3. Name: SAMPLES_ENV_FILE +# 4. Value: paste your entire .env file contents, e.g.: +# AZURE_DOCUMENTDB_CONNECTION_STRING=mongodb+srv://... +# AZURE_DOCUMENTDB_DATABASENAME=quickstart_db +# AZURE_OPENAI_EMBEDDING_ENDPOINT=https://...openai.azure.com +# AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small +# AZURE_OPENAI_EMBEDDING_KEY=abc123... +# AZURE_OPENAI_EMBEDDING_API_VERSION=2024-06-01 +# TOP_K=3 +# LOAD_SIZE_BATCH=25 +# 5. Click "Add secret" +# +# ARTIFACTS: +# Full-run jobs upload output-*.log files as workflow artifacts (7-day retention). +# Download them from the workflow run's "Artifacts" section to inspect sample output. +# +# ============================================================================= + name: Validate Samples on: + # Build-only on PR and push pull_request: paths: - 'ai/**' - '.github/workflows/validate-samples.yml' push: - branches: - - main + branches: [main] paths: - 'ai/**' - '.github/workflows/validate-samples.yml' + # Manual trigger for full validation (build + run) + workflow_dispatch: + inputs: + run_mode: + description: 'build-only = compile check only; full = compile + execute against Azure' + required: true + default: 'full' + type: choice + options: + - full + - build-only + permissions: contents: read concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Separate concurrency groups for auto (PR/push) vs manual full-run + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ inputs.run_mode || 'auto' }} + cancel-in-progress: ${{ github.event_name != 'workflow_dispatch' }} jobs: - validate-typescript: - name: TypeScript - ${{ matrix.sample }} + # ============================================================ + # BUILD JOBS — Always run (PR, push, and workflow_dispatch) + # Validates that code compiles without needing any secrets. + # ============================================================ + + build-typescript: + name: Build TypeScript - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false strategy: fail-fast: false matrix: sample: - vector-search-typescript - - vector-search-agent-typescript - + - select-algorithm-typescript steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Node.js - uses: actions/setup-node@v6 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: node-version: '20' cache: 'npm' cache-dependency-path: ai/${{ matrix.sample }}/package-lock.json - - - name: Install dependencies + - run: npm ci working-directory: ai/${{ matrix.sample }} - run: npm ci - - - name: Build TypeScript + - run: npm run build working-directory: ai/${{ matrix.sample }} - run: npm run build - validate-dotnet: - name: .NET + build-dotnet: + name: Build .NET runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false - steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup .NET - uses: actions/setup-dotnet@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-dotnet@v4 with: dotnet-version: '8.0.x' - - - name: Build solution - run: dotnet build documentdb-samples.sln + - run: dotnet build documentdb-samples.sln - validate-go: - name: Go - ${{ matrix.sample }} + build-go: + name: Build Go - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false strategy: fail-fast: false matrix: sample: - vector-search-go - - vector-search-agent-go - + - select-algorithm-go steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Go - uses: actions/setup-go@v6 + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 with: go-version: '1.24' cache-dependency-path: ai/${{ matrix.sample }}/go.sum - - - name: Validate Go + - name: Build Go working-directory: ai/${{ matrix.sample }} + # Go samples have multiple main() files sharing utils.go — build each independently run: | - # Check if src/ has multiple main() declarations (independent programs sharing utils) if [ -d "src" ] && [ "$(grep -rl '^func main()' src/*.go 2>/dev/null | wc -l)" -gt 1 ]; then cd src for f in $(grep -l '^func main()' *.go); do @@ -105,47 +140,315 @@ jobs: go build ./... fi - validate-python: - name: Python + build-python: + name: Build Python - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false - + strategy: + fail-fast: false + matrix: + sample: + - vector-search-python + - select-algorithm-python steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Python - uses: actions/setup-python@v6 + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 with: python-version: '3.11' - - - name: Install dependencies - working-directory: ai/vector-search-python - run: pip install -r requirements.txt - - - name: Validate Python syntax - working-directory: ai/vector-search-python - run: | - find . -name "*.py" -exec python -m py_compile {} + + - run: pip install -r requirements.txt + working-directory: ai/${{ matrix.sample }} + - name: Validate syntax + working-directory: ai/${{ matrix.sample }} + run: find . -name "*.py" -exec python -m py_compile {} + - validate-java: - name: Java + build-java: + name: Build Java - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false - + strategy: + fail-fast: false + matrix: + sample: + - vector-search-java + - select-algorithm-java + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '21' + cache: 'maven' + - run: mvn compile -DskipTests + working-directory: ai/${{ matrix.sample }} + + # ============================================================ + # FULL-RUN JOBS — Only on workflow_dispatch with run_mode=full + # Executes samples against live Azure resources using the + # SAMPLES_ENV_FILE repo secret. Captures output as artifacts. + # ============================================================ + + preflight: + name: Preflight — Verify secret exists + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + runs-on: ubuntu-latest + steps: + - name: Check SAMPLES_ENV_FILE secret + run: | + if [ -z "$ENV_CONTENT" ]; then + echo "::error::SAMPLES_ENV_FILE secret is not set. See workflow header for setup instructions." + exit 1 + fi + echo "✅ SAMPLES_ENV_FILE secret is configured ($(echo "$ENV_CONTENT" | wc -l) lines)" + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + + run-typescript: + name: Run TypeScript - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [build-typescript, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-typescript + scripts: | + node --env-file .env dist/create-embeddings.js 2>&1 | tee output-embed.log + node --env-file .env dist/ivf.js 2>&1 | tee output-ivf.log + node --env-file .env dist/hnsw.js 2>&1 | tee output-hnsw.log + node --env-file .env dist/diskann.js 2>&1 | tee output-diskann.log + - sample: select-algorithm-typescript + scripts: | + node --env-file .env dist/compare-all.js 2>&1 | tee output-compare.log + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: ai/${{ matrix.sample }}/package-lock.json + - run: npm ci + working-directory: ai/${{ matrix.sample }} + - run: npm run build + working-directory: ai/${{ matrix.sample }} + - name: Write .env from secret + working-directory: ai/${{ matrix.sample }} + run: printf '%s\n' "$ENV_CONTENT" > .env + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ai/${{ matrix.sample }} + run: | + set -euo pipefail + ${{ matrix.scripts }} + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-typescript-${{ matrix.sample }} + path: ai/${{ matrix.sample }}/output-*.log + retention-days: 7 + + run-python: + name: Run Python - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [build-python, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-python + scripts: | + python src/create_embeddings.py 2>&1 | tee output-embed.log + python src/ivf.py 2>&1 | tee output-ivf.log + python src/hnsw.py 2>&1 | tee output-hnsw.log + python src/diskann.py 2>&1 | tee output-diskann.log + - sample: select-algorithm-python + scripts: | + python src/compare_all.py 2>&1 | tee output-compare.log + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: '3.11' + - run: pip install -r requirements.txt + working-directory: ai/${{ matrix.sample }} + - name: Write .env from secret + working-directory: ai/${{ matrix.sample }} + run: printf '%s\n' "$ENV_CONTENT" > .env + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ai/${{ matrix.sample }} + run: | + set -euo pipefail + ${{ matrix.scripts }} + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-python-${{ matrix.sample }} + path: ai/${{ matrix.sample }}/output-*.log + retention-days: 7 + + run-go: + name: Run Go - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [build-go, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-go + scripts: | + go run create_embeddings.go utils.go 2>&1 | tee output-embed.log + go run ivf.go utils.go 2>&1 | tee output-ivf.log + go run hnsw.go utils.go 2>&1 | tee output-hnsw.log + go run diskann.go utils.go 2>&1 | tee output-diskann.log + workdir: ai/vector-search-go/src + - sample: select-algorithm-go + scripts: | + go run compare_all.go utils.go 2>&1 | tee output-compare.log + workdir: ai/select-algorithm-go/src + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.24' + cache-dependency-path: ai/${{ matrix.sample }}/go.sum + - name: Write .env from secret + working-directory: ${{ matrix.workdir }} + run: printf '%s\n' "$ENV_CONTENT" > .env + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ${{ matrix.workdir }} + run: | + set -euo pipefail + ${{ matrix.scripts }} + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-go-${{ matrix.sample }} + path: ${{ matrix.workdir }}/output-*.log + retention-days: 7 + + run-java: + name: Run Java - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [build-java, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-java + classes: DiskAnn HNSW IVF + package: com.azure.documentdb.samples + - sample: select-algorithm-java + classes: CompareAll + package: com.azure.documentdb.selectalgorithm steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Java - uses: actions/setup-java@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '21' cache: 'maven' - - - name: Compile Java - working-directory: ai/vector-search-java - run: mvn compile -DskipTests + - run: mvn compile -DskipTests + working-directory: ai/${{ matrix.sample }} + - name: Export env vars from secret + # Java uses System.getenv() — write .env then source it into GITHUB_ENV + run: | + while IFS='=' read -r key value; do + [[ -z "$key" || "$key" == \#* ]] && continue + echo "$key=$value" >> "$GITHUB_ENV" + done <<< "$ENV_CONTENT" + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ai/${{ matrix.sample }} + run: | + set -euo pipefail + for class in ${{ matrix.classes }}; do + echo "=== Running $class ===" + mvn exec:java -Dexec.mainClass="${{ matrix.package }}.$class" 2>&1 | tee "output-${class,,}.log" + done + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-java-${{ matrix.sample }} + path: ai/${{ matrix.sample }}/output-*.log + retention-days: 7 + + run-dotnet: + name: Run .NET - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [build-dotnet, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-dotnet + project: ai/vector-search-dotnet/DocumentDBVectorSearch.csproj + - sample: select-algorithm-dotnet + project: ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '8.0.x' + - name: Export env vars from secret + # .NET uses Environment.GetEnvironmentVariable() — parse .env into GITHUB_ENV + run: | + while IFS='=' read -r key value; do + [[ -z "$key" || "$key" == \#* ]] && continue + echo "$key=$value" >> "$GITHUB_ENV" + done <<< "$ENV_CONTENT" + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + run: | + set -euo pipefail + dotnet run --project ${{ matrix.project }} 2>&1 | tee output-run.log + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-dotnet-${{ matrix.sample }} + path: output-run.log + retention-days: 7 + + # ============================================================ + # SUMMARY — Aggregates pass/fail status across all languages + # ============================================================ + + summary: + name: Results Summary + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' && always() + needs: [preflight, run-typescript, run-python, run-go, run-java, run-dotnet] + runs-on: ubuntu-latest + steps: + - name: Generate summary table + run: | + echo "## 🧪 Full Validation Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Language | Status |" >> $GITHUB_STEP_SUMMARY + echo "|----------|--------|" >> $GITHUB_STEP_SUMMARY + echo "| TypeScript | ${{ needs.run-typescript.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Python | ${{ needs.run-python.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Go | ${{ needs.run-go.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Java | ${{ needs.run-java.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| .NET | ${{ needs.run-dotnet.result }} |" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "📦 Download artifacts for full output logs." >> $GITHUB_STEP_SUMMARY diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md index 16e0b67..8d1c37d 100644 --- a/ai/select-algorithm-typescript/README.md +++ b/ai/select-algorithm-typescript/README.md @@ -75,21 +75,42 @@ npm run start:diskann ## Compare All Algorithms -Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table: +Run all 9 combinations (3 algorithms × 3 similarity metrics) across multiple diverse queries and view formatted comparison tables with a ranking divergence summary: ```bash npm run start:compare-all ``` +By default, the script runs **5 diverse queries** designed to stress different aspects of similarity ranking: + +1. `outdoor adventure with family activities` +2. `quiet romantic getaway with ocean view` +3. `budget-friendly downtown hotel with free WiFi` +4. `historic building with fine dining and spa` +5. `ski resort with yoga and winter sports` + **Environment variables** (optional overrides): | Variable | Default | Description | |---|---|---| -| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | -| `TOP_K` | `3` | Number of results per combination | +| `QUERY_TEXT` | *(5 built-in queries)* | Override with a single custom query | +| `TOP_K` | `5` | Number of results per combination | | `VERBOSE` | `false` | When `true`, shows all k results per combo | -The script creates a single `hotels` collection, loads data once, creates 9 vector indexes (one per algorithm/metric pair), and runs searches sequentially for fair timing comparison. +### Architecture + +> **DocumentDB limitation:** Only ONE vector index per field per collection is allowed. The script creates 9 separate collections (one per algorithm×metric pair), loads data into each, creates one index per collection, runs searches, and cleans up all collections on exit. + +### Output + +The script produces: +- **Per-query comparison table** — shows algorithm, metric, latency, top score, and #1 result for each of the 9 combinations +- **Ranking divergence summary** — highlights queries where algorithms/metrics disagreed on the #1 result +- **Score gap analysis** — shows the confidence margin between #1 and #2 results + +### Small dataset caveat + +With ~50 hotel documents, all algorithms typically return identical rankings. This is expected — the dataset is too small for algorithmic differences to surface. For meaningful differentiation, use 1000+ documents with varied embeddings. The diverse queries help by combining attributes that no single hotel perfectly satisfies, which can reveal metric-level differences (COS vs L2 vs IP) even on small data. ## Algorithm comparison diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts index 53c54aa..e634d8a 100644 --- a/ai/select-algorithm-typescript/src/compare-all.ts +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -13,9 +13,9 @@ interface AlgorithmConfig { } interface SearchResult { + query: string; algorithm: string; similarity: string; - latencyMs: number; topScore: number; topResult: string; results: Array<{ name: string; score: number }>; @@ -27,16 +27,41 @@ const ALGORITHMS: AlgorithmConfig[] = [ { name: 'DiskANN', kind: 'vector-diskann', options: { maxDegree: 32, lBuild: 50 } }, ]; -const SIMILARITIES = ['COS', 'L2', 'IP']; +// Only COS and L2 — Inner Product (IP) is omitted because text-embedding-3-small +// produces unit-normalized vectors (magnitude = 1). For normalized vectors, +// cosine similarity = dot(a,b)/(||a||·||b||) = dot(a,b) = inner product. +// COS and IP always return identical results, so comparing both adds no insight. +const SIMILARITIES = ['COS', 'L2']; + +// Diverse queries designed to stress-test ranking differences: +// Each combines attributes that no single hotel perfectly satisfies, +// forcing similarity metrics to disagree on partial matches. +const DEFAULT_QUERIES = [ + 'outdoor adventure with family activities', + 'quiet romantic getaway with ocean view', + 'budget-friendly downtown hotel with free WiFi', + 'historic building with fine dining and spa', + 'ski resort with yoga and winter sports', +]; + +// DocumentDB allows only ONE vector index per field per collection, +// so we use a separate collection for each algorithm×metric combination. +function collectionNameFor(algo: AlgorithmConfig, sim: string): string { + return `compare_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; +} async function main() { const baseConfig = getConfig(); - const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; - const topK = parseInt(process.env.TOP_K || '3', 10); + const topK = parseInt(process.env.TOP_K || '5', 10); const verbose = process.env.VERBOSE === 'true'; - const collectionName = 'hotels'; + + // Support single query override via env, otherwise use all default queries + const queries: string[] = process.env.QUERY_TEXT + ? [process.env.QUERY_TEXT] + : DEFAULT_QUERIES; const { aiClient, dbClient } = getClientsPasswordless(); + const createdCollections: string[] = []; try { if (!aiClient) throw new Error('AI client is not configured.'); @@ -45,27 +70,39 @@ async function main() { await dbClient.connect(); const db = dbClient.db(baseConfig.dbName); - // Drop collection if it exists for a clean comparison - const existingCollections = await db.listCollections({ name: collectionName }).toArray(); - if (existingCollections.length > 0) { - await db.dropCollection(collectionName); - console.log(`Dropped existing collection: ${collectionName}`); - } - - // Create collection and load data - const collection = await db.createCollection(collectionName); - console.log(`Created collection: ${collectionName}`); + // Load data from file once (held in memory, inserted per collection) const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); - const insertSummary = await insertData(baseConfig, collection, data); - console.log(`Inserted ${insertSummary.inserted}/${insertSummary.total} documents`); + console.log(`Loaded ${data.length} documents from ${baseConfig.dataFile}`); + + // Generate embeddings for all queries upfront + console.log(`\nGenerating embeddings for ${queries.length} query(ies)...`); + const embeddingResponse = await aiClient.embeddings.create({ + model: baseConfig.deployment, + input: queries + }); + const queryVectors = embeddingResponse.data.map(d => d.embedding); + console.log(`Embeddings generated (${queryVectors[0].length} dimensions each)`); - // Create all 9 indexes - console.log('\nCreating vector indexes...'); + // Create 9 collections, each with its own vector index + console.log('\nSetting up 9 collections (1 per algorithm×metric)...'); for (const algo of ALGORITHMS) { for (const sim of SIMILARITIES) { + const colName = collectionNameFor(algo, sim); const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + + // Drop if leftover from a prior run + const existing = await db.listCollections({ name: colName }).toArray(); + if (existing.length > 0) { + await db.dropCollection(colName); + } + + const collection = await db.createCollection(colName); + createdCollections.push(colName); + + await insertData(baseConfig, collection, data); + const indexOptions = { - createIndexes: collectionName, + createIndexes: colName, indexes: [{ name: indexName, key: { [baseConfig.embeddedField]: 'cosmosSearch' }, @@ -78,78 +115,85 @@ async function main() { }] }; await db.command(indexOptions); - console.log(` ✓ ${indexName} (created)`); + console.log(` ✓ ${colName} → index ${indexName}`); } } - // Generate one embedding for the query - console.log(`\nQuery: "${queryText}"`); - const embeddingResponse = await aiClient.embeddings.create({ - model: baseConfig.deployment, - input: [queryText] - }); - const queryVector = embeddingResponse.data[0].embedding; - console.log(`Embedding generated (${queryVector.length} dimensions)`); + // Brief pause for indexes to become queryable + console.log('\nWaiting for indexes to be ready...'); + await new Promise(resolve => setTimeout(resolve, 3000)); - // Run all 9 searches sequentially - console.log(`\nRunning searches (top ${topK} results)...\n`); - const results: SearchResult[] = []; + // Run all queries × all 9 combinations + const allResults: SearchResult[] = []; - for (const algo of ALGORITHMS) { - for (const sim of SIMILARITIES) { - const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + for (let qi = 0; qi < queries.length; qi++) { + const queryText = queries[qi]; + const queryVector = queryVectors[qi]; + console.log(`\n━━━ Query ${qi + 1}/${queries.length}: "${queryText}" (top ${topK}) ━━━`); + + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const colName = collectionNameFor(algo, sim); + const collection = db.collection(colName); - const start = performance.now(); - const searchResults = await collection.aggregate([ - { - $search: { - cosmosSearch: { - vector: queryVector, - path: baseConfig.embeddedField, - k: topK - }, - cosmosSearchOptions: { - indexName: indexName + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' } } - }, - { - $project: { - score: { $meta: 'searchScore' }, - document: '$$ROOT' - } - } - ]).toArray(); - const latencyMs = performance.now() - start; - - const topDoc = searchResults[0] as any; - results.push({ - algorithm: algo.name, - similarity: sim, - latencyMs, - topScore: topDoc?.score ?? 0, - topResult: topDoc?.document?.HotelName ?? '(none)', - results: searchResults.map((r: any) => ({ - name: r.document?.HotelName ?? '(none)', - score: r.score ?? 0 - })) - }); + ]).toArray(); + + const topDoc = searchResults[0] as any; + allResults.push({ + query: queryText, + algorithm: algo.name, + similarity: sim, + topScore: topDoc?.score ?? 0, + topResult: topDoc?.document?.HotelName ?? '(none)', + results: searchResults.map((r: any) => ({ + name: r.document?.HotelName ?? '(none)', + score: r.score ?? 0 + })) + }); + } } } - // Print comparison table - printComparisonTable(results, verbose); + // Print per-query comparison tables + for (const queryText of queries) { + const queryResults = allResults.filter(r => r.query === queryText); + printComparisonTable(queryText, queryResults, verbose); + } + + // Print cross-query ranking divergence summary + if (queries.length > 1) { + printDivergenceSummary(allResults, queries); + } } catch (error) { console.error('Compare-all failed:', error); process.exitCode = 1; } finally { - // Cleanup: drop the comparison collection + // Cleanup: drop all comparison collections if (dbClient) { try { const db = dbClient.db(baseConfig.dbName); - await db.dropCollection(collectionName); - console.log(`\nCleanup: dropped collection "${collectionName}"`); + console.log(`\nCleanup: dropping ${createdCollections.length} comparison collections...`); + for (const colName of createdCollections) { + await db.dropCollection(colName); + } + console.log('Cleanup complete'); } catch (cleanupErr) { console.error('Cleanup warning:', cleanupErr); } @@ -159,46 +203,162 @@ async function main() { } } -function printComparisonTable(results: SearchResult[], verbose: boolean) { - const algoWidth = 10; - const simWidth = 10; - const latWidth = 8; - const scoreWidth = 10; - const nameWidth = 30; - +function printComparisonTable(queryText: string, results: SearchResult[], _verbose: boolean) { const pad = (s: string, w: number) => s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length); - const topLine = `╔${'═'.repeat(algoWidth)}╤${'═'.repeat(simWidth)}╤${'═'.repeat(latWidth)}╤${'═'.repeat(scoreWidth)}╤${'═'.repeat(nameWidth)}╗`; - const headerSep = `╠${'═'.repeat(algoWidth)}╪${'═'.repeat(simWidth)}╪${'═'.repeat(latWidth)}╪${'═'.repeat(scoreWidth)}╪${'═'.repeat(nameWidth)}╣`; - const rowSep = `╟${'─'.repeat(algoWidth)}┼${'─'.repeat(simWidth)}┼${'─'.repeat(latWidth)}┼${'─'.repeat(scoreWidth)}┼${'─'.repeat(nameWidth)}╢`; - const bottomLine = `╚${'═'.repeat(algoWidth)}╧${'═'.repeat(simWidth)}╧${'═'.repeat(latWidth)}╧${'═'.repeat(scoreWidth)}╧${'═'.repeat(nameWidth)}╝`; + // Group by similarity metric to check if algorithms agree + const byMetric = new Map(); + for (const r of results) { + const group = byMetric.get(r.similarity) ?? []; + group.push(r); + byMetric.set(r.similarity, group); + } + + // Check if all algorithms agree (same #1 and #2 per metric) + const allAgree = [...byMetric.values()].every(group => { + const first = group[0]; + return group.every(r => + r.results[0]?.name === first.results[0]?.name && + r.results[1]?.name === first.results[1]?.name + ); + }); - console.log(topLine); - console.log(`║${pad(' Algorithm', algoWidth)}│${pad(' Similarity', simWidth)}│${pad(' Latency', latWidth)}│${pad(' Top Score', scoreWidth)}│${pad(' Top Result', nameWidth)}║`); - console.log(headerSep); + console.log(`\n┌─ Query: "${queryText}"`); - results.forEach((r, i) => { - const latStr = `${Math.round(r.latencyMs)}ms`; - const scoreStr = r.topScore.toFixed(4); + if (allAgree) { + // Collapsed view: one row per metric (algorithms all agree) + const simWidth = 8; + const nameWidth = 26; + const scoreWidth = 9; + const gapWidth = 8; + const colWidths = [simWidth, nameWidth, scoreWidth, scoreWidth, gapWidth, nameWidth]; + const topLine = `╔${colWidths.map(w => '═'.repeat(w)).join('╤')}╗`; + const headerSep = `╠${colWidths.map(w => '═'.repeat(w)).join('╪')}╣`; + const rowSep = `╟${colWidths.map(w => '─'.repeat(w)).join('┼')}╢`; + const bottomLine = `╚${colWidths.map(w => '═'.repeat(w)).join('╧')}╝`; + + console.log(`│ ✅ All algorithms agree (IVF, HNSW, DiskANN) — showing by metric only`); + console.log(topLine); console.log( - `║${pad(` ${r.algorithm}`, algoWidth)}│${pad(` ${r.similarity}`, simWidth)}│${pad(` ${latStr}`, latWidth)}│${pad(` ${scoreStr}`, scoreWidth)}│${pad(` ${r.topResult}`, nameWidth)}║` + `║${pad(' Metric', simWidth)}│${pad(' #1 Result', nameWidth)}│${pad(' #1 Score', scoreWidth)}│${pad(' #2 Score', scoreWidth)}│${pad(' Gap', gapWidth)}│${pad(' #2 Result', nameWidth)}║` ); + console.log(headerSep); + + const metrics = [...byMetric.entries()]; + metrics.forEach(([metric, group], i) => { + const r = group[0]; + const score1 = r.results[0]?.score.toFixed(4) ?? '-'; + const name1 = r.results[0]?.name ?? '(none)'; + const score2 = r.results[1]?.score.toFixed(4) ?? '-'; + const name2 = r.results[1]?.name ?? '(none)'; + const gap = (r.results[0] && r.results[1]) + ? Math.abs(r.results[0].score - r.results[1].score).toFixed(4) + : '-'; - if (verbose && r.results.length > 1) { - for (let j = 1; j < r.results.length; j++) { - const sub = r.results[j]; - console.log( - `║${pad('', algoWidth)}│${pad('', simWidth)}│${pad('', latWidth)}│${pad(` ${sub.score.toFixed(4)}`, scoreWidth)}│${pad(` ${sub.name}`, nameWidth)}║` - ); + console.log( + `║${pad(` ${metric}`, simWidth)}│${pad(` ${name1}`, nameWidth)}│${pad(` ${score1}`, scoreWidth)}│${pad(` ${score2}`, scoreWidth)}│${pad(` ${gap}`, gapWidth)}│${pad(` ${name2}`, nameWidth)}║` + ); + + if (i < metrics.length - 1) { + console.log(rowSep); } - } + }); + + console.log(bottomLine); + } else { + // Expanded view: show full algo×metric grid (algorithms disagree) + const algoWidth = 10; + const simWidth = 6; + const scoreWidth = 8; + const nameWidth = 26; + const colWidths = [algoWidth, simWidth, nameWidth, scoreWidth, scoreWidth, nameWidth]; + const topLine = `╔${colWidths.map(w => '═'.repeat(w)).join('╤')}╗`; + const headerSep = `╠${colWidths.map(w => '═'.repeat(w)).join('╪')}╣`; + const rowSep = `╟${colWidths.map(w => '─'.repeat(w)).join('┼')}╢`; + const bottomLine = `╚${colWidths.map(w => '═'.repeat(w)).join('╧')}╝`; + + console.log(`│ ⚠️ Algorithms DISAGREE — showing full breakdown`); + console.log(topLine); + console.log( + `║${pad(' Algo', algoWidth)}│${pad(' Sim', simWidth)}│${pad(' #1 Result', nameWidth)}│${pad(' #1 Score', scoreWidth)}│${pad(' #2 Score', scoreWidth)}│${pad(' #2 Result', nameWidth)}║` + ); + console.log(headerSep); + + results.forEach((r, i) => { + const score1 = r.results[0]?.score.toFixed(4) ?? '-'; + const name1 = r.results[0]?.name ?? '(none)'; + const score2 = r.results[1]?.score.toFixed(4) ?? '-'; + const name2 = r.results[1]?.name ?? '(none)'; + + console.log( + `║${pad(` ${r.algorithm}`, algoWidth)}│${pad(` ${r.similarity}`, simWidth)}│${pad(` ${name1}`, nameWidth)}│${pad(` ${score1}`, scoreWidth)}│${pad(` ${score2}`, scoreWidth)}│${pad(` ${name2}`, nameWidth)}║` + ); + + if (i < results.length - 1) { + console.log(rowSep); + } + }); - if (i < results.length - 1) { - console.log(rowSep); + console.log(bottomLine); + } +} + +// Show where algorithms/metrics disagree on rankings across queries +function printDivergenceSummary(allResults: SearchResult[], queries: string[]) { + console.log('\n\n╔══════════════════════════════════════════════════════════════════╗'); + console.log('║ RANKING DIVERGENCE SUMMARY ║'); + console.log('╚══════════════════════════════════════════════════════════════════╝'); + console.log('Shows queries where algorithms or metrics produced DIFFERENT #1 results.\n'); + + let divergenceCount = 0; + + for (const queryText of queries) { + const queryResults = allResults.filter(r => r.query === queryText); + const topResults = new Set(queryResults.map(r => r.topResult)); + + if (topResults.size > 1) { + divergenceCount++; + console.log(` ⚡ "${queryText}"`); + + // Group by top result to show which combos picked what + const groups = new Map(); + for (const r of queryResults) { + const key = r.topResult; + if (!groups.has(key)) groups.set(key, []); + groups.get(key)!.push(`${r.algorithm}/${r.similarity}`); + } + for (const [hotel, combos] of groups) { + console.log(` → ${hotel}: ${combos.join(', ')}`); + } + console.log(''); } - }); + } - console.log(bottomLine); + if (divergenceCount === 0) { + console.log(' All algorithms returned identical #1 results for every query.'); + console.log(' This is expected with small datasets (~50 docs). For meaningful'); + console.log(' differentiation, use 1000+ documents with varied embeddings.\n'); + } else { + console.log(` ${divergenceCount}/${queries.length} queries showed ranking divergence.`); + } + + // Score gap analysis — show how "confident" the top result is + console.log('\n Score Gaps (top score − 2nd score):'); + console.log(' ─────────────────────────────────────'); + for (const queryText of queries) { + const queryResults = allResults.filter(r => r.query === queryText); + const gaps = queryResults.map(r => { + if (r.results.length < 2) return 0; + return r.results[0].score - r.results[1].score; + }); + const avgGap = gaps.reduce((a, b) => a + b, 0) / gaps.length; + const maxGap = Math.max(...gaps); + const minGap = Math.min(...gaps); + const shortQuery = queryText.length > 40 ? queryText.slice(0, 37) + '...' : queryText; + console.log(` "${shortQuery}"`); + console.log(` avg: ${avgGap.toFixed(4)} | min: ${minGap.toFixed(4)} | max: ${maxGap.toFixed(4)}`); + } + console.log(''); } main().catch(error => { From e070073cd448c07694286369b1307f63c33d4c15 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Thu, 30 Apr 2026 08:12:00 -0700 Subject: [PATCH 9/9] fix: address review findings (serialize, masking, types, env docs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Serialize run jobs (TS→Py→Go→Java→.NET) to prevent collection collisions - Add ::add-mask:: for secret values, fix IFS parsing for connection strings - Fix Go version 1.24→1.23 - Add timeout-minutes: 2 to preflight job - TypeScript: add MongoSearchResult interface, env validation, safe cleanup - Add MONGO_CLUSTER_NAME to env var table (required for passwordless auth) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions.md | 3 +- .github/workflows/validate-samples.yml | 33 +++++++----- .../src/compare-all.ts | 43 ++++++++++++--- infra/main.json | 54 +++++++++++++++++++ 4 files changed, 111 insertions(+), 22 deletions(-) create mode 100644 infra/main.json diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 3474847..8ab46b3 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -71,7 +71,8 @@ All samples must support these env vars: | Variable | Purpose | |----------|---------| -| `AZURE_DOCUMENTDB_CONNECTION_STRING` | MongoDB connection string | +| `MONGO_CLUSTER_NAME` | DocumentDB cluster name (required for passwordless/OIDC auth) | +| `AZURE_DOCUMENTDB_CONNECTION_STRING` | MongoDB connection string (fallback when not using passwordless) | | `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | | `AZURE_OPENAI_EMBEDDING_MODEL` | Deployment name (e.g., `text-embedding-3-small`) | diff --git a/.github/workflows/validate-samples.yml b/.github/workflows/validate-samples.yml index d35e962..5defcfe 100644 --- a/.github/workflows/validate-samples.yml +++ b/.github/workflows/validate-samples.yml @@ -124,7 +124,7 @@ jobs: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: - go-version: '1.24' + go-version: '1.23' cache-dependency-path: ai/${{ matrix.sample }}/go.sum - name: Build Go working-directory: ai/${{ matrix.sample }} @@ -191,6 +191,7 @@ jobs: name: Preflight — Verify secret exists if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' runs-on: ubuntu-latest + timeout-minutes: 2 steps: - name: Check SAMPLES_ENV_FILE secret run: | @@ -198,14 +199,14 @@ jobs: echo "::error::SAMPLES_ENV_FILE secret is not set. See workflow header for setup instructions." exit 1 fi - echo "✅ SAMPLES_ENV_FILE secret is configured ($(echo "$ENV_CONTENT" | wc -l) lines)" + echo "✅ SAMPLES_ENV_FILE secret is configured" env: ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} run-typescript: name: Run TypeScript - ${{ matrix.sample }} if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' - needs: [build-typescript, preflight] + needs: [preflight] runs-on: ubuntu-latest timeout-minutes: 20 strategy: @@ -253,7 +254,7 @@ jobs: run-python: name: Run Python - ${{ matrix.sample }} if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' - needs: [build-python, preflight] + needs: [run-typescript, build-python, preflight] runs-on: ubuntu-latest timeout-minutes: 20 strategy: @@ -297,7 +298,7 @@ jobs: run-go: name: Run Go - ${{ matrix.sample }} if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' - needs: [build-go, preflight] + needs: [run-python, build-go, preflight] runs-on: ubuntu-latest timeout-minutes: 20 strategy: @@ -319,7 +320,7 @@ jobs: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: - go-version: '1.24' + go-version: '1.23' cache-dependency-path: ai/${{ matrix.sample }}/go.sum - name: Write .env from secret working-directory: ${{ matrix.workdir }} @@ -342,7 +343,7 @@ jobs: run-java: name: Run Java - ${{ matrix.sample }} if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' - needs: [build-java, preflight] + needs: [run-go, build-java, preflight] runs-on: ubuntu-latest timeout-minutes: 20 strategy: @@ -365,10 +366,12 @@ jobs: - run: mvn compile -DskipTests working-directory: ai/${{ matrix.sample }} - name: Export env vars from secret - # Java uses System.getenv() — write .env then source it into GITHUB_ENV run: | - while IFS='=' read -r key value; do - [[ -z "$key" || "$key" == \#* ]] && continue + while IFS= read -r line; do + [[ -z "$line" || "$line" == \#* ]] && continue + key="${line%%=*}" + value="${line#*=}" + echo "::add-mask::$value" echo "$key=$value" >> "$GITHUB_ENV" done <<< "$ENV_CONTENT" env: @@ -392,7 +395,7 @@ jobs: run-dotnet: name: Run .NET - ${{ matrix.sample }} if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' - needs: [build-dotnet, preflight] + needs: [run-java, build-dotnet, preflight] runs-on: ubuntu-latest timeout-minutes: 20 strategy: @@ -409,10 +412,12 @@ jobs: with: dotnet-version: '8.0.x' - name: Export env vars from secret - # .NET uses Environment.GetEnvironmentVariable() — parse .env into GITHUB_ENV run: | - while IFS='=' read -r key value; do - [[ -z "$key" || "$key" == \#* ]] && continue + while IFS= read -r line; do + [[ -z "$line" || "$line" == \#* ]] && continue + key="${line%%=*}" + value="${line#*=}" + echo "::add-mask::$value" echo "$key=$value" >> "$GITHUB_ENV" done <<< "$ENV_CONTENT" env: diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts index e634d8a..39cadfb 100644 --- a/ai/select-algorithm-typescript/src/compare-all.ts +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -12,6 +12,11 @@ interface AlgorithmConfig { options: Record; } +interface MongoSearchResult { + document: { name: string; [key: string]: unknown }; + score: number; +} + interface SearchResult { query: string; algorithm: string; @@ -154,15 +159,16 @@ async function main() { } ]).toArray(); - const topDoc = searchResults[0] as any; + const typedResults = searchResults as unknown as MongoSearchResult[]; + const topDoc = typedResults[0]; allResults.push({ query: queryText, algorithm: algo.name, similarity: sim, topScore: topDoc?.score ?? 0, - topResult: topDoc?.document?.HotelName ?? '(none)', - results: searchResults.map((r: any) => ({ - name: r.document?.HotelName ?? '(none)', + topResult: (topDoc?.document?.HotelName as string) ?? '(none)', + results: typedResults.map((r) => ({ + name: (r.document?.HotelName as string) ?? '(none)', score: r.score ?? 0 })) }); @@ -191,14 +197,22 @@ async function main() { const db = dbClient.db(baseConfig.dbName); console.log(`\nCleanup: dropping ${createdCollections.length} comparison collections...`); for (const colName of createdCollections) { - await db.dropCollection(colName); + try { + await db.dropCollection(colName); + } catch (dropErr) { + console.error(`Cleanup warning (drop ${colName}):`, dropErr); + } } console.log('Cleanup complete'); } catch (cleanupErr) { console.error('Cleanup warning:', cleanupErr); } - await dbClient.close(); - console.log('Database connection closed'); + try { + await dbClient.close(); + console.log('Database connection closed'); + } catch (closeErr) { + console.error('Warning closing connection:', closeErr); + } } } } @@ -361,6 +375,21 @@ function printDivergenceSummary(allResults: SearchResult[], queries: string[]) { console.log(''); } +// Validate required environment variables before starting +const REQUIRED_ENV_VARS = [ + 'AZURE_OPENAI_EMBEDDING_ENDPOINT', + 'AZURE_OPENAI_EMBEDDING_MODEL', + 'AZURE_OPENAI_EMBEDDING_API_VERSION', +]; + +const missing = REQUIRED_ENV_VARS.filter(v => !process.env[v]); +if (!process.env.AZURE_DOCUMENTDB_CONNECTION_STRING && !process.env.MONGO_CLUSTER_NAME) { + missing.push('AZURE_DOCUMENTDB_CONNECTION_STRING or MONGO_CLUSTER_NAME'); +} +if (missing.length > 0) { + throw new Error(`Missing required environment variables:\n - ${missing.join('\n - ')}`); +} + main().catch(error => { console.error('Unhandled error:', error); process.exitCode = 1; diff --git a/infra/main.json b/infra/main.json new file mode 100644 index 0000000..f47cac0 --- /dev/null +++ b/infra/main.json @@ -0,0 +1,54 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "environmentName": { + "value": "development" + }, + "location": { + "value": "eastus2" + }, + "openAiLocation": { + "value": "eastus2" + }, + "deploymentUserPrincipalId": { + "value": "" + }, + "currentUserPrincipalId": { + "value": "" + }, + "documentDbAdminUsername": { + "value": "docdbadmin" + }, + "documentDbAdminPassword": { + "value": "TempP@ss123!" + }, + "chatModelName": { + "value": "gpt-4.1-mini" + }, + "chatModelVersion": { + "value": "2025-04-14" + }, + "chatModelType": { + "value": "Standard" + }, + "synthModelName": { + "value": "gpt-4.1" + }, + "synthModelVersion": { + "value": "2025-04-14" + }, + "synthModelType": { + "value": "Standard" + }, + "embeddingModelName": { + "value": "text-embedding-3-small" + }, + "embeddingModelVersion": { + "value": "1" + }, + "embeddingModelType": { + "value": "Standard" + } + } +} \ No newline at end of file