diff --git a/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json new file mode 100644 index 0000000..aafd623 --- /dev/null +++ b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json @@ -0,0 +1,48 @@ +{ + "name": "Azure DocumentDB Select Algorithm - .NET 8", + "image": "mcr.microsoft.com/devcontainers/dotnet:1-8.0-bookworm", + + "features": { + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/devcontainers/features/github-cli:1": {}, + "ghcr.io/devcontainers/features/common-utils:2": { + "installZsh": true, + "configureZshAsDefaultShell": true, + "installOhMyZsh": true + } + }, + + "customizations": { + "vscode": { + "extensions": [ + "ms-dotnettools.csdevkit", + "ms-dotnettools.vscodeintellicode-csharp", + "ms-azuretools.vscode-azureresourcegroups", + "ms-azuretools.vscode-cosmosdb", + "mongodb.mongodb-vscode" + ], + "settings": { + "dotnet.completion.showCompletionItemsFromUnimportedNamespaces": true, + "files.exclude": { + "**/bin": true, + "**/obj": true + } + } + } + }, + + "postCreateCommand": "dotnet restore && dotnet build", + "remoteUser": "vscode", + + "containerEnv": { + "DOTNET_CLI_TELEMETRY_OPTOUT": "1", + "DOTNET_NOLOGO": "1" + }, + + "mounts": [ + "source=${localEnv:HOME}${localEnv:USERPROFILE}/.azure,target=/home/vscode/.azure,type=bind,consistency=cached" + ], + + "capAdd": ["SYS_PTRACE"], + "securityOpt": ["seccomp:unconfined"] +} diff --git a/ai/select-algorithm-dotnet/.gitignore b/ai/select-algorithm-dotnet/.gitignore new file mode 100644 index 0000000..de285c3 --- /dev/null +++ b/ai/select-algorithm-dotnet/.gitignore @@ -0,0 +1,7 @@ +bin/ +obj/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-dotnet/AlgorithmRunner.cs b/ai/select-algorithm-dotnet/AlgorithmRunner.cs new file mode 100644 index 0000000..193eeaf --- /dev/null +++ b/ai/select-algorithm-dotnet/AlgorithmRunner.cs @@ -0,0 +1,197 @@ +using System.Diagnostics; +using MongoDB.Driver; +using MongoDB.Bson; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +public static class AlgorithmRunner +{ + private record IndexConfig(string Name, string Kind, string Similarity, BsonDocument ExtraParams); + + public static void RunSingleAlgorithm(AppConfiguration config, string algorithm) + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine($" {algorithm.ToUpper()} Vector Search"); + Console.WriteLine(new string('=', 60)); + + var mongoClient = Utils.GetMongoClientPasswordless(config); + var embeddingClient = Utils.GetEmbeddingClient(config); + + try + { + var database = mongoClient.GetDatabase(config.DocumentDB.DatabaseName); + + var collectionName = $"hotels_{algorithm}"; + var collectionNames = database.ListCollectionNames().ToList(); + if (collectionNames.Contains(collectionName)) + { + database.DropCollection(collectionName); + Console.WriteLine($"Dropped existing '{collectionName}' collection."); + } + + var collection = database.GetCollection(collectionName); + + var data = Utils.ReadJsonFile(config.DataFiles.WithVectors); + var documents = data.Where(d => d.Contains(config.Embedding.EmbeddedField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + Utils.InsertData(collection, documents, config.DocumentDB.LoadBatchSize); + + Console.WriteLine($"\nQuery: \"{config.VectorSearch.Query}\""); + var embeddingResult = embeddingClient.GenerateEmbedding(config.VectorSearch.Query); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + Console.WriteLine("Embedding generated\n"); + + var indexConfig = BuildIndexConfig(algorithm, config.Embedding.Dimensions); + Console.WriteLine($"Creating {algorithm} index..."); + CreateIndex(collection, config.Embedding.EmbeddedField, indexConfig); + + Console.WriteLine("Waiting for index to build..."); + Thread.Sleep(5000); + + Console.WriteLine("Running search...\n"); + var sw = Stopwatch.StartNew(); + var results = RunVectorSearch(collection, queryVector, config.Embedding.EmbeddedField, indexConfig.Name, config.VectorSearch.TopK, algorithm); + sw.Stop(); + + PrintResults(results, algorithm, sw.ElapsedMilliseconds); + } + finally + { + mongoClient.Cluster.Dispose(); + } + } + + private static IndexConfig BuildIndexConfig(string algorithm, int dimensions) + { + var algo = algorithm.ToLower(); + return algo switch + { + "ivf" => new IndexConfig( + $"vector_ivf", + "vector-ivf", + "COS", + new BsonDocument { { "numLists", 1 } } + ), + "hnsw" => new IndexConfig( + $"vector_hnsw", + "vector-hnsw", + "COS", + new BsonDocument { { "m", 16 }, { "efConstruction", 64 } } + ), + "diskann" => new IndexConfig( + $"vector_diskann", + "vector-diskann", + "COS", + new BsonDocument { { "maxDegree", 32 }, { "lBuild", 50 } } + ), + _ => throw new ArgumentException($"Unknown algorithm: {algorithm}") + }; + } + + private static void CreateIndex(IMongoCollection collection, string vectorField, IndexConfig config) + { + try + { + collection.Indexes.DropOne(config.Name); + } + catch (MongoCommandException) + { + } + + var cosmosSearchOptions = new BsonDocument + { + { "kind", config.Kind }, + { "dimensions", int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536") }, + { "similarity", config.Similarity } + }; + + foreach (var param in config.ExtraParams) + { + cosmosSearchOptions.Add(param); + } + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", config.Name }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", cosmosSearchOptions } + } + } + } + }; + + try + { + collection.Database.RunCommand(command); + } + catch (MongoCommandException ex) when (ex.Message.Contains("already exists")) + { + } + } + + private static List RunVectorSearch( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK, + string algorithm) + { + var cosmosSearch = new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + }; + + switch (algorithm.ToLower()) + { + case "diskann": + cosmosSearch.Add("lSearch", 100); + break; + case "hnsw": + cosmosSearch.Add("efSearch", 80); + break; + case "ivf": + cosmosSearch.Add("nProbes", 1); + break; + } + + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", cosmosSearch)), + new BsonDocument("$project", new BsonDocument + { + { "HotelName", 1 }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + private static void PrintResults(List results, string algorithm, long latencyMs) + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine($" {algorithm.ToUpper()} Results ({results.Count} found, {latencyMs}ms)"); + Console.WriteLine(new string('=', 60)); + Console.WriteLine(); + + for (var i = 0; i < results.Count; i++) + { + var doc = results[i]; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString : "Unknown"; + var score = doc.Contains("score") ? doc["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + + Console.WriteLine(); + } +} diff --git a/ai/select-algorithm-dotnet/CompareAll.cs b/ai/select-algorithm-dotnet/CompareAll.cs new file mode 100644 index 0000000..639b20e --- /dev/null +++ b/ai/select-algorithm-dotnet/CompareAll.cs @@ -0,0 +1,305 @@ +/// Unified comparison runner for all 9 combinations (3 algorithms × 3 similarity metrics). +/// Executes vector searches sequentially for fair timing and prints a formatted comparison table. + +namespace SelectAlgorithm; + +using System.Diagnostics; +using MongoDB.Driver; +using MongoDB.Bson; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +public static class CompareAll +{ + private record IndexConfig(string Name, string Kind, string Similarity, BsonDocument ExtraParams); + + private record SearchResult(string IndexName, string Algorithm, string Metric, long LatencyMs, List Results); + + public static void Run(AppConfiguration appConfig) + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" Compare All Algorithms × Metrics"); + Console.WriteLine(" 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP"); + Console.WriteLine(new string('=', 60)); + + // Use config values with env var overrides for compare-specific settings + var databaseName = appConfig.DocumentDB.DatabaseName; + var dataFile = appConfig.DataFiles.WithVectors; + var vectorField = appConfig.Embedding.EmbeddedField; + var dimensions = appConfig.Embedding.Dimensions; + var batchSize = appConfig.DocumentDB.LoadBatchSize; + var queryText = Environment.GetEnvironmentVariable("QUERY_TEXT") ?? "luxury hotel near the beach"; + var topK = int.Parse(Environment.GetEnvironmentVariable("TOP_K") ?? "3"); + var verbose = (Environment.GetEnvironmentVariable("VERBOSE") ?? "false").Equals("true", StringComparison.OrdinalIgnoreCase); + + var mongoClient = Utils.GetMongoClientPasswordless(appConfig); + var embeddingClient = Utils.GetEmbeddingClient(appConfig); + + try + { + var database = mongoClient.GetDatabase(databaseName); + + // Drop collection for a clean comparison + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + + var collection = database.GetCollection("hotels"); + + // Load data once into single collection + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + Utils.InsertData(collection, documents, batchSize); + + // Generate ONE embedding for the query (reused for all 9 searches) + Console.WriteLine($"\nQuery: \"{queryText}\""); + Console.WriteLine($"Top K: {topK}"); + var embeddingResult = embeddingClient.GenerateEmbedding(queryText); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + Console.WriteLine("Embedding generated (reused for all searches)\n"); + + // Define 9 index configurations + var configs = BuildIndexConfigs(dimensions); + + // Run each config sequentially: drop→create→wait→search + // DocumentDB doesn't allow multiple vector indexes of the same kind on the same field + Console.WriteLine("Running 9 algorithm × metric combinations...\n"); + var results = new List(); + foreach (var config in configs) + { + // 1. Drop all existing vector indexes + DropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + CreateIndex(collection, vectorField, config); + Console.WriteLine($" ✓ {config.Name} created"); + + // 3. Wait for index to build + Thread.Sleep(5000); + + // 4. Search + var sw = Stopwatch.StartNew(); + var searchResults = RunVectorSearch(collection, queryVector, vectorField, config.Name, topK); + sw.Stop(); + + // 5. Record result + results.Add(new SearchResult(config.Name, config.Kind, config.Similarity, sw.ElapsedMilliseconds, searchResults)); + + if (verbose) + { + Console.WriteLine($" {config.Name}: {sw.ElapsedMilliseconds}ms ({searchResults.Count} results)"); + } + } + + // Print comparison table + PrintComparisonTable(results, verbose); + } + finally + { + // Cleanup: drop the comparison collection + try + { + var database = mongoClient.GetDatabase(databaseName); + database.DropCollection("hotels"); + Console.WriteLine("\nCleanup: dropped collection 'hotels'"); + } + catch (Exception ex) + { + Console.WriteLine($"Cleanup warning: {ex.Message}"); + } + mongoClient.Cluster.Dispose(); + } + } + + private static List BuildIndexConfigs(int dimensions) + { + string[] metrics = ["COS", "L2", "IP"]; + var configs = new List(); + + foreach (var metric in metrics) + { + configs.Add(new IndexConfig( + $"vector_ivf_{metric.ToLower()}", + "vector-ivf", + metric, + new BsonDocument { { "numLists", 1 } } + )); + + configs.Add(new IndexConfig( + $"vector_hnsw_{metric.ToLower()}", + "vector-hnsw", + metric, + new BsonDocument { { "m", 16 }, { "efConstruction", 64 } } + )); + + configs.Add(new IndexConfig( + $"vector_diskann_{metric.ToLower()}", + "vector-diskann", + metric, + new BsonDocument { { "maxDegree", 32 }, { "lBuild", 50 } } + )); + } + + return configs; + } + + private static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + foreach (var idx in cursor.ToList()) + { + var name = idx.GetValue("name", "").AsString; + var key = idx.GetValue("key", new BsonDocument()).AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + try { collection.Indexes.DropOne(name); } catch { } + } + } + } + catch { } + } + + private static void CreateIndex(IMongoCollection collection, string vectorField, IndexConfig config) + { + // Drop existing index with same name if present + try + { + collection.Indexes.DropOne(config.Name); + } + catch (MongoCommandException) + { + // Index doesn't exist, that's fine + } + + var cosmosSearchOptions = new BsonDocument + { + { "kind", config.Kind }, + { "dimensions", int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536") }, + { "similarity", config.Similarity } + }; + + foreach (var param in config.ExtraParams) + { + cosmosSearchOptions.Add(param); + } + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", config.Name }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", cosmosSearchOptions } + } + } + } + }; + + try + { + collection.Database.RunCommand(command); + } + catch (MongoCommandException ex) when (ex.Message.Contains("already exists")) + { + // Index already exists with same config — idempotent + } + } + + private static List RunVectorSearch( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK) + { + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "HotelName", 1 }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + private static void PrintComparisonTable(List results, bool verbose) + { + Console.WriteLine(); + Console.WriteLine(new string('=', 78)); + Console.WriteLine(" COMPARISON RESULTS"); + Console.WriteLine(new string('=', 78)); + Console.WriteLine(); + + // Header + var header = "Index Name".PadRight(24) + + "Algorithm".PadRight(14) + + "Metric".PadRight(8) + + "Latency".PadRight(10) + + "Top Result".PadRight(22); + Console.WriteLine(header); + Console.WriteLine(new string('-', 78)); + + foreach (var result in results) + { + var topResult = "—"; + var topScore = ""; + if (result.Results.Count > 0) + { + var doc = result.Results[0]; + topResult = doc.Contains("HotelName") ? doc["HotelName"].AsString : "Unknown"; + if (topResult.Length > 18) topResult = topResult[..18] + "..."; + var score = doc.Contains("score") ? doc["score"].ToDouble() : 0.0; + topScore = $" ({score:F3})"; + } + + var algoDisplay = result.Algorithm.Replace("vector-", "").ToUpper(); + var row = result.IndexName.PadRight(24) + + algoDisplay.PadRight(14) + + result.Metric.PadRight(8) + + $"{result.LatencyMs}ms".PadRight(10) + + $"{topResult}{topScore}"; + Console.WriteLine(row); + } + + Console.WriteLine(new string('-', 78)); + Console.WriteLine(); + + // Summary stats + var fastest = results.MinBy(r => r.LatencyMs)!; + var slowest = results.MaxBy(r => r.LatencyMs)!; + Console.WriteLine($" Fastest: {fastest.IndexName} ({fastest.LatencyMs}ms)"); + Console.WriteLine($" Slowest: {slowest.IndexName} ({slowest.LatencyMs}ms)"); + Console.WriteLine(); + + if (verbose) + { + Console.WriteLine(" DETAILED RESULTS:"); + Console.WriteLine(); + foreach (var result in results) + { + Console.WriteLine($" [{result.IndexName}]"); + for (var i = 0; i < result.Results.Count; i++) + { + var doc = result.Results[i]; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString : "Unknown"; + var score = doc.Contains("score") ? doc["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + Console.WriteLine(); + } + } + } +} diff --git a/ai/select-algorithm-dotnet/Models/Configuration.cs b/ai/select-algorithm-dotnet/Models/Configuration.cs new file mode 100644 index 0000000..cd223d0 --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/Configuration.cs @@ -0,0 +1,40 @@ +namespace SelectAlgorithm.Models; + +public class AppConfiguration +{ + public AzureOpenAIConfiguration AzureOpenAI { get; set; } = new(); + public DocumentDBConfiguration DocumentDB { get; set; } = new(); + public EmbeddingConfiguration Embedding { get; set; } = new(); + public VectorSearchConfiguration VectorSearch { get; set; } = new(); + public DataFilesConfiguration DataFiles { get; set; } = new(); +} + +public class AzureOpenAIConfiguration +{ + public string Endpoint { get; set; } = string.Empty; + public string EmbeddingModel { get; set; } = "text-embedding-3-small"; +} + +public class DocumentDBConfiguration +{ + public string ClusterName { get; set; } = string.Empty; + public string DatabaseName { get; set; } = "Hotels"; + public int LoadBatchSize { get; set; } = 100; +} + +public class EmbeddingConfiguration +{ + public string EmbeddedField { get; set; } = "DescriptionVector"; + public int Dimensions { get; set; } = 1536; +} + +public class VectorSearchConfiguration +{ + public string Query { get; set; } = "luxury hotel near the beach"; + public int TopK { get; set; } = 3; +} + +public class DataFilesConfiguration +{ + public string WithVectors { get; set; } = "./data/Hotels_Vector.json"; +} diff --git a/ai/select-algorithm-dotnet/Models/HotelData.cs b/ai/select-algorithm-dotnet/Models/HotelData.cs new file mode 100644 index 0000000..4821ee3 --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/HotelData.cs @@ -0,0 +1,19 @@ +using MongoDB.Bson; +using MongoDB.Bson.Serialization.Attributes; + +namespace SelectAlgorithm.Models; + +public class HotelData +{ + [BsonId] + [BsonRepresentation(BsonType.ObjectId)] + public string? Id { get; set; } + + public string HotelId { get; set; } = string.Empty; + public string HotelName { get; set; } = string.Empty; + public string Description { get; set; } = string.Empty; + public string Category { get; set; } = string.Empty; + + [BsonExtraElements] + public BsonDocument? ExtraElements { get; set; } +} diff --git a/ai/select-algorithm-dotnet/Program.cs b/ai/select-algorithm-dotnet/Program.cs new file mode 100644 index 0000000..f0e7a04 --- /dev/null +++ b/ai/select-algorithm-dotnet/Program.cs @@ -0,0 +1,49 @@ +using Microsoft.Extensions.Configuration; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +class Program +{ + static void Main(string[] args) + { + Console.WriteLine(); + Console.WriteLine("Select Algorithm Demo - Azure DocumentDB Vector Search (.NET)"); + Console.WriteLine(new string('-', 60)); + Console.WriteLine(); + + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) + .AddEnvironmentVariables() + .Build(); + + var appConfig = new AppConfiguration(); + configuration.Bind(appConfig); + + var command = args.Length > 0 ? args[0].ToLower() : "compare-all"; + + switch (command) + { + case "ivf": + AlgorithmRunner.RunSingleAlgorithm(appConfig, "ivf"); + break; + case "hnsw": + AlgorithmRunner.RunSingleAlgorithm(appConfig, "hnsw"); + break; + case "diskann": + AlgorithmRunner.RunSingleAlgorithm(appConfig, "diskann"); + break; + case "compare-all": + CompareAll.Run(appConfig); + break; + default: + Console.WriteLine($"Unknown command: {command}"); + Console.WriteLine("Usage: dotnet run -- [ivf|hnsw|diskann|compare-all]"); + return; + } + + Console.WriteLine(); + Console.WriteLine("Done!"); + } +} diff --git a/ai/select-algorithm-dotnet/README.md b/ai/select-algorithm-dotnet/README.md new file mode 100644 index 0000000..c56154b --- /dev/null +++ b/ai/select-algorithm-dotnet/README.md @@ -0,0 +1,159 @@ +# Select Algorithm - .NET (C#) + +Demonstrates three vector index algorithms available in Azure DocumentDB (vCore): + +| Algorithm | Best For | Cluster Tier | Key Parameters | +|-----------|----------|--------------|----------------| +| **IVF** | < 10,000 documents | M10+ | `numLists` | +| **HNSW** | 10,000–50,000 documents | M30+ | `m`, `efConstruction` | +| **DiskANN** | 50,000+ documents | M40+ | `maxDegree`, `lBuild` | + +## Prerequisites + +- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) +- Azure DocumentDB (vCore) cluster +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. **Configure environment:** + + The .NET sample uses `appsettings.json` for configuration. After deploying with `azd up`, you can export values: + + ```bash + azd env get-values + ``` + + Then update `appsettings.json` with your Azure resource values. + +2. Edit `appsettings.json` with your configuration: + + ```json + { + "AzureOpenAI": { + "EmbeddingModel": "text-embedding-3-small", + "EmbeddingEndpoint": "https://.openai.azure.com" + }, + "DocumentDB": { + "ClusterName": "", + "DatabaseName": "Hotels" + }, + "Algorithm": "all", + "Similarity": "COS" + } + ``` + +3. Copy the data file: + + Copy `Hotels_Vector.json` from the repository's `ai/data/` folder into this project's `data/` folder: + + ```bash + cp ../../data/Hotels_Vector.json ./data/ + ``` + +4. Restore packages: + + ```bash + dotnet restore + ``` + +## Usage + +Run all 9 combinations (default): + +```bash +dotnet run +``` + +Run a specific algorithm: + +```bash +dotnet run -- ivf +dotnet run -- hnsw +dotnet run -- diskann +``` + +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation with a formatted comparison table: + +```bash +dotnet run -- compare-all +``` + +This mode: +- Uses a **single collection** (`hotels`) +- Generates **one embedding** for the query, reused across all searches +- For each of 9 algorithm/metric combinations: creates the index → searches → drops the index +- DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +- Prints a formatted comparison table with scores, top results, and key insights + +**Additional environment variables for compare mode:** + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `3` | Number of results per search | +| `VERBOSE` | `false` | Show detailed per-result output | + +**Output:** +``` +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +Algorithm Similarity #1 Result #1 Score #2 Result #2 Score Diff +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +## Project Structure + +``` +select-algorithm-dotnet/ +├── .devcontainer/ +│ └── devcontainer.json # Dev container configuration +├── Models/ +│ ├── Configuration.cs # App configuration model +│ └── HotelData.cs # Hotel document model +├── Utilities/ +│ └── AzureIdentityTokenHandler.cs # OIDC token handler +├── output/ +│ └── compare_all.txt # Sample comparison output +├── AlgorithmRunner.cs # Per-algorithm index + search runner +├── appsettings.json # Configuration file +├── CompareAll.cs # Unified 9-combination comparison runner +├── Program.cs # Entry point - dispatches by ALGORITHM setting +├── README.md # This file +├── SelectAlgorithm.csproj # Project file +└── Utils.cs # Shared helpers (connection, embedding, search) +``` + +## How It Works + +1. **Connect** to DocumentDB using Microsoft Entra ID (OIDC) passwordless authentication +2. **Load** hotel documents with pre-computed embeddings from `Hotels_Vector.json` +3. **Create** a vector index using the selected algorithm +4. **Search** using a natural language query converted to an embedding via Azure OpenAI +5. **Display** ranked results with similarity scores + +## Authentication + +This sample uses `DefaultAzureCredential` for both: +- **DocumentDB**: OIDC-based MongoDB authentication +- **Azure OpenAI**: Token-based authentication with `https://cognitiveservices.azure.com/.default` scope + +Ensure you are logged in with `az login` and have appropriate RBAC roles assigned. diff --git a/ai/select-algorithm-dotnet/SelectAlgorithm.csproj b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj new file mode 100644 index 0000000..331e522 --- /dev/null +++ b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj @@ -0,0 +1,23 @@ + + + Exe + net8.0 + enable + enable + SelectAlgorithm + + + + + + + + + + + + + PreserveNewest + + + diff --git a/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs new file mode 100644 index 0000000..eca94fd --- /dev/null +++ b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs @@ -0,0 +1,32 @@ +using Azure.Core; +using MongoDB.Driver.Authentication.Oidc; + +namespace SelectAlgorithm.Utilities; + +internal sealed class AzureIdentityTokenHandler( + TokenCredential credential, + string? tenantId +) : IOidcCallback +{ + private readonly string[] scopes = ["https://ossrdbms-aad.database.windows.net/.default"]; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = credential.GetToken( + new TokenRequestContext(scopes, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = await credential.GetTokenAsync( + new TokenRequestContext(scopes, parentRequestId: null, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} diff --git a/ai/select-algorithm-dotnet/Utils.cs b/ai/select-algorithm-dotnet/Utils.cs new file mode 100644 index 0000000..02a187e --- /dev/null +++ b/ai/select-algorithm-dotnet/Utils.cs @@ -0,0 +1,190 @@ +using MongoDB.Driver; +using MongoDB.Driver.Authentication.Oidc; +using MongoDB.Bson; +using MongoDB.Bson.Serialization; +using Azure.Identity; +using Azure.Core; +using Azure.AI.OpenAI; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +public class AzureOidcCallback : IOidcCallback +{ + private readonly DefaultAzureCredential _credential; + private static readonly string[] Scopes = { "https://ossrdbms-aad.database.windows.net/.default" }; + + public AzureOidcCallback(DefaultAzureCredential credential) => _credential = credential; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = _credential.GetToken(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = await _credential.GetTokenAsync(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} + +public static class Utils +{ + public static IMongoClient GetMongoClientPasswordless(AppConfiguration config) + { + var clusterName = config.DocumentDB.ClusterName; + if (string.IsNullOrEmpty(clusterName)) + throw new InvalidOperationException("DocumentDB:ClusterName is required in appsettings.json"); + + var credential = new DefaultAzureCredential(); + + var connectionString = $"mongodb+srv://{clusterName}.global.mongocluster.cosmos.azure.com/"; + var settings = MongoClientSettings.FromConnectionString(connectionString); + settings.ConnectTimeout = TimeSpan.FromSeconds(120); + settings.UseTls = true; + settings.RetryWrites = true; + + // Custom OIDC callback using DefaultAzureCredential + // Chains through CLI, managed identity, etc. + var oidcCallback = new AzureOidcCallback(credential); + settings.Credential = MongoCredential.CreateOidcCredential(oidcCallback, null); + + return new MongoClient(settings); + } + + public static EmbeddingClient GetEmbeddingClient(AppConfiguration config) + { + var endpoint = config.AzureOpenAI.Endpoint; + if (string.IsNullOrEmpty(endpoint)) + throw new InvalidOperationException("AzureOpenAI:Endpoint is required in appsettings.json"); + + var model = config.AzureOpenAI.EmbeddingModel; + + var credential = new DefaultAzureCredential(); + var azureClient = new AzureOpenAIClient(new Uri(endpoint), credential); + return azureClient.GetEmbeddingClient(model); + } + + public static List ReadJsonFile(string path) + { + if (!File.Exists(path)) + throw new FileNotFoundException($"Data file not found: {path}"); + + var json = File.ReadAllText(path); + return BsonSerializer.Deserialize>(json); + } + + public static void InsertData(IMongoCollection collection, List data, int batchSize) + { + var totalDocuments = data.Count; + var existingCount = collection.CountDocuments(new BsonDocument()); + + if (existingCount >= totalDocuments) + { + Console.WriteLine($"Collection already has {existingCount} documents, skipping insert"); + return; + } + + if (existingCount > 0) + { + collection.DeleteMany(new BsonDocument()); + } + + var insertedCount = 0; + for (var i = 0; i < totalDocuments; i += batchSize) + { + var batch = data.Skip(i).Take(batchSize).ToList(); + try + { + collection.InsertMany(batch, new InsertManyOptions { IsOrdered = false }); + insertedCount += batch.Count; + } + catch (MongoBulkWriteException) + { + // Some documents may have been inserted before the error + insertedCount += batch.Count; + } + Thread.Sleep(100); + } + + Console.WriteLine($"Inserted {insertedCount}/{totalDocuments} documents"); + } + + public static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + var indexes = cursor.ToList(); + foreach (var index in indexes) + { + if (index.Contains("key")) + { + var key = index["key"].AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + var indexName = index["name"].AsString; + collection.Indexes.DropOne(indexName); + Console.WriteLine($"Dropped existing vector index: {indexName}"); + } + } + } + } + catch (Exception ex) + { + Console.WriteLine($"Warning: Error dropping indexes: {ex.Message}"); + } + } + + public static List PerformVectorSearch( + IMongoCollection collection, + EmbeddingClient client, + string query, + string vectorField, + string model, + int topK = 5) + { + var embeddingResult = client.GenerateEmbedding(query); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "document", "$$ROOT" }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + public static void PrintSearchResults(List results, string algorithm) + { + Console.WriteLine(); + Console.WriteLine(new string('=', 60)); + Console.WriteLine($" {algorithm} Search Results ({results.Count} found)"); + Console.WriteLine(new string('=', 60)); + + for (var i = 0; i < results.Count; i++) + { + var result = results[i]; + var doc = result.Contains("document") ? result["document"].AsBsonDocument : result; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString + : doc.Contains("name") ? doc["name"].AsString + : "Unknown"; + var score = result.Contains("score") ? result["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + + Console.WriteLine(); + } +} diff --git a/ai/select-algorithm-dotnet/appsettings.json b/ai/select-algorithm-dotnet/appsettings.json new file mode 100644 index 0000000..015fc8f --- /dev/null +++ b/ai/select-algorithm-dotnet/appsettings.json @@ -0,0 +1,23 @@ +{ + "AzureOpenAI": { + "Endpoint": "https://.openai.azure.com/", + "EmbeddingModel": "text-embedding-3-small" + }, + "DocumentDB": { + "ClusterName": "", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536 + }, + "VectorSearch": { + "Query": "quintessential lodging near running trails, eateries, retail", + "Similarity": "COS", + "TopK": 5 + }, + "DataFiles": { + "WithVectors": "./data/Hotels_Vector.json" + } +} diff --git a/ai/select-algorithm-dotnet/data/README.md b/ai/select-algorithm-dotnet/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-dotnet/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-dotnet/output/compare_all.txt b/ai/select-algorithm-dotnet/output/compare_all.txt new file mode 100644 index 0000000..8cbd861 --- /dev/null +++ b/ai/select-algorithm-dotnet/output/compare_all.txt @@ -0,0 +1,45 @@ +Select Algorithm Demo - Azure DocumentDB Vector Search (.NET) +------------------------------------------------------------ +============================================================ + Compare All Algorithms x Metrics + 9 combinations: IVF, HNSW, DiskANN x COS, L2, IP +============================================================ +Dropped existing 'hotels' collection (if any) + +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Query: "luxury hotel near the beach" +Top K: 3 +Embedding generated (reused for all searches) + +Running 9 algorithm x metric combinations... + vector_ivf_cos created + vector_hnsw_cos created + vector_diskann_cos created + vector_ivf_l2 created + vector_hnsw_l2 created + vector_diskann_l2 created + vector_ivf_ip created + vector_hnsw_ip created + vector_diskann_ip created + +============================================================================== + COMPARISON RESULTS +============================================================================== +Index Name Algorithm Metric Latency Top Result +------------------------------------------------------------------------------ +vector_ivf_cos IVF COS 77ms Ocean Water Resort... (0.618) +vector_hnsw_cos HNSW COS 71ms Ocean Water Resort... (0.618) +vector_diskann_cos DISKANN COS 70ms Ocean Water Resort... (0.618) +vector_ivf_l2 IVF L2 70ms Ocean Water Resort... (0.874) +vector_hnsw_l2 HNSW L2 69ms Ocean Water Resort... (0.874) +vector_diskann_l2 DISKANN L2 76ms Ocean Water Resort... (0.874) +vector_ivf_ip IVF IP 69ms Ocean Water Resort... (0.618) +vector_hnsw_ip HNSW IP 69ms Ocean Water Resort... (0.618) +vector_diskann_ip DISKANN IP 70ms Ocean Water Resort... (0.618) +------------------------------------------------------------------------------ + Fastest: vector_hnsw_l2 (69ms) + Slowest: vector_ivf_cos (77ms) +Cleanup: dropped collection 'hotels' +Done! \ No newline at end of file diff --git a/ai/select-algorithm-go/.env.example b/ai/select-algorithm-go/.env.example new file mode 100644 index 0000000..3e6f3c1 --- /dev/null +++ b/ai/select-algorithm-go/.env.example @@ -0,0 +1,43 @@ +# DocumentDB Configuration +# Name of the DocumentDB cluster (used for passwordless OIDC authentication) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI Embedding Configuration +# Azure OpenAI service endpoint URL +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Azure OpenAI API version for embeddings +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 + +# Database name in DocumentDB +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Path to JSON file with generated vector embeddings +DATA_FILE_WITH_VECTORS=./data/Hotels_Vector.json + +# Name of the field where embeddings are stored +EMBEDDED_FIELD=DescriptionVector + +# Number of dimensions in the embedding vectors (1536 for text-embedding-3-small) +EMBEDDING_DIMENSIONS=1536 + +# Number of records to load per batch during data insertion +LOAD_SIZE_BATCH=100 + +# Algorithm to run: "all", "ivf", "hnsw", or "diskann" +ALGORITHM=all + +# Vector similarity metric: "COS" (cosine), "L2" (Euclidean), or "IP" (inner product) +SIMILARITY=COS + +# Notes: +# 1. Replace all placeholder values with your actual Azure resource information +# 2. For production, use Azure Key Vault or environment variables instead of storing keys in files +# 3. The EMBEDDING_DIMENSIONS must match your chosen embedding model: +# - text-embedding-3-small: 1536 dimensions +# - text-embedding-3-large: 3072 dimensions +# 4. Adjust batch sizes based on your API rate limits and performance requirements +# 5. For passwordless authentication, ensure your Azure identity has appropriate RBAC permissions diff --git a/ai/select-algorithm-go/.gitignore b/ai/select-algorithm-go/.gitignore new file mode 100644 index 0000000..cbdc8a2 --- /dev/null +++ b/ai/select-algorithm-go/.gitignore @@ -0,0 +1,6 @@ +*.exe +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md new file mode 100644 index 0000000..ab05f6f --- /dev/null +++ b/ai/select-algorithm-go/README.md @@ -0,0 +1,227 @@ +# DocumentDB Vector Search - Go Algorithm Comparison Sample + +This sample demonstrates how to compare different vector search algorithms (IVF, HNSW, DiskANN) and similarity metrics (Cosine, L2, Inner Product) with Azure Cosmos DB for MongoDB (DocumentDB). + +## Prerequisites + +- [Go 1.24+](https://golang.org/dl/) +- [Azure DocumentDB (vCore) cluster](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/) (M40+ tier for DiskANN) +- [Azure OpenAI resource](https://learn.microsoft.com/azure/ai-services/openai/) with an embedding model deployed +- [Azure CLI](https://learn.microsoft.com/cli/azure/) (for passwordless authentication) +- Pre-generated embeddings file (`Hotels_Vector.json`) — see the `vector-search-go` sample + +## Setup + +1. **Clone the repository** and navigate to this directory: + + ```bash + cd ai/select-algorithm-go + ``` + +2. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + Required variables: + ```env + MONGO_CLUSTER_NAME=your-cluster-name + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_DOCUMENTDB_DATABASENAME=Hotels + DATA_FILE_WITH_VECTORS=./data/Hotels_Vector.json + EMBEDDED_FIELD=contentVector + EMBEDDING_DIMENSIONS=1536 + ``` + +3. **Copy the data file:** + + Copy `Hotels_Vector.json` from the repository's `ai/data/` folder into this project's `data/` folder: + + ```bash + cp ../../data/Hotels_Vector.json ./data/ + ``` + +4. **Install dependencies**: + + ```bash + go mod download + ``` + +5. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +## Usage + +### Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single execution: + +```bash +go run ./src/... +``` + +This creates indexes sequentially (create/search/drop per combo — DocumentDB allows one vector index per kind per field) and prints a comparison table showing scores and top results. + +**Output:** +``` +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== + ... +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +ALGORITHM SIMILARITY #1 RESULT #1 SCORE #2 RESULT #2 SCORE DIFF +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +### Run Individual Algorithms + +Test a specific algorithm with cosine similarity: + +```bash +# IVF (Inverted File) — clustering-based, works on all tiers +go run src/ivf.go src/utils.go + +# HNSW (Hierarchical Navigable Small World) — graph-based, higher recall +go run src/hnsw.go src/utils.go + +# DiskANN — disk-optimized, best for large datasets (requires M40+ tier) +go run src/diskann.go src/utils.go +``` + +### On Windows (PowerShell) + +```powershell +go run ./src/... +go run src/ivf.go src/utils.go +go run src/hnsw.go src/utils.go +go run src/diskann.go src/utils.go +``` + +## Environment Variables + +| Variable | Default | Description | +|--------------|----------------------------------|---------------------------------| +| `MONGO_CLUSTER_NAME` | *(required)* | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | *(required)* | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | `text-embedding-3-small` | Embedding model name | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Database name | +| `DATA_FILE_WITH_VECTORS` | `./data/Hotels_Vector.json` | Path to data file | +| `EMBEDDED_FIELD` | `contentVector` | Field containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Embedding vector dimensions | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query | +| `VERBOSE` | `false` | Show full results (compare_all only) | + +## How It Works + +### Comparison Mode (`compare_all.go`) + +1. **Data Loading:** Loads hotel data with pre-generated embeddings +2. **Index Creation:** Creates vector indexes sequentially (one at a time): + - For each algorithm (IVF, HNSW, DiskANN) × each metric (COS, L2, IP): + - Create the index → wait for readiness → search → drop the index + - DocumentDB only allows one vector index per kind per field +3. **Query Execution:** Generates embedding once, reuses for all 9 searches +4. **Result Comparison:** Prints formatted table with #1/#2 results, scores, and diff + +### Individual Mode (`ivf.go`, `hnsw.go`, `diskann.go`) + +Each file demonstrates a single algorithm with cosine similarity: +- Creates a dedicated collection for that algorithm +- Creates the appropriate vector index +- Performs a search and displays results +- Cleans up the collection on exit + +## Index Parameters + +| Algorithm | Kind | Key Parameters | Values Used | +|-----------|-----------------|-----------------------------|-----------------------------| +| IVF | `vector-ivf` | `numLists` | 1 (optimized for small datasets) | +| HNSW | `vector-hnsw` | `m`, `efConstruction` | 16, 64 | +| DiskANN | `vector-diskann`| `maxDegree`, `lBuild` | 32, 50 | + +## Project Structure + +``` +select-algorithm-go/ +├── .env.example # Environment variable template +├── go.mod # Go module dependencies +├── go.sum # Go module checksums +├── output/ # Sample output files +├── README.md # This file +└── src/ + ├── utils.go # Shared config, auth, data, and search helpers + ├── compare_all.go # Unified 9-combination comparison runner (create/search/drop) + ├── ivf.go # IVF algorithm demonstration + ├── hnsw.go # HNSW algorithm demonstration + └── diskann.go # DiskANN algorithm demonstration +``` + +## Authentication + +This sample uses **passwordless (OIDC) authentication** with `DefaultAzureCredential`. Ensure your Azure identity has: + +- **DocumentDB**: Appropriate RBAC role on the cluster +- **Azure OpenAI**: `Cognitive Services OpenAI User` role on the OpenAI resource + +The MongoDB OIDC auth uses the `https://ossrdbms-aad.database.windows.net/.default` scope, and the OpenAI client uses Azure token credentials. + +## Important Notes + +- **COS/IP scores:** Higher = more similar (0–1 range) +- **L2 scores:** Lower = more similar (distance metric) +- **Sequential indexing:** DocumentDB requires create/search/drop per combo (one vector index per kind per field) +- **Cleanup:** All samples automatically drop their collections on exit +- **Collection strategy:** `compare_all.go` uses a single collection with sequential index rotation; individual runners use separate collections +- **bson.D ordering:** All MongoDB commands use `bson.D` (ordered) instead of `bson.M` (unordered) to avoid "multi-key map" errors + +## Troubleshooting + +**"OIDC authentication failed"** +- Run `az login` and ensure you're authenticated +- Verify your Azure identity has RBAC permissions on the DocumentDB cluster +- Check that `MONGO_CLUSTER_NAME` matches your cluster name + +**"DiskANN indexes require a higher cluster tier"** +- DiskANN requires M40+ cluster tier +- Try IVF or HNSW instead, or upgrade your cluster + +**"No documents found with embeddings"** +- Ensure `DATA_FILE_WITH_VECTORS` points to the correct file +- Verify the file contains the field specified in `EMBEDDED_FIELD` +- Check that embeddings were generated with the correct dimensions + +## Learn More + +- [Azure Cosmos DB for MongoDB Documentation](https://learn.microsoft.com/azure/cosmos-db/mongodb/) +- [Vector Search in DocumentDB](https://learn.microsoft.com/azure/cosmos-db/mongodb/vector-search) +- [Choosing a Vector Index Algorithm](https://learn.microsoft.com/azure/cosmos-db/mongodb/vector-search-algorithms) +- [Go MongoDB driver](https://pkg.go.dev/go.mongodb.org/mongo-driver) diff --git a/ai/select-algorithm-go/data/README.md b/ai/select-algorithm-go/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-go/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-go/go.mod b/ai/select-algorithm-go/go.mod new file mode 100644 index 0000000..53e0b34 --- /dev/null +++ b/ai/select-algorithm-go/go.mod @@ -0,0 +1,36 @@ +module documentdb-select-algorithm + +go 1.24.0 + +require ( + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 + github.com/joho/godotenv v1.5.1 + github.com/openai/openai-go/v3 v3.12.0 + go.mongodb.org/mongo-driver v1.17.6 +) + +require ( + github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect + github.com/golang-jwt/jwt/v5 v5.3.0 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect + github.com/montanaflynn/stats v0.7.1 // indirect + github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.1.2 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect +) diff --git a/ai/select-algorithm-go/go.sum b/ai/select-algorithm-go/go.sum new file mode 100644 index 0000000..5ff90f3 --- /dev/null +++ b/ai/select-algorithm-go/go.sum @@ -0,0 +1,97 @@ +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= +github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= +github.com/openai/openai-go/v3 v3.12.0 h1:NkrImaglFQeDycc/n/fEmpFV8kKr8snl9/8X2x4eHOg= +github.com/openai/openai-go/v3 v3.12.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.mongodb.org/mongo-driver v1.17.6 h1:87JUG1wZfWsr6rIz3ZmpH90rL5tea7O3IHuSwHUpsss= +go.mongodb.org/mongo-driver v1.17.6/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ai/select-algorithm-go/output/compare_all.txt b/ai/select-algorithm-go/output/compare_all.txt new file mode 100644 index 0000000..bfb5406 --- /dev/null +++ b/ai/select-algorithm-go/output/compare_all.txt @@ -0,0 +1,70 @@ +DocumentDB Select Algorithm - Go Sample +======================================== +Database: Hotels +Dimensions: 1536 + +Initializing MongoDB and Azure OpenAI clients... +Attempting OIDC authentication... +OIDC authentication successful! + +====================================================================== + COMPARE ALL: 3 Algorithms x 3 Similarity Metrics (9 combinations) +====================================================================== +Query: "luxury hotel near the beach" +Top-K: 3 +Verbose: false + +Dropped existing 'hotels' collection + +Loading data from ../data/Hotels_Vector.json... +Loaded 50 documents with embeddings +Preparing collection 'hotels'... +Starting batch insertion of 50 documents... +Batch 1 completed: 50 documents inserted +Insertion completed: 50 inserted, 0 failed + +Generating embedding for query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running 9 vector index comparisons (create->search->drop)... + vector_ivf_cos created + vector_ivf_cos (214ms) + vector_hnsw_cos created + vector_hnsw_cos (111ms) + vector_diskann_cos created + vector_diskann_cos (107ms) + vector_ivf_l2 created + vector_ivf_l2 (103ms) + vector_hnsw_l2 created + vector_hnsw_l2 (103ms) + vector_diskann_l2 created + vector_diskann_l2 (103ms) + vector_ivf_ip created + vector_ivf_ip (102ms) + vector_hnsw_ip created + vector_hnsw_ip (104ms) + vector_diskann_ip created + vector_diskann_ip (104ms) + +====================================================================== + COMPARISON RESULTS +====================================================================== + ALGORITHM METRIC LATENCY TOP SCORE RESULTS STATUS + --------- ------ ------- --------- ------- ------ + IVF COS 214ms 0.6184 3 OK + HNSW COS 111ms 0.6184 3 OK + DiskANN COS 107ms 0.6184 3 OK + IVF L2 103ms 0.8736 3 OK + HNSW L2 103ms 0.8736 3 OK + DiskANN L2 103ms 0.8736 3 OK + IVF IP 102ms 0.6184 3 OK + HNSW IP 104ms 0.6184 3 OK + DiskANN IP 104ms 0.6184 3 OK + +Fastest: IVF/IP (102ms) +Highest score: IVF/L2 (0.8736) + +Cleanup: dropping comparison collection... +Cleanup: dropped collection 'hotels' + +Done! \ No newline at end of file diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go new file mode 100644 index 0000000..85f8ddd --- /dev/null +++ b/ai/select-algorithm-go/src/compare_all.go @@ -0,0 +1,379 @@ +package main + +import ( + "context" + "fmt" + "os" + "strconv" + "strings" + "text/tabwriter" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CompareResult holds the result of a single algorithm+metric search +type CompareResult struct { + Algorithm string + Metric string + IndexName string + Latency time.Duration + Results []SearchResult + TopScore float64 + Error error +} + +// indexSpec defines one of the 9 combinations +type indexSpec struct { + Algorithm string + Kind string + Metric string + IndexName string + Options bson.D +} + +// RunCompareAll executes all 9 algorithm×metric combinations on a single collection +func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + queryText := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + topK, _ := strconv.Atoi(getEnvOrDefault("TOP_K", "3")) + verbose := strings.ToLower(getEnvOrDefault("VERBOSE", "false")) == "true" + + fmt.Println("\n" + strings.Repeat("=", 70)) + fmt.Println(" COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations)") + fmt.Println(strings.Repeat("=", 70)) + fmt.Printf("Query: %q\n", queryText) + fmt.Printf("Top-K: %d\n", topK) + fmt.Printf("Verbose: %v\n", verbose) + + // 1. Drop collection for clean comparison, then load data + database := dbClient.Database(config.DatabaseName) + collection := database.Collection("hotels") + + // Drop existing collection for a clean comparison + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("\nCleanup: dropping comparison collection...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels'") + } + }() + + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // 2. Generate ONE embedding for the query (reused for all 9 searches) + fmt.Printf("\nGenerating embedding for query: %q\n", queryText) + queryEmbedding, err := GenerateEmbedding(ctx, aiClient, queryText, config.ModelName) + if err != nil { + return fmt.Errorf("failed to generate query embedding: %v", err) + } + fmt.Printf("Embedding generated (%d dimensions)\n", len(queryEmbedding)) + + // 3. Define all 9 index specs + metrics := []string{"COS", "L2", "IP"} + specs := buildIndexSpecs(config.VectorField, config.Dimensions, metrics) + + // 4. Create→search→drop each index sequentially (DocumentDB only allows one vector index per field) + fmt.Printf("\nRunning %d vector index comparisons (create→search→drop)...\n", len(specs)) + var results []CompareResult + + for _, spec := range specs { + // Drop all existing vector indexes on this field + DropVectorIndexes(ctx, collection, config.VectorField) + + // Create this specific index with retry (drop may still be in progress) + var createErr error + for attempt := 0; attempt < 3; attempt++ { + if attempt > 0 { + time.Sleep(3 * time.Second) + } + createErr = createNamedVectorIndex(ctx, collection, config.VectorField, spec) + if createErr == nil { + break + } + } + if createErr != nil { + results = append(results, CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + IndexName: spec.IndexName, + Error: createErr, + }) + fmt.Printf(" ⚠ %s: %v\n", spec.IndexName, createErr) + continue + } + fmt.Printf(" ✓ %s created\n", spec.IndexName) + + // Wait for index to become ready + time.Sleep(10 * time.Second) + + // Search using simple cosmosSearch (with retry for index readiness) + var searchResults []SearchResult + var searchErr error + var latency time.Duration + for searchAttempt := 0; searchAttempt < 3; searchAttempt++ { + if searchAttempt > 0 { + time.Sleep(5 * time.Second) + } + start := time.Now() + searchResults, searchErr = vectorSearchSimple(ctx, collection, queryEmbedding, config.VectorField, topK) + latency = time.Since(start) + if searchErr == nil && len(searchResults) > 0 { + break + } + } + + cr := CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + IndexName: spec.IndexName, + Latency: latency, + Results: searchResults, + Error: searchErr, + } + if len(searchResults) > 0 { + cr.TopScore = searchResults[0].Score + } + results = append(results, cr) + + status := "✓" + if searchErr != nil { + status = "✗" + } + fmt.Printf(" %s %s (%v)\n", status, spec.IndexName, latency.Round(time.Millisecond)) + } + + // 6. Print comparison table + fmt.Println() + printComparisonTable(results, verbose) + + return nil +} + +// buildIndexSpecs creates the 9 index specifications +func buildIndexSpecs(vectorField string, dimensions int, metrics []string) []indexSpec { + var specs []indexSpec + + for _, metric := range metrics { + metricLower := strings.ToLower(metric) + + // IVF + specs = append(specs, indexSpec{ + Algorithm: "IVF", + Kind: "vector-ivf", + Metric: metric, + IndexName: fmt.Sprintf("vector_ivf_%s", metricLower), + Options: bson.D{ + {"kind", "vector-ivf"}, + {"dimensions", dimensions}, + {"similarity", metric}, + {"numLists", 1}, + }, + }) + + // HNSW + specs = append(specs, indexSpec{ + Algorithm: "HNSW", + Kind: "vector-hnsw", + Metric: metric, + IndexName: fmt.Sprintf("vector_hnsw_%s", metricLower), + Options: bson.D{ + {"kind", "vector-hnsw"}, + {"dimensions", dimensions}, + {"similarity", metric}, + {"m", 16}, + {"efConstruction", 64}, + }, + }) + + // DiskANN + specs = append(specs, indexSpec{ + Algorithm: "DiskANN", + Kind: "vector-diskann", + Metric: metric, + IndexName: fmt.Sprintf("vector_diskann_%s", metricLower), + Options: bson.D{ + {"kind", "vector-diskann"}, + {"dimensions", dimensions}, + {"similarity", metric}, + {"maxDegree", 32}, + {"lBuild", 50}, + }, + }) + } + + return specs +} + +// createNamedVectorIndex creates a single named vector index +func createNamedVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, spec indexSpec) error { + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", spec.IndexName}, + {"key", bson.D{ + {vectorField, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", spec.Options}, + }, + }}, + } + + var result bson.M + err := collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + if strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "IndexAlreadyExists") { + return nil + } + return err + } + return nil +} + +// vectorSearchSimple performs a vector search using the active vector index +func vectorSearchSimple(ctx context.Context, collection *mongo.Collection, embedding []float64, vectorField string, topK int) ([]SearchResult, error) { + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": embedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, err + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, err + } + + return results, nil +} + +// printComparisonTable outputs a formatted table of results +func printComparisonTable(results []CompareResult, verbose bool) { + fmt.Println(strings.Repeat("=", 70)) + fmt.Println(" COMPARISON RESULTS") + fmt.Println(strings.Repeat("=", 70)) + + w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', tabwriter.AlignRight) + fmt.Fprintf(w, "ALGORITHM\tMETRIC\tLATENCY\tTOP SCORE\tRESULTS\tSTATUS\t\n") + fmt.Fprintf(w, "---------\t------\t-------\t---------\t-------\t------\t\n") + + for _, r := range results { + status := "OK" + scoreStr := fmt.Sprintf("%.4f", r.TopScore) + resultCount := fmt.Sprintf("%d", len(r.Results)) + + if r.Error != nil { + status = "ERROR" + scoreStr = "-" + resultCount = "-" + } + + fmt.Fprintf(w, "%s\t%s\t%v\t%s\t%s\t%s\t\n", + r.Algorithm, + r.Metric, + r.Latency.Round(time.Millisecond), + scoreStr, + resultCount, + status, + ) + } + w.Flush() + + // Print verbose details if requested + if verbose { + fmt.Println() + for _, r := range results { + if r.Error != nil { + fmt.Printf("\n[%s] Error: %v\n", r.IndexName, r.Error) + continue + } + if len(r.Results) > 0 { + fmt.Printf("\n[%s] Top results:\n", r.IndexName) + for i, res := range r.Results { + doc := res.Document.(bson.D) + var hotelName string + for _, elem := range doc { + if elem.Key == "HotelName" { + hotelName = fmt.Sprintf("%v", elem.Value) + break + } + } + fmt.Printf(" %d. %s (score: %.4f)\n", i+1, hotelName, res.Score) + } + } + } + } + + // Summary + fmt.Println() + var fastest CompareResult + for _, r := range results { + if r.Error == nil && (fastest.Latency == 0 || r.Latency < fastest.Latency) { + fastest = r + } + } + if fastest.Latency > 0 { + fmt.Printf("⚡ Fastest: %s/%s (%v)\n", fastest.Algorithm, fastest.Metric, fastest.Latency.Round(time.Millisecond)) + } + + var highestScore CompareResult + for _, r := range results { + if r.Error == nil && r.TopScore > highestScore.TopScore { + highestScore = r + } + } + if highestScore.TopScore > 0 { + fmt.Printf("🎯 Highest score: %s/%s (%.4f)\n", highestScore.Algorithm, highestScore.Metric, highestScore.TopScore) + } +} diff --git a/ai/select-algorithm-go/src/diskann.go b/ai/select-algorithm-go/src/diskann.go new file mode 100644 index 0000000..c83ed15 --- /dev/null +++ b/ai/select-algorithm-go/src/diskann.go @@ -0,0 +1,116 @@ +package main + +import ( + "context" + "fmt" + "log" + "time" + + "go.mongodb.org/mongo-driver/bson" +) + +func main() { + fmt.Println("Starting DiskANN vector search demonstration...") + + ctx := context.Background() + config := LoadConfig() + + fmt.Println("\nInitializing clients with passwordless authentication...") + mongoClient, azureOpenAIClient, err := GetClientsPasswordless() + if err != nil { + log.Fatalf("Failed to initialize clients: %v", err) + } + defer mongoClient.Disconnect(ctx) + + database := mongoClient.Database(config.DatabaseName) + collection := database.Collection("hotels_diskann") + + // Drop collection if exists + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_diskann"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_diskann'") + } + + defer func() { + fmt.Println("\nCleanup: dropping collection 'hotels_diskann'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: collection dropped") + } + }() + + // Load data + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + log.Fatalf("Failed to load data: %v", err) + } + + var documentsWithEmbeddings []map[string]interface{} + for _, doc := range data { + if _, exists := doc[config.VectorField]; exists { + documentsWithEmbeddings = append(documentsWithEmbeddings, doc) + } + } + + if len(documentsWithEmbeddings) == 0 { + log.Fatalf("No documents found with embeddings in field '%s'", config.VectorField) + } + + fmt.Printf("Loaded %d documents\n", len(documentsWithEmbeddings)) + + // Insert data + fmt.Println("\nInserting data...") + stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + log.Fatalf("Failed to insert data: %v", err) + } + + if stats.Inserted == 0 { + log.Fatalf("No documents were inserted successfully") + } + + fmt.Printf("Inserted %d documents\n", stats.Inserted) + + // Create DiskANN index + indexName := "vector_diskann_cos" + fmt.Printf("\nCreating %s index...\n", indexName) + err = CreateVectorIndex(ctx, collection, indexName, config.VectorField, "diskann", "COS", config.Dimensions) + if err != nil { + log.Fatalf("Failed to create DiskANN vector index: %v", err) + } + + fmt.Println("Waiting for index to build...") + time.Sleep(2 * time.Second) + + // Perform search + query := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + fmt.Printf("\nSearching for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, azureOpenAIClient, query, config.ModelName) + if err != nil { + log.Fatalf("Failed to generate embedding: %v", err) + } + + results, err := PerformVectorSearch(ctx, collection, queryEmbedding, config.VectorField, 5) + if err != nil { + log.Fatalf("Failed to perform vector search: %v", err) + } + + // Display results + fmt.Println("\nSearch Results:") + fmt.Println("===============") + for i, result := range results { + hotelName := GetHotelName(result) + fmt.Printf("%d. %s (Score: %.4f)\n", i+1, hotelName, result.Score) + } + + fmt.Println("\n✓ DiskANN demonstration completed successfully!") +} diff --git a/ai/select-algorithm-go/src/hnsw.go b/ai/select-algorithm-go/src/hnsw.go new file mode 100644 index 0000000..727529e --- /dev/null +++ b/ai/select-algorithm-go/src/hnsw.go @@ -0,0 +1,116 @@ +package main + +import ( + "context" + "fmt" + "log" + "time" + + "go.mongodb.org/mongo-driver/bson" +) + +func main() { + fmt.Println("Starting HNSW vector search demonstration...") + + ctx := context.Background() + config := LoadConfig() + + fmt.Println("\nInitializing clients with passwordless authentication...") + mongoClient, azureOpenAIClient, err := GetClientsPasswordless() + if err != nil { + log.Fatalf("Failed to initialize clients: %v", err) + } + defer mongoClient.Disconnect(ctx) + + database := mongoClient.Database(config.DatabaseName) + collection := database.Collection("hotels_hnsw") + + // Drop collection if exists + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_hnsw"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_hnsw'") + } + + defer func() { + fmt.Println("\nCleanup: dropping collection 'hotels_hnsw'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: collection dropped") + } + }() + + // Load data + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + log.Fatalf("Failed to load data: %v", err) + } + + var documentsWithEmbeddings []map[string]interface{} + for _, doc := range data { + if _, exists := doc[config.VectorField]; exists { + documentsWithEmbeddings = append(documentsWithEmbeddings, doc) + } + } + + if len(documentsWithEmbeddings) == 0 { + log.Fatalf("No documents found with embeddings in field '%s'", config.VectorField) + } + + fmt.Printf("Loaded %d documents\n", len(documentsWithEmbeddings)) + + // Insert data + fmt.Println("\nInserting data...") + stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + log.Fatalf("Failed to insert data: %v", err) + } + + if stats.Inserted == 0 { + log.Fatalf("No documents were inserted successfully") + } + + fmt.Printf("Inserted %d documents\n", stats.Inserted) + + // Create HNSW index + indexName := "vector_hnsw_cos" + fmt.Printf("\nCreating %s index...\n", indexName) + err = CreateVectorIndex(ctx, collection, indexName, config.VectorField, "hnsw", "COS", config.Dimensions) + if err != nil { + log.Fatalf("Failed to create HNSW vector index: %v", err) + } + + fmt.Println("Waiting for index to build...") + time.Sleep(2 * time.Second) + + // Perform search + query := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + fmt.Printf("\nSearching for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, azureOpenAIClient, query, config.ModelName) + if err != nil { + log.Fatalf("Failed to generate embedding: %v", err) + } + + results, err := PerformVectorSearch(ctx, collection, queryEmbedding, config.VectorField, 5) + if err != nil { + log.Fatalf("Failed to perform vector search: %v", err) + } + + // Display results + fmt.Println("\nSearch Results:") + fmt.Println("===============") + for i, result := range results { + hotelName := GetHotelName(result) + fmt.Printf("%d. %s (Score: %.4f)\n", i+1, hotelName, result.Score) + } + + fmt.Println("\n✓ HNSW demonstration completed successfully!") +} diff --git a/ai/select-algorithm-go/src/ivf.go b/ai/select-algorithm-go/src/ivf.go new file mode 100644 index 0000000..8f89f28 --- /dev/null +++ b/ai/select-algorithm-go/src/ivf.go @@ -0,0 +1,116 @@ +package main + +import ( + "context" + "fmt" + "log" + "time" + + "go.mongodb.org/mongo-driver/bson" +) + +func main() { + fmt.Println("Starting IVF vector search demonstration...") + + ctx := context.Background() + config := LoadConfig() + + fmt.Println("\nInitializing clients with passwordless authentication...") + mongoClient, azureOpenAIClient, err := GetClientsPasswordless() + if err != nil { + log.Fatalf("Failed to initialize clients: %v", err) + } + defer mongoClient.Disconnect(ctx) + + database := mongoClient.Database(config.DatabaseName) + collection := database.Collection("hotels_ivf") + + // Drop collection if exists + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_ivf"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_ivf'") + } + + defer func() { + fmt.Println("\nCleanup: dropping collection 'hotels_ivf'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: collection dropped") + } + }() + + // Load data + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + log.Fatalf("Failed to load data: %v", err) + } + + var documentsWithEmbeddings []map[string]interface{} + for _, doc := range data { + if _, exists := doc[config.VectorField]; exists { + documentsWithEmbeddings = append(documentsWithEmbeddings, doc) + } + } + + if len(documentsWithEmbeddings) == 0 { + log.Fatalf("No documents found with embeddings in field '%s'", config.VectorField) + } + + fmt.Printf("Loaded %d documents\n", len(documentsWithEmbeddings)) + + // Insert data + fmt.Println("\nInserting data...") + stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + log.Fatalf("Failed to insert data: %v", err) + } + + if stats.Inserted == 0 { + log.Fatalf("No documents were inserted successfully") + } + + fmt.Printf("Inserted %d documents\n", stats.Inserted) + + // Create IVF index + indexName := "vector_ivf_cos" + fmt.Printf("\nCreating %s index...\n", indexName) + err = CreateVectorIndex(ctx, collection, indexName, config.VectorField, "ivf", "COS", config.Dimensions) + if err != nil { + log.Fatalf("Failed to create IVF vector index: %v", err) + } + + fmt.Println("Waiting for index to build...") + time.Sleep(3 * time.Second) + + // Perform search + query := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + fmt.Printf("\nSearching for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, azureOpenAIClient, query, config.ModelName) + if err != nil { + log.Fatalf("Failed to generate embedding: %v", err) + } + + results, err := PerformVectorSearch(ctx, collection, queryEmbedding, config.VectorField, 5) + if err != nil { + log.Fatalf("Failed to perform vector search: %v", err) + } + + // Display results + fmt.Println("\nSearch Results:") + fmt.Println("===============") + for i, result := range results { + hotelName := GetHotelName(result) + fmt.Printf("%d. %s (Score: %.4f)\n", i+1, hotelName, result.Score) + } + + fmt.Println("\n✓ IVF demonstration completed successfully!") +} diff --git a/ai/select-algorithm-go/src/utils.go b/ai/select-algorithm-go/src/utils.go new file mode 100644 index 0000000..6e6a8d4 --- /dev/null +++ b/ai/select-algorithm-go/src/utils.go @@ -0,0 +1,395 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "strconv" + "strings" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/joho/godotenv" + "github.com/openai/openai-go/v3" + "github.com/openai/openai-go/v3/azure" + "github.com/openai/openai-go/v3/option" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" + "go.mongodb.org/mongo-driver/mongo/options" +) + +// Config holds the application configuration +type Config struct { + ClusterName string + DatabaseName string + DataFile string + VectorField string + ModelName string + Dimensions int + BatchSize int + Similarity string + Algorithm string +} + +// SearchResult represents a search result document +type SearchResult struct { + Document interface{} `bson:"document"` + Score float64 `bson:"score"` +} + +// InsertStats holds statistics about data insertion +type InsertStats struct { + Total int `json:"total"` + Inserted int `json:"inserted"` + Failed int `json:"failed"` +} + +// LoadConfig loads configuration from environment variables +func LoadConfig() *Config { + // Load environment variables from .env file + // For production use, prefer Azure Key Vault or similar secret management + // services instead of .env files. For development/demo purposes only. + err := godotenv.Load() + if err != nil { + log.Printf("Warning: Error loading .env file: %v", err) + } + + dimensions, _ := strconv.Atoi(getEnvOrDefault("EMBEDDING_DIMENSIONS", "1536")) + batchSize, _ := strconv.Atoi(getEnvOrDefault("LOAD_SIZE_BATCH", "100")) + + return &Config{ + ClusterName: getEnvOrDefault("MONGO_CLUSTER_NAME", ""), + DatabaseName: getEnvOrDefault("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"), + DataFile: getEnvOrDefault("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"), + VectorField: getEnvOrDefault("EMBEDDED_FIELD", "contentVector"), + ModelName: getEnvOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"), + Dimensions: dimensions, + BatchSize: batchSize, + Similarity: getEnvOrDefault("SIMILARITY", "COS"), + Algorithm: strings.ToLower(getEnvOrDefault("ALGORITHM", "all")), + } +} + +// getEnvOrDefault returns environment variable value or default if not set +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} + +// GetClientsPasswordless creates MongoDB and Azure OpenAI clients with passwordless authentication +func GetClientsPasswordless(ctx context.Context, config *Config) (*mongo.Client, openai.Client, error) { + if config.ClusterName == "" { + return nil, openai.Client{}, fmt.Errorf("MONGO_CLUSTER_NAME environment variable is required") + } + + // Create Azure credential + credential, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("failed to create Azure credential: %v", err) + } + + // Connect to DocumentDB with OIDC authentication + mongoURI := fmt.Sprintf("mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", config.ClusterName) + + fmt.Println("Attempting OIDC authentication...") + mongoClient, err := connectWithOIDC(ctx, mongoURI, credential) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("OIDC authentication failed: %v", err) + } + fmt.Println("OIDC authentication successful!") + + // Get Azure OpenAI endpoint + azureOpenAIEndpoint := os.Getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if azureOpenAIEndpoint == "" { + return nil, openai.Client{}, fmt.Errorf("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + } + + // Create Azure OpenAI client with credential-based authentication + openAIClient := openai.NewClient( + option.WithBaseURL(fmt.Sprintf("%s/openai/v1", azureOpenAIEndpoint)), + azure.WithTokenCredential(credential)) + + return mongoClient, openAIClient, nil +} + +// connectWithOIDC attempts to connect using OIDC authentication +func connectWithOIDC(ctx context.Context, mongoURI string, credential *azidentity.DefaultAzureCredential) (*mongo.Client, error) { + oidcCallback := func(ctx context.Context, args *options.OIDCArgs) (*options.OIDCCredential, error) { + scope := "https://ossrdbms-aad.database.windows.net/.default" + fmt.Printf("Getting token with scope: %s\n", scope) + token, err := credential.GetToken(ctx, policy.TokenRequestOptions{ + Scopes: []string{scope}, + }) + if err != nil { + return nil, fmt.Errorf("failed to get token with scope %s: %v", scope, err) + } + + fmt.Printf("Successfully obtained token\n") + + return &options.OIDCCredential{ + AccessToken: token.Token, + }, nil + } + + clientOptions := options.Client(). + ApplyURI(mongoURI). + SetConnectTimeout(30 * time.Second). + SetServerSelectionTimeout(30 * time.Second). + SetRetryWrites(true). + SetAuth(options.Credential{ + AuthMechanism: "MONGODB-OIDC", + AuthMechanismProperties: map[string]string{ + "TOKEN_RESOURCE": "https://ossrdbms-aad.database.windows.net", + }, + OIDCMachineCallback: oidcCallback, + }) + + mongoClient, err := mongo.Connect(ctx, clientOptions) + if err != nil { + return nil, err + } + + return mongoClient, nil +} + +// ReadFileReturnJSON reads a JSON file and returns the data as a slice of maps +func ReadFileReturnJSON(filePath string) ([]map[string]interface{}, error) { + file, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("error reading file '%s': %v", filePath, err) + } + + var data []map[string]interface{} + err = json.Unmarshal(file, &data) + if err != nil { + return nil, fmt.Errorf("error parsing JSON in file '%s': %v", filePath, err) + } + + return data, nil +} + +// InsertData inserts data into a MongoDB collection in batches +func InsertData(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + totalDocuments := len(data) + insertedCount := 0 + failedCount := 0 + + fmt.Printf("Starting batch insertion of %d documents...\n", totalDocuments) + + for i := 0; i < totalDocuments; i += batchSize { + end := i + batchSize + if end > totalDocuments { + end = totalDocuments + } + + batch := data[i:end] + batchNum := (i / batchSize) + 1 + + documents := make([]interface{}, len(batch)) + for j, doc := range batch { + documents[j] = doc + } + + result, err := collection.InsertMany(ctx, documents, options.InsertMany().SetOrdered(false)) + if err != nil { + if bulkErr, ok := err.(mongo.BulkWriteException); ok { + errorCount := len(bulkErr.WriteErrors) + insertedCount += len(batch) - errorCount + failedCount += errorCount + fmt.Printf("Batch %d had errors: %d inserted, %d failed\n", batchNum, len(batch)-errorCount, errorCount) + for _, writeErr := range bulkErr.WriteErrors { + fmt.Printf(" Error: %s\n", writeErr.Message) + } + } else { + failedCount += len(batch) + fmt.Printf("Batch %d failed completely: %v\n", batchNum, err) + } + } else { + insertedCount += len(result.InsertedIDs) + fmt.Printf("Batch %d completed: %d documents inserted\n", batchNum, len(result.InsertedIDs)) + } + + time.Sleep(100 * time.Millisecond) + } + + return &InsertStats{ + Total: totalDocuments, + Inserted: insertedCount, + Failed: failedCount, + }, nil +} + +// DropVectorIndexes drops existing vector indexes on the specified field +func DropVectorIndexes(ctx context.Context, collection *mongo.Collection, vectorField string) error { + cursor, err := collection.Indexes().List(ctx) + if err != nil { + return fmt.Errorf("could not list indexes: %v", err) + } + defer cursor.Close(ctx) + + var vectorIndexes []string + for cursor.Next(ctx) { + var index bson.M + if err := cursor.Decode(&index); err != nil { + continue + } + + if key, ok := index["key"].(bson.M); ok { + if indexType, exists := key[vectorField]; exists && indexType == "cosmosSearch" { + if name, ok := index["name"].(string); ok { + vectorIndexes = append(vectorIndexes, name) + } + } + } + } + + for _, indexName := range vectorIndexes { + fmt.Printf("Dropping existing vector index: %s\n", indexName) + _, err := collection.Indexes().DropOne(ctx, indexName) + if err != nil { + fmt.Printf("Warning: Could not drop index %s: %v\n", indexName, err) + } + } + + if len(vectorIndexes) > 0 { + fmt.Printf("Dropped %d existing vector index(es)\n", len(vectorIndexes)) + } else { + fmt.Println("No existing vector indexes found to drop") + } + + return nil +} + +// PerformVectorSearch performs a vector search using the cosmosSearch aggregation pipeline +func PerformVectorSearch(ctx context.Context, collection *mongo.Collection, client openai.Client, query, vectorField, model string, topK int) ([]SearchResult, error) { + fmt.Printf("Performing vector search for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, client, query, model) + if err != nil { + return nil, fmt.Errorf("error generating embedding: %v", err) + } + + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": queryEmbedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, fmt.Errorf("error performing vector search: %v", err) + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + fmt.Printf("Warning: Could not decode result: %v\n", err) + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, fmt.Errorf("cursor error: %v", err) + } + + return results, nil +} + +// GenerateEmbedding generates an embedding for the given text using Azure OpenAI +func GenerateEmbedding(ctx context.Context, client openai.Client, text, modelName string) ([]float64, error) { + resp, err := client.Embeddings.New(ctx, openai.EmbeddingNewParams{ + Input: openai.EmbeddingNewParamsInputUnion{ + OfString: openai.String(text), + }, + Model: modelName, + }) + if err != nil { + return nil, fmt.Errorf("failed to generate embedding: %v", err) + } + + if len(resp.Data) == 0 { + return nil, fmt.Errorf("no embedding data received") + } + + embedding := make([]float64, len(resp.Data[0].Embedding)) + for i, v := range resp.Data[0].Embedding { + embedding[i] = float64(v) + } + + return embedding, nil +} + +// PrintSearchResults prints search results in a formatted way +func PrintSearchResults(results []SearchResult, algorithm string) { + if len(results) == 0 { + fmt.Println("No search results found.") + return + } + + fmt.Printf("\n%s Search Results (top %d):\n", strings.ToUpper(algorithm), len(results)) + fmt.Println(strings.Repeat("=", 80)) + + for i, result := range results { + doc := result.Document.(bson.D) + var hotelName string + for _, elem := range doc { + if elem.Key == "HotelName" { + hotelName = fmt.Sprintf("%v", elem.Value) + break + } + } + + fmt.Printf("%d. HotelName: %s, Score: %.4f\n", i+1, hotelName, result.Score) + } +} + +// FilterDocumentsWithEmbeddings returns only documents that contain the vector field +func FilterDocumentsWithEmbeddings(data []map[string]interface{}, vectorField string) []map[string]interface{} { + var filtered []map[string]interface{} + for _, doc := range data { + if _, exists := doc[vectorField]; exists { + filtered = append(filtered, doc) + } + } + return filtered +} + +// PrepareCollection clears existing data and inserts new documents +func PrepareCollection(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + fmt.Printf("Preparing collection '%s'...\n", collection.Name()) + + deleteResult, err := collection.DeleteMany(ctx, bson.M{}) + if err != nil { + return nil, fmt.Errorf("failed to clear existing data: %v", err) + } + if deleteResult.DeletedCount > 0 { + fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) + } + + stats, err := InsertData(ctx, collection, data, batchSize) + if err != nil { + return nil, fmt.Errorf("failed to insert data: %v", err) + } + + return stats, nil +} diff --git a/ai/select-algorithm-java/.env.example b/ai/select-algorithm-java/.env.example new file mode 100644 index 0000000..3e6b531 --- /dev/null +++ b/ai/select-algorithm-java/.env.example @@ -0,0 +1,26 @@ +# Azure DocumentDB cluster name (find in Azure Portal > DocumentDB > Overview) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI embedding endpoint(find in Azure Portal > Azure OpenAI > Keys and Endpoint) +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model deployment name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Path to pre-computed vectors JSON file +DATA_FILE_WITH_VECTORS=./data/Hotels_Vector.json + +# Database name (default: Hotels) +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Field name containing embeddings in the data file +EMBEDDED_FIELD=DescriptionVector + +# Embedding dimensions (default: 1536) +EMBEDDING_DIMENSIONS=1536 + +# Algorithm to test: all, diskann, hnsw, ivf (default: all) +ALGORITHM=all + +# Similarity to test: COS, L2, IP (default: COS) +SIMILARITY=COS diff --git a/ai/select-algorithm-java/.gitignore b/ai/select-algorithm-java/.gitignore new file mode 100644 index 0000000..9ae5e73 --- /dev/null +++ b/ai/select-algorithm-java/.gitignore @@ -0,0 +1,7 @@ +target/ +.env +*.class + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md new file mode 100644 index 0000000..69d735b --- /dev/null +++ b/ai/select-algorithm-java/README.md @@ -0,0 +1,184 @@ +# Select Algorithm - Java + +This sample demonstrates how to create and use different vector search index algorithms (IVF, HNSW, DiskANN) with Azure DocumentDB using the MongoDB Java driver. + +## Prerequisites + +- Java 17 or later +- Maven 3.8+ +- Azure DocumentDB cluster with vector search enabled +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +2. Update `.env` with your Azure resource details (if not using `azd`): + - `MONGO_CLUSTER_NAME` — your DocumentDB cluster name + - `AZURE_OPENAI_EMBEDDING_ENDPOINT` — your Azure OpenAI endpoint + - `AZURE_OPENAI_EMBEDDING_MODEL` — deployment name (e.g., `text-embedding-3-small`) + - `DATA_FILE_WITH_VECTORS` — path to the pre-computed vectors JSON file + +3. Copy the data file: + + Copy `Hotels_Vector.json` from the repository's `ai/data/` folder into this project's `data/` folder: + + ```bash + cp ../../data/Hotels_Vector.json ./data/ + ``` + +## Build + +```bash +mvn clean compile +``` + +## Run + +### Run Individual Algorithms + +Run a specific algorithm with its own collection and index: + +```bash +# IVF (Inverted File) - best for large datasets with batch queries +mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.IVF" + +# HNSW (Hierarchical Navigable Small World) - best for low-latency, high-recall searches +mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.HNSW" + +# DiskANN - best for very large datasets (50K+ docs), requires M40+ cluster +mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.DiskANN" +``` + +Each individual algorithm demo: +- Creates its own collection (`hotels_ivf`, `hotels_hnsw`, `hotels_diskann`) +- Inserts the hotel data +- Creates a single vector index +- Runs one search query +- Cleans up (drops collection) at the end + +### Run Comparison Mode + +Compare all 9 algorithm × similarity combinations: + +## Algorithms + +| Algorithm | Description | Best For | +|-----------|-------------|----------| +| **IVF** | Inverted File index — partitions vectors into clusters | Large datasets with batch queries | +| **HNSW** | Hierarchical Navigable Small World graph | Low-latency, high-recall searches | +| **DiskANN** | Disk-based Approximate Nearest Neighbor | Very large datasets that exceed memory | + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `ALGORITHM` | `all` | Which algorithm to run: `ivf`, `hnsw`, `diskann`, `all` | +| `SIMILARITY` | `COS` | Similarity metric: `COS`, `L2`, `IP` | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `EMBEDDED_FIELD` | `contentVector` | Field name containing embeddings | + +## Authentication + +This sample uses **passwordless authentication** via `DefaultAzureCredential`: + +- **DocumentDB**: OIDC mechanism with Azure identity +- **Azure OpenAI**: Entra ID token-based auth + +Ensure your identity has the appropriate RBAC roles assigned on both resources. + +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and print a formatted comparison table: + +```bash +mvn exec:java -Pcompare +``` + +Or via the `ALGORITHM` environment variable: + +```bash +ALGORITHM=compare mvn exec:java +``` + +On Windows (PowerShell): + +```powershell +$env:ALGORITHM="compare"; mvn exec:java +``` + +### Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `3` | Number of results per search | +| `VERBOSE` | `false` | Print detailed per-index results | + +### What It Does + +1. Connects to DocumentDB and loads hotel data into a single `hotels` collection +2. Generates one embedding for the query text (reused for all searches) +3. For each of the 9 algorithm/metric combinations: creates the index → searches → drops the index +4. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +5. Prints a formatted comparison table with scores, top results, and key insights + +### Output + +``` +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +Algorithm Similarity #1 Result #1 Score #2 Result #2 Score Diff +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +### Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=32, lBuild=50 | + +## Project Structure + +``` +src/main/java/com/azure/documentdb/selectalgorithm/ +├── Main.java — Entry point, runs CompareAll +├── Utils.java — Shared helpers (connection, embedding, data loading) +├── IVF.java — IVF index demo (single algorithm) +├── HNSW.java — HNSW index demo (single algorithm) +├── DiskANN.java — DiskANN index demo (single algorithm) +└── CompareAll.java — Unified comparison runner (all 9 combinations) +``` diff --git a/ai/select-algorithm-java/data/README.md b/ai/select-algorithm-java/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-java/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-java/output/compare_all.txt b/ai/select-algorithm-java/output/compare_all.txt new file mode 100644 index 0000000..9a6b312 --- /dev/null +++ b/ai/select-algorithm-java/output/compare_all.txt @@ -0,0 +1,52 @@ +============================================== + Azure DocumentDB - Compare All Algorithms +============================================== + Query: "luxury hotel near the beach" + Top K: 3 + Metrics: COS, L2, IP + Algos: IVF, HNSW, DiskANN + + Loading data from: ../data/Hotels_Vector.json + Loaded 50 documents + Collection reset. + Inserting 50 documents in batches of 100... + Inserted batch 1-50 + Data insertion complete. + + Generating embedding for: "luxury hotel near the beach" + Embedding generated (1536 dimensions) + + Running 9 algorithm x metric combinations... + + vector_ivf_cos created + vector_ivf_l2 created + vector_ivf_ip created + vector_hnsw_cos created + vector_hnsw_l2 created + vector_hnsw_ip created + vector_diskann_cos created + vector_diskann_l2 created + vector_diskann_ip created + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' + + COMPARISON TABLE - All Algorithms x Metrics + + ALGO METRIC INDEX NAME LATENCY RESULTS TOP MATCH + IVF COS vector_ivf_cos 97.79 ms 3 Ocean Water Reso.. + IVF L2 vector_ivf_l2 72.51 ms 3 Ocean Water Reso.. + IVF IP vector_ivf_ip 71.64 ms 3 Ocean Water Reso.. + HNSW COS vector_hnsw_cos 73.44 ms 3 Ocean Water Reso.. + HNSW L2 vector_hnsw_l2 71.36 ms 3 Ocean Water Reso.. + HNSW IP vector_hnsw_ip 71.81 ms 3 Ocean Water Reso.. + DISKANN COS vector_diskann_cos 73.67 ms 3 Ocean Water Reso.. + DISKANN L2 vector_diskann_l2 73.81 ms 3 Ocean Water Reso.. + DISKANN IP vector_diskann_ip 72.19 ms 3 Ocean Water Reso.. + + Fastest: vector_hnsw_l2 ( 71.36 ms) + Slowest: 97.79 ms | Average: 75.36 ms | Top K: 3 + +============================================== + Comparison complete. +============================================== \ No newline at end of file diff --git a/ai/select-algorithm-java/pom.xml b/ai/select-algorithm-java/pom.xml new file mode 100644 index 0000000..2414631 --- /dev/null +++ b/ai/select-algorithm-java/pom.xml @@ -0,0 +1,83 @@ + + + 4.0.0 + + com.azure.documentdb + select-algorithm-java + 1.0.0 + jar + + DocumentDB Select Algorithm - Java + Demonstrates IVF, HNSW, and DiskANN vector search indexes with Azure DocumentDB + + + 17 + 17 + UTF-8 + + + + + org.mongodb + mongodb-driver-sync + 5.4.0 + + + com.azure + azure-identity + 1.16.0 + + + com.azure + azure-ai-openai + 1.0.0-beta.16 + + + io.github.cdimascio + dotenv-java + 3.1.0 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.Main + + + + + + + + compare + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.CompareAll + + + + + + + diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java new file mode 100644 index 0000000..45e4261 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -0,0 +1,254 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * Unified comparison runner that executes all 9 combinations + * (3 algorithms x 3 similarity metrics) and prints a formatted table. + */ +public class CompareAll { + + private static final String COLLECTION_NAME = "hotels"; + private static final String[] ALGORITHMS = {"ivf", "hnsw", "diskann"}; + private static final String[] METRICS = {"COS", "L2", "IP"}; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String queryText = Utils.getEnv("QUERY_TEXT", "luxury hotel near the beach"); + int topK = Integer.parseInt(Utils.getEnv("TOP_K", "3")); + boolean verbose = Boolean.parseBoolean(Utils.getEnv("VERBOSE", "false")); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "../data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "contentVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Top K: %d%n", topK); + System.out.printf(" Metrics: COS, L2, IP%n"); + System.out.printf(" Algos: IVF, HNSW, DiskANN%n"); + System.out.println(); + + List results = new ArrayList<>(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + collection.drop(); + System.out.println(" Collection reset."); + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Run 9 algorithm × metric combinations sequentially (create→search→drop) + // DocumentDB does not allow multiple vector indexes of the same kind + // on the same field path simultaneously. + System.out.println(" Running 9 algorithm × metric combinations...\n"); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + // 1. Drop all existing vector indexes + dropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + createIndex(database, collection, vectorField, dimensions, algo, metric); + System.out.printf(" ✓ %s created%n", indexName); + + // 3. Wait for index to build + try { Thread.sleep(5000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } + + // 4. Search + long startNs = System.nanoTime(); + List searchResults = performSearch( + collection, vectorAsDoubles, vectorField, topK); + long elapsedNs = System.nanoTime() - startNs; + double elapsedMs = elapsedNs / 1_000_000.0; + + // 5. Extract top result info + String topHotel = "-"; + double topScore = 0.0; + if (!searchResults.isEmpty()) { + Document top = searchResults.get(0); + topHotel = top.getString("HotelName") != null + ? top.getString("HotelName") : "-"; + topScore = top.getDouble("score") != null + ? top.getDouble("score") : 0.0; + } + + results.add(new SearchResult( + algo.toUpperCase(), metric, indexName, + elapsedMs, searchResults.size(), topHotel, topScore)); + + if (verbose) { + System.out.printf(" [%s] %d results in %.2f ms%n", + indexName, searchResults.size(), elapsedMs); + for (int i = 0; i < searchResults.size(); i++) { + Document doc = searchResults.get(i); + System.out.printf(" %d. %s (%.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + } + } + } + } + + // Cleanup: drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); + } + + // Print comparison table + printComparisonTable(results, topK); + } + + private static void dropVectorIndexes(MongoCollection collection, String vectorField) { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + Document key = idx.get("key", Document.class); + if (key != null && "cosmosSearch".equals(key.getString(vectorField))) { + try { + collection.dropIndex(name); + } catch (Exception e) { + // Ignore if index doesn't exist + } + } + } + } + + private static void createIndex(MongoDatabase database, MongoCollection collection, + String vectorField, int dimensions, + String algo, String metric) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + Document cosmosSearchOptions = new Document() + .append("dimensions", dimensions) + .append("similarity", metric); + + switch (algo) { + case "ivf" -> cosmosSearchOptions + .append("kind", "vector-ivf") + .append("numLists", 1); + case "hnsw" -> cosmosSearchOptions + .append("kind", "vector-hnsw") + .append("m", 16) + .append("efConstruction", 64); + case "diskann" -> cosmosSearchOptions + .append("kind", "vector-diskann") + .append("maxDegree", 32) + .append("lBuild", 50); + } + + Document indexDefinition = new Document() + .append("name", indexName) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + try { + database.runCommand(command); + } catch (Exception e) { + // Idempotent: ignore if index already exists + if (!e.getMessage().contains("already exists")) { + throw e; + } + } + } + + private static List performSearch(MongoCollection collection, + List vectorAsDoubles, + String vectorField, int topK) { + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + return results; + } + + private static void printComparisonTable(List results, int topK) { + System.out.println(); + System.out.println(" ╔══════════════════════════════════════════════════════════════════════════════════╗"); + System.out.println(" ║ COMPARISON TABLE — All Algorithms × Metrics ║"); + System.out.println(" ╠══════════════════════════════════════════════════════════════════════════════════╣"); + System.out.printf(" ║ %-10s %-8s %-22s %10s %8s %-18s ║%n", + "ALGO", "METRIC", "INDEX NAME", "LATENCY", "RESULTS", "TOP MATCH"); + System.out.println(" ╠══════════════════════════════════════════════════════════════════════════════════╣"); + + for (SearchResult r : results) { + String topMatch = r.topHotel.length() > 16 + ? r.topHotel.substring(0, 16) + ".." + : r.topHotel; + System.out.printf(" ║ %-10s %-8s %-22s %8.2f ms %5d %-18s ║%n", + r.algorithm, r.metric, r.indexName, + r.latencyMs, r.resultCount, topMatch); + } + + System.out.println(" ╠══════════════════════════════════════════════════════════════════════════════════╣"); + + // Summary stats + double fastest = results.stream().mapToDouble(r -> r.latencyMs).min().orElse(0); + double slowest = results.stream().mapToDouble(r -> r.latencyMs).max().orElse(0); + double avg = results.stream().mapToDouble(r -> r.latencyMs).average().orElse(0); + String fastestIdx = results.stream() + .filter(r -> r.latencyMs == fastest) + .findFirst().map(r -> r.indexName).orElse("-"); + + System.out.printf(" ║ Fastest: %-22s (%8.2f ms) ║%n", fastestIdx, fastest); + System.out.printf(" ║ Slowest: %8.2f ms | Average: %8.2f ms | Top K: %-3d ║%n", slowest, avg, topK); + System.out.println(" ╚══════════════════════════════════════════════════════════════════════════════════╝"); + System.out.println(); + } + + private record SearchResult( + String algorithm, + String metric, + String indexName, + double latencyMs, + int resultCount, + String topHotel, + double topScore) { + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskANN.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskANN.java new file mode 100644 index 0000000..1fc1430 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/DiskANN.java @@ -0,0 +1,113 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * DiskANN (Disk-based Approximate Nearest Neighbor) vector index demonstration. + * Best for: Very large datasets (50K+ documents) that exceed memory. + * Requires M40+ cluster tier. + */ +public class DiskANN { + + private static final String COLLECTION_NAME = "hotels_diskann"; + private static final String INDEX_NAME = "vectorIndex_diskann"; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "./data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "DescriptionVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + String similarity = Utils.getEnv("SIMILARITY", "COS"); + String queryText = "luxury hotel near the beach"; + int topK = 5; + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - DiskANN Vector Index"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Similarity: %s%n", similarity); + System.out.printf(" Top K: %d%n", topK); + System.out.println(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + try { + // Load data + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop collection if it exists + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println(" Dropped existing collection."); + } + + // Create collection + database.createCollection(COLLECTION_NAME); + collection = database.getCollection(COLLECTION_NAME); + System.out.printf(" Created collection: %s%n", COLLECTION_NAME); + + // Insert data + Utils.insertData(collection, data, 100); + + // Create DiskANN vector index + System.out.println("\n Creating DiskANN vector index..."); + createDiskAnnIndex(database, collection, vectorField, dimensions, similarity); + System.out.printf(" Created index: %s%n", INDEX_NAME); + + // Perform vector search + OpenAIClient aiClient = Utils.getOpenAIClient(); + List results = Utils.performVectorSearch( + collection, aiClient, queryText, vectorField, model, topK); + + // Print results + Utils.printResults(results); + + } finally { + // Cleanup + System.out.println(" Cleanup: dropping collection..."); + collection.drop(); + System.out.println(" Cleanup complete."); + } + } + + System.out.println("=============================================="); + System.out.println(" DiskANN demo complete."); + System.out.println("=============================================="); + } + + private static void createDiskAnnIndex(MongoDatabase database, MongoCollection collection, + String vectorField, int dimensions, String similarity) { + Document cosmosSearchOptions = new Document() + .append("kind", "vector-diskann") + .append("dimensions", dimensions) + .append("similarity", similarity) + .append("maxDegree", 32) + .append("lBuild", 50); + + Document indexDefinition = new Document() + .append("name", INDEX_NAME) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + database.runCommand(command); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HNSW.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HNSW.java new file mode 100644 index 0000000..d29b4ed --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/HNSW.java @@ -0,0 +1,112 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * HNSW (Hierarchical Navigable Small World) vector index demonstration. + * Best for: Low-latency, high-recall searches with 10K-50K documents. + */ +public class HNSW { + + private static final String COLLECTION_NAME = "hotels_hnsw"; + private static final String INDEX_NAME = "vectorIndex_hnsw"; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "./data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "DescriptionVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + String similarity = Utils.getEnv("SIMILARITY", "COS"); + String queryText = "luxury hotel near the beach"; + int topK = 5; + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - HNSW Vector Index"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Similarity: %s%n", similarity); + System.out.printf(" Top K: %d%n", topK); + System.out.println(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + try { + // Load data + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop collection if it exists + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println(" Dropped existing collection."); + } + + // Create collection + database.createCollection(COLLECTION_NAME); + collection = database.getCollection(COLLECTION_NAME); + System.out.printf(" Created collection: %s%n", COLLECTION_NAME); + + // Insert data + Utils.insertData(collection, data, 100); + + // Create HNSW vector index + System.out.println("\n Creating HNSW vector index..."); + createHnswIndex(database, collection, vectorField, dimensions, similarity); + System.out.printf(" Created index: %s%n", INDEX_NAME); + + // Perform vector search + OpenAIClient aiClient = Utils.getOpenAIClient(); + List results = Utils.performVectorSearch( + collection, aiClient, queryText, vectorField, model, topK); + + // Print results + Utils.printResults(results); + + } finally { + // Cleanup + System.out.println(" Cleanup: dropping collection..."); + collection.drop(); + System.out.println(" Cleanup complete."); + } + } + + System.out.println("=============================================="); + System.out.println(" HNSW demo complete."); + System.out.println("=============================================="); + } + + private static void createHnswIndex(MongoDatabase database, MongoCollection collection, + String vectorField, int dimensions, String similarity) { + Document cosmosSearchOptions = new Document() + .append("kind", "vector-hnsw") + .append("dimensions", dimensions) + .append("similarity", similarity) + .append("m", 16) + .append("efConstruction", 64); + + Document indexDefinition = new Document() + .append("name", INDEX_NAME) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + database.runCommand(command); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IVF.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IVF.java new file mode 100644 index 0000000..635cdd7 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/IVF.java @@ -0,0 +1,111 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * IVF (Inverted File) vector index demonstration. + * Best for: Large datasets with batch queries. + */ +public class IVF { + + private static final String COLLECTION_NAME = "hotels_ivf"; + private static final String INDEX_NAME = "vectorIndex_ivf"; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "./data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "DescriptionVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + String similarity = Utils.getEnv("SIMILARITY", "COS"); + String queryText = "luxury hotel near the beach"; + int topK = 5; + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - IVF Vector Index"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Similarity: %s%n", similarity); + System.out.printf(" Top K: %d%n", topK); + System.out.println(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + try { + // Load data + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop collection if it exists + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println(" Dropped existing collection."); + } + + // Create collection + database.createCollection(COLLECTION_NAME); + collection = database.getCollection(COLLECTION_NAME); + System.out.printf(" Created collection: %s%n", COLLECTION_NAME); + + // Insert data + Utils.insertData(collection, data, 100); + + // Create IVF vector index + System.out.println("\n Creating IVF vector index..."); + createIvfIndex(database, collection, vectorField, dimensions, similarity); + System.out.printf(" Created index: %s%n", INDEX_NAME); + + // Perform vector search + OpenAIClient aiClient = Utils.getOpenAIClient(); + List results = Utils.performVectorSearch( + collection, aiClient, queryText, vectorField, model, topK); + + // Print results + Utils.printResults(results); + + } finally { + // Cleanup + System.out.println(" Cleanup: dropping collection..."); + collection.drop(); + System.out.println(" Cleanup complete."); + } + } + + System.out.println("=============================================="); + System.out.println(" IVF demo complete."); + System.out.println("=============================================="); + } + + private static void createIvfIndex(MongoDatabase database, MongoCollection collection, + String vectorField, int dimensions, String similarity) { + Document cosmosSearchOptions = new Document() + .append("kind", "vector-ivf") + .append("dimensions", dimensions) + .append("similarity", similarity) + .append("numLists", 1); + + Document indexDefinition = new Document() + .append("name", INDEX_NAME) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + database.runCommand(command); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java new file mode 100644 index 0000000..5a9d54c --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java @@ -0,0 +1,17 @@ +package com.azure.documentdb.selectalgorithm; + +public class Main { + + public static void main(String[] args) { + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.println(); + + CompareAll.run(); + + System.out.println("=============================================="); + System.out.println(" Comparison complete."); + System.out.println("=============================================="); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java new file mode 100644 index 0000000..c79102b --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java @@ -0,0 +1,195 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingItem; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.core.credential.AccessToken; +import com.azure.identity.DefaultAzureCredential; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.model.InsertManyOptions; +import io.github.cdimascio.dotenv.Dotenv; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class Utils { + + private static final Dotenv dotenv = Dotenv.configure().ignoreIfMissing().load(); + + public static String getEnv(String key, String defaultValue) { + String value = dotenv.get(key); + if (value == null || value.isBlank()) { + value = System.getenv(key); + } + return (value != null && !value.isBlank()) ? value : defaultValue; + } + + public static String getEnv(String key) { + return getEnv(key, null); + } + + public static MongoClient getMongoClient() { + String clusterName = getEnv("MONGO_CLUSTER_NAME"); + if (clusterName == null) { + throw new IllegalStateException("MONGO_CLUSTER_NAME environment variable is required"); + } + + String connectionUri = String.format( + "mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", clusterName); + + // Use custom OIDC callback with DefaultAzureCredential + // This chains through CLI, managed identity, etc. + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + String tokenResource = "https://ossrdbms-aad.database.windows.net/.default"; + + MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", (MongoCredential.OidcCallback) context -> { + AccessToken token = credential.getToken( + new com.azure.core.credential.TokenRequestContext() + .addScopes(tokenResource)).block(); + return new MongoCredential.OidcCallbackResult(token.getToken()); + }); + + MongoClientSettings settings = MongoClientSettings.builder() + .applyConnectionString(new ConnectionString(connectionUri)) + .credential(mongoCredential) + .build(); + + return MongoClients.create(settings); + } + + public static OpenAIClient getOpenAIClient() { + String endpoint = getEnv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + if (endpoint == null) { + throw new IllegalStateException("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required"); + } + + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + public static List readJsonFile(String path) { + try { + String content = Files.readString(Path.of(path)); + // Parse JSON array of documents + @SuppressWarnings("unchecked") + List docs = Document.parse("{\"data\":" + content + "}").getList("data", Document.class); + return docs; + } catch (IOException e) { + throw new RuntimeException("Failed to read data file: " + path, e); + } + } + + public static void insertData(MongoCollection collection, List data, int batchSize) { + System.out.printf(" Inserting %d documents in batches of %d...%n", data.size(), batchSize); + InsertManyOptions options = new InsertManyOptions().ordered(false); + + for (int i = 0; i < data.size(); i += batchSize) { + List batch = data.subList(i, Math.min(i + batchSize, data.size())); + // Remove _id to avoid duplicate key errors on re-run + List cleaned = new ArrayList<>(); + for (Document doc : batch) { + Document copy = new Document(doc); + copy.remove("_id"); + cleaned.add(copy); + } + try { + collection.insertMany(cleaned, options); + } catch (Exception e) { + // Ignore duplicate key errors on re-insert + if (!e.getMessage().contains("duplicate key")) { + throw e; + } + } + System.out.printf(" Inserted batch %d-%d%n", i + 1, Math.min(i + batchSize, data.size())); + } + System.out.println(" Data insertion complete."); + } + + public static void dropVectorIndexes(MongoCollection collection, String vectorField) { + try { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + if (name != null && name.contains(vectorField) && !name.equals("_id_")) { + System.out.printf(" Dropping existing index: %s%n", name); + collection.dropIndex(name); + } + } + } catch (Exception e) { + // Ignore errors when indexes don't exist + System.out.println(" No existing vector indexes to drop."); + } + } + + public static List getEmbedding(OpenAIClient client, String text, String model) { + EmbeddingsOptions options = new EmbeddingsOptions(List.of(text)); + List embeddings = client.getEmbeddings(model, options).getData(); + if (embeddings.isEmpty()) { + throw new RuntimeException("No embedding returned for query text"); + } + return embeddings.get(0).getEmbedding(); + } + + public static List performVectorSearch( + MongoCollection collection, + OpenAIClient aiClient, + String query, + String vectorField, + String model, + int topK) { + + System.out.printf(" Generating embedding for query: \"%s\"%n", query); + List queryVector = getEmbedding(aiClient, query, model); + System.out.printf(" Embedding generated (%d dimensions)%n", queryVector.size()); + + // Convert List to List for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + + return results; + } + + public static void printResults(List results) { + System.out.println("\n === Search Results ==="); + for (int i = 0; i < results.size(); i++) { + Document doc = results.get(i); + System.out.printf(" %d. %s (score: %.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + System.out.printf(" %s%n", doc.getString("Description")); + } + System.out.println(); + } +} diff --git a/ai/select-algorithm-python/.env.example b/ai/select-algorithm-python/.env.example new file mode 100644 index 0000000..a0164f0 --- /dev/null +++ b/ai/select-algorithm-python/.env.example @@ -0,0 +1,29 @@ +# Azure DocumentDB cluster name (find in Azure Portal > DocumentDB > Overview) +MONGO_CLUSTER_NAME=your-cluster-name + +# Azure OpenAI embedding endpoint (find in Azure Portal > Azure OpenAI > Keys and Endpoint) +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + +# Azure OpenAI embedding model deployment name +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + +# Azure OpenAI API version (see: https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation) +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 + +# Database name (default: Hotels) +AZURE_DOCUMENTDB_DATABASENAME=Hotels + +# Path to pre-computed vectors JSON file (default: ../data/Hotels_Vector.json) +DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + +# Field name containing embeddings in the data file +EMBEDDED_FIELD=DescriptionVector + +# Embedding dimensions (default: 1536) +EMBEDDING_DIMENSIONS=1536 + +# Algorithm to test: all, diskann, hnsw, ivf (default: all) +ALGORITHM=all + +# Similarity to test: COS, L2, IP (default: COS) +SIMILARITY=COS \ No newline at end of file diff --git a/ai/select-algorithm-python/.gitignore b/ai/select-algorithm-python/.gitignore new file mode 100644 index 0000000..87965ce --- /dev/null +++ b/ai/select-algorithm-python/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.pyc +.env +.venv/ + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md new file mode 100644 index 0000000..94a1905 --- /dev/null +++ b/ai/select-algorithm-python/README.md @@ -0,0 +1,126 @@ + +# Select Vector Algorithm (Python) + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB. Each algorithm is optimized for different dataset sizes and performance requirements. + +## Algorithm Selection Guide + +| Algorithm | Dataset Size | Cluster Tier | Key Parameters | +|-----------|-------------|--------------|----------------| +| IVF | < 10K docs | M10+ | numLists | +| HNSW | 10K-50K | M30+ | m, efConstruction | +| DiskANN | 50K+ | M40+ | maxDegree, lBuild | + +## Prerequisites + +- Azure subscription +- Azure DocumentDB vCore cluster (M40+ for all algorithms, M10+ for IVF only) +- Azure OpenAI resource with `text-embedding-3-small` deployed +- Python 3.10+ +- Azure CLI (`az login` for passwordless auth) + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +3. Install dependencies: + ```bash + cd src + pip install -r ../requirements.txt + ``` + +4. Copy the data file: + + Copy `Hotels_Vector.json` from the repository's `ai/data/` folder into this project's `data/` folder: + + ```bash + cp ../../data/Hotels_Vector.json ./data/ + ``` + +5. Ensure you're logged in to Azure: + ```bash + az login + ``` + +## Run + +```bash +cd src + +# Run individual algorithms +python ivf.py +python hnsw.py +python diskann.py +``` + +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation: + +```bash +cd src +python compare_all.py +``` + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +**Environment variables:** + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `3` | Number of results per search | +| `VERBOSE` | `false` | Show all k results per combo | + +**Output:** +``` +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +Algorithm Similarity #1 Result #1 Score #2 Result #2 Score Diff +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +HNSW COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +HNSW L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +HNSW IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +DiskANN COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +DiskANN L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +DiskANN IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` diff --git a/ai/select-algorithm-python/data/README.md b/ai/select-algorithm-python/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-python/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-python/output/compare_all.txt b/ai/select-algorithm-python/output/compare_all.txt new file mode 100644 index 0000000..bcf32dc --- /dev/null +++ b/ai/select-algorithm-python/output/compare_all.txt @@ -0,0 +1,48 @@ +====================================================================== + Compare All Algorithms - 9 Combinations + (3 Algorithms x 3 Similarity Metrics) +====================================================================== + + Query: "luxury hotel near the beach" + Top K: 3 + Verbose: False + +Dropped existing 'hotels' collection (if any) +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Generating embedding for query... +Running 9 vector searches... + + Created index 'vector_ivf_cos' + Created index 'vector_ivf_l2' + Created index 'vector_ivf_ip' + Created index 'vector_hnsw_cos' + Created index 'vector_hnsw_l2' + Created index 'vector_hnsw_ip' + Created index 'vector_diskann_cos' + Created index 'vector_diskann_l2' + Created index 'vector_diskann_ip' ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| Algorithm | Metric | Index Name | Latency | Results | Top Score | Top Result | ++=============+==========+====================+===========+===========+=============+==========================+ +| IVF | COS | vector_ivf_cos | 213.9 ms | 3 | 0.6184 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| IVF | L2 | vector_ivf_l2 | 109.3 ms | 3 | 0.8736 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| IVF | IP | vector_ivf_ip | 104.8 ms | 3 | 0.6184 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| HNSW | COS | vector_hnsw_cos | 103.0 ms | 3 | 0.6184 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| HNSW | L2 | vector_hnsw_l2 | 103.1 ms | 3 | 0.8736 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| HNSW | IP | vector_hnsw_ip | 102.5 ms | 3 | 0.6184 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| DiskANN | COS | vector_diskann_cos | 102.6 ms | 3 | 0.6184 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| DiskANN | L2 | vector_diskann_l2 | 102.4 ms | 3 | 0.8736 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ +| DiskANN | IP | vector_diskann_ip | 102.7 ms | 3 | 0.6184 | Ocean Water Resort & Spa | ++-------------+----------+--------------------+-----------+-----------+-------------+--------------------------+ + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-python/requirements.txt b/ai/select-algorithm-python/requirements.txt new file mode 100644 index 0000000..63bdeb8 --- /dev/null +++ b/ai/select-algorithm-python/requirements.txt @@ -0,0 +1,14 @@ +# MongoDB driver for connecting to DocumentDB +pymongo>=4.7.0 + +# Azure OpenAI SDK for generating embeddings +openai>=1.0.0,<2.0.0 + +# Azure authentication library for passwordless connection +azure-identity>=1.15.0 + +# Environment variable management from .env files +python-dotenv>=1.0.0 + +# Formatted table output for compare_all.py +tabulate>=0.9.0 diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py new file mode 100644 index 0000000..2d04b3a --- /dev/null +++ b/ai/select-algorithm-python/src/compare_all.py @@ -0,0 +1,224 @@ +""" +Compare All Algorithms — Unified comparison runner. + +Executes all 9 combinations (3 algorithms × 3 similarity metrics) in a single +invocation and prints a formatted comparison table. + +Algorithms: IVF, HNSW, DiskANN +Metrics: COS, L2, IP +""" +import os +import time +from typing import Dict, List, Any, Tuple + +from tabulate import tabulate +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data +) + +# Index definitions: (algo_label, kind, extra_params) +ALGORITHMS = [ + ("IVF", "vector-ivf", {"numLists": 1}), + ("HNSW", "vector-hnsw", {"m": 16, "efConstruction": 64}), + ("DiskANN", "vector-diskann", {"maxDegree": 32, "lBuild": 50}), +] + +METRICS = ["COS", "L2", "IP"] + + +def get_compare_config() -> Dict[str, Any]: + """Load comparison-specific configuration from environment variables.""" + config = get_config() + config["query_text"] = os.getenv("QUERY_TEXT", "luxury hotel near the beach") + config["top_k"] = int(os.getenv("TOP_K", "3")) + config["verbose"] = os.getenv("VERBOSE", "false").lower() in ("true", "1", "yes") + return config + + +def index_name(algo: str, metric: str) -> str: + """Generate canonical index name: vector_{algo}_{metric}.""" + return f"vector_{algo.lower()}_{metric.lower()}" + + +def get_existing_index_names(collection) -> List[str]: + """Return names of existing indexes on the collection.""" + return [idx["name"] for idx in collection.list_indexes()] + + +def drop_vector_indexes(collection, vector_field: str) -> None: + """Drop all existing vector indexes on *vector_field*.""" + for idx in collection.list_indexes(): + name = idx.get("name", "") + key = idx.get("key", {}) + if vector_field in key and key[vector_field] == "cosmosSearch": + collection.drop_index(name) + + +def create_vector_index(collection, name: str, kind: str, vector_field: str, + dimensions: int, similarity: str, + extra_params: Dict[str, Any]) -> None: + """Create a single vector index.""" + cosmos_options = { + "kind": kind, + "dimensions": dimensions, + "similarity": similarity, + **extra_params, + } + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": name, + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": cosmos_options, + } + ], + } + collection.database.command(index_command) + + +def generate_embedding(azure_openai_client, query_text: str, + model_name: str) -> List[float]: + """Generate a single embedding for the query text.""" + response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + return response.data[0].embedding + + +def vector_search_with_index(collection, query_embedding: List[float], + vector_field: str, + top_k: int) -> Tuple[List[Dict[str, Any]], float]: + """Run vector search using the single active index and return results + latency.""" + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + start = time.perf_counter() + results = list(collection.aggregate(pipeline)) + elapsed_ms = (time.perf_counter() - start) * 1000 + + return results, elapsed_ms + + +def format_top_result(results: List[Dict[str, Any]]) -> str: + """Extract top result name for display.""" + if not results: + return "(no results)" + doc = results[0].get("document", results[0]) + return doc.get("HotelName", doc.get("name", "Unknown")) + + +def main(): + print("=" * 70) + print(" Compare All Algorithms — 9 Combinations") + print(" (3 Algorithms × 3 Similarity Metrics)") + print("=" * 70) + + config = get_compare_config() + query_text = config["query_text"] + top_k = config["top_k"] + verbose = config["verbose"] + + print(f"\n Query: \"{query_text}\"") + print(f" Top K: {top_k}") + print(f" Verbose: {verbose}\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config["database_name"]] + + # Drop collection for a clean comparison + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection (if any)") + + # Create fresh collection and load data + collection = database["hotels"] + data = read_file_return_json(config["data_file"]) + documents = [doc for doc in data if config["vector_field"] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + insert_data(collection, documents, config["batch_size"]) + + # Generate ONE embedding for the query + print("\nGenerating embedding for query...") + query_embedding = generate_embedding( + azure_openai_client, query_text, config["model_name"] + ) + + # Run all 9 searches sequentially (create→search→drop for each) + print("Running 9 vector searches...\n") + table_rows = [] + + for algo_label, kind, extra_params in ALGORITHMS: + for metric in METRICS: + name = index_name(algo_label, metric) + # Drop all vector indexes first + drop_vector_indexes(collection, config["vector_field"]) + # Create this specific index + create_vector_index( + collection, name, kind, config["vector_field"], + config["dimensions"], metric, extra_params + ) + print(f" Created index '{name}'") + time.sleep(5) # Increased wait time + # Search (no index name needed) + results, latency_ms = vector_search_with_index( + collection, query_embedding, config["vector_field"], top_k + ) + + top_score = results[0].get("score", 0) if results else 0 + top_name = format_top_result(results) + + table_rows.append([ + algo_label, + metric, + name, + f"{latency_ms:.1f} ms", + len(results), + f"{top_score:.4f}", + top_name, + ]) + + if verbose: + for i, r in enumerate(results, 1): + doc = r.get("document", r) + hotel = doc.get("HotelName", doc.get("name", "Unknown")) + score = r.get("score", 0) + print(f" {name} #{i}: {hotel} (score: {score:.4f})") + + # Print comparison table + headers = ["Algorithm", "Metric", "Index Name", "Latency", + "Results", "Top Score", "Top Result"] + print(tabulate(table_rows, headers=headers, tablefmt="grid")) + + finally: + # Cleanup: drop the comparison collection + try: + database = mongo_client[config["database_name"]] + database.drop_collection("hotels") + print("\nCleanup: dropped collection 'hotels'") + except Exception as e: + print(f"Cleanup warning: {e}") + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/diskann.py b/ai/select-algorithm-python/src/diskann.py new file mode 100644 index 0000000..db7b108 --- /dev/null +++ b/ai/select-algorithm-python/src/diskann.py @@ -0,0 +1,126 @@ +import os +from typing import List, Dict, Any +from utils import get_clients_passwordless, get_config, read_file_return_json, insert_data, drop_vector_indexes, perform_vector_search, print_search_results +from dotenv import load_dotenv + +load_dotenv() + + +def create_diskann_vector_index(collection, vector_field: str, dimensions: int, similarity: str = "COS") -> None: + """Create DiskANN vector index with specified similarity metric.""" + print(f"Creating DiskANN vector index (similarity={similarity})...") + + # Drop any existing vector indexes on this field first + drop_vector_indexes(collection, vector_field) + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": f"diskann_index_{vector_field}_{similarity.lower()}", + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-diskann", + "dimensions": dimensions, + "similarity": similarity, + "maxDegree": 32, + "lBuild": 50 + } + } + ] + } + + try: + collection.database.command(index_command) + print("DiskANN vector index created successfully") + except Exception as e: + error_msg = str(e) + print(f"Error creating DiskANN vector index: {e}") + + if "not enabled for this cluster tier" in error_msg or "M40" in error_msg: + print("\n⚠️ DiskANN requires Azure DocumentDB cluster tier M40 or higher.") + print(" Try HNSW or IVF instead, or upgrade your cluster tier.") + + raise + + +def main(): + print("=" * 60) + print(" DiskANN Vector Search — Select Algorithm Sample") + print("=" * 60) + + config = get_config() + similarity = config.get('similarity', 'COS').upper() + + print(f"\n Algorithm: DiskANN") + print(f" Similarity: {similarity}") + print(f" Database: {config['database_name']}") + print(f" ⚠️ Requires cluster tier M40 or higher\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config['database_name']] + collection_name = f"hotels_diskann_{similarity.lower()}" + + # Drop collection if exists (clean start) + if collection_name in database.list_collection_names(): + database.drop_collection(collection_name) + print(f"Dropped existing collection '{collection_name}'") + + collection = database[collection_name] + + # Load hotel data with embeddings + print(f"Loading data from {config['data_file']}...") + data = read_file_return_json(config['data_file']) + documents = [doc for doc in data if config['vector_field'] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + + # Insert data + insert_data(collection, documents, config['batch_size']) + + # Create DiskANN index + create_diskann_vector_index( + collection, + config['vector_field'], + config['dimensions'], + similarity + ) + + # Wait for index to be ready + import time + print("Waiting for index to be ready...") + time.sleep(3) + + # Perform vector search + query = os.getenv("QUERY_TEXT", "quintessential lodging near running trails, eateries, retail") + print(f'\nQuery: "{query}"\n') + + results = perform_vector_search( + collection, + azure_openai_client, + query, + config['vector_field'], + config['model_name'], + top_k=5 + ) + + print_search_results(results, f"DiskANN ({similarity})") + + except Exception as e: + print(f"\nError: {e}") + raise + + finally: + # Cleanup + try: + database = mongo_client[config['database_name']] + database.drop_collection(collection_name) + print(f"\nCleanup: dropped collection '{collection_name}'") + except Exception: + pass + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/hnsw.py b/ai/select-algorithm-python/src/hnsw.py new file mode 100644 index 0000000..1371462 --- /dev/null +++ b/ai/select-algorithm-python/src/hnsw.py @@ -0,0 +1,119 @@ +import os +from typing import List, Dict, Any +from utils import get_clients_passwordless, get_config, read_file_return_json, insert_data, drop_vector_indexes, perform_vector_search, print_search_results +from dotenv import load_dotenv + +load_dotenv() + + +def create_hnsw_vector_index(collection, vector_field: str, dimensions: int, similarity: str = "COS") -> None: + """Create HNSW vector index with specified similarity metric.""" + print(f"Creating HNSW vector index (similarity={similarity})...") + + # Drop any existing vector indexes on this field first + drop_vector_indexes(collection, vector_field) + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": f"hnsw_index_{vector_field}_{similarity.lower()}", + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-hnsw", + "dimensions": dimensions, + "similarity": similarity, + "m": 16, + "efConstruction": 64 + } + } + ] + } + + try: + collection.database.command(index_command) + print("HNSW vector index created successfully") + except Exception as e: + print(f"Error creating HNSW vector index: {e}") + raise + + +def main(): + print("=" * 60) + print(" HNSW Vector Search — Select Algorithm Sample") + print("=" * 60) + + config = get_config() + similarity = config.get('similarity', 'COS').upper() + + print(f"\n Algorithm: HNSW") + print(f" Similarity: {similarity}") + print(f" Database: {config['database_name']}\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config['database_name']] + collection_name = f"hotels_hnsw_{similarity.lower()}" + + # Drop collection if exists (clean start) + if collection_name in database.list_collection_names(): + database.drop_collection(collection_name) + print(f"Dropped existing collection '{collection_name}'") + + collection = database[collection_name] + + # Load hotel data with embeddings + print(f"Loading data from {config['data_file']}...") + data = read_file_return_json(config['data_file']) + documents = [doc for doc in data if config['vector_field'] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + + # Insert data + insert_data(collection, documents, config['batch_size']) + + # Create HNSW index + create_hnsw_vector_index( + collection, + config['vector_field'], + config['dimensions'], + similarity + ) + + # Wait for index to be ready + import time + print("Waiting for index to be ready...") + time.sleep(2) + + # Perform vector search + query = os.getenv("QUERY_TEXT", "quintessential lodging near running trails, eateries, retail") + print(f'\nQuery: "{query}"\n') + + results = perform_vector_search( + collection, + azure_openai_client, + query, + config['vector_field'], + config['model_name'], + top_k=5 + ) + + print_search_results(results, f"HNSW ({similarity})") + + except Exception as e: + print(f"\nError: {e}") + raise + + finally: + # Cleanup + try: + database = mongo_client[config['database_name']] + database.drop_collection(collection_name) + print(f"\nCleanup: dropped collection '{collection_name}'") + except Exception: + pass + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/ivf.py b/ai/select-algorithm-python/src/ivf.py new file mode 100644 index 0000000..44416a9 --- /dev/null +++ b/ai/select-algorithm-python/src/ivf.py @@ -0,0 +1,118 @@ +import os +from typing import List, Dict, Any +from utils import get_clients_passwordless, get_config, read_file_return_json, insert_data, drop_vector_indexes, perform_vector_search, print_search_results +from dotenv import load_dotenv + +load_dotenv() + + +def create_ivf_vector_index(collection, vector_field: str, dimensions: int, similarity: str = "COS") -> None: + """Create IVF vector index with specified similarity metric.""" + print(f"Creating IVF vector index (similarity={similarity})...") + + # Drop any existing vector indexes on this field first + drop_vector_indexes(collection, vector_field) + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": f"ivf_index_{vector_field}_{similarity.lower()}", + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": "vector-ivf", + "dimensions": dimensions, + "similarity": similarity, + "numLists": 1 # Small dataset + } + } + ] + } + + try: + collection.database.command(index_command) + print("IVF vector index created successfully") + except Exception as e: + print(f"Error creating IVF vector index: {e}") + raise + + +def main(): + print("=" * 60) + print(" IVF Vector Search — Select Algorithm Sample") + print("=" * 60) + + config = get_config() + similarity = config.get('similarity', 'COS').upper() + + print(f"\n Algorithm: IVF") + print(f" Similarity: {similarity}") + print(f" Database: {config['database_name']}\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config['database_name']] + collection_name = f"hotels_ivf_{similarity.lower()}" + + # Drop collection if exists (clean start) + if collection_name in database.list_collection_names(): + database.drop_collection(collection_name) + print(f"Dropped existing collection '{collection_name}'") + + collection = database[collection_name] + + # Load hotel data with embeddings + print(f"Loading data from {config['data_file']}...") + data = read_file_return_json(config['data_file']) + documents = [doc for doc in data if config['vector_field'] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + + # Insert data + insert_data(collection, documents, config['batch_size']) + + # Create IVF index + create_ivf_vector_index( + collection, + config['vector_field'], + config['dimensions'], + similarity + ) + + # Wait for index to be ready + import time + print("Waiting for index to be ready...") + time.sleep(3) + + # Perform vector search + query = os.getenv("QUERY_TEXT", "quintessential lodging near running trails, eateries, retail") + print(f'\nQuery: "{query}"\n') + + results = perform_vector_search( + collection, + azure_openai_client, + query, + config['vector_field'], + config['model_name'], + top_k=5 + ) + + print_search_results(results, f"IVF ({similarity})") + + except Exception as e: + print(f"\nError: {e}") + raise + + finally: + # Cleanup + try: + database = mongo_client[config['database_name']] + database.drop_collection(collection_name) + print(f"\nCleanup: dropped collection '{collection_name}'") + except Exception: + pass + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py new file mode 100644 index 0000000..09d1386 --- /dev/null +++ b/ai/select-algorithm-python/src/utils.py @@ -0,0 +1,176 @@ +import json +import os +import time +import warnings +from typing import Dict, List, Any, Optional, Tuple + +# Suppress the PyMongo CosmosDB cluster detection warning +warnings.filterwarnings( + "ignore", + message="You appear to be connected to a CosmosDB cluster.*", +) + +from pymongo import MongoClient, InsertOne +from pymongo.collection import Collection +from pymongo.errors import BulkWriteError +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult +from openai import AzureOpenAI +from dotenv import load_dotenv + +# Load environment variables from .env file in project root +# After azd up, run: azd env get-values > .env +load_dotenv() + + +class AzureIdentityTokenCallback(OIDCCallback): + def __init__(self, credential): + self.credential = credential + + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + token = self.credential.get_token( + "https://ossrdbms-aad.database.windows.net/.default").token + return OIDCCallbackResult(access_token=token) + + +def get_clients_passwordless() -> Tuple[MongoClient, AzureOpenAI]: + """Create MongoDB and Azure OpenAI clients using passwordless auth.""" + cluster_name = os.getenv("MONGO_CLUSTER_NAME") + if not cluster_name: + raise ValueError("MONGO_CLUSTER_NAME environment variable is required") + + credential = DefaultAzureCredential() + + mongo_client = MongoClient( + f"mongodb+srv://{cluster_name}.mongocluster.cosmos.azure.com/", + connectTimeoutMS=120000, + tls=True, + retryWrites=False, + authMechanism="MONGODB-OIDC", + authMechanismProperties={"OIDC_CALLBACK": AzureIdentityTokenCallback(credential)} + ) + + azure_openai_endpoint = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if not azure_openai_endpoint: + raise ValueError("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + + token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default") + + azure_openai_client = AzureOpenAI( + azure_endpoint=azure_openai_endpoint, + azure_ad_token_provider=token_provider, + api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION", "2024-10-21") + ) + + return mongo_client, azure_openai_client + + +def get_config() -> Dict[str, Any]: + """Load configuration from environment variables.""" + return { + 'database_name': os.getenv('AZURE_DOCUMENTDB_DATABASENAME', 'Hotels'), + 'data_file': os.getenv('DATA_FILE_WITH_VECTORS', './data/Hotels_Vector.json'), + 'vector_field': os.getenv('EMBEDDED_FIELD', 'DescriptionVector'), + 'model_name': os.getenv('AZURE_OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'dimensions': int(os.getenv('EMBEDDING_DIMENSIONS', '1536')), + 'batch_size': int(os.getenv('LOAD_SIZE_BATCH', '100')), + 'similarity': os.getenv('SIMILARITY', 'COS'), + } + + +def read_file_return_json(file_path: str) -> List[Dict[str, Any]]: + """Read a JSON file and return the parsed data.""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + return json.load(file) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found") + raise + + +def insert_data(collection: Collection, data: List[Dict[str, Any]], + batch_size: int = 100) -> Dict[str, Any]: + """Insert data into collection in batches, skipping if already populated.""" + total_documents = len(data) + + existing_count = collection.count_documents({}) + if existing_count >= total_documents: + print(f"Collection already has {existing_count} documents, skipping insert") + return {'total': total_documents, 'inserted': 0, 'skipped': True} + + if existing_count > 0: + collection.delete_many({}) + + inserted_count = 0 + for i in range(0, total_documents, batch_size): + batch = data[i:i + batch_size] + try: + operations = [InsertOne(doc) for doc in batch] + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count + except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + time.sleep(0.1) + + print(f"Inserted {inserted_count}/{total_documents} documents") + return {'total': total_documents, 'inserted': inserted_count, 'skipped': False} + + +def drop_vector_indexes(collection: Collection, vector_field: str) -> None: + """Drop any existing vector indexes on the specified field.""" + try: + indexes = list(collection.list_indexes()) + for index in indexes: + if 'key' in index and vector_field in index['key']: + if index['key'][vector_field] == 'cosmosSearch': + collection.drop_index(index['name']) + print(f"Dropped existing vector index: {index['name']}") + except Exception as e: + print(f"Warning: Error dropping indexes: {e}") + + +def perform_vector_search(collection: Collection, + azure_openai_client: AzureOpenAI, + query_text: str, + vector_field: str, + model_name: str, + top_k: int = 5) -> List[Dict[str, Any]]: + """Perform vector search using the $search aggregation stage.""" + embedding_response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + query_embedding = embedding_response.data[0].embedding + + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + return list(collection.aggregate(pipeline)) + + +def print_search_results(results: List[Dict[str, Any]], algorithm: str) -> None: + """Print formatted search results.""" + print(f"\n{'='*60}") + print(f" {algorithm} Search Results ({len(results)} found)") + print(f"{'='*60}") + for i, result in enumerate(results, 1): + doc = result.get('document', result) + name = doc.get('HotelName', doc.get('name', 'Unknown')) + score = result.get('score', 0) + print(f" {i}. {name} (score: {score:.4f})") + print() diff --git a/ai/select-algorithm-typescript/.env.example b/ai/select-algorithm-typescript/.env.example new file mode 100644 index 0000000..73e1fa9 --- /dev/null +++ b/ai/select-algorithm-typescript/.env.example @@ -0,0 +1,10 @@ +MONGO_CLUSTER_NAME=your-cluster-name +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 +AZURE_DOCUMENTDB_DATABASENAME=Hotels +DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json +EMBEDDED_FIELD=DescriptionVector +EMBEDDING_DIMENSIONS=1536 +LOAD_SIZE_BATCH=100 +SIMILARITY=COS diff --git a/ai/select-algorithm-typescript/.gitignore b/ai/select-algorithm-typescript/.gitignore new file mode 100644 index 0000000..4477a63 --- /dev/null +++ b/ai/select-algorithm-typescript/.gitignore @@ -0,0 +1,7 @@ +node_modules/ +dist/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md new file mode 100644 index 0000000..19ce353 --- /dev/null +++ b/ai/select-algorithm-typescript/README.md @@ -0,0 +1,131 @@ +# Select Algorithm — TypeScript + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB using TypeScript. + +## Prerequisites + +- [Node.js 20+](https://nodejs.org/) +- [Azure CLI](https://learn.microsoft.com/cli/azure/install-azure-cli) (for `az login`) +- An Azure DocumentDB cluster with vector search enabled +- An Azure OpenAI resource with an embedding model deployed + +## Setup + +1. **Install dependencies:** + + ```bash + npm install + ``` + +2. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +3. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file in the project folder with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + | Variable | Description | + |---|---| + | `MONGO_CLUSTER_NAME` | Your DocumentDB cluster name | + | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | + | `AZURE_OPENAI_EMBEDDING_MODEL` | Embedding model deployment name | + | `AZURE_OPENAI_EMBEDDING_API_VERSION` | Azure OpenAI API version | + | `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | + | `DATA_FILE_WITH_VECTORS` | Path to JSON data file with vectors | + | `EMBEDDED_FIELD` | Field name containing the vector (default: `contentVector`) | + | `EMBEDDING_DIMENSIONS` | Vector dimensions (default: `1536`) | + | `LOAD_SIZE_BATCH` | Batch size for data insertion | + | `SIMILARITY` | Similarity metric: `COS`, `L2`, or `IP` | + +5. **Copy the data file:** + + Copy `Hotels_Vector.json` from the repository's `ai/data/` folder into this project's `data/` folder: + + ```bash + cp ../../data/Hotels_Vector.json ./data/ + ``` + +6. **Build the project:** + + ```bash + npm run build + ``` + +## Run + +Each script creates a collection, inserts data, builds a vector index, and performs a similarity search. + +```bash +# IVF (Inverted File Index) +npm run start:ivf + +# HNSW (Hierarchical Navigable Small World) +npm run start:hnsw + +# DiskANN +npm run start:diskann +``` + +## Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table: + +```bash +npm start +``` + +**Environment variables** (optional overrides): + +| Variable | Default | Description | +|---|---|---| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `3` | Number of results per combination | +| `VERBOSE` | `false` | When `true`, shows all k results per combo | + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +**Output:** +``` +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +Algorithm Similarity #1 Result #1 Score #2 Result #2 Score Diff +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +## Algorithm comparison + +| Algorithm | Index type | Best for | +|---|---|---| +| **IVF** | `vector-ivf` | Smaller datasets, lower memory usage | +| **HNSW** | `vector-hnsw` | Fast approximate search, balanced recall/speed | +| **DiskANN** | `vector-diskann` | Large-scale datasets, disk-based search | diff --git a/ai/select-algorithm-typescript/data/README.md b/ai/select-algorithm-typescript/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-typescript/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-typescript/output/compare_all.txt b/ai/select-algorithm-typescript/output/compare_all.txt new file mode 100644 index 0000000..aa0ccab --- /dev/null +++ b/ai/select-algorithm-typescript/output/compare_all.txt @@ -0,0 +1,38 @@ +Select Algorithm Demo - Azure DocumentDB Vector Search (TypeScript) +------------------------------------------------------------------- +Using Azure OpenAI Embedding Deployment/Model: text-embedding-3-small + +Loaded 50 documents +Inserted 50 documents + +Query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running searches (top 3 results)... + + vector_ivf_cos created + vector_ivf_l2 created + vector_ivf_ip created + vector_hnsw_cos created + vector_hnsw_l2 created + vector_hnsw_ip created + vector_diskann_cos created + vector_diskann_l2 created + vector_diskann_ip created + + COMPARISON RESULTS + + Algorithm Similarity Latency Top Score Top Result + --------- ---------- ------- --------- ---------------------------- + IVF COS 217ms 0.6184 Ocean Water Resort & Spa + IVF L2 110ms 0.8735 Ocean Water Resort & Spa + IVF IP 106ms 0.6183 Ocean Water Resort & Spa + HNSW COS 104ms 0.6184 Ocean Water Resort & Spa + HNSW L2 104ms 0.8735 Ocean Water Resort & Spa + HNSW IP 103ms 0.6183 Ocean Water Resort & Spa + DiskANN COS 104ms 0.6184 Ocean Water Resort & Spa + DiskANN L2 104ms 0.8735 Ocean Water Resort & Spa + DiskANN IP 103ms 0.6183 Ocean Water Resort & Spa + +Cleanup: dropped collection "hotels" +Database connection closed \ No newline at end of file diff --git a/ai/select-algorithm-typescript/package-lock.json b/ai/select-algorithm-typescript/package-lock.json new file mode 100644 index 0000000..f0ceb74 --- /dev/null +++ b/ai/select-algorithm-typescript/package-lock.json @@ -0,0 +1,735 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "select-algorithm-typescript", + "version": "1.0.0", + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } + }, + "node_modules/@azure/abort-controller": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.1.2.tgz", + "integrity": "sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-auth": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.10.1.tgz", + "integrity": "sha512-ykRMW8PjVAn+RS6ww5cmK9U2CyH9p4Q88YJwvUslfuMmN98w/2rdGRLPqJYObapBCdzBVeDgYWdJnFPFb7qzpg==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-util": "^1.13.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-client": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-client/-/core-client-1.10.1.tgz", + "integrity": "sha512-Nh5PhEOeY6PrnxNPsEHRr9eimxLwgLlpmguQaHKBinFYA/RU9+kOYVOQqOrTsCL+KSxrLLl1gD8Dk5BFW/7l/w==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-rest-pipeline": "^1.22.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-rest-pipeline": { + "version": "1.23.0", + "resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.23.0.tgz", + "integrity": "sha512-Evs1INHo+jUjwHi1T6SG6Ua/LHOQBCLuKEEE6efIpt4ZOoNonaT1kP32GoOcdNDbfqsD2445CPri3MubBy5DEQ==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "@typespec/ts-http-runtime": "^0.3.4", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-tracing": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.3.1.tgz", + "integrity": "sha512-9MWKevR7Hz8kNzzPLfX4EAtGM2b8mr50HPDBvio96bURP/9C+HjdH3sBlLSNNrvRAr5/k/svoH457gB5IKpmwQ==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-util": { + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.13.1.tgz", + "integrity": "sha512-XPArKLzsvl0Hf0CaGyKHUyVgF7oDnhKoP85Xv6M4StF/1AhfORhZudHtOyf2s+FcbuQ9dPRAjB8J2KvRRMUK2A==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/identity": { + "version": "4.13.1", + "resolved": "https://registry.npmjs.org/@azure/identity/-/identity-4.13.1.tgz", + "integrity": "sha512-5C/2WD5Vb1lHnZS16dNQRPMjN6oV/Upba+C9nBIs15PmOi6A3ZGs4Lr2u60zw4S04gi+u3cEXiqTVP7M4Pz3kw==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-auth": "^1.9.0", + "@azure/core-client": "^1.9.2", + "@azure/core-rest-pipeline": "^1.17.0", + "@azure/core-tracing": "^1.0.0", + "@azure/core-util": "^1.11.0", + "@azure/logger": "^1.0.0", + "@azure/msal-browser": "^5.5.0", + "@azure/msal-node": "^5.1.0", + "open": "^10.1.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/logger": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.3.0.tgz", + "integrity": "sha512-fCqPIfOcLE+CGqGPd66c8bZpwAji98tZ4JI9i/mlTNTlsIWslCfpg48s/ypyLxZTump5sypjrKn2/kY7q8oAbA==", + "license": "MIT", + "dependencies": { + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/msal-browser": { + "version": "5.9.0", + "resolved": "https://registry.npmjs.org/@azure/msal-browser/-/msal-browser-5.9.0.tgz", + "integrity": "sha512-CzE+4PefDSJWj26zU7G1bKchlGRRHMBFreG4tAlGuzyI8hAPiYGobaJvZBgZBf6L63iphX7VH+ityL8VgEQz9Q==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-common": { + "version": "16.5.2", + "resolved": "https://registry.npmjs.org/@azure/msal-common/-/msal-common-16.5.2.tgz", + "integrity": "sha512-GkDEL6TYo3HgT3UuqakdgE9PZfc1hMki6+Hwgy1uddb/EauvAKfu85vVhuofRSo22D1xTnWt8Ucwfg4vSCVwvA==", + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-node": { + "version": "5.1.5", + "resolved": "https://registry.npmjs.org/@azure/msal-node/-/msal-node-5.1.5.tgz", + "integrity": "sha512-ObTeMoNPmq19X3z40et9Xvs4ZoWVeJg43PZMRLG5iwVL+2nCtAerG3YTDItqPp1CfXNwmCXBbg8jn1DOx65c3g==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2", + "jsonwebtoken": "^9.0.0" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/@mongodb-js/saslprep": { + "version": "1.4.9", + "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.4.9.tgz", + "integrity": "sha512-RXSxsokhAF/4nWys8An8npsqOI33Ex1Hlzqjw2pZOO+GKtMAR2noGnUdsFiGwsaO/xXI+56mtjTmDA3JXJsvmA==", + "license": "MIT", + "dependencies": { + "sparse-bitfield": "^3.0.3" + } + }, + "node_modules/@types/node": { + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/@types/webidl-conversions": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", + "integrity": "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==", + "license": "MIT" + }, + "node_modules/@types/whatwg-url": { + "version": "11.0.5", + "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz", + "integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==", + "license": "MIT", + "dependencies": { + "@types/webidl-conversions": "*" + } + }, + "node_modules/@typespec/ts-http-runtime": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@typespec/ts-http-runtime/-/ts-http-runtime-0.3.5.tgz", + "integrity": "sha512-yURCknZhvywvQItHMMmFSo+fq5arCUIyz/CVk7jD89MSai7dkaX8ufjCWp3NttLojoTVbcE72ri+be/TnEbMHw==", + "license": "MIT", + "dependencies": { + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/bson": { + "version": "6.10.4", + "resolved": "https://registry.npmjs.org/bson/-/bson-6.10.4.tgz", + "integrity": "sha512-WIsKqkSC0ABoBJuT1LEX+2HEvNmNKKgnTAyd0fL8qzK4SH2i9NXg+t08YtdZp/V9IZ33cxe3iV4yM0qg8lMQng==", + "license": "Apache-2.0", + "engines": { + "node": ">=16.20.1" + } + }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", + "license": "BSD-3-Clause" + }, + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "license": "MIT", + "dependencies": { + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/default-browser": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", + "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", + "license": "MIT", + "dependencies": { + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", + "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/is-docker": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "license": "MIT", + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "license": "MIT", + "dependencies": { + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-wsl": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.1.tgz", + "integrity": "sha512-e6rvdUCiQCAuumZslxRJWR/Doq4VpPR82kqclvcS0efgt430SlGIk05vdCN58+VrzgtIcfNODjozVielycD4Sw==", + "license": "MIT", + "dependencies": { + "is-inside-container": "^1.0.0" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/jsonwebtoken": { + "version": "9.0.3", + "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", + "integrity": "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g==", + "license": "MIT", + "dependencies": { + "jws": "^4.0.1", + "lodash.includes": "^4.3.0", + "lodash.isboolean": "^3.0.3", + "lodash.isinteger": "^4.0.4", + "lodash.isnumber": "^3.0.3", + "lodash.isplainobject": "^4.0.6", + "lodash.isstring": "^4.0.1", + "lodash.once": "^4.0.0", + "ms": "^2.1.1", + "semver": "^7.5.4" + }, + "engines": { + "node": ">=12", + "npm": ">=6" + } + }, + "node_modules/jwa": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", + "license": "MIT", + "dependencies": { + "buffer-equal-constant-time": "^1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", + "license": "MIT", + "dependencies": { + "jwa": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/lodash.includes": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.includes/-/lodash.includes-4.3.0.tgz", + "integrity": "sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==", + "license": "MIT" + }, + "node_modules/lodash.isboolean": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz", + "integrity": "sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==", + "license": "MIT" + }, + "node_modules/lodash.isinteger": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/lodash.isinteger/-/lodash.isinteger-4.0.4.tgz", + "integrity": "sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==", + "license": "MIT" + }, + "node_modules/lodash.isnumber": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isnumber/-/lodash.isnumber-3.0.3.tgz", + "integrity": "sha512-QYqzpfwO3/CWf3XP+Z+tkQsfaLL/EnUlXWVkIk5FUPc4sBdTehEqZONuyRt2P67PXAk+NXmTBcc97zw9t1FQrw==", + "license": "MIT" + }, + "node_modules/lodash.isplainobject": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz", + "integrity": "sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==", + "license": "MIT" + }, + "node_modules/lodash.isstring": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.isstring/-/lodash.isstring-4.0.1.tgz", + "integrity": "sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==", + "license": "MIT" + }, + "node_modules/lodash.once": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", + "integrity": "sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==", + "license": "MIT" + }, + "node_modules/memory-pager": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz", + "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==", + "license": "MIT" + }, + "node_modules/mongodb": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.21.0.tgz", + "integrity": "sha512-URyb/VXMjJ4da46OeSXg+puO39XH9DeQpWCslifrRn9JWugy0D+DvvBvkm2WxmHe61O/H19JM66p1z7RHVkZ6A==", + "license": "Apache-2.0", + "dependencies": { + "@mongodb-js/saslprep": "^1.3.0", + "bson": "^6.10.4", + "mongodb-connection-string-url": "^3.0.2" + }, + "engines": { + "node": ">=16.20.1" + }, + "peerDependencies": { + "@aws-sdk/credential-providers": "^3.188.0", + "@mongodb-js/zstd": "^1.1.0 || ^2.0.0", + "gcp-metadata": "^5.2.0", + "kerberos": "^2.0.1", + "mongodb-client-encryption": ">=6.0.0 <7", + "snappy": "^7.3.2", + "socks": "^2.7.1" + }, + "peerDependenciesMeta": { + "@aws-sdk/credential-providers": { + "optional": true + }, + "@mongodb-js/zstd": { + "optional": true + }, + "gcp-metadata": { + "optional": true + }, + "kerberos": { + "optional": true + }, + "mongodb-client-encryption": { + "optional": true + }, + "snappy": { + "optional": true + }, + "socks": { + "optional": true + } + } + }, + "node_modules/mongodb-connection-string-url": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.2.tgz", + "integrity": "sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA==", + "license": "Apache-2.0", + "dependencies": { + "@types/whatwg-url": "^11.0.2", + "whatwg-url": "^14.1.0 || ^13.0.0" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/open": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz", + "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==", + "license": "MIT", + "dependencies": { + "default-browser": "^5.2.1", + "define-lazy-prop": "^3.0.0", + "is-inside-container": "^1.0.0", + "wsl-utils": "^0.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/openai": { + "version": "5.23.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.23.2.tgz", + "integrity": "sha512-MQBzmTulj+MM5O8SKEk/gL8a7s5mktS9zUtAkU257WjvobGc9nKcBuVwjyEEcb9SI8a8Y2G/mzn3vm9n1Jlleg==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/run-applescript": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.1.0.tgz", + "integrity": "sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/sparse-bitfield": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz", + "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==", + "license": "MIT", + "dependencies": { + "memory-pager": "^1.0.2" + } + }, + "node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "dev": true, + "license": "MIT" + }, + "node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "license": "MIT", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/wsl-utils": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz", + "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==", + "license": "MIT", + "dependencies": { + "is-wsl": "^3.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + } + } +} diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json new file mode 100644 index 0000000..49fb408 --- /dev/null +++ b/ai/select-algorithm-typescript/package.json @@ -0,0 +1,23 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "description": "Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB", + "type": "module", + "scripts": { + "env:init": "azd env get-values > .env", + "build": "tsc", + "start": "node --env-file .env dist/compare-all.js", + "start:ivf": "node --env-file .env dist/ivf.js", + "start:hnsw": "node --env-file .env dist/hnsw.js", + "start:diskann": "node --env-file .env dist/diskann.js" + }, + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } +} diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts new file mode 100644 index 0000000..6d03508 --- /dev/null +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -0,0 +1,244 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +interface AlgorithmConfig { + name: string; + kind: string; + options: Record; +} + +interface SearchResult { + algorithm: string; + similarity: string; + latencyMs: number; + topScore: number; + topResult: string; + results: Array<{ name: string; score: number }>; +} + +const ALGORITHMS: AlgorithmConfig[] = [ + { name: 'IVF', kind: 'vector-ivf', options: { numLists: 1 } }, + { name: 'HNSW', kind: 'vector-hnsw', options: { m: 16, efConstruction: 64 } }, + { name: 'DiskANN', kind: 'vector-diskann', options: { maxDegree: 32, lBuild: 50 } }, +]; + +const SIMILARITIES = ['COS', 'L2', 'IP']; + +async function main() { + const baseConfig = getConfig(); + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '3', 10); + const verbose = process.env.VERBOSE === 'true'; + const collectionName = 'hotels'; + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(baseConfig.dbName); + + // Drop collection if it exists for a clean start + let collections = await db.listCollections({ name: collectionName }).toArray(); + if (collections.length > 0) { + try { + const col = db.collection(collectionName); + const existingIndexes = await col.listIndexes().toArray(); + for (const idx of existingIndexes) { + if (idx.name !== '_id_') { + try { + await col.dropIndex(idx.name); + } catch {} + } + } + await new Promise(r => setTimeout(r, 2000)); + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); + } catch (e: any) { + console.log(`Cleanup note: ${e.message.split('\n')[0]}`); + } + await new Promise(r => setTimeout(r, 10000)); + } + + // Load data once for reuse + const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); + console.log(`Loaded ${data.length} documents`); + + // Insert data into collection + const collection = db.collection(collectionName); + await insertData(baseConfig, collection, data); + + // Generate one embedding for the query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: baseConfig.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + console.log(`Embedding generated (${queryVector.length} dimensions)`); + + // Sequential create→search→drop for each algorithm+similarity combo + // DocumentDB does not allow multiple vector indexes of the same kind on the same field + console.log(`\nRunning searches (top ${topK} results)...\n`); + const results: SearchResult[] = []; + + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + + // 1. Drop all existing vector indexes + const indexes = await collection.listIndexes().toArray(); + let droppedAny = false; + for (const idx of indexes) { + if (idx.key && idx.key[baseConfig.embeddedField] === 'cosmosSearch') { + try { await collection.dropIndex(idx.name); droppedAny = true; } catch {} + } + } + if (droppedAny) { + await new Promise(r => setTimeout(r, 2000)); + } + + // 2. Create this specific index + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [baseConfig.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: algo.kind, + ...algo.options, + similarity: sim, + dimensions: baseConfig.embeddingDimensions + } + }] + }; + await db.command(indexOptions); + console.log(` ✓ ${indexName} created`); + + // 3. Wait for index to be ready + await new Promise(r => setTimeout(r, 5000)); + + // 4. Search with retry (index may need more time) + let searchResults: any[] = []; + let latencyMs = 0; + for (let attempt = 0; attempt < 3; attempt++) { + if (attempt > 0) { + await new Promise(r => setTimeout(r, 5000)); + } + try { + const start = performance.now(); + searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]).toArray(); + latencyMs = performance.now() - start; + if (searchResults.length > 0) break; + } catch (e) { + if (attempt === 2) throw e; + } + } + + // Record result + const topDoc = searchResults[0] as any; + results.push({ + algorithm: algo.name, + similarity: sim, + latencyMs, + topScore: topDoc?.score ?? 0, + topResult: topDoc?.document?.HotelName ?? '(none)', + results: searchResults.map((r: any) => ({ + name: r.document?.HotelName ?? '(none)', + score: r.score ?? 0 + })) + }); + } + } + + // Print comparison table + printComparisonTable(results, verbose); + + } catch (error) { + console.error('Compare-all failed:', error); + process.exitCode = 1; + } finally { + // Cleanup: drop the comparison collection + if (dbClient) { + try { + const db = dbClient.db(baseConfig.dbName); + await db.dropCollection(collectionName); + console.log(`\nCleanup: dropped collection "${collectionName}"`); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } + } +} + +function printComparisonTable(results: SearchResult[], verbose: boolean) { + const algoWidth = 10; + const simWidth = 10; + const latWidth = 8; + const scoreWidth = 10; + const nameWidth = 30; + + const pad = (s: string, w: number) => s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length); + + const topLine = `╔${'═'.repeat(algoWidth)}╤${'═'.repeat(simWidth)}╤${'═'.repeat(latWidth)}╤${'═'.repeat(scoreWidth)}╤${'═'.repeat(nameWidth)}╗`; + const headerSep = `╠${'═'.repeat(algoWidth)}╪${'═'.repeat(simWidth)}╪${'═'.repeat(latWidth)}╪${'═'.repeat(scoreWidth)}╪${'═'.repeat(nameWidth)}╣`; + const rowSep = `╟${'─'.repeat(algoWidth)}┼${'─'.repeat(simWidth)}┼${'─'.repeat(latWidth)}┼${'─'.repeat(scoreWidth)}┼${'─'.repeat(nameWidth)}╢`; + const bottomLine = `╚${'═'.repeat(algoWidth)}╧${'═'.repeat(simWidth)}╧${'═'.repeat(latWidth)}╧${'═'.repeat(scoreWidth)}╧${'═'.repeat(nameWidth)}╝`; + + console.log(topLine); + console.log(`║${pad(' Algorithm', algoWidth)}│${pad(' Similarity', simWidth)}│${pad(' Latency', latWidth)}│${pad(' Top Score', scoreWidth)}│${pad(' Top Result', nameWidth)}║`); + console.log(headerSep); + + results.forEach((r, i) => { + const latStr = `${Math.round(r.latencyMs)}ms`; + const scoreStr = r.topScore.toFixed(4); + console.log( + `║${pad(` ${r.algorithm}`, algoWidth)}│${pad(` ${r.similarity}`, simWidth)}│${pad(` ${latStr}`, latWidth)}│${pad(` ${scoreStr}`, scoreWidth)}│${pad(` ${r.topResult}`, nameWidth)}║` + ); + + if (verbose && r.results.length > 1) { + for (let j = 1; j < r.results.length; j++) { + const sub = r.results[j]; + console.log( + `║${pad('', algoWidth)}│${pad('', simWidth)}│${pad('', latWidth)}│${pad(` ${sub.score.toFixed(4)}`, scoreWidth)}│${pad(` ${sub.name}`, nameWidth)}║` + ); + } + } + + if (i < results.length - 1) { + console.log(rowSep); + } + }); + + console.log(bottomLine); +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/diskann.ts b/ai/select-algorithm-typescript/src/diskann.ts new file mode 100644 index 0000000..fd130cd --- /dev/null +++ b/ai/select-algorithm-typescript/src/diskann.ts @@ -0,0 +1,100 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +async function main() { + const config = getConfig(); + const collectionName = 'hotels_diskann'; + const indexName = 'vectorIndex_diskann'; + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '3', 10); + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + + // Drop collection if it exists + const existingCollections = await db.listCollections({ name: collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); + } + + // Create collection and load data + const collection = await db.createCollection(collectionName); + console.log(`Created collection: ${collectionName}`); + const data = await readFileReturnJson(path.join(__dirname, '..', config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create DiskANN vector index + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [config.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: 'vector-diskann', + maxDegree: 32, + lBuild: 50, + similarity: config.similarity, + dimensions: config.embeddingDimensions + } + }] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log(`Created vector index: ${indexName} (DiskANN, ${config.similarity})`); + + // Generate embedding for query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + + // Perform vector search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: config.embeddedField, + k: topK + }, + returnStoredSource: true + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]).toArray(); + + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('DiskANN search failed:', error); + process.exitCode = 1; + } finally { + if (dbClient) { + await dbClient.close(); + console.log('Database connection closed'); + } + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/hnsw.ts b/ai/select-algorithm-typescript/src/hnsw.ts new file mode 100644 index 0000000..d6e2659 --- /dev/null +++ b/ai/select-algorithm-typescript/src/hnsw.ts @@ -0,0 +1,100 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +async function main() { + const config = getConfig(); + const collectionName = 'hotels_hnsw'; + const indexName = 'vectorIndex_hnsw'; + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '3', 10); + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + + // Drop collection if it exists + const existingCollections = await db.listCollections({ name: collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); + } + + // Create collection and load data + const collection = await db.createCollection(collectionName); + console.log(`Created collection: ${collectionName}`); + const data = await readFileReturnJson(path.join(__dirname, '..', config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create HNSW vector index + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [config.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: 'vector-hnsw', + m: 16, + efConstruction: 64, + similarity: config.similarity, + dimensions: config.embeddingDimensions + } + }] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log(`Created vector index: ${indexName} (HNSW, ${config.similarity})`); + + // Generate embedding for query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + + // Perform vector search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: config.embeddedField, + k: topK + }, + returnStoredSource: true + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]).toArray(); + + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('HNSW search failed:', error); + process.exitCode = 1; + } finally { + if (dbClient) { + await dbClient.close(); + console.log('Database connection closed'); + } + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/ivf.ts b/ai/select-algorithm-typescript/src/ivf.ts new file mode 100644 index 0000000..8704ef7 --- /dev/null +++ b/ai/select-algorithm-typescript/src/ivf.ts @@ -0,0 +1,99 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData, printSearchResults } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +async function main() { + const config = getConfig(); + const collectionName = 'hotels_ivf'; + const indexName = 'vectorIndex_ivf'; + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '3', 10); + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + + // Drop collection if it exists + const existingCollections = await db.listCollections({ name: collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); + } + + // Create collection and load data + const collection = await db.createCollection(collectionName); + console.log(`Created collection: ${collectionName}`); + const data = await readFileReturnJson(path.join(__dirname, '..', config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create IVF vector index + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [config.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: 'vector-ivf', + numLists: 1, + similarity: config.similarity, + dimensions: config.embeddingDimensions + } + }] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log(`Created vector index: ${indexName} (IVF, ${config.similarity})`); + + // Generate embedding for query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + + // Perform vector search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: config.embeddedField, + k: topK + }, + returnStoredSource: true + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]).toArray(); + + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('IVF search failed:', error); + process.exitCode = 1; + } finally { + if (dbClient) { + await dbClient.close(); + console.log('Database connection closed'); + } + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/select-algorithm.ts b/ai/select-algorithm-typescript/src/select-algorithm.ts new file mode 100644 index 0000000..0ffe49f --- /dev/null +++ b/ai/select-algorithm-typescript/src/select-algorithm.ts @@ -0,0 +1,287 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, insertData, printComparisonTable } from './utils.js'; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Validate required environment variables at startup +const requiredEnvVars = [ + 'MONGO_CLUSTER_NAME', + 'AZURE_OPENAI_EMBEDDING_ENDPOINT', + 'AZURE_OPENAI_EMBEDDING_MODEL', + 'DATA_FILE_WITH_VECTORS' +]; + +const missing = requiredEnvVars.filter(v => !process.env[v]); +if (missing.length > 0) { + console.error(`Missing required environment variables: ${missing.join(', ')}`); + console.error('See .env.example for required values.'); + process.exit(1); +} + +type Algorithm = 'diskann' | 'hnsw' | 'ivf'; +type Similarity = 'COS' | 'L2' | 'IP'; + +const ALGORITHMS: Algorithm[] = ['diskann', 'hnsw', 'ivf']; +const SIMILARITIES: Similarity[] = ['COS', 'L2', 'IP']; + +const ALGORITHM_LABELS: Record = { + diskann: 'DiskANN', + hnsw: 'HNSW', + ivf: 'IVF', +}; + +// Index creation configs per algorithm +function getIndexOptions( + collectionName: string, + indexName: string, + embeddedField: string, + dimensions: number, + algorithm: Algorithm, + similarity: Similarity +) { + const base = { + createIndexes: collectionName, + indexes: [ + { + name: indexName, + key: { [embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: {} as Record, + }, + ], + }; + + switch (algorithm) { + case 'diskann': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-diskann', + dimensions, + similarity, + maxDegree: 32, + lBuild: 50, + }; + break; + case 'hnsw': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-hnsw', + dimensions, + similarity, + m: 16, + efConstruction: 64, + }; + break; + case 'ivf': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-ivf', + dimensions, + similarity, + numLists: 1, + }; + break; + } + + return base; +} + +// Algorithm-specific query params +function getSearchPipeline( + queryEmbedding: number[], + embeddedField: string, + k: number, + algorithm: Algorithm +) { + const cosmosSearch: Record = { + vector: queryEmbedding, + path: embeddedField, + k, + }; + + // Add algorithm-specific search params + switch (algorithm) { + case 'diskann': + cosmosSearch.lSearch = 100; + break; + case 'hnsw': + cosmosSearch.efSearch = 80; + break; + case 'ivf': + cosmosSearch.nProbes = 1; + break; + } + + return [ + { $search: { cosmosSearch } }, + { $project: { score: { $meta: "searchScore" }, document: "$$ROOT" } }, + ]; +} + +/** + * Determine which collections to create/query based on ALGORITHM and SIMILARITY env vars. + * Collection naming: hotels_{algorithm}_{similarity} + */ +function getTargetCollections( + algorithmEnv: string, + similarityEnv: string +): Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> { + const algorithms: Algorithm[] = + algorithmEnv === 'all' ? ALGORITHMS : [algorithmEnv as Algorithm]; + const similarities: Similarity[] = + similarityEnv === 'all' ? SIMILARITIES : [similarityEnv as Similarity]; + + const targets: Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> = []; + + for (const alg of algorithms) { + if (!ALGORITHMS.includes(alg)) { + throw new Error(`Invalid ALGORITHM '${alg}'. Must be one of: all, ${ALGORITHMS.join(', ')}`); + } + for (const sim of similarities) { + if (!SIMILARITIES.includes(sim)) { + throw new Error(`Invalid SIMILARITY '${sim}'. Must be one of: all, ${SIMILARITIES.join(', ')}`); + } + targets.push({ + collectionName: `hotels_${alg}_${sim.toLowerCase()}`, + algorithm: alg, + similarity: sim, + }); + } + } + + return targets; +} + +async function main() { + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) { + throw new Error('Azure OpenAI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + const dbName = process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels'; + const embeddedField = process.env.EMBEDDED_FIELD || 'DescriptionVector'; + const embeddingDimensions = parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10); + const dataFile = process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json'; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const batchSize = parseInt(process.env.LOAD_SIZE_BATCH || '100', 10); + const algorithmEnv = (process.env.ALGORITHM || 'all').trim().toLowerCase(); + const similarityEnv = (process.env.SIMILARITY || 'COS').trim().toUpperCase(); + const searchQuery = 'quintessential lodging near running trails, eateries, retail'; + + const targets = getTargetCollections(algorithmEnv, similarityEnv); + + console.log(`\n🔬 Vector Algorithm Comparison`); + console.log(` Database: ${dbName}`); + console.log(` Algorithms: ${algorithmEnv}`); + console.log(` Similarity: ${similarityEnv}`); + console.log(` Collections to query: ${targets.map(t => t.collectionName).join(', ')}`); + console.log(` Search query: "${searchQuery}"\n`); + + await dbClient.connect(); + const db = dbClient.db(dbName); + + // Load data once (shared across collections) + const data = await readFileReturnJson(path.join(__dirname, '..', dataFile)); + + // Generate query embedding once (reuse across collections) + console.log('Generating query embedding...'); + const embeddingResponse = await aiClient.embeddings.create({ + model: deployment, + input: [searchQuery], + }); + const queryEmbedding = embeddingResponse.data[0].embedding; + if (queryEmbedding.length !== embeddingDimensions) { + throw new Error( + `Embedding dimension mismatch: expected ${embeddingDimensions}, got ${queryEmbedding.length}. ` + + `Verify AZURE_OPENAI_EMBEDDING_MODEL matches the configured EMBEDDING_DIMENSIONS.` + ); + } + console.log(`Query embedding: ${queryEmbedding.length} dimensions\n`); + + const config = { batchSize }; + + const comparisonResults: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> = []; + + for (const target of targets) { + console.log(`\n━━━ ${ALGORITHM_LABELS[target.algorithm]} / ${target.similarity} ━━━`); + console.log(`Collection: ${target.collectionName}`); + + try { + // Create collection (drops existing to ensure clean state) + try { + await db.dropCollection(target.collectionName); + } catch { + // Collection may not exist yet + } + const collection = await db.createCollection(target.collectionName); + console.log('Created collection:', target.collectionName); + + // Insert data + const insertSummary = await insertData(config, collection, data); + console.log(`Inserted: ${insertSummary.inserted}/${insertSummary.total}`); + + // Create vector index + const indexName = `vectorIndex_${target.algorithm}_${target.similarity.toLowerCase()}`; + const indexOptions = getIndexOptions( + target.collectionName, + indexName, + embeddedField, + embeddingDimensions, + target.algorithm, + target.similarity + ); + await db.command(indexOptions); + console.log('Created vector index:', indexName); + + // Run vector search + console.log('Executing vector search...'); + const startTime = Date.now(); + + const pipeline = getSearchPipeline(queryEmbedding, embeddedField, 5, target.algorithm); + const searchResults = await collection.aggregate(pipeline).toArray(); + + const latencyMs = Date.now() - startTime; + + comparisonResults.push({ + collectionName: target.collectionName, + algorithm: ALGORITHM_LABELS[target.algorithm], + similarity: target.similarity, + searchResults, + latencyMs, + }); + + console.log(`✓ ${searchResults.length} results, ${latencyMs}ms`); + } catch (error) { + console.error(`✗ Error with ${target.collectionName}:`, (error as Error).message); + } + } + + // Print comparison table + if (comparisonResults.length > 0) { + printComparisonTable(comparisonResults); + } + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('\nClosing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/utils.ts b/ai/select-algorithm-typescript/src/utils.ts new file mode 100644 index 0000000..5099c32 --- /dev/null +++ b/ai/select-algorithm-typescript/src/utils.ts @@ -0,0 +1,205 @@ +import { Collection, Document, MongoClient, OIDCResponse, OIDCCallbackParams } from 'mongodb'; +import { AzureOpenAI } from 'openai/index.js'; +import { promises as fs } from "fs"; +import { AccessToken, DefaultAzureCredential, TokenCredential, getBearerTokenProvider } from '@azure/identity'; + +// Define a type for JSON data +export type JsonData = Record; + +export function getConfig() { + return { + dbName: process.env.MONGO_DB_NAME || 'documentdb_demo', + dataFile: process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json', + embeddedField: process.env.EMBEDDED_FIELD || 'DescriptionVector', + similarity: process.env.SIMILARITY || 'COS', + embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10), + deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL || 'text-embedding-ada-002', + batchSize: parseInt(process.env.BATCH_SIZE || '25', 10) + }; +} + +export const AzureIdentityTokenCallback = async (params: OIDCCallbackParams, credential: TokenCredential): Promise => { + const tokenResponse: AccessToken | null = await credential.getToken(['https://ossrdbms-aad.database.windows.net/.default']); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: (tokenResponse?.expiresOnTimestamp || 0) - Math.floor(Date.now() / 1000) + }; +}; + +export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClient: MongoClient | null } { + let aiClient: AzureOpenAI | null = null; + let dbClient: MongoClient | null = null; + + // Validate all required environment variables upfront + const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const clusterName = process.env.MONGO_CLUSTER_NAME!; + + if (!endpoint || !deployment || !clusterName) { + throw new Error('Missing required environment variables: AZURE_OPENAI_EMBEDDING_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL, MONGO_CLUSTER_NAME'); + } + + console.log(`Using Azure OpenAI Embedding Deployment/Model: ${deployment}`); + + const credential = new DefaultAzureCredential(); + + // For Azure OpenAI with DefaultAzureCredential + { + const scope = "https://cognitiveservices.azure.com/.default"; + const azureADTokenProvider = getBearerTokenProvider(credential, scope); + aiClient = new AzureOpenAI({ + apiVersion: "2024-10-21", + endpoint, + deployment, + azureADTokenProvider, + timeout: 30000, + maxRetries: 3, + }); + } + + // For DocumentDB with DefaultAzureCredential (uses signed-in user) + { + dbClient = new MongoClient( + `mongodb+srv://${clusterName}.mongocluster.cosmos.azure.com/`, { + connectTimeoutMS: 120000, + tls: true, + retryWrites: false, + maxIdleTimeMS: 120000, + authMechanism: 'MONGODB-OIDC', + authMechanismProperties: { + OIDC_CALLBACK: (params: OIDCCallbackParams) => AzureIdentityTokenCallback(params, credential), + ALLOWED_HOSTS: ['*.azure.com'] + } + } + ); + } + + return { aiClient, dbClient }; +} + +export async function readFileReturnJson(filePath: string): Promise { + + console.log(`Reading JSON file from ${filePath}`); + + const fileAsString = await fs.readFile(filePath, "utf-8"); + return JSON.parse(fileAsString); +} + +export async function insertData(config: { batchSize: number }, collection: Collection, data: Document[]) { + console.log(`Processing in batches of ${config.batchSize}...`); + const totalBatches = Math.ceil(data.length / config.batchSize); + + let inserted = 0; + let failed = 0; + + for (let i = 0; i < totalBatches; i++) { + const start = i * config.batchSize; + const end = Math.min(start + config.batchSize, data.length); + const batch = data.slice(start, end); + + try { + const result = await collection.insertMany(batch, { ordered: false }); + inserted += result.insertedCount || 0; + console.log(`Batch ${i + 1} complete: ${result.insertedCount} inserted`); + } catch (error: any) { + if (error?.writeErrors) { + console.error(`Error in batch ${i + 1}: ${error?.writeErrors.length} failures`); + failed += error?.writeErrors.length; + inserted += batch.length - error?.writeErrors.length; + } else { + console.error(`Error in batch ${i + 1}:`, error); + failed += batch.length; + } + } + + // Small pause between batches to reduce resource contention + if (i < totalBatches - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + // Create standard field indexes + const indexColumns = ["HotelId", "Category", "Description", "Description_fr"]; + for (const col of indexColumns) { + const indexSpec: Record = {}; + indexSpec[col] = 1; + await collection.createIndex(indexSpec); + } + + return { total: data.length, inserted, failed }; +} + +export function printSearchResults(insertSummary: any, vectorIndexSummary: any, searchResults: Document[]) { + console.log(`\nInsert summary: ${JSON.stringify(insertSummary)}`); + console.log(`Vector index: ${JSON.stringify(vectorIndexSummary)}`); + + if (!searchResults || searchResults.length === 0) { + console.log('No search results found.'); + return; + } + + searchResults.map((result: Document, index: number) => { + const { document, score } = result; + console.log(`${index + 1}. HotelName: ${document.HotelName}, Score: ${score.toFixed(4)}`); + }); +} + +/** + * Print a side-by-side comparison table of vector search results across collections + */ +export function printComparisonTable( + results: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> +): void { + console.log('\n╔══════════════════════════════════════════════════════════════════════════════════╗'); + console.log('║ Vector Algorithm Comparison Results ║'); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + // Header + console.log( + '║ ' + + 'Algorithm'.padEnd(12) + + 'Similarity'.padEnd(14) + + 'Top Result'.padEnd(24) + + 'Score'.padEnd(12) + + 'Latency(ms)'.padEnd(14) + + '║' + ); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + for (const r of results) { + const topResult = r.searchResults[0]; + const topName = topResult ? (topResult.document.HotelName as string).substring(0, 22) : 'N/A'; + const topScore = topResult ? topResult.score.toFixed(4) : 'N/A'; + + console.log( + '║ ' + + r.algorithm.padEnd(12) + + r.similarity.padEnd(14) + + topName.padEnd(24) + + topScore.padEnd(12) + + r.latencyMs.toFixed(0).padEnd(14) + + '║' + ); + } + + console.log('╚══════════════════════════════════════════════════════════════════════════════════╝'); + + // Detailed results per collection + for (const r of results) { + console.log(`\n--- ${r.algorithm} / ${r.similarity} (${r.collectionName}) ---`); + if (r.searchResults.length === 0) { + console.log(' No results.'); + continue; + } + r.searchResults.forEach((item: Document, i: number) => { + console.log(` ${i + 1}. ${item.document.HotelName}, Score: ${item.score.toFixed(4)}`); + }); + console.log(` Latency: ${r.latencyMs.toFixed(0)}ms`); + } +} diff --git a/ai/select-algorithm-typescript/tsconfig.json b/ai/select-algorithm-typescript/tsconfig.json new file mode 100644 index 0000000..3cb9aaa --- /dev/null +++ b/ai/select-algorithm-typescript/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "NodeNext", + "moduleResolution": "nodenext", + "declaration": true, + "outDir": "./dist", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "noImplicitAny": false, + "forceConsistentCasingInFileNames": true, + "sourceMap": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +}