From edcfe2ab1d72219e72aa3564bf3876b50fcb6de3 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Thu, 30 Apr 2026 07:51:28 -0700 Subject: [PATCH 1/2] Standardize collection lifecycle: conditional drop at start, always drop at end All 10 sample directories now follow the same pattern: - START: conditionally drop collection only if it exists - END: always drop collection for cleanup (in finally/defer block) Languages updated: TypeScript, Python, Go, Java, .NET Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/src/CompareAll.cs | 10 +- ai/select-algorithm-go/src/compare_all.go | 15 +- .../selectalgorithm/CompareAll.java | 139 +++++++++--------- ai/select-algorithm-python/src/compare_all.py | 7 +- .../Services/VectorSearchService.cs | 48 ++++-- ai/vector-search-go/src/diskann.go | 31 ++-- ai/vector-search-go/src/hnsw.go | 31 ++-- ai/vector-search-go/src/ivf.go | 31 ++-- .../com/azure/documentdb/samples/DiskAnn.java | 33 +++-- .../com/azure/documentdb/samples/HNSW.java | 33 +++-- .../com/azure/documentdb/samples/IVF.java | 33 +++-- ai/vector-search-python/src/diskann.py | 14 +- ai/vector-search-python/src/hnsw.py | 14 +- ai/vector-search-python/src/ivf.py | 14 +- ai/vector-search-typescript/src/diskann.ts | 23 ++- ai/vector-search-typescript/src/hnsw.ts | 23 ++- ai/vector-search-typescript/src/ivf.ts | 23 ++- 17 files changed, 354 insertions(+), 168 deletions(-) diff --git a/ai/select-algorithm-dotnet/src/CompareAll.cs b/ai/select-algorithm-dotnet/src/CompareAll.cs index a29704c..d8af191 100644 --- a/ai/select-algorithm-dotnet/src/CompareAll.cs +++ b/ai/select-algorithm-dotnet/src/CompareAll.cs @@ -37,9 +37,13 @@ public static void Run() { var database = mongoClient.GetDatabase(databaseName); - // Drop collection for a clean comparison - database.DropCollection("hotels"); - Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + // Drop collection if it already exists (clean start) + var collectionNames = database.ListCollectionNames().ToList(); + if (collectionNames.Contains("hotels")) + { + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection."); + } var collection = database.GetCollection("hotels"); diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go index 463e55d..c873e18 100644 --- a/ai/select-algorithm-go/src/compare_all.go +++ b/ai/select-algorithm-go/src/compare_all.go @@ -47,15 +47,18 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, fmt.Printf("Top-K: %d\n", topK) fmt.Printf("Verbose: %v\n", verbose) - // 1. Drop collection for clean comparison, then load data + // 1. Drop collection if it exists for clean comparison, then load data database := dbClient.Database(config.DatabaseName) collection := database.Collection("hotels") - // Drop existing collection for a clean comparison - if err := collection.Drop(ctx); err != nil { - fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) - } else { - fmt.Println("Dropped existing 'hotels' collection") + // Drop existing collection if it exists (clean start) + names, _ := database.ListCollectionNames(ctx, bson.M{"name": "hotels"}) + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection: %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } } // Ensure cleanup on exit diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java index ef8d55a..7cbf094 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -49,80 +49,85 @@ public static void run() { MongoDatabase database = mongoClient.getDatabase(databaseName); MongoCollection collection = database.getCollection(COLLECTION_NAME); - // Load data ONCE into the single collection - System.out.println(" Loading data from: " + dataFile); - List data = Utils.readJsonFile(dataFile); - System.out.printf(" Loaded %d documents%n", data.size()); - - collection.drop(); - System.out.println(" Collection reset."); - Utils.insertData(collection, data, 100); - - // Generate ONE embedding for the query (reused for all 9 searches) - OpenAIClient aiClient = Utils.getOpenAIClient(); - System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); - List queryVector = Utils.getEmbedding(aiClient, queryText, model); - System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); - - // Convert to doubles for BSON - List vectorAsDoubles = queryVector.stream() - .map(Float::doubleValue) - .toList(); - - // Create all 9 indexes idempotently - System.out.println(" Creating 9 vector indexes..."); - for (String algo : ALGORITHMS) { - for (String metric : METRICS) { - createIndex(collection, vectorField, dimensions, algo, metric); + try { + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println(" Dropped existing collection."); } - } - System.out.println(" All indexes created.\n"); - - // Run searches sequentially for fair timing - System.out.println(" Running searches..."); - for (String algo : ALGORITHMS) { - for (String metric : METRICS) { - String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); - - long startNs = System.nanoTime(); - List searchResults = performSearch( - collection, vectorAsDoubles, vectorField, topK); - long elapsedNs = System.nanoTime() - startNs; - double elapsedMs = elapsedNs / 1_000_000.0; - - // Extract top result info - String topHotel = "-"; - double topScore = 0.0; - if (!searchResults.isEmpty()) { - Document top = searchResults.get(0); - topHotel = top.getString("HotelName") != null - ? top.getString("HotelName") : "-"; - topScore = top.getDouble("score") != null - ? top.getDouble("score") : 0.0; + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Create all 9 indexes idempotently + System.out.println(" Creating 9 vector indexes..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + createIndex(collection, vectorField, dimensions, algo, metric); } + } + System.out.println(" All indexes created.\n"); + + // Run searches sequentially for fair timing + System.out.println(" Running searches..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + long startNs = System.nanoTime(); + List searchResults = performSearch( + collection, vectorAsDoubles, vectorField, topK); + long elapsedNs = System.nanoTime() - startNs; + double elapsedMs = elapsedNs / 1_000_000.0; + + // Extract top result info + String topHotel = "-"; + double topScore = 0.0; + if (!searchResults.isEmpty()) { + Document top = searchResults.get(0); + topHotel = top.getString("HotelName") != null + ? top.getString("HotelName") : "-"; + topScore = top.getDouble("score") != null + ? top.getDouble("score") : 0.0; + } - results.add(new SearchResult( - algo.toUpperCase(), metric, indexName, - elapsedMs, searchResults.size(), topHotel, topScore)); - - if (verbose) { - System.out.printf(" [%s] %d results in %.2f ms%n", - indexName, searchResults.size(), elapsedMs); - for (int i = 0; i < searchResults.size(); i++) { - Document doc = searchResults.get(i); - System.out.printf(" %d. %s (%.4f)%n", - i + 1, - doc.getString("HotelName"), - doc.getDouble("score")); + results.add(new SearchResult( + algo.toUpperCase(), metric, indexName, + elapsedMs, searchResults.size(), topHotel, topScore)); + + if (verbose) { + System.out.printf(" [%s] %d results in %.2f ms%n", + indexName, searchResults.size(), elapsedMs); + for (int i = 0; i < searchResults.size(); i++) { + Document doc = searchResults.get(i); + System.out.printf(" %d. %s (%.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + } } } } + } finally { + // Cleanup: always drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); } - - // Cleanup: drop the comparison collection - System.out.println("\n Cleanup: dropping comparison collection..."); - collection.drop(); - System.out.println(" Cleanup: dropped collection 'hotels'"); } // Print comparison table diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py index 1aac549..8539898 100644 --- a/ai/select-algorithm-python/src/compare_all.py +++ b/ai/select-algorithm-python/src/compare_all.py @@ -171,9 +171,10 @@ def main(): try: database = mongo_client[config["database_name"]] - # Drop collection for a clean comparison - database.drop_collection("hotels") - print("Dropped existing 'hotels' collection (if any)") + # Drop collection if it already exists (clean start) + if "hotels" in database.list_collection_names(): + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection") # Create fresh collection and load data collection = database["hotels"] diff --git a/ai/vector-search-dotnet/Services/VectorSearchService.cs b/ai/vector-search-dotnet/Services/VectorSearchService.cs index e8505a1..a1aa841 100644 --- a/ai/vector-search-dotnet/Services/VectorSearchService.cs +++ b/ai/vector-search-dotnet/Services/VectorSearchService.cs @@ -43,24 +43,32 @@ public VectorSearchService(ILogger logger, MongoDbService m /// The vector search algorithm to use (IVF, HNSW, or DiskANN) public async Task RunSearchAsync(VectorIndexType indexType) { + _logger.LogInformation($"Starting {indexType} vector search workflow"); + + // Setup collection + var collectionSuffix = indexType switch + { + VectorIndexType.IVF => "ivf", + VectorIndexType.HNSW => "hnsw", + VectorIndexType.DiskANN => "diskann", + _ => throw new ArgumentException($"Unknown index type: {indexType}") + }; + var collectionName = $"hotels_{collectionSuffix}"; + var indexName = $"vectorIndex_{collectionSuffix}"; + + // Drop collection if it already exists (clean start) + var database = _mongoService.GetDatabase(_config.VectorSearch.DatabaseName); + var existingCollections = (await database.ListCollectionNamesAsync()).ToList(); + if (existingCollections.Contains(collectionName)) + { + await _mongoService.DropCollectionAsync(_config.VectorSearch.DatabaseName, collectionName); + } + try { - _logger.LogInformation($"Starting {indexType} vector search workflow"); - - // Setup collection - var collectionSuffix = indexType switch - { - VectorIndexType.IVF => "ivf", - VectorIndexType.HNSW => "hnsw", - VectorIndexType.DiskANN => "diskann", - _ => throw new ArgumentException($"Unknown index type: {indexType}") - }; - var collectionName = $"hotels_{collectionSuffix}"; - var indexName = $"vectorIndex_{collectionSuffix}"; - var collection = _mongoService.GetCollection(_config.VectorSearch.DatabaseName, collectionName); - // Load data from file if collection is empty + // Load data from file var assemblyLocation = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location) ?? string.Empty; var dataFilePath = Path.Combine(assemblyLocation, _config.DataFiles.WithVectors); await _mongoService.LoadDataIfNeededAsync(collection, dataFilePath); @@ -137,6 +145,18 @@ await _mongoService.CreateVectorIndexAsync( _logger.LogError(ex, $"{indexType} vector search failed"); throw; } + finally + { + // Cleanup: always drop the collection + try + { + await _mongoService.DropCollectionAsync(_config.VectorSearch.DatabaseName, collectionName); + } + catch (Exception ex) + { + _logger.LogWarning(ex, $"Cleanup warning: failed to drop collection '{collectionName}'"); + } + } } /// diff --git a/ai/vector-search-go/src/diskann.go b/ai/vector-search-go/src/diskann.go index 8991f58..e4536a3 100644 --- a/ai/vector-search-go/src/diskann.go +++ b/ai/vector-search-go/src/diskann.go @@ -154,6 +154,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_diskann") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_diskann"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_diskann'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_diskann'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_diskann'") + } + }() + // Load data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -177,15 +199,6 @@ func main() { // Insert data into collection fmt.Printf("\nInserting data into collection '%s'...\n", config.CollectionName) - // Clear existing data to ensure clean state - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert the hotel data stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-go/src/hnsw.go b/ai/vector-search-go/src/hnsw.go index ab6977c..93bc5bd 100644 --- a/ai/vector-search-go/src/hnsw.go +++ b/ai/vector-search-go/src/hnsw.go @@ -155,6 +155,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_hnsw") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_hnsw"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_hnsw'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_hnsw'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_hnsw'") + } + }() + // Load hotel data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -178,15 +200,6 @@ func main() { // Insert data into MongoDB collection fmt.Printf("\nPreparing collection '%s'...\n", config.CollectionName) - // Clear any existing data to start fresh - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert hotel data with embeddings stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-go/src/ivf.go b/ai/vector-search-go/src/ivf.go index 2aeddd8..2861845 100644 --- a/ai/vector-search-go/src/ivf.go +++ b/ai/vector-search-go/src/ivf.go @@ -152,6 +152,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_ivf") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_ivf"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_ivf'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_ivf'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_ivf'") + } + }() + // Load hotel data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -175,15 +197,6 @@ func main() { // Prepare collection with fresh data fmt.Printf("\nPreparing collection '%s'...\n", config.CollectionName) - // Remove any existing data for clean state - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert hotel data with embeddings stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java index 676630b..14a37c6 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java index 146fc27..a8b3be7 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java index e800107..9c23aec 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-python/src/diskann.py b/ai/vector-search-python/src/diskann.py index 81720ab..fdef640 100644 --- a/ai/vector-search-python/src/diskann.py +++ b/ai/vector-search-python/src/diskann.py @@ -142,6 +142,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -200,8 +207,13 @@ def main(): raise finally: - # Close the MongoDB client + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-python/src/hnsw.py b/ai/vector-search-python/src/hnsw.py index 9352220..fcc9e72 100644 --- a/ai/vector-search-python/src/hnsw.py +++ b/ai/vector-search-python/src/hnsw.py @@ -136,6 +136,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load hotel data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -196,8 +203,13 @@ def main(): raise finally: - # Clean up MongoDB connection + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-python/src/ivf.py b/ai/vector-search-python/src/ivf.py index f39c0d2..04a0794 100644 --- a/ai/vector-search-python/src/ivf.py +++ b/ai/vector-search-python/src/ivf.py @@ -133,6 +133,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load hotel data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -191,8 +198,13 @@ def main(): raise finally: - # Ensure MongoDB connection is properly closed + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-typescript/src/diskann.ts b/ai/vector-search-typescript/src/diskann.ts index 96b547c..b756405 100644 --- a/ai/vector-search-typescript/src/diskann.ts +++ b/ai/vector-search-typescript/src/diskann.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -95,9 +103,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } diff --git a/ai/vector-search-typescript/src/hnsw.ts b/ai/vector-search-typescript/src/hnsw.ts index 771146c..fede64e 100644 --- a/ai/vector-search-typescript/src/hnsw.ts +++ b/ai/vector-search-typescript/src/hnsw.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -95,9 +103,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } diff --git a/ai/vector-search-typescript/src/ivf.ts b/ai/vector-search-typescript/src/ivf.ts index e81ace8..908ae1c 100644 --- a/ai/vector-search-typescript/src/ivf.ts +++ b/ai/vector-search-typescript/src/ivf.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -96,9 +104,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } From 44371c6af4dc087bab823aab3180ad799df600bc Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Thu, 30 Apr 2026 08:34:09 -0700 Subject: [PATCH 2/2] Add copilot instruction files for sample conventions - Main instructions: naming, env vars, collection lifecycle, bulk insert - Language-specific files for TypeScript, Python, Go, Java, .NET - Documents how samples should be built for CI compatibility Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions-dotnet.md | 135 +++++++++++++++++++ .github/copilot-instructions-go.md | 133 ++++++++++++++++++ .github/copilot-instructions-java.md | 122 +++++++++++++++++ .github/copilot-instructions-python.md | 119 ++++++++++++++++ .github/copilot-instructions-typescript.md | 114 ++++++++++++++++ .github/copilot-instructions.md | 149 +++++++++++++++++++++ 6 files changed, 772 insertions(+) create mode 100644 .github/copilot-instructions-dotnet.md create mode 100644 .github/copilot-instructions-go.md create mode 100644 .github/copilot-instructions-java.md create mode 100644 .github/copilot-instructions-python.md create mode 100644 .github/copilot-instructions-typescript.md create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions-dotnet.md b/.github/copilot-instructions-dotnet.md new file mode 100644 index 0000000..4789eca --- /dev/null +++ b/.github/copilot-instructions-dotnet.md @@ -0,0 +1,135 @@ +# .NET (C#) Specific Instructions + +## Stack + +- .NET 8+ +- `MongoDB.Driver` for DocumentDB access +- `Azure.Identity` for DefaultAzureCredential +- `Azure.AI.OpenAI` for Azure OpenAI + +## File Structure + +``` +ai/select-algorithm-dotnet/ +├── src/ +│ ├── CompareAll.cs +│ └── Utils.cs +├── select-algorithm-dotnet.csproj +└── README.md + +ai/vector-search-dotnet/ +├── src/ +│ ├── Ivf.cs +│ ├── Hnsw.cs +│ ├── Diskann.cs +│ └── Utils.cs +├── vector-search-dotnet.csproj +└── README.md +``` + +## Naming Conventions + +- Files: `PascalCase.cs` +- Methods: `PascalCase` +- Constants: `PascalCase` +- Private fields: `_camelCase` +- Local variables: `camelCase` +- Namespaces: `Azure.DocumentDB.Samples` + +## Authentication Pattern + +```csharp +using Azure.Identity; +using MongoDB.Driver; +using MongoDB.Driver.Authentication.Oidc; + +var credential = new DefaultAzureCredential(); +var oidcCallback = new OidcCallback(async (parameters, cancellationToken) => +{ + var token = await credential.GetTokenAsync( + new TokenRequestContext(new[] { "https://ossrdbms-aad.database.windows.net/.default" }), + cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn); +}); +``` + +## $search Syntax + +```csharp +// CORRECT +var searchStage = new BsonDocument("$search", + new BsonDocument("cosmosSearch", + new BsonDocument + { + { "vector", new BsonArray(queryVector) }, + { "path", embeddedField }, + { "k", topK } + })); + +// WRONG — do NOT add cosmosSearchOptions to the $search stage +``` + +## Bulk Insert + +Use `collection.InsertManyAsync()` with `InsertManyOptions { IsOrdered = false }`: + +```csharp +using MongoDB.Driver; + +try +{ + await collection.InsertManyAsync(batch, new InsertManyOptions { IsOrdered = false }); + insertedCount += batch.Count; +} +catch (MongoBulkWriteException e) +{ + // Partial failure — some docs inserted + insertedCount += (int)e.Result.InsertedCount; + failedCount += batch.Count - (int)e.Result.InsertedCount; +} +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`await Task.Delay(200)`) +- Catch `MongoBulkWriteException` for partial failure handling +- Always use the async variant (`InsertManyAsync`) + +## Key Patterns + +- Use `Environment.GetEnvironmentVariable("VAR") ?? "default"` for config +- Use `using` statements for disposable resources +- Use `try/finally` for collection cleanup +- Async/await throughout (use `Async` suffix on method names) +- Match TypeScript output format exactly + +## Environment Variables + +- Use `IConfiguration` with layered sources: `appsettings.json` → environment variables +- Provide `appsettings.json` with placeholder structure (committed) and gitignore `appsettings.local.json` +- Environment variables override JSON config values +- Bind to strongly-typed configuration classes (`AppConfiguration`, `AzureOpenAIConfiguration`, etc.) + +```csharp +var configuration = new ConfigurationBuilder() + .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) + .AddEnvironmentVariables() + .Build(); + +var appConfig = configuration.Get() + ?? throw new InvalidOperationException("Failed to load configuration"); +``` + +- Configuration class hierarchy: + - `AppConfiguration` → root + - `AzureOpenAIConfiguration` → endpoint, model, apiVersion + - `MongoDBConfiguration` → connectionString, clusterName, loadBatchSize + - `EmbeddingConfiguration` → fieldToEmbed, embeddedField, dimensions, batchSize + - `VectorSearchConfiguration` → query, databaseName, topK + +- Include `Microsoft.Extensions.Configuration` packages in `.csproj` + +## Build & Run + +```bash +dotnet run +``` diff --git a/.github/copilot-instructions-go.md b/.github/copilot-instructions-go.md new file mode 100644 index 0000000..16533ee --- /dev/null +++ b/.github/copilot-instructions-go.md @@ -0,0 +1,133 @@ +# Go-Specific Instructions + +## Stack + +- Go 1.21+ +- `go.mongodb.org/mongo-driver/v2` for DocumentDB access +- `github.com/Azure/azure-sdk-for-go/sdk/azidentity` for DefaultAzureCredential +- `github.com/openai/openai-go` for Azure OpenAI + +## File Structure + +``` +ai/select-algorithm-go/ +├── src/ +│ ├── compare_all.go # Multi-query comparison runner +│ └── utils.go # Shared utilities +├── go.mod +├── go.sum +└── README.md + +ai/vector-search-go/ +├── src/ +│ ├── ivf.go +│ ├── hnsw.go +│ ├── diskann.go +│ └── utils.go +├── go.mod +├── go.sum +└── README.md +``` + +## Naming Conventions + +- Files: `snake_case.go` +- Functions: `PascalCase` (exported), `camelCase` (unexported) +- Constants: `PascalCase` or `camelCase` +- Packages: `lowercase` + +## Authentication Pattern + +```go +import ( + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" +) + +credential, _ := azidentity.NewDefaultAzureCredential(nil) +// Use OIDC callback with DocumentDB scope +``` + +## $search Syntax + +```go +// CORRECT +searchStage := bson.D{{Key: "$search", Value: bson.D{ + {Key: "cosmosSearch", Value: bson.D{ + {Key: "vector", Value: queryVector}, + {Key: "path", Value: embeddedField}, + {Key: "k", Value: topK}, + }}, +}}} + +// WRONG — do NOT include cosmosSearchOptions in the $search stage +``` + +## Bulk Insert + +Use `collection.InsertMany()` with `SetOrdered(false)` and handle `BulkWriteException`: + +```go +result, err := collection.InsertMany(ctx, documents, options.InsertMany().SetOrdered(false)) +if err != nil { + if bulkErr, ok := err.(mongo.BulkWriteException); ok { + // Partial failure — some docs inserted, some failed + failed := len(bulkErr.WriteErrors) + insertedCount += len(batch) - failed + } else { + return fmt.Errorf("batch insert failed: %w", err) + } +} else { + insertedCount += len(result.InsertedIDs) +} +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`time.Sleep(200 * time.Millisecond)`) +- Type-assert `mongo.BulkWriteException` for partial failure handling + +## Key Patterns + +- Use `os.Getenv("VAR")` with fallback helper for config +- Always check errors explicitly — no panic in sample code +- Use `context.Background()` or appropriate timeout contexts +- Use `defer` for cleanup (drop collections) +- Match TypeScript output format exactly + +## Environment Variables + +- Use `github.com/joho/godotenv` to load from `.env` file at startup +- Provide a `.env.example` file in each sample directory +- Access pattern: `os.Getenv("VAR")` with a helper function for defaults +- Call `godotenv.Load()` early — log a warning if `.env` is missing but don't fail (env vars may be set externally) + +```go +import ( + "os" + "github.com/joho/godotenv" +) + +func init() { + err := godotenv.Load() + if err != nil { + fmt.Println("No .env file found, using environment variables") + } +} + +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} +``` + +- Include `github.com/joho/godotenv` in `go.mod` + +## Build & Run + +```bash +cd src +go run . +``` diff --git a/.github/copilot-instructions-java.md b/.github/copilot-instructions-java.md new file mode 100644 index 0000000..35cbf11 --- /dev/null +++ b/.github/copilot-instructions-java.md @@ -0,0 +1,122 @@ +# Java-Specific Instructions + +## Stack + +- Java 17+ +- MongoDB Java Driver (`org.mongodb:mongodb-driver-sync`) +- Azure Identity (`com.azure:azure-identity`) +- Azure OpenAI (`com.azure:azure-ai-openai`) + +## File Structure + +``` +ai/select-algorithm-java/ +├── src/main/java/com/azure/documentdb/sample/ +│ ├── CompareAll.java +│ └── Utils.java +├── pom.xml +└── README.md + +ai/vector-search-java/ +├── src/main/java/com/azure/documentdb/sample/ +│ ├── Ivf.java +│ ├── Hnsw.java +│ ├── Diskann.java +│ └── Utils.java +├── pom.xml +└── README.md +``` + +## Naming Conventions + +- Files: `PascalCase.java` +- Methods: `camelCase` +- Constants: `UPPER_SNAKE_CASE` +- Classes: `PascalCase` +- Packages: `com.azure.documentdb.sample` + +## Authentication Pattern + +```java +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; + +DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); +MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", (context) -> { + AccessToken token = credential.getToken( + new TokenRequestContext().addScopes("https://ossrdbms-aad.database.windows.net/.default") + ).block(); + return new OidcCallbackResult(token.getToken()); + }); +``` + +## $search Syntax + +```java +// CORRECT +Document searchStage = new Document("$search", + new Document("cosmosSearch", + new Document("vector", queryVector) + .append("path", embeddedField) + .append("k", topK))); + +// WRONG — do NOT add cosmosSearchOptions to the $search stage +``` + +## Bulk Insert + +Use `collection.insertMany()` with `InsertManyOptions().ordered(false)`: + +```java +import com.mongodb.client.model.InsertManyOptions; +import com.mongodb.MongoBulkWriteException; + +try { + collection.insertMany(documents, new InsertManyOptions().ordered(false)); + insertedCount += documents.size(); +} catch (MongoBulkWriteException e) { + // Partial failure — some docs inserted + insertedCount += e.getWriteResult().getInsertedCount(); + failedCount += documents.size() - e.getWriteResult().getInsertedCount(); +} +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`Thread.sleep(200)`) +- Catch `MongoBulkWriteException` for partial failure handling + +## Key Patterns + +- Use `System.getenv("VAR")` with null check for config +- Use try-with-resources for MongoClient +- Use `try/finally` for collection cleanup +- Match TypeScript output format exactly + +## Environment Variables + +- Read directly via `System.getenv("VAR")` — **no dotenv library** +- Provide a `.env.example` file in each sample directory for documentation purposes +- Access pattern: `System.getenv("VAR")` with null check or ternary for defaults +- Validate required vars early and fail with a clear message + +```java +var clusterName = System.getenv("MONGO_CLUSTER_NAME"); +var endpoint = System.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); +var model = System.getenv("AZURE_OPENAI_EMBEDDING_MODEL"); +var batchSizeStr = System.getenv("LOAD_SIZE_BATCH"); +var batchSize = batchSizeStr != null ? Integer.parseInt(batchSizeStr) : 100; + +if (clusterName == null || endpoint == null) { + throw new IllegalStateException("Missing required environment variables: MONGO_CLUSTER_NAME, AZURE_OPENAI_EMBEDDING_ENDPOINT"); +} +``` + +- Users set env vars via shell export, IDE run configuration, or azd-provided `.env` + +## Build & Run + +```bash +mvn compile exec:java -Dexec.mainClass="com.azure.documentdb.sample.CompareAll" +``` diff --git a/.github/copilot-instructions-python.md b/.github/copilot-instructions-python.md new file mode 100644 index 0000000..2605d13 --- /dev/null +++ b/.github/copilot-instructions-python.md @@ -0,0 +1,119 @@ +# Python-Specific Instructions + +## Stack + +- Python 3.10+ +- `pymongo` for DocumentDB access +- `openai` SDK (AzureOpenAI class) +- `azure-identity` for DefaultAzureCredential + +## File Structure + +``` +ai/select-algorithm-python/ +├── src/ +│ ├── compare_all.py # Multi-query comparison runner +│ └── utils.py # Shared utilities +├── requirements.txt +└── README.md + +ai/vector-search-python/ +├── src/ +│ ├── ivf.py +│ ├── hnsw.py +│ ├── diskann.py +│ ├── create_embeddings.py +│ └── utils.py +├── requirements.txt +└── README.md +``` + +## Naming Conventions + +- Files: `snake_case.py` +- Functions: `snake_case` +- Constants: `UPPER_SNAKE_CASE` +- Classes: `PascalCase` + +## Authentication Pattern + +```python +from azure.identity import DefaultAzureCredential +from pymongo import MongoClient +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult + +class AzureIdentityCallback(OIDCCallback): + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + credential = DefaultAzureCredential() + token = credential.get_token("https://ossrdbms-aad.database.windows.net/.default") + return OIDCCallbackResult(access_token=token.token, expires_in_seconds=300) +``` + +## $search Syntax + +```python +# CORRECT +pipeline = [ + {"$search": {"cosmosSearch": {"vector": query_vector, "path": field, "k": top_k}}}, + {"$project": {"similarityScore": {"$meta": "searchScore"}, "document": "$$ROOT"}} +] + +# WRONG — do NOT use cosmosSearchOptions in $search +# pipeline = [{"$search": {"cosmosSearch": {...}, "cosmosSearchOptions": {...}}}] +``` + +## Bulk Insert + +Use `collection.bulk_write()` with `InsertOne` operations and `ordered=False`: + +```python +from pymongo import InsertOne +from pymongo.errors import BulkWriteError + +operations = [InsertOne(document) for document in batch] +try: + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count +except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + failed_count += len(batch) - e.details.get('nInserted', 0) +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches (`time.sleep(0.2)`) +- Handle `BulkWriteError` for partial failures + +## Key Patterns + +- Use `os.environ.get("VAR", "default")` for config +- Type hints on all function signatures +- Use `try/finally` for collection cleanup +- Match TypeScript output format exactly (table layout, emoji, section headers) + +## Environment Variables + +- Use `python-dotenv` to load from `.env` file at startup +- Provide a `.env.example` file in each sample directory +- Access pattern: `os.environ.get("VAR", "default")` for optional, `os.environ["VAR"]` for required +- Call `load_dotenv()` at the top of the entry point before accessing any env vars + +```python +from dotenv import load_dotenv +import os + +load_dotenv() + +endpoint = os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"] +model = os.environ["AZURE_OPENAI_EMBEDDING_MODEL"] +cluster_name = os.environ["MONGO_CLUSTER_NAME"] +batch_size = int(os.environ.get("LOAD_SIZE_BATCH", "100")) +``` + +- Include `python-dotenv` in `requirements.txt` + +## Build & Run + +```bash +pip install -r requirements.txt +python src/compare_all.py +``` diff --git a/.github/copilot-instructions-typescript.md b/.github/copilot-instructions-typescript.md new file mode 100644 index 0000000..8d944b1 --- /dev/null +++ b/.github/copilot-instructions-typescript.md @@ -0,0 +1,114 @@ +# TypeScript-Specific Instructions + +> This is the **reference implementation**. Other languages must match its behavior. + +## Stack + +- Node.js with ESM modules (`"type": "module"` in package.json) +- TypeScript 5+ with strict mode +- `mongodb` driver (native MongoDB client) +- `openai` SDK (AzureOpenAI class) +- `@azure/identity` for DefaultAzureCredential + +## File Structure + +``` +ai/select-algorithm-typescript/ +├── src/ +│ ├── compare-all.ts # Multi-query comparison runner +│ ├── utils.ts # Shared utilities (auth, config, insert, print) +│ └── ... +├── package.json +├── tsconfig.json +└── README.md + +ai/vector-search-typescript/ +├── src/ +│ ├── ivf.ts # Individual IVF example +│ ├── hnsw.ts # Individual HNSW example +│ ├── diskann.ts # Individual DiskANN example +│ ├── create-embeddings.ts +│ ├── utils.ts +│ └── showIndexes.ts +├── package.json +├── tsconfig.json +└── README.md +``` + +## Authentication Pattern + +```typescript +import { DefaultAzureCredential, getBearerTokenProvider } from '@azure/identity'; +import { MongoClient, OIDCCallbackParams, OIDCResponse } from 'mongodb'; + +// OIDC callback for passwordless auth +const AzureIdentityTokenCallback = async ( + params: OIDCCallbackParams, + credential: TokenCredential +): Promise => { + const tokenResponse = await credential.getToken([ + 'https://ossrdbms-aad.database.windows.net/.default' + ]); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: (tokenResponse?.expiresOnTimestamp || 0) - Math.floor(Date.now() / 1000) + }; +}; +``` + +## ESM Considerations + +```typescript +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); +``` + +## Environment Variables + +- Loaded via `process.env` directly — **no dotenv library** in production code +- Provide a `.env.example` file in each sample directory showing all required vars with placeholder values +- A `.env` file at the sample root is used for local development (gitignored) +- Access pattern: `process.env.VAR_NAME!` (non-null assertion) for required vars +- For optional vars with defaults: `process.env.VAR_NAME || 'default'` +- Validate all required vars at startup — throw with a clear error listing missing vars + +```typescript +const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; +const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; +const clusterName = process.env.MONGO_CLUSTER_NAME!; + +if (!endpoint || !deployment || !clusterName) { + throw new Error('Missing required environment variables: ...'); +} +``` + +## Build & Run + +```bash +npm install +npm run build # tsc +npm start # node dist/compare-all.js +``` + +## Bulk Insert + +Use `collection.insertMany()` with `ordered: false` for batch inserts: + +```typescript +const result = await collection.insertMany(batch, { ordered: false }); +inserted += result.insertedCount || 0; +``` + +- Batch size configurable via `LOAD_SIZE_BATCH` env var (default: 100) +- 200ms delay between batches to avoid rate limiting +- Handle partial failures gracefully (log failed count, continue) + +## Key Patterns + +- Use `interface` for data shapes (SearchResult, AlgorithmConfig) +- Use `const` arrays for ALGORITHMS and SIMILARITIES definitions +- Clean up collections in `finally` block +- Template literal strings for console output formatting diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..8ab46b3 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,149 @@ +# Copilot Instructions for DocumentDB Samples + +## Repository Purpose + +This repo contains Azure DocumentDB (vCore) code samples demonstrating vector search capabilities across multiple languages. Each sample must work identically across all supported languages. + +## Supported Languages + +- [TypeScript](.github/copilot-instructions-typescript.md) (reference implementation) +- [Python](.github/copilot-instructions-python.md) +- [Go](.github/copilot-instructions-go.md) +- [Java](.github/copilot-instructions-java.md) +- [.NET (C#)](.github/copilot-instructions-dotnet.md) + +## Architecture Rules + +### Authentication + +- **Always support two auth modes**: passwordless (DefaultAzureCredential with OIDC callback) AND connection string +- Passwordless is the primary path; connection string is fallback +- DocumentDB vCore uses MongoDB wire protocol — auth token scope is `https://ossrdbms-aad.database.windows.net/.default` + +### Azure OpenAI Integration + +- Use `text-embedding-3-small` (1536 dimensions) as the default embedding model +- Model deployment name comes from env var `AZURE_OPENAI_EMBEDDING_MODEL` +- Support both API key and DefaultAzureCredential for OpenAI client + +### DocumentDB Vector Search + +- **One vector index per field per collection** — this is a hard platform constraint +- When comparing multiple index types, use separate collections (one per algorithm×metric combination) +- Collection naming: `compare_{algorithm}_{metric}` (e.g., `compare_hnsw_cos`) +- Supported algorithms: `vector-ivf`, `vector-hnsw`, `vector-diskann` +- Supported metrics: `COS`, `L2` (IP is omitted — see below) + +### Why No Inner Product (IP) + +`text-embedding-3-small` produces unit-normalized vectors (magnitude ≈ 1). For normalized vectors: +- cosine similarity = dot(a,b) / (||a|| × ||b||) = dot(a,b) = inner product +- COS and IP always return identical results + +Including IP adds no insight and doubles comparison time. All samples use only COS and L2. + +### $search Query Syntax + +The correct MongoDB `$search` syntax for DocumentDB vector search is: + +``` +{ $search: { cosmosSearch: { vector: , path: "", k: } } } +``` + +**DO NOT** use `cosmosSearchOptions` as a key in the `$search` stage. That key is only valid in index creation commands. + +### Data + +- Shared dataset: `ai/data/Hotels_Vector.json` (50 documents with pre-computed embeddings) +- All samples reference this shared data file — do not duplicate data per language +- The `DescriptionVector` field contains the 1536-dimension embedding + +### Batch Insert + +- Always use bulk/batch insert (`insertMany` or equivalent) with `ordered: false` +- Default batch size: 100 (configurable via `LOAD_SIZE_BATCH` env var) +- Add a small delay between batches (200ms) to avoid rate limiting +- Handle partial failures gracefully (log failed count, continue) + +### Environment Variables + +All samples must support these env vars: + +| Variable | Purpose | +|----------|---------| +| `MONGO_CLUSTER_NAME` | DocumentDB cluster name (required for passwordless/OIDC auth) | +| `AZURE_DOCUMENTDB_CONNECTION_STRING` | MongoDB connection string (fallback when not using passwordless) | +| `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | +| `AZURE_OPENAI_EMBEDDING_MODEL` | Deployment name (e.g., `text-embedding-3-small`) | +| `AZURE_OPENAI_EMBEDDING_KEY` | API key (optional if using DefaultAzureCredential) | +| `AZURE_OPENAI_EMBEDDING_API_VERSION` | API version | +| `TOP_K` | Number of results to return (default: 5) | +| `LOAD_SIZE_BATCH` | Batch size for bulk insert (default: 100) | +| `QUERY_TEXT` | Single query override (optional) | +| `VERBOSE` | Enable verbose output (default: false) | + +### Sample Categories + +1. **vector-search-{lang}**: Basic vector search with individual algorithm samples (ivf.ts, hnsw.ts, diskann.ts) +2. **select-algorithm-{lang}**: Comparison runner that tests all algorithms × metrics with multi-query support + +### select-algorithm Comparison Runner Requirements + +The comparison runner (`compare-all`) must: + +1. **Multi-query support**: Run 5 diverse default queries (overridable via `QUERY_TEXT` for single) +2. **Adaptive table collapse**: When all algorithms return the same #1 result for a query, show collapsed metric-only view. When they disagree, show expanded algorithm×metric grid. +3. **Gap analysis**: Show the score gap between #1 and #2 results +4. **Per-query output**: Header with query text, then comparison table +5. **Summary**: Final divergence summary across all queries + +### Console Output Style + +- Use clear section headers with `\n` separation +- Tables with aligned columns (use padding) +- Emoji indicators: ✅ (agreement), ⚠️ (disagreement) +- Show document counts, embedding dimensions, and collection names during setup + +### Collection Lifecycle (REQUIRED) + +Every sample must follow this exact lifecycle — the validation workflow depends on it: + +1. **Start**: Check if collection exists → drop only if it does (defensive, handles prior crashes) +2. **End**: Always drop the collection in a `finally`/`defer` block (cleanup for next run) + +Language-specific patterns: + +| Language | Conditional drop at start | Always drop at end | +|----------|--------------------------|-------------------| +| TypeScript | `db.listCollections({name}).toArray()` → `db.dropCollection(name)` | `finally { db.dropCollection(name) }` | +| Python | `name in database.list_collection_names()` → `database.drop_collection(name)` | `finally: database.drop_collection(name)` | +| Go | `database.ListCollectionNames(ctx, bson.M{"name": name})` → `collection.Drop(ctx)` | `defer func() { collection.Drop(ctx) }()` | +| Java | `database.listCollectionNames().into(list).contains(name)` → `collection.drop()` | `finally { collection.drop() }` | +| .NET | `ListCollectionNamesAsync(filter)` → `DropCollectionAsync(name)` | `finally { DropCollectionAsync(name) }` | + +**Why this matters**: The CI workflow runs samples in parallel across languages. Without end-of-run cleanup, leftover collections cause name conflicts and flaky test failures. + +### Collection Naming Convention (REQUIRED) + +Collection names must be unique per algorithm to avoid conflicts: + +- **vector-search samples**: `hotels_{algorithm}` (e.g., `hotels_diskann`, `hotels_hnsw`, `hotels_ivf`) +- **select-algorithm samples**: `compare_{algorithm}_{metric}` (e.g., `compare_hnsw_cos`, `compare_ivf_l2`) +- **Database**: Always `Hotels` +- **Index names**: `vectorIndex_{algorithm}` (e.g., `vectorIndex_diskann`) + +All languages must use identical collection/index names for a given algorithm. This enables the shared validation workflow to verify behavior consistency. + +### Error Handling + +- Graceful cleanup: drop created collections on error (use try/finally) +- Log but don't crash on individual batch insert failures +- Validate all required env vars at startup with clear error messages + +### Code Style + +- No unnecessary comments — only comment non-obvious decisions (like why IP is omitted) +- Use descriptive variable names over comments +- Keep functions focused — extract helpers for repeated patterns +- TypeScript is the reference implementation — other languages should match its behavior exactly