From edcfe2ab1d72219e72aa3564bf3876b50fcb6de3 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Thu, 30 Apr 2026 07:51:28 -0700 Subject: [PATCH 1/2] Standardize collection lifecycle: conditional drop at start, always drop at end All 10 sample directories now follow the same pattern: - START: conditionally drop collection only if it exists - END: always drop collection for cleanup (in finally/defer block) Languages updated: TypeScript, Python, Go, Java, .NET Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/src/CompareAll.cs | 10 +- ai/select-algorithm-go/src/compare_all.go | 15 +- .../selectalgorithm/CompareAll.java | 139 +++++++++--------- ai/select-algorithm-python/src/compare_all.py | 7 +- .../Services/VectorSearchService.cs | 48 ++++-- ai/vector-search-go/src/diskann.go | 31 ++-- ai/vector-search-go/src/hnsw.go | 31 ++-- ai/vector-search-go/src/ivf.go | 31 ++-- .../com/azure/documentdb/samples/DiskAnn.java | 33 +++-- .../com/azure/documentdb/samples/HNSW.java | 33 +++-- .../com/azure/documentdb/samples/IVF.java | 33 +++-- ai/vector-search-python/src/diskann.py | 14 +- ai/vector-search-python/src/hnsw.py | 14 +- ai/vector-search-python/src/ivf.py | 14 +- ai/vector-search-typescript/src/diskann.ts | 23 ++- ai/vector-search-typescript/src/hnsw.ts | 23 ++- ai/vector-search-typescript/src/ivf.ts | 23 ++- 17 files changed, 354 insertions(+), 168 deletions(-) diff --git a/ai/select-algorithm-dotnet/src/CompareAll.cs b/ai/select-algorithm-dotnet/src/CompareAll.cs index a29704c..d8af191 100644 --- a/ai/select-algorithm-dotnet/src/CompareAll.cs +++ b/ai/select-algorithm-dotnet/src/CompareAll.cs @@ -37,9 +37,13 @@ public static void Run() { var database = mongoClient.GetDatabase(databaseName); - // Drop collection for a clean comparison - database.DropCollection("hotels"); - Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + // Drop collection if it already exists (clean start) + var collectionNames = database.ListCollectionNames().ToList(); + if (collectionNames.Contains("hotels")) + { + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection."); + } var collection = database.GetCollection("hotels"); diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go index 463e55d..c873e18 100644 --- a/ai/select-algorithm-go/src/compare_all.go +++ b/ai/select-algorithm-go/src/compare_all.go @@ -47,15 +47,18 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, fmt.Printf("Top-K: %d\n", topK) fmt.Printf("Verbose: %v\n", verbose) - // 1. Drop collection for clean comparison, then load data + // 1. Drop collection if it exists for clean comparison, then load data database := dbClient.Database(config.DatabaseName) collection := database.Collection("hotels") - // Drop existing collection for a clean comparison - if err := collection.Drop(ctx); err != nil { - fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) - } else { - fmt.Println("Dropped existing 'hotels' collection") + // Drop existing collection if it exists (clean start) + names, _ := database.ListCollectionNames(ctx, bson.M{"name": "hotels"}) + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection: %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } } // Ensure cleanup on exit diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java index ef8d55a..7cbf094 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -49,80 +49,85 @@ public static void run() { MongoDatabase database = mongoClient.getDatabase(databaseName); MongoCollection collection = database.getCollection(COLLECTION_NAME); - // Load data ONCE into the single collection - System.out.println(" Loading data from: " + dataFile); - List data = Utils.readJsonFile(dataFile); - System.out.printf(" Loaded %d documents%n", data.size()); - - collection.drop(); - System.out.println(" Collection reset."); - Utils.insertData(collection, data, 100); - - // Generate ONE embedding for the query (reused for all 9 searches) - OpenAIClient aiClient = Utils.getOpenAIClient(); - System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); - List queryVector = Utils.getEmbedding(aiClient, queryText, model); - System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); - - // Convert to doubles for BSON - List vectorAsDoubles = queryVector.stream() - .map(Float::doubleValue) - .toList(); - - // Create all 9 indexes idempotently - System.out.println(" Creating 9 vector indexes..."); - for (String algo : ALGORITHMS) { - for (String metric : METRICS) { - createIndex(collection, vectorField, dimensions, algo, metric); + try { + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println(" Dropped existing collection."); } - } - System.out.println(" All indexes created.\n"); - - // Run searches sequentially for fair timing - System.out.println(" Running searches..."); - for (String algo : ALGORITHMS) { - for (String metric : METRICS) { - String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); - - long startNs = System.nanoTime(); - List searchResults = performSearch( - collection, vectorAsDoubles, vectorField, topK); - long elapsedNs = System.nanoTime() - startNs; - double elapsedMs = elapsedNs / 1_000_000.0; - - // Extract top result info - String topHotel = "-"; - double topScore = 0.0; - if (!searchResults.isEmpty()) { - Document top = searchResults.get(0); - topHotel = top.getString("HotelName") != null - ? top.getString("HotelName") : "-"; - topScore = top.getDouble("score") != null - ? top.getDouble("score") : 0.0; + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Create all 9 indexes idempotently + System.out.println(" Creating 9 vector indexes..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + createIndex(collection, vectorField, dimensions, algo, metric); } + } + System.out.println(" All indexes created.\n"); + + // Run searches sequentially for fair timing + System.out.println(" Running searches..."); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + long startNs = System.nanoTime(); + List searchResults = performSearch( + collection, vectorAsDoubles, vectorField, topK); + long elapsedNs = System.nanoTime() - startNs; + double elapsedMs = elapsedNs / 1_000_000.0; + + // Extract top result info + String topHotel = "-"; + double topScore = 0.0; + if (!searchResults.isEmpty()) { + Document top = searchResults.get(0); + topHotel = top.getString("HotelName") != null + ? top.getString("HotelName") : "-"; + topScore = top.getDouble("score") != null + ? top.getDouble("score") : 0.0; + } - results.add(new SearchResult( - algo.toUpperCase(), metric, indexName, - elapsedMs, searchResults.size(), topHotel, topScore)); - - if (verbose) { - System.out.printf(" [%s] %d results in %.2f ms%n", - indexName, searchResults.size(), elapsedMs); - for (int i = 0; i < searchResults.size(); i++) { - Document doc = searchResults.get(i); - System.out.printf(" %d. %s (%.4f)%n", - i + 1, - doc.getString("HotelName"), - doc.getDouble("score")); + results.add(new SearchResult( + algo.toUpperCase(), metric, indexName, + elapsedMs, searchResults.size(), topHotel, topScore)); + + if (verbose) { + System.out.printf(" [%s] %d results in %.2f ms%n", + indexName, searchResults.size(), elapsedMs); + for (int i = 0; i < searchResults.size(); i++) { + Document doc = searchResults.get(i); + System.out.printf(" %d. %s (%.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + } } } } + } finally { + // Cleanup: always drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); } - - // Cleanup: drop the comparison collection - System.out.println("\n Cleanup: dropping comparison collection..."); - collection.drop(); - System.out.println(" Cleanup: dropped collection 'hotels'"); } // Print comparison table diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py index 1aac549..8539898 100644 --- a/ai/select-algorithm-python/src/compare_all.py +++ b/ai/select-algorithm-python/src/compare_all.py @@ -171,9 +171,10 @@ def main(): try: database = mongo_client[config["database_name"]] - # Drop collection for a clean comparison - database.drop_collection("hotels") - print("Dropped existing 'hotels' collection (if any)") + # Drop collection if it already exists (clean start) + if "hotels" in database.list_collection_names(): + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection") # Create fresh collection and load data collection = database["hotels"] diff --git a/ai/vector-search-dotnet/Services/VectorSearchService.cs b/ai/vector-search-dotnet/Services/VectorSearchService.cs index e8505a1..a1aa841 100644 --- a/ai/vector-search-dotnet/Services/VectorSearchService.cs +++ b/ai/vector-search-dotnet/Services/VectorSearchService.cs @@ -43,24 +43,32 @@ public VectorSearchService(ILogger logger, MongoDbService m /// The vector search algorithm to use (IVF, HNSW, or DiskANN) public async Task RunSearchAsync(VectorIndexType indexType) { + _logger.LogInformation($"Starting {indexType} vector search workflow"); + + // Setup collection + var collectionSuffix = indexType switch + { + VectorIndexType.IVF => "ivf", + VectorIndexType.HNSW => "hnsw", + VectorIndexType.DiskANN => "diskann", + _ => throw new ArgumentException($"Unknown index type: {indexType}") + }; + var collectionName = $"hotels_{collectionSuffix}"; + var indexName = $"vectorIndex_{collectionSuffix}"; + + // Drop collection if it already exists (clean start) + var database = _mongoService.GetDatabase(_config.VectorSearch.DatabaseName); + var existingCollections = (await database.ListCollectionNamesAsync()).ToList(); + if (existingCollections.Contains(collectionName)) + { + await _mongoService.DropCollectionAsync(_config.VectorSearch.DatabaseName, collectionName); + } + try { - _logger.LogInformation($"Starting {indexType} vector search workflow"); - - // Setup collection - var collectionSuffix = indexType switch - { - VectorIndexType.IVF => "ivf", - VectorIndexType.HNSW => "hnsw", - VectorIndexType.DiskANN => "diskann", - _ => throw new ArgumentException($"Unknown index type: {indexType}") - }; - var collectionName = $"hotels_{collectionSuffix}"; - var indexName = $"vectorIndex_{collectionSuffix}"; - var collection = _mongoService.GetCollection(_config.VectorSearch.DatabaseName, collectionName); - // Load data from file if collection is empty + // Load data from file var assemblyLocation = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location) ?? string.Empty; var dataFilePath = Path.Combine(assemblyLocation, _config.DataFiles.WithVectors); await _mongoService.LoadDataIfNeededAsync(collection, dataFilePath); @@ -137,6 +145,18 @@ await _mongoService.CreateVectorIndexAsync( _logger.LogError(ex, $"{indexType} vector search failed"); throw; } + finally + { + // Cleanup: always drop the collection + try + { + await _mongoService.DropCollectionAsync(_config.VectorSearch.DatabaseName, collectionName); + } + catch (Exception ex) + { + _logger.LogWarning(ex, $"Cleanup warning: failed to drop collection '{collectionName}'"); + } + } } /// diff --git a/ai/vector-search-go/src/diskann.go b/ai/vector-search-go/src/diskann.go index 8991f58..e4536a3 100644 --- a/ai/vector-search-go/src/diskann.go +++ b/ai/vector-search-go/src/diskann.go @@ -154,6 +154,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_diskann") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_diskann"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_diskann'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_diskann'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_diskann'") + } + }() + // Load data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -177,15 +199,6 @@ func main() { // Insert data into collection fmt.Printf("\nInserting data into collection '%s'...\n", config.CollectionName) - // Clear existing data to ensure clean state - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert the hotel data stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-go/src/hnsw.go b/ai/vector-search-go/src/hnsw.go index ab6977c..93bc5bd 100644 --- a/ai/vector-search-go/src/hnsw.go +++ b/ai/vector-search-go/src/hnsw.go @@ -155,6 +155,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_hnsw") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_hnsw"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_hnsw'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_hnsw'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_hnsw'") + } + }() + // Load hotel data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -178,15 +200,6 @@ func main() { // Insert data into MongoDB collection fmt.Printf("\nPreparing collection '%s'...\n", config.CollectionName) - // Clear any existing data to start fresh - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert hotel data with embeddings stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-go/src/ivf.go b/ai/vector-search-go/src/ivf.go index 2aeddd8..2861845 100644 --- a/ai/vector-search-go/src/ivf.go +++ b/ai/vector-search-go/src/ivf.go @@ -152,6 +152,28 @@ func main() { database := mongoClient.Database(config.DatabaseName) collection := database.Collection("hotels_ivf") + // Drop collection if it already exists (clean start) + names, err := database.ListCollectionNames(ctx, bson.M{"name": "hotels_ivf"}) + if err != nil { + log.Fatalf("Failed to list collections: %v", err) + } + if len(names) > 0 { + if err := collection.Drop(ctx); err != nil { + log.Fatalf("Failed to drop existing collection: %v", err) + } + fmt.Println("Dropped existing collection 'hotels_ivf'") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("Cleanup: dropping collection 'hotels_ivf'...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels_ivf'") + } + }() + // Load hotel data with embeddings fmt.Printf("\nLoading data from %s...\n", config.DataFile) data, err := ReadFileReturnJSON(config.DataFile) @@ -175,15 +197,6 @@ func main() { // Prepare collection with fresh data fmt.Printf("\nPreparing collection '%s'...\n", config.CollectionName) - // Remove any existing data for clean state - deleteResult, err := collection.DeleteMany(ctx, bson.M{}) - if err != nil { - log.Fatalf("Failed to clear existing data: %v", err) - } - if deleteResult.DeletedCount > 0 { - fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) - } - // Insert hotel data with embeddings stats, err := InsertData(ctx, collection, documentsWithEmbeddings, config.BatchSize, nil) if err != nil { diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java index 676630b..14a37c6 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java index 146fc27..a8b3be7 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java index e800107..9c23aec 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java @@ -47,24 +47,33 @@ public void run() { var database = mongoClient.getDatabase(DATABASE_NAME); var collection = database.getCollection(COLLECTION_NAME, Document.class); - // Drop and recreate collection - collection.drop(); + // Drop collection if it already exists (clean start) + if (database.listCollectionNames().into(new ArrayList<>()).contains(COLLECTION_NAME)) { + collection.drop(); + System.out.println("Dropped existing collection: " + COLLECTION_NAME); + } database.createCollection(COLLECTION_NAME); System.out.println("Created collection: " + COLLECTION_NAME); - // Load and insert data - var hotelData = loadHotelData(); - insertDataInBatches(collection, hotelData); + try { + // Load and insert data + var hotelData = loadHotelData(); + insertDataInBatches(collection, hotelData); - // Create standard indexes - createStandardIndexes(collection); + // Create standard indexes + createStandardIndexes(collection); - // Create vector index - createVectorIndex(database); + // Create vector index + createVectorIndex(database); - // Perform vector search - var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); - performVectorSearch(collection, queryEmbedding); + // Perform vector search + var queryEmbedding = createEmbedding(openAIClient, SAMPLE_QUERY); + performVectorSearch(collection, queryEmbedding); + } finally { + // Cleanup: always drop collection at end + collection.drop(); + System.out.println("Cleanup: dropped collection '" + COLLECTION_NAME + "'"); + } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); diff --git a/ai/vector-search-python/src/diskann.py b/ai/vector-search-python/src/diskann.py index 81720ab..fdef640 100644 --- a/ai/vector-search-python/src/diskann.py +++ b/ai/vector-search-python/src/diskann.py @@ -142,6 +142,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -200,8 +207,13 @@ def main(): raise finally: - # Close the MongoDB client + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-python/src/hnsw.py b/ai/vector-search-python/src/hnsw.py index 9352220..fcc9e72 100644 --- a/ai/vector-search-python/src/hnsw.py +++ b/ai/vector-search-python/src/hnsw.py @@ -136,6 +136,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load hotel data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -196,8 +203,13 @@ def main(): raise finally: - # Clean up MongoDB connection + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-python/src/ivf.py b/ai/vector-search-python/src/ivf.py index f39c0d2..04a0794 100644 --- a/ai/vector-search-python/src/ivf.py +++ b/ai/vector-search-python/src/ivf.py @@ -133,6 +133,13 @@ def main(): database = mongo_client[config['database_name']] collection = database[config['collection_name']] + # Drop collection if it already exists (clean start) + if config['collection_name'] in database.list_collection_names(): + database.drop_collection(config['collection_name']) + print(f"Dropped existing collection '{config['collection_name']}'") + + collection = database[config['collection_name']] + # Load hotel data with embeddings print(f"\nLoading data from {config['data_file']}...") data = read_file_return_json(config['data_file']) @@ -191,8 +198,13 @@ def main(): raise finally: - # Ensure MongoDB connection is properly closed + # Cleanup: drop collection and close connection if 'mongo_client' in locals(): + try: + database.drop_collection(config['collection_name']) + print(f"Cleanup: dropped collection '{config['collection_name']}'") + except Exception as cleanup_err: + print(f"Cleanup warning: {cleanup_err}") mongo_client.close() diff --git a/ai/vector-search-typescript/src/diskann.ts b/ai/vector-search-typescript/src/diskann.ts index 96b547c..b756405 100644 --- a/ai/vector-search-typescript/src/diskann.ts +++ b/ai/vector-search-typescript/src/diskann.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -95,9 +103,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } diff --git a/ai/vector-search-typescript/src/hnsw.ts b/ai/vector-search-typescript/src/hnsw.ts index 771146c..fede64e 100644 --- a/ai/vector-search-typescript/src/hnsw.ts +++ b/ai/vector-search-typescript/src/hnsw.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -95,9 +103,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } diff --git a/ai/vector-search-typescript/src/ivf.ts b/ai/vector-search-typescript/src/ivf.ts index e81ace8..908ae1c 100644 --- a/ai/vector-search-typescript/src/ivf.ts +++ b/ai/vector-search-typescript/src/ivf.ts @@ -34,6 +34,14 @@ async function main() { await dbClient.connect(); const db = dbClient.db(config.dbName); + + // Drop collection if it already exists (clean start) + const existingCollections = await db.listCollections({ name: config.collectionName }).toArray(); + if (existingCollections.length > 0) { + await db.dropCollection(config.collectionName); + console.log('Dropped existing collection:', config.collectionName); + } + const collection = await db.createCollection(config.collectionName); console.log('Created collection:', config.collectionName); const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); @@ -96,9 +104,18 @@ async function main() { console.error('App failed:', error); process.exitCode = 1; } finally { - console.log('Closing database connection...'); - if (dbClient) await dbClient.close(); - console.log('Database connection closed'); + // Cleanup: drop collection and close connection + if (dbClient) { + try { + const db = dbClient.db(config.dbName); + await db.dropCollection(config.collectionName); + console.log('Cleanup: dropped collection', config.collectionName); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } } } From 4d0b00301e69478e47cebe2c483010ee4ea0bb48 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Thu, 30 Apr 2026 08:33:50 -0700 Subject: [PATCH 2/2] Add CI validation workflow and collection lifecycle standardization - Add validate-samples.yml workflow (dual-mode: build-only + full-run) - Serialized execution: TS -> Py -> Go -> Java -> .NET - Secret masking for all env vars - Collection lifecycle: drop-if-exists at start, always drop at end - TypeScript type safety improvements in compare-all.ts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/validate-samples.yml | 462 +++++++++++++++--- ai/select-algorithm-typescript/README.md | 29 +- .../src/compare-all.ts | 397 +++++++++++---- 3 files changed, 703 insertions(+), 185 deletions(-) diff --git a/.github/workflows/validate-samples.yml b/.github/workflows/validate-samples.yml index 7bd29ec..5defcfe 100644 --- a/.github/workflows/validate-samples.yml +++ b/.github/workflows/validate-samples.yml @@ -1,100 +1,135 @@ +# ============================================================================= +# Validate Samples — End-to-end validation for all DocumentDB AI samples +# ============================================================================= +# +# PURPOSE: +# Validates that every sample in this repo compiles and (optionally) runs +# correctly against a live Azure DocumentDB + Azure OpenAI deployment. +# +# TWO MODES: +# 1. BUILD-ONLY (automatic) — Triggered on PR/push to ai/** paths. +# Compiles all 5 languages (TypeScript, Python, Go, Java, .NET) to catch +# syntax errors, missing imports, and type issues. No secrets needed. +# +# 2. FULL RUN (manual) — Triggered via workflow_dispatch ("Run workflow" button). +# Builds AND executes every sample against real Azure resources. +# Requires the SAMPLES_ENV_FILE repo secret (see setup below). +# Captures all stdout/stderr as downloadable artifacts. +# +# SETUP — Creating the SAMPLES_ENV_FILE secret: +# 1. Go to repo Settings > Secrets and variables > Actions +# 2. Click "New repository secret" +# 3. Name: SAMPLES_ENV_FILE +# 4. Value: paste your entire .env file contents, e.g.: +# AZURE_DOCUMENTDB_CONNECTION_STRING=mongodb+srv://... +# AZURE_DOCUMENTDB_DATABASENAME=quickstart_db +# AZURE_OPENAI_EMBEDDING_ENDPOINT=https://...openai.azure.com +# AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small +# AZURE_OPENAI_EMBEDDING_KEY=abc123... +# AZURE_OPENAI_EMBEDDING_API_VERSION=2024-06-01 +# TOP_K=3 +# LOAD_SIZE_BATCH=25 +# 5. Click "Add secret" +# +# ARTIFACTS: +# Full-run jobs upload output-*.log files as workflow artifacts (7-day retention). +# Download them from the workflow run's "Artifacts" section to inspect sample output. +# +# ============================================================================= + name: Validate Samples on: + # Build-only on PR and push pull_request: paths: - 'ai/**' - '.github/workflows/validate-samples.yml' push: - branches: - - main + branches: [main] paths: - 'ai/**' - '.github/workflows/validate-samples.yml' + # Manual trigger for full validation (build + run) + workflow_dispatch: + inputs: + run_mode: + description: 'build-only = compile check only; full = compile + execute against Azure' + required: true + default: 'full' + type: choice + options: + - full + - build-only + permissions: contents: read concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + # Separate concurrency groups for auto (PR/push) vs manual full-run + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ inputs.run_mode || 'auto' }} + cancel-in-progress: ${{ github.event_name != 'workflow_dispatch' }} jobs: - validate-typescript: - name: TypeScript - ${{ matrix.sample }} + # ============================================================ + # BUILD JOBS — Always run (PR, push, and workflow_dispatch) + # Validates that code compiles without needing any secrets. + # ============================================================ + + build-typescript: + name: Build TypeScript - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false strategy: fail-fast: false matrix: sample: - vector-search-typescript - - vector-search-agent-typescript - + - select-algorithm-typescript steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Node.js - uses: actions/setup-node@v6 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: node-version: '20' cache: 'npm' cache-dependency-path: ai/${{ matrix.sample }}/package-lock.json - - - name: Install dependencies + - run: npm ci working-directory: ai/${{ matrix.sample }} - run: npm ci - - - name: Build TypeScript + - run: npm run build working-directory: ai/${{ matrix.sample }} - run: npm run build - validate-dotnet: - name: .NET + build-dotnet: + name: Build .NET runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false - steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup .NET - uses: actions/setup-dotnet@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-dotnet@v4 with: dotnet-version: '8.0.x' - - - name: Build solution - run: dotnet build documentdb-samples.sln + - run: dotnet build documentdb-samples.sln - validate-go: - name: Go - ${{ matrix.sample }} + build-go: + name: Build Go - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false strategy: fail-fast: false matrix: sample: - vector-search-go - - vector-search-agent-go - + - select-algorithm-go steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Go - uses: actions/setup-go@v6 + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 with: - go-version: '1.24' + go-version: '1.23' cache-dependency-path: ai/${{ matrix.sample }}/go.sum - - - name: Validate Go + - name: Build Go working-directory: ai/${{ matrix.sample }} + # Go samples have multiple main() files sharing utils.go — build each independently run: | - # Check if src/ has multiple main() declarations (independent programs sharing utils) if [ -d "src" ] && [ "$(grep -rl '^func main()' src/*.go 2>/dev/null | wc -l)" -gt 1 ]; then cd src for f in $(grep -l '^func main()' *.go); do @@ -105,47 +140,320 @@ jobs: go build ./... fi - validate-python: - name: Python + build-python: + name: Build Python - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false - + strategy: + fail-fast: false + matrix: + sample: + - vector-search-python + - select-algorithm-python steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Python - uses: actions/setup-python@v6 + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 with: python-version: '3.11' - - - name: Install dependencies - working-directory: ai/vector-search-python - run: pip install -r requirements.txt - - - name: Validate Python syntax - working-directory: ai/vector-search-python - run: | - find . -name "*.py" -exec python -m py_compile {} + + - run: pip install -r requirements.txt + working-directory: ai/${{ matrix.sample }} + - name: Validate syntax + working-directory: ai/${{ matrix.sample }} + run: find . -name "*.py" -exec python -m py_compile {} + - validate-java: - name: Java + build-java: + name: Build Java - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 - continue-on-error: false - + strategy: + fail-fast: false + matrix: + sample: + - vector-search-java + - select-algorithm-java steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Setup Java - uses: actions/setup-java@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '21' cache: 'maven' - - - name: Compile Java - working-directory: ai/vector-search-java - run: mvn compile -DskipTests + - run: mvn compile -DskipTests + working-directory: ai/${{ matrix.sample }} + + # ============================================================ + # FULL-RUN JOBS — Only on workflow_dispatch with run_mode=full + # Executes samples against live Azure resources using the + # SAMPLES_ENV_FILE repo secret. Captures output as artifacts. + # ============================================================ + + preflight: + name: Preflight — Verify secret exists + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + runs-on: ubuntu-latest + timeout-minutes: 2 + steps: + - name: Check SAMPLES_ENV_FILE secret + run: | + if [ -z "$ENV_CONTENT" ]; then + echo "::error::SAMPLES_ENV_FILE secret is not set. See workflow header for setup instructions." + exit 1 + fi + echo "✅ SAMPLES_ENV_FILE secret is configured" + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + + run-typescript: + name: Run TypeScript - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-typescript + scripts: | + node --env-file .env dist/create-embeddings.js 2>&1 | tee output-embed.log + node --env-file .env dist/ivf.js 2>&1 | tee output-ivf.log + node --env-file .env dist/hnsw.js 2>&1 | tee output-hnsw.log + node --env-file .env dist/diskann.js 2>&1 | tee output-diskann.log + - sample: select-algorithm-typescript + scripts: | + node --env-file .env dist/compare-all.js 2>&1 | tee output-compare.log + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: ai/${{ matrix.sample }}/package-lock.json + - run: npm ci + working-directory: ai/${{ matrix.sample }} + - run: npm run build + working-directory: ai/${{ matrix.sample }} + - name: Write .env from secret + working-directory: ai/${{ matrix.sample }} + run: printf '%s\n' "$ENV_CONTENT" > .env + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ai/${{ matrix.sample }} + run: | + set -euo pipefail + ${{ matrix.scripts }} + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-typescript-${{ matrix.sample }} + path: ai/${{ matrix.sample }}/output-*.log + retention-days: 7 + + run-python: + name: Run Python - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [run-typescript, build-python, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-python + scripts: | + python src/create_embeddings.py 2>&1 | tee output-embed.log + python src/ivf.py 2>&1 | tee output-ivf.log + python src/hnsw.py 2>&1 | tee output-hnsw.log + python src/diskann.py 2>&1 | tee output-diskann.log + - sample: select-algorithm-python + scripts: | + python src/compare_all.py 2>&1 | tee output-compare.log + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: '3.11' + - run: pip install -r requirements.txt + working-directory: ai/${{ matrix.sample }} + - name: Write .env from secret + working-directory: ai/${{ matrix.sample }} + run: printf '%s\n' "$ENV_CONTENT" > .env + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ai/${{ matrix.sample }} + run: | + set -euo pipefail + ${{ matrix.scripts }} + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-python-${{ matrix.sample }} + path: ai/${{ matrix.sample }}/output-*.log + retention-days: 7 + + run-go: + name: Run Go - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [run-python, build-go, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-go + scripts: | + go run create_embeddings.go utils.go 2>&1 | tee output-embed.log + go run ivf.go utils.go 2>&1 | tee output-ivf.log + go run hnsw.go utils.go 2>&1 | tee output-hnsw.log + go run diskann.go utils.go 2>&1 | tee output-diskann.log + workdir: ai/vector-search-go/src + - sample: select-algorithm-go + scripts: | + go run compare_all.go utils.go 2>&1 | tee output-compare.log + workdir: ai/select-algorithm-go/src + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.23' + cache-dependency-path: ai/${{ matrix.sample }}/go.sum + - name: Write .env from secret + working-directory: ${{ matrix.workdir }} + run: printf '%s\n' "$ENV_CONTENT" > .env + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ${{ matrix.workdir }} + run: | + set -euo pipefail + ${{ matrix.scripts }} + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-go-${{ matrix.sample }} + path: ${{ matrix.workdir }}/output-*.log + retention-days: 7 + + run-java: + name: Run Java - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [run-go, build-java, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-java + classes: DiskAnn HNSW IVF + package: com.azure.documentdb.samples + - sample: select-algorithm-java + classes: CompareAll + package: com.azure.documentdb.selectalgorithm + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '21' + cache: 'maven' + - run: mvn compile -DskipTests + working-directory: ai/${{ matrix.sample }} + - name: Export env vars from secret + run: | + while IFS= read -r line; do + [[ -z "$line" || "$line" == \#* ]] && continue + key="${line%%=*}" + value="${line#*=}" + echo "::add-mask::$value" + echo "$key=$value" >> "$GITHUB_ENV" + done <<< "$ENV_CONTENT" + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + working-directory: ai/${{ matrix.sample }} + run: | + set -euo pipefail + for class in ${{ matrix.classes }}; do + echo "=== Running $class ===" + mvn exec:java -Dexec.mainClass="${{ matrix.package }}.$class" 2>&1 | tee "output-${class,,}.log" + done + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-java-${{ matrix.sample }} + path: ai/${{ matrix.sample }}/output-*.log + retention-days: 7 + + run-dotnet: + name: Run .NET - ${{ matrix.sample }} + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' + needs: [run-java, build-dotnet, preflight] + runs-on: ubuntu-latest + timeout-minutes: 20 + strategy: + fail-fast: false + matrix: + include: + - sample: vector-search-dotnet + project: ai/vector-search-dotnet/DocumentDBVectorSearch.csproj + - sample: select-algorithm-dotnet + project: ai/select-algorithm-dotnet/src/SelectAlgorithm.csproj + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '8.0.x' + - name: Export env vars from secret + run: | + while IFS= read -r line; do + [[ -z "$line" || "$line" == \#* ]] && continue + key="${line%%=*}" + value="${line#*=}" + echo "::add-mask::$value" + echo "$key=$value" >> "$GITHUB_ENV" + done <<< "$ENV_CONTENT" + env: + ENV_CONTENT: ${{ secrets.SAMPLES_ENV_FILE }} + - name: Run sample + run: | + set -euo pipefail + dotnet run --project ${{ matrix.project }} 2>&1 | tee output-run.log + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-dotnet-${{ matrix.sample }} + path: output-run.log + retention-days: 7 + + # ============================================================ + # SUMMARY — Aggregates pass/fail status across all languages + # ============================================================ + + summary: + name: Results Summary + if: github.event_name == 'workflow_dispatch' && inputs.run_mode == 'full' && always() + needs: [preflight, run-typescript, run-python, run-go, run-java, run-dotnet] + runs-on: ubuntu-latest + steps: + - name: Generate summary table + run: | + echo "## 🧪 Full Validation Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Language | Status |" >> $GITHUB_STEP_SUMMARY + echo "|----------|--------|" >> $GITHUB_STEP_SUMMARY + echo "| TypeScript | ${{ needs.run-typescript.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Python | ${{ needs.run-python.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Go | ${{ needs.run-go.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Java | ${{ needs.run-java.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| .NET | ${{ needs.run-dotnet.result }} |" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "📦 Download artifacts for full output logs." >> $GITHUB_STEP_SUMMARY diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md index 16e0b67..8d1c37d 100644 --- a/ai/select-algorithm-typescript/README.md +++ b/ai/select-algorithm-typescript/README.md @@ -75,21 +75,42 @@ npm run start:diskann ## Compare All Algorithms -Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table: +Run all 9 combinations (3 algorithms × 3 similarity metrics) across multiple diverse queries and view formatted comparison tables with a ranking divergence summary: ```bash npm run start:compare-all ``` +By default, the script runs **5 diverse queries** designed to stress different aspects of similarity ranking: + +1. `outdoor adventure with family activities` +2. `quiet romantic getaway with ocean view` +3. `budget-friendly downtown hotel with free WiFi` +4. `historic building with fine dining and spa` +5. `ski resort with yoga and winter sports` + **Environment variables** (optional overrides): | Variable | Default | Description | |---|---|---| -| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | -| `TOP_K` | `3` | Number of results per combination | +| `QUERY_TEXT` | *(5 built-in queries)* | Override with a single custom query | +| `TOP_K` | `5` | Number of results per combination | | `VERBOSE` | `false` | When `true`, shows all k results per combo | -The script creates a single `hotels` collection, loads data once, creates 9 vector indexes (one per algorithm/metric pair), and runs searches sequentially for fair timing comparison. +### Architecture + +> **DocumentDB limitation:** Only ONE vector index per field per collection is allowed. The script creates 9 separate collections (one per algorithm×metric pair), loads data into each, creates one index per collection, runs searches, and cleans up all collections on exit. + +### Output + +The script produces: +- **Per-query comparison table** — shows algorithm, metric, latency, top score, and #1 result for each of the 9 combinations +- **Ranking divergence summary** — highlights queries where algorithms/metrics disagreed on the #1 result +- **Score gap analysis** — shows the confidence margin between #1 and #2 results + +### Small dataset caveat + +With ~50 hotel documents, all algorithms typically return identical rankings. This is expected — the dataset is too small for algorithmic differences to surface. For meaningful differentiation, use 1000+ documents with varied embeddings. The diverse queries help by combining attributes that no single hotel perfectly satisfies, which can reveal metric-level differences (COS vs L2 vs IP) even on small data. ## Algorithm comparison diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts index 53c54aa..39cadfb 100644 --- a/ai/select-algorithm-typescript/src/compare-all.ts +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -12,10 +12,15 @@ interface AlgorithmConfig { options: Record; } +interface MongoSearchResult { + document: { name: string; [key: string]: unknown }; + score: number; +} + interface SearchResult { + query: string; algorithm: string; similarity: string; - latencyMs: number; topScore: number; topResult: string; results: Array<{ name: string; score: number }>; @@ -27,16 +32,41 @@ const ALGORITHMS: AlgorithmConfig[] = [ { name: 'DiskANN', kind: 'vector-diskann', options: { maxDegree: 32, lBuild: 50 } }, ]; -const SIMILARITIES = ['COS', 'L2', 'IP']; +// Only COS and L2 — Inner Product (IP) is omitted because text-embedding-3-small +// produces unit-normalized vectors (magnitude = 1). For normalized vectors, +// cosine similarity = dot(a,b)/(||a||·||b||) = dot(a,b) = inner product. +// COS and IP always return identical results, so comparing both adds no insight. +const SIMILARITIES = ['COS', 'L2']; + +// Diverse queries designed to stress-test ranking differences: +// Each combines attributes that no single hotel perfectly satisfies, +// forcing similarity metrics to disagree on partial matches. +const DEFAULT_QUERIES = [ + 'outdoor adventure with family activities', + 'quiet romantic getaway with ocean view', + 'budget-friendly downtown hotel with free WiFi', + 'historic building with fine dining and spa', + 'ski resort with yoga and winter sports', +]; + +// DocumentDB allows only ONE vector index per field per collection, +// so we use a separate collection for each algorithm×metric combination. +function collectionNameFor(algo: AlgorithmConfig, sim: string): string { + return `compare_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; +} async function main() { const baseConfig = getConfig(); - const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; - const topK = parseInt(process.env.TOP_K || '3', 10); + const topK = parseInt(process.env.TOP_K || '5', 10); const verbose = process.env.VERBOSE === 'true'; - const collectionName = 'hotels'; + + // Support single query override via env, otherwise use all default queries + const queries: string[] = process.env.QUERY_TEXT + ? [process.env.QUERY_TEXT] + : DEFAULT_QUERIES; const { aiClient, dbClient } = getClientsPasswordless(); + const createdCollections: string[] = []; try { if (!aiClient) throw new Error('AI client is not configured.'); @@ -45,27 +75,39 @@ async function main() { await dbClient.connect(); const db = dbClient.db(baseConfig.dbName); - // Drop collection if it exists for a clean comparison - const existingCollections = await db.listCollections({ name: collectionName }).toArray(); - if (existingCollections.length > 0) { - await db.dropCollection(collectionName); - console.log(`Dropped existing collection: ${collectionName}`); - } - - // Create collection and load data - const collection = await db.createCollection(collectionName); - console.log(`Created collection: ${collectionName}`); + // Load data from file once (held in memory, inserted per collection) const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); - const insertSummary = await insertData(baseConfig, collection, data); - console.log(`Inserted ${insertSummary.inserted}/${insertSummary.total} documents`); + console.log(`Loaded ${data.length} documents from ${baseConfig.dataFile}`); - // Create all 9 indexes - console.log('\nCreating vector indexes...'); + // Generate embeddings for all queries upfront + console.log(`\nGenerating embeddings for ${queries.length} query(ies)...`); + const embeddingResponse = await aiClient.embeddings.create({ + model: baseConfig.deployment, + input: queries + }); + const queryVectors = embeddingResponse.data.map(d => d.embedding); + console.log(`Embeddings generated (${queryVectors[0].length} dimensions each)`); + + // Create 9 collections, each with its own vector index + console.log('\nSetting up 9 collections (1 per algorithm×metric)...'); for (const algo of ALGORITHMS) { for (const sim of SIMILARITIES) { + const colName = collectionNameFor(algo, sim); const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + + // Drop if leftover from a prior run + const existing = await db.listCollections({ name: colName }).toArray(); + if (existing.length > 0) { + await db.dropCollection(colName); + } + + const collection = await db.createCollection(colName); + createdCollections.push(colName); + + await insertData(baseConfig, collection, data); + const indexOptions = { - createIndexes: collectionName, + createIndexes: colName, indexes: [{ name: indexName, key: { [baseConfig.embeddedField]: 'cosmosSearch' }, @@ -78,127 +120,274 @@ async function main() { }] }; await db.command(indexOptions); - console.log(` ✓ ${indexName} (created)`); + console.log(` ✓ ${colName} → index ${indexName}`); } } - // Generate one embedding for the query - console.log(`\nQuery: "${queryText}"`); - const embeddingResponse = await aiClient.embeddings.create({ - model: baseConfig.deployment, - input: [queryText] - }); - const queryVector = embeddingResponse.data[0].embedding; - console.log(`Embedding generated (${queryVector.length} dimensions)`); + // Brief pause for indexes to become queryable + console.log('\nWaiting for indexes to be ready...'); + await new Promise(resolve => setTimeout(resolve, 3000)); - // Run all 9 searches sequentially - console.log(`\nRunning searches (top ${topK} results)...\n`); - const results: SearchResult[] = []; + // Run all queries × all 9 combinations + const allResults: SearchResult[] = []; - for (const algo of ALGORITHMS) { - for (const sim of SIMILARITIES) { - const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + for (let qi = 0; qi < queries.length; qi++) { + const queryText = queries[qi]; + const queryVector = queryVectors[qi]; + console.log(`\n━━━ Query ${qi + 1}/${queries.length}: "${queryText}" (top ${topK}) ━━━`); + + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const colName = collectionNameFor(algo, sim); + const collection = db.collection(colName); - const start = performance.now(); - const searchResults = await collection.aggregate([ - { - $search: { - cosmosSearch: { - vector: queryVector, - path: baseConfig.embeddedField, - k: topK - }, - cosmosSearchOptions: { - indexName: indexName + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' } } - }, - { - $project: { - score: { $meta: 'searchScore' }, - document: '$$ROOT' - } - } - ]).toArray(); - const latencyMs = performance.now() - start; - - const topDoc = searchResults[0] as any; - results.push({ - algorithm: algo.name, - similarity: sim, - latencyMs, - topScore: topDoc?.score ?? 0, - topResult: topDoc?.document?.HotelName ?? '(none)', - results: searchResults.map((r: any) => ({ - name: r.document?.HotelName ?? '(none)', - score: r.score ?? 0 - })) - }); + ]).toArray(); + + const typedResults = searchResults as unknown as MongoSearchResult[]; + const topDoc = typedResults[0]; + allResults.push({ + query: queryText, + algorithm: algo.name, + similarity: sim, + topScore: topDoc?.score ?? 0, + topResult: (topDoc?.document?.HotelName as string) ?? '(none)', + results: typedResults.map((r) => ({ + name: (r.document?.HotelName as string) ?? '(none)', + score: r.score ?? 0 + })) + }); + } } } - // Print comparison table - printComparisonTable(results, verbose); + // Print per-query comparison tables + for (const queryText of queries) { + const queryResults = allResults.filter(r => r.query === queryText); + printComparisonTable(queryText, queryResults, verbose); + } + + // Print cross-query ranking divergence summary + if (queries.length > 1) { + printDivergenceSummary(allResults, queries); + } } catch (error) { console.error('Compare-all failed:', error); process.exitCode = 1; } finally { - // Cleanup: drop the comparison collection + // Cleanup: drop all comparison collections if (dbClient) { try { const db = dbClient.db(baseConfig.dbName); - await db.dropCollection(collectionName); - console.log(`\nCleanup: dropped collection "${collectionName}"`); + console.log(`\nCleanup: dropping ${createdCollections.length} comparison collections...`); + for (const colName of createdCollections) { + try { + await db.dropCollection(colName); + } catch (dropErr) { + console.error(`Cleanup warning (drop ${colName}):`, dropErr); + } + } + console.log('Cleanup complete'); } catch (cleanupErr) { console.error('Cleanup warning:', cleanupErr); } - await dbClient.close(); - console.log('Database connection closed'); + try { + await dbClient.close(); + console.log('Database connection closed'); + } catch (closeErr) { + console.error('Warning closing connection:', closeErr); + } } } } -function printComparisonTable(results: SearchResult[], verbose: boolean) { - const algoWidth = 10; - const simWidth = 10; - const latWidth = 8; - const scoreWidth = 10; - const nameWidth = 30; - +function printComparisonTable(queryText: string, results: SearchResult[], _verbose: boolean) { const pad = (s: string, w: number) => s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length); - const topLine = `╔${'═'.repeat(algoWidth)}╤${'═'.repeat(simWidth)}╤${'═'.repeat(latWidth)}╤${'═'.repeat(scoreWidth)}╤${'═'.repeat(nameWidth)}╗`; - const headerSep = `╠${'═'.repeat(algoWidth)}╪${'═'.repeat(simWidth)}╪${'═'.repeat(latWidth)}╪${'═'.repeat(scoreWidth)}╪${'═'.repeat(nameWidth)}╣`; - const rowSep = `╟${'─'.repeat(algoWidth)}┼${'─'.repeat(simWidth)}┼${'─'.repeat(latWidth)}┼${'─'.repeat(scoreWidth)}┼${'─'.repeat(nameWidth)}╢`; - const bottomLine = `╚${'═'.repeat(algoWidth)}╧${'═'.repeat(simWidth)}╧${'═'.repeat(latWidth)}╧${'═'.repeat(scoreWidth)}╧${'═'.repeat(nameWidth)}╝`; + // Group by similarity metric to check if algorithms agree + const byMetric = new Map(); + for (const r of results) { + const group = byMetric.get(r.similarity) ?? []; + group.push(r); + byMetric.set(r.similarity, group); + } - console.log(topLine); - console.log(`║${pad(' Algorithm', algoWidth)}│${pad(' Similarity', simWidth)}│${pad(' Latency', latWidth)}│${pad(' Top Score', scoreWidth)}│${pad(' Top Result', nameWidth)}║`); - console.log(headerSep); + // Check if all algorithms agree (same #1 and #2 per metric) + const allAgree = [...byMetric.values()].every(group => { + const first = group[0]; + return group.every(r => + r.results[0]?.name === first.results[0]?.name && + r.results[1]?.name === first.results[1]?.name + ); + }); + + console.log(`\n┌─ Query: "${queryText}"`); + + if (allAgree) { + // Collapsed view: one row per metric (algorithms all agree) + const simWidth = 8; + const nameWidth = 26; + const scoreWidth = 9; + const gapWidth = 8; + const colWidths = [simWidth, nameWidth, scoreWidth, scoreWidth, gapWidth, nameWidth]; + const topLine = `╔${colWidths.map(w => '═'.repeat(w)).join('╤')}╗`; + const headerSep = `╠${colWidths.map(w => '═'.repeat(w)).join('╪')}╣`; + const rowSep = `╟${colWidths.map(w => '─'.repeat(w)).join('┼')}╢`; + const bottomLine = `╚${colWidths.map(w => '═'.repeat(w)).join('╧')}╝`; - results.forEach((r, i) => { - const latStr = `${Math.round(r.latencyMs)}ms`; - const scoreStr = r.topScore.toFixed(4); + console.log(`│ ✅ All algorithms agree (IVF, HNSW, DiskANN) — showing by metric only`); + console.log(topLine); console.log( - `║${pad(` ${r.algorithm}`, algoWidth)}│${pad(` ${r.similarity}`, simWidth)}│${pad(` ${latStr}`, latWidth)}│${pad(` ${scoreStr}`, scoreWidth)}│${pad(` ${r.topResult}`, nameWidth)}║` + `║${pad(' Metric', simWidth)}│${pad(' #1 Result', nameWidth)}│${pad(' #1 Score', scoreWidth)}│${pad(' #2 Score', scoreWidth)}│${pad(' Gap', gapWidth)}│${pad(' #2 Result', nameWidth)}║` ); + console.log(headerSep); + + const metrics = [...byMetric.entries()]; + metrics.forEach(([metric, group], i) => { + const r = group[0]; + const score1 = r.results[0]?.score.toFixed(4) ?? '-'; + const name1 = r.results[0]?.name ?? '(none)'; + const score2 = r.results[1]?.score.toFixed(4) ?? '-'; + const name2 = r.results[1]?.name ?? '(none)'; + const gap = (r.results[0] && r.results[1]) + ? Math.abs(r.results[0].score - r.results[1].score).toFixed(4) + : '-'; + + console.log( + `║${pad(` ${metric}`, simWidth)}│${pad(` ${name1}`, nameWidth)}│${pad(` ${score1}`, scoreWidth)}│${pad(` ${score2}`, scoreWidth)}│${pad(` ${gap}`, gapWidth)}│${pad(` ${name2}`, nameWidth)}║` + ); - if (verbose && r.results.length > 1) { - for (let j = 1; j < r.results.length; j++) { - const sub = r.results[j]; - console.log( - `║${pad('', algoWidth)}│${pad('', simWidth)}│${pad('', latWidth)}│${pad(` ${sub.score.toFixed(4)}`, scoreWidth)}│${pad(` ${sub.name}`, nameWidth)}║` - ); + if (i < metrics.length - 1) { + console.log(rowSep); } - } + }); + + console.log(bottomLine); + } else { + // Expanded view: show full algo×metric grid (algorithms disagree) + const algoWidth = 10; + const simWidth = 6; + const scoreWidth = 8; + const nameWidth = 26; + const colWidths = [algoWidth, simWidth, nameWidth, scoreWidth, scoreWidth, nameWidth]; + const topLine = `╔${colWidths.map(w => '═'.repeat(w)).join('╤')}╗`; + const headerSep = `╠${colWidths.map(w => '═'.repeat(w)).join('╪')}╣`; + const rowSep = `╟${colWidths.map(w => '─'.repeat(w)).join('┼')}╢`; + const bottomLine = `╚${colWidths.map(w => '═'.repeat(w)).join('╧')}╝`; + + console.log(`│ ⚠️ Algorithms DISAGREE — showing full breakdown`); + console.log(topLine); + console.log( + `║${pad(' Algo', algoWidth)}│${pad(' Sim', simWidth)}│${pad(' #1 Result', nameWidth)}│${pad(' #1 Score', scoreWidth)}│${pad(' #2 Score', scoreWidth)}│${pad(' #2 Result', nameWidth)}║` + ); + console.log(headerSep); + + results.forEach((r, i) => { + const score1 = r.results[0]?.score.toFixed(4) ?? '-'; + const name1 = r.results[0]?.name ?? '(none)'; + const score2 = r.results[1]?.score.toFixed(4) ?? '-'; + const name2 = r.results[1]?.name ?? '(none)'; + + console.log( + `║${pad(` ${r.algorithm}`, algoWidth)}│${pad(` ${r.similarity}`, simWidth)}│${pad(` ${name1}`, nameWidth)}│${pad(` ${score1}`, scoreWidth)}│${pad(` ${score2}`, scoreWidth)}│${pad(` ${name2}`, nameWidth)}║` + ); + + if (i < results.length - 1) { + console.log(rowSep); + } + }); + + console.log(bottomLine); + } +} + +// Show where algorithms/metrics disagree on rankings across queries +function printDivergenceSummary(allResults: SearchResult[], queries: string[]) { + console.log('\n\n╔══════════════════════════════════════════════════════════════════╗'); + console.log('║ RANKING DIVERGENCE SUMMARY ║'); + console.log('╚══════════════════════════════════════════════════════════════════╝'); + console.log('Shows queries where algorithms or metrics produced DIFFERENT #1 results.\n'); + + let divergenceCount = 0; + + for (const queryText of queries) { + const queryResults = allResults.filter(r => r.query === queryText); + const topResults = new Set(queryResults.map(r => r.topResult)); + + if (topResults.size > 1) { + divergenceCount++; + console.log(` ⚡ "${queryText}"`); - if (i < results.length - 1) { - console.log(rowSep); + // Group by top result to show which combos picked what + const groups = new Map(); + for (const r of queryResults) { + const key = r.topResult; + if (!groups.has(key)) groups.set(key, []); + groups.get(key)!.push(`${r.algorithm}/${r.similarity}`); + } + for (const [hotel, combos] of groups) { + console.log(` → ${hotel}: ${combos.join(', ')}`); + } + console.log(''); } - }); + } - console.log(bottomLine); + if (divergenceCount === 0) { + console.log(' All algorithms returned identical #1 results for every query.'); + console.log(' This is expected with small datasets (~50 docs). For meaningful'); + console.log(' differentiation, use 1000+ documents with varied embeddings.\n'); + } else { + console.log(` ${divergenceCount}/${queries.length} queries showed ranking divergence.`); + } + + // Score gap analysis — show how "confident" the top result is + console.log('\n Score Gaps (top score − 2nd score):'); + console.log(' ─────────────────────────────────────'); + for (const queryText of queries) { + const queryResults = allResults.filter(r => r.query === queryText); + const gaps = queryResults.map(r => { + if (r.results.length < 2) return 0; + return r.results[0].score - r.results[1].score; + }); + const avgGap = gaps.reduce((a, b) => a + b, 0) / gaps.length; + const maxGap = Math.max(...gaps); + const minGap = Math.min(...gaps); + const shortQuery = queryText.length > 40 ? queryText.slice(0, 37) + '...' : queryText; + console.log(` "${shortQuery}"`); + console.log(` avg: ${avgGap.toFixed(4)} | min: ${minGap.toFixed(4)} | max: ${maxGap.toFixed(4)}`); + } + console.log(''); +} + +// Validate required environment variables before starting +const REQUIRED_ENV_VARS = [ + 'AZURE_OPENAI_EMBEDDING_ENDPOINT', + 'AZURE_OPENAI_EMBEDDING_MODEL', + 'AZURE_OPENAI_EMBEDDING_API_VERSION', +]; + +const missing = REQUIRED_ENV_VARS.filter(v => !process.env[v]); +if (!process.env.AZURE_DOCUMENTDB_CONNECTION_STRING && !process.env.MONGO_CLUSTER_NAME) { + missing.push('AZURE_DOCUMENTDB_CONNECTION_STRING or MONGO_CLUSTER_NAME'); +} +if (missing.length > 0) { + throw new Error(`Missing required environment variables:\n - ${missing.join('\n - ')}`); } main().catch(error => {