diff --git a/ai/select-algorithm-python/.gitignore b/ai/select-algorithm-python/.gitignore new file mode 100644 index 0000000..2117ca6 --- /dev/null +++ b/ai/select-algorithm-python/.gitignore @@ -0,0 +1,9 @@ +# Python +__pycache__/ +*.py[cod] +*.pyo +.venv/ +venv/ + +# Environment +.env diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md new file mode 100644 index 0000000..37839ef --- /dev/null +++ b/ai/select-algorithm-python/README.md @@ -0,0 +1,136 @@ +# DocumentDB Vector Algorithm Comparison - Python + +Compare DocumentDB vector index algorithms (DiskANN, HNSW, IVF) across similarity functions (cosine, L2, inner product). + +## Prerequisites + +- Python 3.10+ +- Azure DocumentDB cluster with vector search enabled +- Azure OpenAI service with embedding model deployed +- Azure CLI authenticated (`az login`) +- DocumentDB `dbOwner` role on the target database +- `Cognitive Services OpenAI User` role on the Azure OpenAI resource + +## Setup + +1. Create and activate a virtual environment: +```bash +python -m venv .venv + +# Windows +.venv\Scripts\activate + +# macOS/Linux +source .venv/bin/activate +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +3. Configure environment variables: +```bash +cp .env.example .env +# Edit .env with your Azure resource details +``` + +4. Ensure vector data exists: +```bash +# Data file should be at: ../../data/Hotels_Vector.json +``` + +## Usage + +Run all algorithms with all similarity functions: +```bash +python src/select_algorithm.py +``` + +Run specific algorithm: +```bash +ALGORITHM=diskann python src/select_algorithm.py +``` + +Run specific similarity function: +```bash +SIMILARITY=L2 python src/select_algorithm.py +``` + +Run specific combination: +```bash +ALGORITHM=hnsw SIMILARITY=IP python src/select_algorithm.py +``` + +## Environment Variables + +- `ALGORITHM`: Algorithm to test (`all`, `diskann`, `hnsw`, `ivf`) +- `SIMILARITY`: Similarity function (`all`, `COS`, `L2`, `IP`) +- `MONGO_CLUSTER_NAME`: DocumentDB cluster name +- `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Azure OpenAI endpoint +- `AZURE_OPENAI_EMBEDDING_MODEL`: Embedding model deployment name + +## Expected output + +The script creates collections per algorithm/similarity combo, runs vector search, and prints a comparison table showing: +- Algorithm and similarity function used +- Top search result +- Search score +- Query latency in milliseconds + +``` +Vector Algorithm Comparison + Database: Hotels + Algorithms: all + Similarity: cos + Collections to query: hotels_diskann_cos, hotels_hnsw_cos, hotels_ivf_cos + Search query: "quintessential lodging near running trails, eateries, retail" + +Initializing MongoDB and Azure OpenAI clients... +Loading data from ... +Loaded [N] documents +Generating query embedding... +Query embedding: 1536 dimensions + +--- DiskANN / COS --- +Collection: hotels_diskann_cos +Created collection: hotels_diskann_cos +Inserted: [N]/[N] +Created vector index: vectorIndex_diskann_cos +Executing vector search... +Success: 5 results, [time]ms + +--- HNSW / COS --- +... + +--- IVF / COS --- +... + +=== Vector Search Comparison === +[Table of results with Algorithm, Similarity, Top Result, Score, and Latency columns] +[Results vary based on data and cluster configuration] +``` + +## Troubleshooting + +### Authentication failures + +This sample uses `DefaultAzureCredential` for passwordless authentication. If you see authentication errors: + +- Run `az login` to authenticate with Azure CLI. +- Verify your account has access to the DocumentDB cluster and Azure OpenAI resource. +- If using a managed identity, ensure the identity is assigned to the resource. + +### Missing environment variables + +If the script fails at startup, verify all required variables are set in your `.env` file. Copy `.env.example` as a starting point and fill in each value. + +### pymongo connection issues + +- Verify the `MONGO_CLUSTER_NAME` value matches your DocumentDB cluster name. +- Ensure network access is enabled for your IP in the DocumentDB cluster firewall settings. +- Check that the cluster is running and not paused. + +### DocumentDB RBAC permissions + +Vector search requires read/write permissions on the target database. Ensure your identity has the appropriate DocumentDB RBAC role assigned, such as `dbOwner` or a custom role with `createCollection`, `createIndex`, and `find` actions. diff --git a/ai/select-algorithm-python/pyproject.toml b/ai/select-algorithm-python/pyproject.toml new file mode 100644 index 0000000..fd8380a --- /dev/null +++ b/ai/select-algorithm-python/pyproject.toml @@ -0,0 +1,8 @@ +[project] +name = "documentdb-select-algorithm" +version = "1.0.0" +requires-python = ">=3.10" +description = "Compare DocumentDB vector index algorithms" +license = { text = "MIT" } +authors = [{ name = "Microsoft Corporation" }] +readme = "README.md" diff --git a/ai/select-algorithm-python/requirements.txt b/ai/select-algorithm-python/requirements.txt new file mode 100644 index 0000000..d90ac44 --- /dev/null +++ b/ai/select-algorithm-python/requirements.txt @@ -0,0 +1,11 @@ +# MongoDB driver for connecting to Azure DocumentDB +pymongo==4.6.0 + +# Azure OpenAI SDK for generating embeddings +openai==1.55.3 + +# Azure authentication library for passwordless connection +azure-identity==1.15.0 + +# Environment variable management from .env files +python-dotenv==1.0.0 diff --git a/ai/select-algorithm-python/src/select_algorithm.py b/ai/select-algorithm-python/src/select_algorithm.py new file mode 100644 index 0000000..072bd0a --- /dev/null +++ b/ai/select-algorithm-python/src/select_algorithm.py @@ -0,0 +1,243 @@ +import os +import time +from pathlib import Path +from typing import Any, Literal +import openai +import pymongo.errors +from utils import get_clients_passwordless, read_file_return_json, insert_data, print_comparison_table +from dotenv import load_dotenv + +load_dotenv() + +Algorithm = Literal['diskann', 'hnsw', 'ivf'] +Similarity = Literal['COS', 'L2', 'IP'] + +ALGORITHMS: list[Algorithm] = ['diskann', 'hnsw', 'ivf'] +SIMILARITIES: list[Similarity] = ['COS', 'L2', 'IP'] + +ALGORITHM_LABELS: dict[str, str] = { + 'diskann': 'DiskANN', + 'hnsw': 'HNSW', + 'ivf': 'IVF' +} + + +def get_index_options( + collection_name: str, + index_name: str, + embedded_field: str, + dimensions: int, + algorithm: Algorithm, + similarity: Similarity +) -> dict[str, Any]: + base = { + "createIndexes": collection_name, + "indexes": [ + { + "name": index_name, + "key": {embedded_field: "cosmosSearch"}, + "cosmosSearchOptions": {} + } + ] + } + + if algorithm == 'diskann': + base["indexes"][0]["cosmosSearchOptions"] = { + "kind": "vector-diskann", + "dimensions": dimensions, + "similarity": similarity, + "maxDegree": 32, + "lBuild": 50 + } + elif algorithm == 'hnsw': + base["indexes"][0]["cosmosSearchOptions"] = { + "kind": "vector-hnsw", + "dimensions": dimensions, + "similarity": similarity, + "m": 16, + "efConstruction": 64 + } + elif algorithm == 'ivf': + base["indexes"][0]["cosmosSearchOptions"] = { + "kind": "vector-ivf", + "dimensions": dimensions, + "similarity": similarity, + "numLists": 1 + } + + return base + + +def get_search_pipeline( + query_embedding: list[float], + embedded_field: str, + k: int, + algorithm: Algorithm +) -> list[dict[str, Any]]: + cosmos_search = { + "vector": query_embedding, + "path": embedded_field, + "k": k + } + + if algorithm == 'diskann': + cosmos_search["lSearch"] = 100 + elif algorithm == 'hnsw': + cosmos_search["efSearch"] = 80 + elif algorithm == 'ivf': + cosmos_search["nProbes"] = 1 + + return [ + {"$search": {"cosmosSearch": cosmos_search}}, + {"$project": {"score": {"$meta": "searchScore"}, "document": "$$ROOT"}} + ] + + +def get_target_collections( + algorithm_env: str, + similarity_env: str +) -> list[dict[str, Any]]: + algorithms = ALGORITHMS if algorithm_env == 'all' else [algorithm_env] + similarities = SIMILARITIES if similarity_env == 'all' else [similarity_env] + + targets = [] + + for alg in algorithms: + if alg not in ALGORITHMS: + raise ValueError(f"Invalid ALGORITHM '{alg}'. Must be one of: all, {', '.join(ALGORITHMS)}") + + for sim in similarities: + if sim not in SIMILARITIES: + raise ValueError(f"Invalid SIMILARITY '{sim}'. Must be one of: all, {', '.join(SIMILARITIES)}") + + targets.append({ + 'collection_name': f"hotels_{alg}_{sim.lower()}", + 'algorithm': alg, + 'similarity': sim + }) + + return targets + + +def main() -> None: + db_name = os.getenv('AZURE_DOCUMENTDB_DATABASENAME', 'Hotels') + embedded_field = os.getenv('EMBEDDED_FIELD', 'DescriptionVector') + embedding_dimensions = int(os.getenv('EMBEDDING_DIMENSIONS', '1536')) + data_file = os.getenv('DATA_FILE_WITH_VECTORS', '../../data/Hotels_Vector.json') + model_name = os.getenv('AZURE_OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small') + batch_size = int(os.getenv('LOAD_SIZE_BATCH', '100')) + algorithm_env = os.getenv('ALGORITHM', 'all').strip().lower() + similarity_env = os.getenv('SIMILARITY', 'COS').strip().upper() + search_query = 'quintessential lodging near running trails, eateries, retail' + + try: + targets = get_target_collections(algorithm_env, similarity_env) + + print("\nVector Algorithm Comparison") + print(f" Database: {db_name}") + print(f" Algorithms: {algorithm_env}") + print(f" Similarity: {similarity_env}") + print(f" Collections to query: {', '.join([t['collection_name'] for t in targets])}") + print(f' Search query: "{search_query}"\n') + + print("\nInitializing MongoDB and Azure OpenAI clients...") + mongo_client, azure_openai_client, credential = get_clients_passwordless() + + database = mongo_client[db_name] + + script_dir = Path(__file__).parent + data_path = script_dir / '..' / data_file + print(f"\nLoading data from {data_path}...") + data = read_file_return_json(str(data_path)) + print(f"Loaded {len(data)} documents") + + documents_with_embeddings = [doc for doc in data if embedded_field in doc] + if not documents_with_embeddings: + raise ValueError(f"No documents found with embeddings in field '{embedded_field}'") + + print('Generating query embedding...') + embedding_response = azure_openai_client.embeddings.create( + model=model_name, + input=[search_query] + ) + query_embedding = embedding_response.data[0].embedding + if len(query_embedding) != embedding_dimensions: + raise ValueError( + f"Embedding dimension mismatch: expected {embedding_dimensions}, " + f"got {len(query_embedding)}. Check EMBEDDING_DIMENSIONS env var " + f"matches your model's output dimensions." + ) + print(f"Query embedding: {len(query_embedding)} dimensions\n") + + comparison_results = [] + + for target in targets: + print(f"\n--- {ALGORITHM_LABELS[target['algorithm']]} / {target['similarity']} ---") + print(f"Collection: {target['collection_name']}") + + try: + try: + database.drop_collection(target['collection_name']) + except Exception as e: + print(f" Note: could not drop existing collection: {e}") + + collection = database.create_collection(target['collection_name']) + print(f"Created collection: {target['collection_name']}") + + insert_summary = insert_data(collection, documents_with_embeddings, batch_size) + print(f"Inserted: {insert_summary['inserted']}/{insert_summary['total']}") + + index_name = f"vectorIndex_{target['algorithm']}_{target['similarity'].lower()}" + index_options = get_index_options( + target['collection_name'], + index_name, + embedded_field, + embedding_dimensions, + target['algorithm'], + target['similarity'] + ) + database.command(index_options) + print(f"Created vector index: {index_name}") + + print('Executing vector search...') + start_time = time.time() + + pipeline = get_search_pipeline(query_embedding, embedded_field, 5, target['algorithm']) + # aggregate() returns a cursor (iterator); list() consumes all pages + search_results = list(collection.aggregate(pipeline)) + + latency_ms = (time.time() - start_time) * 1000 + + comparison_results.append({ + 'collection_name': target['collection_name'], + 'algorithm': ALGORITHM_LABELS[target['algorithm']], + 'similarity': target['similarity'], + 'search_results': search_results, + 'latency_ms': latency_ms + }) + + print(f"Success: {len(search_results)} results, {latency_ms:.0f}ms") + + except (pymongo.errors.PyMongoError, openai.APIError) as error: + print(f"Error with {target['collection_name']}: {error}") + + if comparison_results: + print_comparison_table(comparison_results) + + except Exception as error: + print(f"\nApp failed: {error}") + raise + + finally: + print('\nClosing connections...') + if 'azure_openai_client' in locals(): + azure_openai_client.close() + if 'credential' in locals(): + credential.close() + if 'mongo_client' in locals(): + mongo_client.close() + print('Connections closed') + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py new file mode 100644 index 0000000..4083a92 --- /dev/null +++ b/ai/select-algorithm-python/src/utils.py @@ -0,0 +1,177 @@ +import json +import os +import warnings +from typing import Any + +warnings.filterwarnings( + "ignore", + message="You appear to be connected to a CosmosDB cluster.*", +) + +from pymongo import MongoClient, InsertOne +from pymongo.collection import Collection +from pymongo.errors import BulkWriteError +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult +from openai import AzureOpenAI +from dotenv import load_dotenv + +load_dotenv() + + +class AzureIdentityTokenCallback(OIDCCallback): + def __init__(self, credential: DefaultAzureCredential) -> None: + self.credential = credential + + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + token = self.credential.get_token( + "https://ossrdbms-aad.database.windows.net/.default").token + return OIDCCallbackResult(access_token=token) + + +def get_clients_passwordless() -> tuple[MongoClient, AzureOpenAI, DefaultAzureCredential]: + cluster_name = os.getenv("MONGO_CLUSTER_NAME") + if not cluster_name: + raise ValueError( + "MONGO_CLUSTER_NAME environment variable is required.\n" + "Create a .env file based on .env.example or set it in your environment." + ) + + credential = DefaultAzureCredential() + + auth_properties = {"OIDC_CALLBACK": AzureIdentityTokenCallback(credential)} + + mongo_client = MongoClient( + f"mongodb+srv://{cluster_name}.mongocluster.cosmos.azure.com/", + # 120s connect timeout accommodates cold-start latency on DocumentDB clusters + connectTimeoutMS=120000, + tls=True, + retryWrites=False, + authMechanism="MONGODB-OIDC", + authMechanismProperties=auth_properties + ) + + azure_openai_endpoint = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if not azure_openai_endpoint: + raise ValueError( + "AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required.\n" + "Create a .env file based on .env.example or set it in your environment." + ) + + token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default") + + azure_openai_client = AzureOpenAI( + azure_endpoint=azure_openai_endpoint, + azure_ad_token_provider=token_provider, + # See Azure OpenAI API version lifecycle: + # https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation + api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION", "2024-10-21"), + timeout=30.0, + max_retries=3, + ) + + return mongo_client, azure_openai_client, credential + + +def read_file_return_json(file_path: str) -> list[dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as file: + return json.load(file) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found") + raise + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in file '{file_path}': {e}") + raise + + +def insert_data(collection: Collection, data: list[dict[str, Any]], batch_size: int = 100) -> dict[str, int]: + """Insert documents using bulk_write in batches. + + batch_size defaults to 100 to stay within the DocumentDB 16 MB command + payload limit while keeping round-trip overhead reasonable. + """ + total_documents = len(data) + inserted_count = 0 + failed_count = 0 + + print(f"Inserting {total_documents} documents in batches of {batch_size}...") + + for i in range(0, total_documents, batch_size): + batch = data[i:i + batch_size] + batch_num = (i // batch_size) + 1 + + try: + operations = [InsertOne(document) for document in batch] + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count + print(f"Batch {batch_num} completed: {result.inserted_count} documents inserted") + + except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + failed_count += len(batch) - e.details.get('nInserted', 0) + print(f"Batch {batch_num} had errors: {e.details.get('nInserted', 0)} inserted, {failed_count} failed") + + except Exception as e: + failed_count += len(batch) + print(f"Batch {batch_num} failed completely: {e}") + + return { + 'total': total_documents, + 'inserted': inserted_count, + 'failed': failed_count + } + + +def print_comparison_table(results: list[dict[str, Any]]) -> None: + if not results: + print("No comparison results to display.") + return + + print("\n" + "=" * 90) + print(" Vector Algorithm Comparison Results") + print("=" * 90) + + header = ( + f"{'Algorithm':<12} " + f"{'Similarity':<14} " + f"{'Top Result':<24} " + f"{'Score':<12} " + f"{'Latency(ms)':<14}" + ) + print(header) + print("-" * 90) + + for r in results: + top_result = r['search_results'][0] if r['search_results'] else None + if top_result: + doc = top_result.get('document', top_result) + top_name = doc.get('HotelName', 'N/A')[:22] + top_score = f"{top_result['score']:.4f}" + else: + top_name = 'N/A' + top_score = 'N/A' + + row = ( + f"{r['algorithm']:<12} " + f"{r['similarity']:<14} " + f"{top_name:<24} " + f"{top_score:<12} " + f"{r['latency_ms']:<14.0f}" + ) + print(row) + + print("=" * 90) + + for r in results: + print(f"\n--- {r['algorithm']} / {r['similarity']} ({r['collection_name']}) ---") + if not r['search_results']: + print(" No results.") + continue + + for i, item in enumerate(r['search_results'], 1): + doc = item.get('document', item) + score = item['score'] + print(f" {i}. {doc.get('HotelName', 'N/A')}, Score: {score:.4f}") + + print(f" Latency: {r['latency_ms']:.0f}ms")