diff --git a/.github/workflows/bloodhound_ops.yml b/.github/workflows/bloodhound_ops.yml new file mode 100644 index 0000000..841cb4e --- /dev/null +++ b/.github/workflows/bloodhound_ops.yml @@ -0,0 +1,343 @@ +name: Bloodhound Operations +run-name: Bloodhound Operations — ${{ github.event.inputs.mode }} + +# ------------------------------------------------------------ +# Manual Bloodhound operations workflow +# +# This workflow lets an operator manually run Bloodhound +# tasks from the GitHub Actions UI. +# +# Available operations: +# scan → run infrastructure scan immediately +# validation → check configuration +# status → quick system health check +# validate_scheduler → simulate scheduled EventBridge execution (debug only) +# +# NOTE: +# validate_scheduler is a controlled validation mode used to +# test the scheduled Lambda execution path without relying +# on cron or EventBridge triggers. +# ------------------------------------------------------------ + +permissions: + id-token: write + contents: read + + +# ------------------------------------------------------------ +# Concurrency Guard +# +# Prevents multiple Bloodhound scans running at the same time. +# +# Example: +# scheduled scan running +# operator triggers manual scan +# +# → GitHub will queue the second run instead of running both. +# ------------------------------------------------------------ +concurrency: + group: bloodhound-scan + cancel-in-progress: false + + +on: + + # ------------------------------------------------------- + # Manual trigger from GitHub Actions UI + # ------------------------------------------------------- + workflow_dispatch: + inputs: + mode: + description: "Bloodhound operation" + required: true + default: "status" + type: choice + options: + - scan + # ------------------------------------------------------- + # NOTE: + # Validation workflow is temporarily disabled in CI. + # + # The validation pipeline provisions disposable Terraform + # infrastructure and performs controlled teardown tests. + # + # This workflow currently runs correctly in local + # environments but requires additional CI hardening + # (Terraform variable injection and account guard handling). + # + # The implementation remains in this workflow and will be + # re-enabled prior to GA once the CI validation pipeline + # is fully stabilized. + # + # To re-enable: + # simply add "validation" back to the options list. + # ------------------------------------------------------- + #- validation + - status + # ------------------------------------------------------- + # Scheduler Validation Mode + # + # validate_scheduler simulates an EventBridge scheduled + # invocation using: + # { "source": "scheduled" } + # + # This is used to validate the scheduled execution path + # before changes are promoted to main. + # + # This is NOT the production scheduler. + # + # The real scheduler is defined in: + # invoke_lambda.yml + # ------------------------------------------------------- + - validate_scheduler + + +jobs: + + invoke-lambda: + runs-on: ubuntu-latest + + steps: + + # ------------------------------------------------------- + # Pull repository contents + # ------------------------------------------------------- + - name: Checkout repository + uses: actions/checkout@v4 + + + # ------------------------------------------------------- + # Authenticate to AWS using GitHub OIDC + # + # This assumes the IAM role configured for GitHub + # so the workflow can invoke the Lambda securely. + # ------------------------------------------------------- + - name: Configure AWS credentials via OIDC + uses: aws-actions/configure-aws-credentials@v5 + with: + role-to-assume: arn:aws:iam::388691194728:role/BloodhoundGitHubInvokeRole + aws-region: us-west-2 + + # Install Terraform for validation workflow + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: 1.6.6 + + + # ------------------------------------------------------- + # Invoke Bloodhound Lambda + # + # The selected operation from the UI is passed into + # the Lambda payload as JSON. + # + # For standard operations: + # + # scan → { "source": "scan" } + # status → { "source": "status" } + # + # Validation requires additional guard fields because + # the validation handler enforces strict safety checks. + # + # validation → + # { + # "source": "validation", + # "mode": "seek_destroy_validation", + # "target_ids": ["validation-instance"] + # } + # + # ------------------------------------------------------- + - name: Invoke Bloodhound Lambda operation + run: | + # fail fast on script errors + set -euo pipefail + + echo "::group::Bloodhound Lambda Invocation" + + # operation selected in UI + MODE="${{ github.event.inputs.mode }}" + + echo "" + echo "========================================" + echo "Bloodhound Operation: $MODE" + echo "========================================" + echo "" + + # ------------------------------------------------------- + # Validation Mode + # + # Run the full teardown validation workflow instead of + # manually constructing a Lambda payload. + # + # The validation script handles: + # • Terraform test instance creation + # • Lambda invocation + # • teardown verification + # • safety checks + # ------------------------------------------------------- + if [ "$MODE" = "validation" ]; then + + echo "Running full teardown validation workflow" + + chmod +x tools/run_validation_workflow.sh + ./tools/run_validation_workflow.sh + + exit 0 + + fi + + + # ------------------------------------------------------- + # Standard Lambda operations (scan / status) + scheduler validation + # ------------------------------------------------------- + # validate_scheduler simulates EventBridge scheduled trigger + if [ "$MODE" = "validate_scheduler" ]; then + echo "===== VALIDATING SCHEDULER PATH =====" + echo "Simulating EventBridge scheduled event" + echo '{"source":"scheduled"}' > event.json + else + echo "{\"source\":\"$MODE\"}" > event.json + fi + + # show payload being sent to Lambda + cat event.json + + echo "" + echo "Invoking Bloodhound Lambda" + + # invoke Lambda + aws lambda invoke \ + --function-name BloodhoundLambdaV2 \ + --cli-binary-format raw-in-base64-out \ + --payload file://event.json \ + --log-type Tail \ + output.json \ + --region us-west-2 \ + > lambda_meta.json + + echo "::endgroup::" + + + # ------------------------------------------------------- + # Print Lambda response payload + # ------------------------------------------------------- + echo "::group::Lambda Response Payload" + cat output.json + echo "::endgroup::" + + + # ------------------------------------------------------- + # Print Bloodhound operational summary results returned from Lambda + # ------------------------------------------------------- + echo "::group::Bloodhound Summary" + + echo "Scan Summary:" + jq '.scan' output.json || true + + echo "" + echo "Budget Snapshot:" + jq '.budget' output.json || true + + echo "" + echo "Teardown Plan:" + jq '.teardown' output.json || true + + echo "::endgroup::" + + + # ------------------------------------------------------- + # AWS invocation metadata + # + # Shows request ID, status code, and other + # Lambda invocation information. + # ------------------------------------------------------- + echo "::group::Lambda Invocation Metadata" + cat lambda_meta.json + echo "::endgroup::" + + + # ------------------------------------------------------- + # Decode Lambda logs returned in the invocation + # + # AWS embeds the last ~4KB of logs in base64. + # ------------------------------------------------------- + echo "::group::Decoded Lambda Logs" + + if jq -e '.LogResult' lambda_meta.json > /dev/null; then + cat lambda_meta.json | jq -r '.LogResult' | base64 -d + else + echo "No embedded logs returned from Lambda." + fi + + echo "::endgroup::" + + + # ------------------------------------------------------- + # Detect Lambda failures + # + # Ensures CI fails if Lambda execution fails. + # ------------------------------------------------------- + echo "::group::Lambda Error Check" + + if grep -q "FunctionError" lambda_meta.json; then + echo "ERROR: Lambda reported FunctionError" + exit 1 + fi + + if ! grep -q '"StatusCode": 200' lambda_meta.json; then + echo "ERROR: Lambda invocation returned non-200 status" + exit 1 + fi + + echo "Lambda invocation completed successfully" + + echo "::endgroup::" + + + # ------------------------------------------------------- + # Pull recent logs from CloudWatch + # ------------------------------------------------------- + - name: Fetch Lambda Logs + run: | + + echo "::group::Lambda CloudWatch Logs" + + LOG_GROUP="/aws/lambda/BloodhoundLambdaV2" + + # find most recent log stream + LOG_STREAM=$(aws logs describe-log-streams \ + --log-group-name $LOG_GROUP \ + --order-by LastEventTime \ + --descending \ + --limit 1 \ + --query 'logStreams[0].logStreamName' \ + --output text) + + echo "Latest log stream:" + echo $LOG_STREAM + + echo "" + echo "Recent log events:" + + aws logs get-log-events \ + --log-group-name $LOG_GROUP \ + --log-stream-name $LOG_STREAM \ + --limit 50 \ + --query 'events[*].message' \ + --output text + + echo "::endgroup::" + + # ------------------------------------------------------- + # Upload validation logs + # + # If validation mode ran, upload the generated + # validation logs as CI artifacts so engineers + # can download them from the workflow run page. + # ------------------------------------------------------- + - name: Upload validation logs + if: always() && github.event.inputs.mode == 'validation' + uses: actions/upload-artifact@v4 + with: + name: validation-logs + path: logs/validation \ No newline at end of file diff --git a/.github/workflows/invoke_lambda.yml b/.github/workflows/invoke_lambda.yml index 1cf329b..0e4c851 100644 --- a/.github/workflows/invoke_lambda.yml +++ b/.github/workflows/invoke_lambda.yml @@ -1,28 +1,253 @@ name: Invoke Bloodhound Lambda +run-name: Bloodhound Operations — ${{ github.event.inputs.mode }} + +# ------------------------------------------------------------ +# GitHub Actions workflow for invoking the Bloodhound Lambda. +# +# This workflow runs ONLY as a scheduled scanner. +# +# The purpose of this workflow is to execute the automated +# Bloodhound infrastructure scan on a fixed schedule. +# +# Manual operations (scan / validation / status) are handled +# by a separate workflow: bloodhound_ops.yml +# +# Lambda behavior is determined by: +# event["source"] +# +# Example payload used here: +# { "source": "scheduled" } + +# ------------------------------------------------------------ + +permissions: + id-token: write + contents: read + +# Prevent multiple scans running at the same time +concurrency: + group: bloodhound-scan + cancel-in-progress: false on: + + # ---------------------------------------------------------- + # Scheduled runs (UTC) + # + # NOTE: + # GitHub cron jobs only run from the repository DEFAULT + # branch (usually "main"). If this workflow exists only + # in another branch, the schedule will NOT trigger. + # + # Bloodhound executes twice per day: + # + # 16:00 UTC → 11 AM EST + # 04:00 UTC → 11 PM EST + # ---------------------------------------------------------- schedule: - # Run at 11 AM and 11 PM EST (4 PM and 4 AM UTC) - cron: '0 16,4 * * *' - workflow_dispatch: jobs: + + # GitHub runner job that invokes the Lambda invoke-lambda: runs-on: ubuntu-latest steps: + + # Temporary debug step to see what GitHub sends in the OIDC token context + - name: Print GitHub OIDC context + run: | + echo "Repository: $GITHUB_REPOSITORY" + echo "Ref: $GITHUB_REF" + echo "Ref Name: $GITHUB_REF_NAME" + echo "Event: $GITHUB_EVENT_NAME" + + # Pull repository contents - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - - name: Set up AWS CLI - uses: aws-actions/configure-aws-credentials@v1 + # Configure AWS credentials via GitHub OIDC + - name: Configure AWS credentials via OIDC + uses: aws-actions/configure-aws-credentials@v5 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + role-to-assume: arn:aws:iam::388691194728:role/BloodhoundGitHubInvokeRole aws-region: us-west-2 + # Invoke Bloodhound Lambda - name: Invoke Bloodhound Lambda function run: | - echo '{}' > test_event.json - aws lambda invoke --function-name BloodhoundLambda --payload file://test_event.json output.txt --region us-west-2 - cat output.txt + + set -euo pipefail # fail fast: exit on errors, undefined vars, or pipeline failures + + # ------------------------------------------------------- + # Group: Invocation Setup + # ------------------------------------------------------- + echo "::group::Bloodhound Lambda Invocation" + + echo "Building Lambda event payload" + + # ------------------------------------------------------- + # Build Lambda event payload + # + # This workflow always runs as part of the scheduled + # infrastructure scan. + # + # Because manual execution has been moved to a separate + # workflow (bloodhound_ops.yml), we no longer need to + # inspect GitHub workflow inputs. + # + # The Lambda event payload is therefore deterministic: + # + # { "source": "scheduled" } + # + # This ensures the Lambda pipeline runs in automated + # scan mode every time the cron job executes. + # ------------------------------------------------------- + + echo '{"source":"scheduled"}' > event.json + cat event.json + + echo "" + echo "Invoking Bloodhound Lambda" + + # Invoke Lambda + # output.json -> Lambda response payload + # lambda_meta.json -> AWS CLI invocation metadata + aws lambda invoke \ + --function-name BloodhoundLambdaV2 \ + --cli-binary-format raw-in-base64-out \ + --payload file://event.json \ + --log-type Tail \ + output.json \ + --region us-west-2 \ + > lambda_meta.json + + echo "::endgroup::" + + + # ------------------------------------------------------- + # Group: Lambda Response + # ------------------------------------------------------- + echo "::group::Lambda Response Payload" + cat output.json + echo "::endgroup::" + + + # ------------------------------------------------------- + # Group: Bloodhound Summary + # + # This prints a concise operational summary extracted + # from the Lambda response so CI logs clearly show what + # Bloodhound discovered during the scan. + # + # These values come directly from the structured response + # returned by the Bloodhound pipeline. + # + # Sections displayed: + # • scan → resource discovery summary + # • budget → cost projection snapshot + # • teardown → planned/allowed cleanup actions + # + # This improves observability without failing the CI job. + # ------------------------------------------------------- + echo "::group::Bloodhound Summary" + + echo "Scan Summary:" + jq '.scan' output.json + + echo "" + echo "Budget Snapshot:" + jq '.budget' output.json + + echo "" + echo "Teardown Plan:" + jq '.teardown' output.json + + echo "::endgroup::" + + + # ------------------------------------------------------- + # Group: Invocation Metadata + # ------------------------------------------------------- + echo "::group::Lambda Invocation Metadata" + cat lambda_meta.json + echo "::endgroup::" + + + # ------------------------------------------------------- + # Group: Decoded Lambda Logs + # + # When --log-type Tail is used, AWS embeds the last 4 KB + # of Lambda logs inside the LogResult field of the invoke + # response. The logs are base64 encoded. + # + # This step decodes them so they appear directly in the + # GitHub Actions output for easier debugging. + # ------------------------------------------------------- + echo "::group::Decoded Lambda Logs" + + if jq -e '.LogResult' lambda_meta.json > /dev/null; then + cat lambda_meta.json | jq -r '.LogResult' | base64 -d + else + echo "No embedded logs returned from Lambda." + fi + + echo "::endgroup::" + + + # ------------------------------------------------------- + # Group: Lambda Failure Detection + # + # Prevents CI pipelines from silently succeeding if + # Lambda execution fails. + # ------------------------------------------------------- + echo "::group::Lambda Error Check" + + # Detect runtime Lambda errors + if grep -q "FunctionError" lambda_meta.json; then + echo "ERROR: Lambda reported FunctionError" + exit 1 + fi + + # Detect unexpected status codes + if ! grep -q '"StatusCode": 200' lambda_meta.json; then + echo "ERROR: Lambda invocation returned non-200 status" + exit 1 + fi + + echo "Lambda invocation completed successfully" + echo "===== Invocation Complete =====" + + echo "::endgroup::" + + - name: Fetch Lambda Logs + run: | + + echo "::group::Lambda CloudWatch Logs" + + LOG_GROUP="/aws/lambda/BloodhoundLambdaV2" + + # Find latest log stream + LOG_STREAM=$(aws logs describe-log-streams \ + --log-group-name $LOG_GROUP \ + --order-by LastEventTime \ + --descending \ + --limit 1 \ + --query 'logStreams[0].logStreamName' \ + --output text) + + echo "Latest log stream:" + echo $LOG_STREAM + + echo "" + echo "Recent log events:" + + aws logs get-log-events \ + --log-group-name $LOG_GROUP \ + --log-stream-name $LOG_STREAM \ + --limit 50 \ + --query 'events[*].message' \ + --output text + + echo "::endgroup::" \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2762ee0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,92 @@ +# ========================================================= +# Python +# ========================================================= +__pycache__/ +*.py[cod] +*.pyo +*.pyd +.Python +*.egg-info/ +.eggs/ +dist/ +build/ +pip-wheel-metadata/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ + +# ========================================================= +# Virtual Environments +# ========================================================= +.venv/ +venv/ +ENV/ +myenv/ + +# ========================================================= +# Environment Files / Secrets +# ========================================================= +.env +.env.* +**/.env +**/.env.* + +# ========================================================= +# Project Build Artifacts +# ========================================================= +.build/ +**/.build/ +*.zip + +# ========================================================= +# Logs +# ========================================================= +*.log + +# ========================================================= +# OS / Editor Files +# ========================================================= +.DS_Store +Thumbs.db +.idea/ +.vscode/ + +# ========================================================= +# Terraform +# ========================================================= +.terraform/ +**/.terraform/ +infra/terraform.tfvars + +# Terraform state files (NEVER commit) +*.tfstate +*.tfstate.* + +# Terraform variable overrides (often contain secrets) +*.tfvars +*.tfvars.json + +# Terraform crash logs +crash.log + +# Terraform CLI config files +.terraformrc +terraform.rc + +# ========================================================= +# GitHub Actions (temporarily disabled for safety) +# ========================================================= +#/.github/workflows/ + +# Lambda local test output +output.txt + +# Ignore Validation logs (these can be large and are not needed in version control) +logs/validation/*.log + +# However, we want to keep the validation history log, which tracks past validation runs and their results. +!logs/validation_history.log + +# Bootstrap script artifacts +trust-policy.json +lambda-policy.json \ No newline at end of file diff --git a/FEATURES.md b/FEATURES.md new file mode 100644 index 0000000..d865856 --- /dev/null +++ b/FEATURES.md @@ -0,0 +1,305 @@ +# Bloodhound V2 — Features + +This document summarizes the major capabilities of the Bloodhound +cloud cost monitoring and automated cleanup system. + +--- + +## Interactive Demo Guide + +A step-by-step operational walkthrough of Bloodhound is available in: + +docs/quick_demo.md + +The demo guide shows how engineers interact with Bloodhound using: + +1. Local execution +2. Slack scan commands +3. Teardown planning +4. Controlled teardown execution +5. Scheduled GitHub Actions scans +6. Manual GitHub Actions operations +7. CloudWatch log inspection +8. Automated teardown validation workflow + +The guide includes real screenshots and validation artifacts. + +--- + +## Core Capabilities + +Bloodhound scans AWS infrastructure to identify unused resources +and optionally remove them to control cloud costs. + +Key capabilities include: + +- multi-region AWS resource scanning +- cost monitoring using AWS Cost Explorer +- Slack-based reporting and operations +- automated teardown planning +- controlled resource deletion +- infrastructure validation workflows + +--- + +## Resource Scanning + +Bloodhound scans the following AWS resources: + +- EC2 instances +- EBS volumes +- Elastic IPs +- NAT Gateways +- RDS instances +- ELBv2 load balancers + +The scanner automatically evaluates multiple AWS regions. + +--- + +## Cost Monitoring + +Bloodhound integrates with AWS Cost Explorer to calculate: + +- cohort-to-date spend +- projected monthly cost +- dynamic monthly allowance + +Budget alerts can be sent to Slack when spending exceeds thresholds. + +--- + +## Teardown Planning + +Bloodhound generates a **teardown plan** showing resources that +could be deleted to reduce cost. + +This plan is always generated before any destructive actions occur. + +--- + +## Controlled Resource Deletion + +Deletion is optional and disabled by default. + +Multiple safety controls protect against accidental deletion: + +- dry-run mode +- simulation mode +- explicit confirmation tokens +- optional allowlists +- resource tagging protection + +--- + +## Slack Integration + +Bloodhound supports the following Slack commands: + +/v2_seek +/v2_seek_destroy_plan +/v2_seek_destroy CONFIRM +/v2_status + +Slack provides operational visibility and safe control of teardown actions. + +--- + +## GitHub Automation + +Bloodhound includes GitHub Actions workflows that automate +infrastructure scanning and operational control. + +Two workflows are included in the repository: + +### Scheduled Scan Workflow + +File: + +.github/workflows/invoke_lambda.yml + +This workflow runs automatically on a fixed schedule and performs +regular AWS infrastructure scans. + +Schedule: + +16:00 UTC → 11 AM EST +04:00 UTC → 11 PM EST + +Authentication and execution flow: + +```text +GitHub Actions + ↓ +OIDC Authentication + ↓ +AWS STS AssumeRoleWithWebIdentity + ↓ +BloodhoundGitHubInvokeRole + ↓ +BloodhoundLambdaV2 + ↓ +AWS Infrastructure Scan + Cleanup +``` + +The workflow: + +- authenticates to AWS using GitHub OIDC +- assumes the `BloodhoundGitHubInvokeRole` +- invokes the `BloodhoundLambdaV2` function +- prints scan summaries and recent CloudWatch logs + +The Lambda event payload used for scheduled scans is: + +```json +{ "source": "scheduled" } +``` + +### Manual Operations Workflow + +File: + +`.github/workflows/bloodhound_ops.yml` + +This workflow allows engineers to manually run Bloodhound +operations from the GitHub Actions UI. + +Supported modes: +- scan — run an immediate infrastructure scan +- status — return system health information +- validation — implemented in the workflow but currently disabled in the GitHub Actions UI + +Validation mode is currently disabled in CI but remains available +for local testing. + +## Automated Validation Workflow + +Bloodhound includes automated validation workflows that verify: + +- Lambda deployment +- infrastructure scanning +- teardown logic +- controlled deletion behavior + +Validation uses disposable test resources to ensure safe testing. + +Validation can be executed locally using: + +`tools/run_validation_workflow.sh` + +This script orchestrates infrastructure smoke tests and controlled +teardown validation using disposable AWS resources. + +--- + +## Safety Architecture + +Bloodhound implements layered safety protections: + +- destructive actions disabled by default +- explicit confirmation required +- resource whitelist tagging +- Terraform deployment guards +- Lambda version rollback capability + +These safeguards ensure the system cannot accidentally delete +production resources. + +--- + +## Deployment Architecture + +Bloodhound runs as an AWS Lambda function deployed using Terraform. + +Key components: + +- AWS Lambda +- CloudWatch Logs +- Slack integration +- Terraform infrastructure management +- GitHub Actions automation +- GitHub OIDC authentication for AWS access +- validation automation scripts +- deterministic Lambda event routing + +## Deterministic Lambda Build Pipeline + +Bloodhound uses a deterministic Docker-based build pipeline to +construct the Lambda deployment artifact. + +Instead of building dependencies directly on the host machine, +Terraform invokes a build script that performs the packaging +inside a Docker container that mirrors the AWS Lambda runtime. + +Build process: + +1. Terraform triggers the Lambda build script +2. The script launches the AWS Lambda runtime container +3. Python dependencies are installed from `requirements.txt` +4. Bloodhound application source code is copied into the package +5. Terraform archives the package into the Lambda deployment artifact +6. The Lambda function is updated with the new version + +Build flow: + +Terraform +↓ +build_lambda.sh +↓ +Docker (Amazon Lambda runtime container) +↓ +.build/lambda_pkg +↓ +archive_file +↓ +Lambda deployment artifact +↓ +AWS Lambda version publish + + +# Benefits of this approach: + +- guarantees dependency compatibility with the AWS Lambda runtime +- produces deterministic and reproducible builds +- prevents environment-specific packaging issues +- separates infrastructure management from packaging logic +- simplifies future CI/CD integration +- allows user to force Lambda rebuilds when packaging logic changes + +Docker builds are used by default to ensure production-safe and +runtime-compatible Lambda artifacts. + +If the build pipeline or packaging script changes, users can force +Terraform to rebuild the Lambda package using: + +```bash +terraform apply -replace=terraform_data.build_lambda_pkg +``` + +## Lambda Event Routing + +The Bloodhound Lambda entrypoint routes events through +deterministic execution paths to prevent recursion and +ensure predictable behavior. + +Event types: + +- Slack HTTP events → Slack command handler +- Scheduled events → dedicated scheduled handler +- Default events → main execution pipeline + +Example routing: + +```text +Lambda handler +↓ +event routing (Slack / scheduled / default) +↓ +scheduled_handler → run_scheduled_scan() +or +bloodhound.app.run() +``` + +This architecture prevents recursive execution loops +and ensures scheduled runs do not re-enter the main +execution pipeline. diff --git a/README.md b/README.md index 0e09461..b85521b 100644 --- a/README.md +++ b/README.md @@ -1,286 +1,1034 @@ -# Bloodhound Lambda Function +# Bloodhound v2 (AWS resource scanner + Slack alerts + optional teardown) + +## 📌 Quick Overview + +Engineers working on Lambda packaging or Terraform deployment should +review `docs/lambda_packaging.md` before modifying the build pipeline. + +## 🚀 Getting Started + +New to Bloodhound? + +Start with these documents: + +📊 **Architecture Overview** +→ [docs/architecture_overview.md](docs/architecture_overview.md) + +🎬 **Quick Demo Guide (Slack, GitHub Actions, Validation)** +→ [docs/quick_demo.md](docs/quick_demo.md) + +📘 **Feature Overview** +→ [FEATURES.md](FEATURES.md) + +These documents explain how Bloodhound works, how to operate it, and how +to run common workflows. + +For deeper engineering documentation: + +📚 [docs/](docs/) + +## Architecture Summary + +Bloodhound uses a single Lambda entrypoint with deterministic +event routing to support multiple execution paths: + +Slack commands → Slack handler +Scheduled events → scheduled handler +Validation events → validation handler +Manual operations → main pipeline + +All executions emit structured CloudWatch logs: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +This allows every invocation to be traced end-to-end. ## Table of Contents -- [Overview](#overview) -- [Prerequisites](#prerequisites) -- [Setup Steps](#setup-steps) - - [1. Create a Slack App](#1-create-a-slack-app) - - [2. Add Bot Permissions](#2-add-bot-permissions) - - [3. Get the Slack Channel ID](#3-get-the-slack-channel-id) - - [4. Invite the Bot to the Channel](#4-invite-the-bot-to-the-channel) - - [5. Prepare the Lambda Function Code](#5-prepare-the-lambda-function-code) - - [6. Package and Deploy the Lambda Function](#6-package-and-deploy-the-lambda-function) - - [7. Create an IAM Role for Lambda](#7-create-an-iam-role-for-lambda) - - [8. Set Up Environment Variables in AWS Lambda](#8-set-up-environment-variables-in-aws-lambda) - - [9. Test the Lambda Function](#9-test-the-lambda-function) - - [10. GitHub Actions Setup](#10-github-actions-setup) -- [Conclusion](#conclusion) +- [Stop — Read This Before Running Bloodhound](#️-stop--read-this-before-running-bloodhound) +- [Documentation](#documentation) +- [Quick Demo Guide](docs/quick_demo.md) +- [Requirements](#requirements) +- [Local Setup and Testing](#local-setup--testing) +- [Dependency Management](#dependency-management) +- [Configure .env](#configure-env) +- [Run Locally](#run-locally) +- [Whitelisting](#whitelisting) +- [Teardown Controls](#teardown-controls-important) +- [Build the Lambda Deployment Zip](#build-the-lambda-deployment-zip-v2) +- [Deploy to AWS Lambda](#deploy-to-aws-lambda-v2) +- [Terraform Deployment Workflow](#terraform-deployment-workflow) +- [Slack Slash Commands](#slack-slash-commands-v2) +- [Validation Scripts](#validation-scripts) +- [GitHub Automation](#️-github-automation) + - [Scheduled Scan Workflow](#scheduled-scan-workflow) + - [Manual Operations Workflow](#manual-operations-workflow) +- [GitHub OIDC Authentication Bootstrap](#github-oidc-authentication-bootstrap) +- [Bootstrap Script](#bootstrap-script) +- [Run the bootstrap script](#run-the-bootstrap-script) +- [Trust Policy](#trust-policy) +- [Security Notes](#security-notes) + +Bloodhound v2 scans selected AWS regions for common cost-leak resources, posts results to Slack, and can optionally delete resources that are **not** whitelisted. + + +- Clone this repo +- Configure `.env` for local testing +- Terraform automatically builds the Lambda deployment zip locally (the `.build/` directory is not committed) +- Configure Lambda env vars to match your `.env` + +If you need to create a Slack bot from scratch, see `docs/SLACK_SETUP.md`. + +### Project docs: + +- v2 plan: `docs/bloodhound_v2_plan.md` +- Lambda packaging and dependency strategy: `docs/lambda_packaging.md` +- Terraform troubleshooting: `docs/troubleshooting_terraform_lambda.md` + +![AWS Architecture Diagram (v2)](assets/bloodhound_lambda_architecture_v2.svg) + + +--- + +## AFTER (replace that entire block) + +```markdown +### Lambda Packaging Pipeline -## Overview +Bloodhound builds the Lambda deployment package automatically during +`terraform apply`. -The Bloodhound Lambda function is designed to scan AWS regions for EC2 and RDS instances and post a summary to a Slack channel. This README provides detailed steps to set up, deploy, and test the Lambda function, including Slack integration. +Terraform invokes the build script: -![AWS Architecture Diagram](assets/bloodhound_lambda_architecture.png) +`scripts/build_lambda.sh` -## Prerequisites +The script prepares the Lambda package directory: -- AWS account with permissions to create IAM roles, Lambda functions, and access to EC2 and RDS. -- Slack workspace with permission to create and install Slack apps. -- Python 3.8 installed locally for testing and packaging the Lambda function. +`.build/lambda_pkg` -## Setup Steps +Terraform then archives the package and deploys the Lambda. -### 1. Create a Slack App +```text +terraform apply + ↓ +terraform_data.build_lambda_pkg + ↓ +scripts/build_lambda.sh + ↓ +.build/lambda_pkg + ↓ +archive_file + ↓ +.build/bloodhound_lambda_v2.zip + ↓ +Lambda deployment +``` + +For detailed build pipeline documentation see: + +infra/README.md +docs/lambda_packaging.md + +## ⚠️ Safety Notice — Read Before Running Bloodhound + +Bloodhound can delete AWS infrastructure when `APPLY_CHANGES=true`. + +Before running validation scripts or enabling destructive mode, review: + +📘 [Bloodhound v2 Configuration Guide](docs/configuration_system.md) + +This document explains: + +- teardown mode configuration +- deletion safety limits +- AWS account validation guards +- Terraform deployment protections +- Bloodhound safety architecture + +--- + +## Documentation + +Configuration and safety model: + +📘 [docs/configuration_system.md](docs/configuration_system.md) + +Slack command validation: + +📘 [docs/slack_and_lambda_validation.md](docs/slack_and_lambda_validation.md) -1. Go to the Slack API website: [Slack API: Applications](https://api.slack.com/apps). -2. Click on "Create New App". -3. Choose "From scratch". -4. Give your app a name and select the Slack workspace where you have permissions to install the app. -5. Click "Create App". +Controlled teardown validation: + +📘 [docs/validate_teardown.md](docs/validate_teardown.md) + +System architecture: + +📘 [docs/bloodhound_v2_plan.md](docs/bloodhound_v2_plan.md) + +--- -### 2. Add Bot Permissions +## Requirements -1. In your Slack app settings, go to "OAuth & Permissions". -2. Under "OAuth Tokens & Redirect URLs", scroll down to "Scopes". -3. Add the following bot token scopes: - - `chat:write`: To post messages in channels. - - `channels:read`: To read information about channels. - - `groups:read`: To read information about private channels. -4. At the top of the "OAuth & Permissions" page, click "Install App to Workspace". -5. Review the permissions and click "Allow". -6. Copy the OAuth Access Token; you'll need this as the `SLACK_BOT_TOKEN`. +- Python 3.10+ for local dev (or match your Lambda runtime) +- AWS CLI configured (use `AWS_PROFILE=...` as needed). To set it up the first time: `aws configure --profile ` (or `aws configure` for the default profile). To see what profiles you have: `aws configure list-profiles`; your local config/creds live in `~/.aws/config` and `~/.aws/credentials` (view with `cat ~/.aws/config` and `cat ~/.aws/credentials`). +- Slack bot token and channel IDs -### 3. Get the Slack Channel ID +--- -1. Open Slack and navigate to the channel where you want the bot to post messages. -2. Click on the channel name to open the channel details. -3. Copy the channel ID from the URL or the channel details pane. The channel ID starts with `C` for public channels or `G` for private channels. +## Local setup + testing -### 4. Invite the Bot to the Channel +### Create a venv and install dependencies -1. In Slack, go to the channel where you want the bot to post messages. -2. Type `/invite @your-bot-name` to invite the bot to the channel. Replace `your-bot-name` with the actual name of your bot. +From this directory: -### 5. Prepare the Lambda Function Code +```bash +python3 -m venv .venv +.venv/bin/python -m pip install --upgrade pip -1. Create a directory for your Lambda function code: +# Runtime dependencies (used by Lambda) +.venv/bin/python -m pip install -r requirements.txt -```sh -mkdir bloodhound_lambda -cd bloodhound_lambda +# Development dependencies (used for local testing) +.venv/bin/python -m pip install -r requirements-dev.txt ``` +### Dependency Management + +Bloodhound separates Lambda runtime dependencies from local development +dependencies. This keeps the Lambda deployment package small and avoids +dependency conflicts during Terraform builds. + +Dependency files: -2. Create a file named `lambda_function.py` and add the following code: - -```python -import boto3 -import os -from slack_sdk import WebClient -from slack_sdk.errors import SlackApiError - -# Adjust based on the regions you want to scan -STUDENT_REGIONS = [ - "us-east-1", - "us-east-2", - "us-west-1", - "us-west-2", -] - -def create_session(region): - return boto3.Session(region_name=region) - -def search_regions_for_rds_resources(session): - rds_instances = [] - print(f"Sniffing out rds resources in {session.region_name}...") - rds = session.client("rds") - response = rds.describe_db_instances() - for dbinstance in response["DBInstances"]: - rds_instances.append(dbinstance["DBInstanceIdentifier"]) - return {"rds": rds_instances} - -def search_regions_for_ec2_resources(session): - ec2_instances = [] - print(f"Sniffing out ec2 resources in {session.region_name}...") - ec2 = session.client("ec2") - response = ec2.describe_instances() - try: - if len(response["Reservations"]) == 1: - for ec2instance in response["Reservations"][0]["Instances"]: - if ec2instance["State"]["Name"] in ("stopped", "terminated"): - continue - ec2_instances.append(ec2instance["InstanceId"]) - else: - for ec2instance in response["Reservations"]: - if ec2instance["Instances"][0]["State"]["Name"] in ("stopped", "terminated"): - continue - ec2_instances.append(ec2instance["Instances"][0]["InstanceId"]) - except IndexError: - return {"ec2": []} - return {"ec2": ec2_instances} - -def format_message(resources): - message = ":dog2: Woof! Woof! :dog2:\n Bloodhound found the following resources in use: \n" - for region in resources.keys(): - if len(resources[region]["ec2"]) == 0 and len(resources[region]["rds"]) == 0: - continue - message += f'- *{region.upper()}*: {len(resources[region]["ec2"])} ec2 instances, {len(resources[region]["rds"])} rds instances\n' - message += f"Please stop or terminate all unneeded resources!" - return message - -def send_slack_message(message): - try: - slack_bot_token = os.environ.get("SLACK_BOT_TOKEN") - channel_id = os.environ.get("CHANNEL_ID") - - client = WebClient(token=slack_bot_token) - response = client.chat_postMessage( - channel=channel_id, - text=message - ) - print(f"Slack message response: {response}") - except SlackApiError as e: - print(f"Error sending message to Slack: {e.response['error']}") - -def lambda_handler(event, context): - resources_in_regions = {} - print(f"Going hunting in regions {STUDENT_REGIONS}") - for region in STUDENT_REGIONS: - session = create_session(region) - resources_in_regions[region] = search_regions_for_ec2_resources(session) - resources_in_regions[region].update(search_regions_for_rds_resources(session)) - message = format_message(resources_in_regions) - print(message) - send_slack_message(message) - return message +```md + +File Purpose +--------------------- -------------------------------------------------------- +requirements.txt Dependencies packaged into the Lambda deployment. + +requirements-dev.txt Dependencies used only for local development and + testing. These are not included in the Lambda package. ``` -3. Create a `requirements.txt` file with the following content: +AWS Lambda already provides several AWS SDK libraries in the runtime +environment (including boto3 and botocore). Because of this, these libraries +are not bundled into the Lambda deployment package. + +For a detailed explanation of the Lambda packaging architecture, +dependency rules, and build pipeline, see: + +`docs/lambda_packaging.md` + +AWS Lambda already includes several AWS SDK libraries in the runtime environment, including: -```text -slack_sdk boto3 + +botocore + +s3transfer + +jmespath + +Because of this, these libraries are not included in the Lambda deployment package, but they may still be installed locally through requirements-dev.txt. + +This keeps the Lambda package smaller and avoids dependency conflicts during Terraform builds. + +See docs/lambda_packaging.md for a deeper explanation of the packaging strategy. + +### Configure `.env` + +Create `.env` from `env.example` and fill it in: + +```bash +cp env.example .env +``` + +Then edit `.env` and replace the placeholder values. + +At minimum you must configure the following values: + +Variable Source +- SLACK_BOT_TOKEN Slack App → OAuth & Permissions → Bot User OAuth Token +- SLACK_SIGNING_SECRET Slack App → Basic Information → Signing Secret +- SLACK_SCAN_CHANNEL_ID Slack channel ID where scan summaries will post +- SLACK_ALERT_CHANNEL_ID Slack channel ID where alerts/teardown notices will post + +Example Slack channel link: + +https://workspace.slack.com/archives/C000Y0V0HNY + +Channel ID: + +C000Y0V0HNY + +Bloodhound v2 automatically loads .env for local runs. + +When deploying to AWS Lambda, these same variables must be configured in: + +Lambda → Configuration → Environment Variables +If you need to create or configure the Slack app, see +`docs/SLACK_SETUP.md` for the full setup guide. +### Environment loading +Bloodhound v2 automatically loads `.env` for local runs. + +### Run locally + +From the repository root. + +**Recommended (module execution):** + +```bash +AWS_PROFILE= python -m tools.run_local ``` -### 6. Package and Deploy the Lambda Function +If your virtual environment is activated: -1. Install the dependencies and create a deployment package: +`AWS_PROFILE=geekstar python -m tools.run_local` -```sh -pip install -r requirements.txt -t . -zip -r9 ../bloodhound_lambda.zip . +If you prefer using the virtual environment interpreter explicitly: + +`AWS_PROFILE= .venv/bin/python -m tools.run_local` + +Note: +The runner must be executed as a module (python -m tools.run_local). +Running python tools/run_local.py may fail with ModuleNotFoundError +because the project uses package-relative imports (from bloodhound...). + +**Legacy script execution (may work in some environments):** + +```bash +# Choose the AWS profile you want to test with: +AWS_PROFILE= .venv/bin/python tools/run_local +AWS_PROFILE=geekstar .venv/bin/python tools/run_local.py ``` -2. Create the Lambda function using the AWS CLI: +AWS Credentials + +Bloodhound uses boto3, which follows the standard AWS credential resolution chain. + +If AWS_PROFILE is not specified, boto3 will automatically use the default profile from: -```sh -aws lambda create-function --function-name BloodhoundLambda \ ---zip-file fileb://bloodhound_lambda.zip --handler lambda_function.lambda_handler --runtime python3.8 \ ---role arn:aws:iam:::role/BloodhoundLambdaRole --region us-west-2 +`~/.aws/credentials` + +You can verify which AWS account your local run will use with: + +`aws sts get-caller-identity` + +Example output: +```json +{ + "UserId": "...", + "Account": "123456789012", + "Arn": "arn:aws:iam::123456789012:user/..." +} ``` -Replace `` with your actual AWS account ID. +This helps confirm you are scanning the expected AWS account before running Bloodhound. +--- -### 7. Create an IAM Role for Lambda +## Whitelisting -1. Go to the [IAM console](https://console.aws.amazon.com/iam/). -2. Create a new role: - - Choose the "Lambda" service. - - Attach the following policies: - - `AWSLambdaBasicExecutionRole` - - `AmazonEC2ReadOnlyAccess` - - `AmazonRDSReadOnlyAccess` -3. Note the ARN of the created role. +Resources tagged with: -### 8. Set Up Environment Variables in AWS Lambda +- key: `bloodhound:keep` +- value: `true` -1. Navigate to the [AWS Lambda Console](https://console.aws.amazon.com/lambda/). -2. Select your BloodhoundLambda function. -3. Go to the "Configuration" tab and then "Environment variables". -4. Add the following environment variables: - - `SLACK_BOT_TOKEN`: Your Slack bot token. - - `CHANNEL_ID`: Your Slack channel ID. -5. Save the changes. +are treated as **kept (whitelisted)** and are excluded from teardown. + +--- + +## Teardown controls (important) + +By default, Bloodhound runs in **dry-run mode** and only posts +a teardown plan: + +- `APPLY_CHANGES=false` + +To allow Bloodhound to delete resources: + +- `APPLY_CHANGES=true` + +### Deletion safety limit + +Bloodhound includes a protection that limits how many resources can +be deleted in a single run. + +Environment variable: + +TEARDOWN_MAX_DELETE_COUNT + +Default value: + +5 + +If a teardown plan contains more resources than this limit, +Bloodhound will abort execution and refuse to delete anything. + +Example: + +Plan: 25 resources +Limit: 10 + +Result: + +Teardown aborted. + +This protection prevents unexpected scanning behavior or configuration +errors from deleting large amounts of infrastructure in a single run. + +### Runtime Safety Guards + +Bloodhound includes several runtime safeguards: + +- **simulate (no deletes)**: `TEARDOWN_SIMULATE=true` +- **only delete explicit IDs/ARNs**: set `TEARDOWN_TARGET_IDS=...` +- **delete everything not whitelisted**: `TEARDOWN_ALLOW_ALL=true` + +### Infrastructure safety guard + +Terraform includes an additional **deployment safety guard**. + +If the environment variable contains: + +APPLY_CHANGES=true + +Terraform will **block the deployment** unless the engineer +explicitly confirms destructive mode. + +Example error: + +Deployment blocked: APPLY_CHANGES=true requires -var allow_apply_mode=true + +To intentionally deploy Bloodhound with destructive mode enabled: + +terraform apply -var allow_apply_mode=true + +This prevents accidental infrastructure deletion caused by +misconfigured environment variables or commits. + +--- + +## Build the Lambda deployment zip (v2) + +The `.build/` directory is intentionally not committed to the repository. + +Terraform automatically constructs the Lambda deployment package during +`terraform apply`. + +Packaging flow: + +```text +terraform apply + ↓ +terraform_data.build_lambda_pkg + ↓ +scripts/build_lambda.sh + ↓ +.build/ + deps/ cached Python dependencies + src/ copied application source + lambda_pkg/ final Lambda package + ↓ +archive_file + ↓ +.build/bloodhound_lambda_v2.zip + ↓ +Lambda deployment +``` + +This layered build system allows Terraform to rebuild the Lambda package +quickly when application code changes while avoiding unnecessary dependency +reinstallation. + +For a deeper explanation of the packaging architecture see: + +`docs/lambda_packaging.md` + +### Forcing a Lambda rebuild + +Terraform only rebuilds the Lambda package when runtime source code +changes are detected. + +If you modify packaging logic or the build script, you may need to +force Terraform to rebuild the Lambda package. + +Run: + +```bash +terraform apply -replace=terraform_data.build_lambda_pkg +``` +This forces Terraform to rerun the build step and recreate the +Lambda deployment package. +--- + +## Deploy to AWS Lambda (v2) + +Deploy to a new function name so you do not touch your existing v1 Lambda: + +- Function name: `BloodhoundLambdaV2` + +Handler: `handlers.lambda_function.lambda_handler` + +The Lambda handler delegates execution to the Bloodhound +orchestration entrypoint: + +`bloodhound.app.run()` + +The `run()` function prepares the runtime environment for non-scheduled +invocations (Slack commands and validation harnesses) and then executes +the core pipeline. + +Scheduled executions are handled separately via +`scheduled_handler → run_scheduled_scan()` to prevent recursion. + +### ⚠️ Scheduled Execution + +Scheduled events do not pass through `run()`. + +They are routed to a dedicated execution path: + +scheduled_handler → run_scheduled_scan() → execute_pipeline() + +This prevents recursive execution and ensures deterministic behavior. + +Execution flow: + +```text +Lambda handler + ↓ +event routing (Slack / validation / scheduled) + ↓ + +Scheduled: + scheduled_handler → run_scheduled_scan() + +Default / Slack / validation: + bloodhound.app.run() + + ↓ +execute_pipeline() + ↓ +scan → budget → teardown → reporting +``` + + +### Configure Lambda environment variables + +In Lambda Console → **Configuration → Environment variables**, copy the values from your local `.env`. + +At minimum: + +- Slack: `SLACK_BOT_TOKEN`, `SLACK_SCAN_CHANNEL_ID`, `SLACK_ALERT_CHANNEL_ID` +- Regions: `REGION_MODE`, `REGIONS` +- Budget: `COHORT_START_YYYY_MM`, `COHORT_TOTAL_BUDGET_USD`, `COHORT_LENGTH_MONTHS`, `BUDGET_OVER_DAYS` +- Teardown: `APPLY_CHANGES`, `TEARDOWN_SIMULATE`, `TEARDOWN_ALLOW_ALL`, `TEARDOWN_TARGET_IDS` + +### Test the Lambda via AWS CLI + +```bash +aws lambda invoke \ + --function-name BloodhoundLambdaV2 \ + --payload file://tools/test_event.json \ + output.txt \ + --region us-west-2 + +cat output.txt +``` + +Note: + +The AWS region used for CLI invocation must match the region +where Terraform deployed the Lambda function. + +The default deployment region used by the Terraform configuration is: + +us-west-2 + +If this region changes in Terraform, the CLI commands and GitHub +workflow configuration must also be updated. + +--- +## Terraform Deployment Workflow -### 9. Test the Lambda Function +Bloodhound v2 infrastructure is deployed using Terraform. -1. Create a test event named `test_event.json` with the following content: +Run Terraform from the `infra/` directory. + +### Step 1 — Bootstrap Existing Resources (first run only) + +If the AWS account already contains Bloodhound IAM resources +(for example from a previous manual deployment), Terraform must +import them into state before the first apply. + +This repository includes a helper script to automate this process. + +```bash +cd infra +./bootstrap_imports.sh +``` + +The script will: + +detect existing IAM role bloodhound-v2-role + +detect existing IAM policy bloodhound-v2-policy + +import them into Terraform state if they already exist + +This step only needs to be performed once when introducing Terraform +into an existing AWS account. + +### Step 2 — Deploy Infrastructure + +```bash +cd infra +terraform init +terraform apply +``` + +Terraform will automatically: + +Build the Lambda deployment package using `scripts/build_lambda.sh` + +Install runtime dependencies from requirements.txt + +Create or update IAM roles and policies + +Deploy the Lambda function + +Create the Lambda Function URL + +After deployment Terraform will output the Lambda URL used by Slack commands. + +Example output: + +`lambda_function_url = https://xxxxx.lambda-url.us-west-2.on.aws/` + + +## GitHub Actions (invoke v2) + +This repo includes a separate workflow for v2: + +- `.github/workflows/invoke_lambda_v2.yml` + +It invokes: + +- `BloodhoundLambdaV2` + +--- + +## ⚙️ GitHub Automation + +Bloodhound includes **two GitHub Actions workflows**: + +1. **Scheduled infrastructure scans** +2. **Manual operator workflows** + +--- + +## Scheduled Scan Workflow + +Workflow file: + +`.github/workflows/invoke_lambda.yml` + +This workflow runs automatically on a fixed schedule and performs the +regular Bloodhound infrastructure scan. + +Schedule: +16:00 UTC → 11 AM EST +04:00 UTC → 11 PM EST +The workflow: + +- authenticates to AWS using GitHub OIDC +- invokes the `BloodhoundLambdaV2` Lambda +- runs the full scan pipeline +- prints the Lambda response and CloudWatch logs + +The Lambda event payload used for scheduled runs is: ```json -{} +{ "source": "scheduled" } ``` +This ensures the Lambda pipeline executes in scheduled scan mode. + +## Manual Operations Workflow -2. Invoke the Lambda function: +Workflow file: +`.github/workflows/bloodhound_ops.yml` +This workflow allows engineers to manually run Bloodhound tasks from the +GitHub Actions UI. -```sh -aws lambda invoke --function-name BloodhoundLambda --payload file://test_event.json output.txt --region us-west-2 +Available operations: + +```text +| Mode | Description | +|------|-------------| +| scan | Run an immediate infrastructure scan | +| status | Return system health information | +| validate_scheduler | Simulate an EventBridge scheduled invocation for validation | +| validation | Reserved for teardown validation workflow (currently disabled in CI) | ``` -3. Check the contents of `output.txt` and review the CloudWatch logs to ensure the function executed correctly. +Example usage: +GitHub → Actions → Bloodhound Operations → Run Workflow +The workflow will: -### 10. GitHub Actions Setup +- authenticate to AWS using GitHub OIDC +- invoke BloodhoundLambdaV2 +- print the Lambda response +- display structured scan results +- stream recent CloudWatch logs + +## Validation Workflow Status -#### 1. Create the Workflow File +The validation workflow is temporarily disabled in GitHub Actions. -Create the file `.github/workflows/invoke_lambda.yml` in your repository. +The validation pipeline provisions disposable Terraform infrastructure +and performs controlled teardown tests. -#### 2. Add the Workflow Configuration +This workflow runs correctly in local environments but requires +additional CI hardening before being enabled in GitHub Actions. -Add the following content to the `invoke_lambda.yml` file: +Validation testing can still be executed locally using: +`tools/run_validation_workflow.sh` -```yaml -name: Invoke Bloodhound Lambda +--- + +## Lambda Logging and Request Tracing + +Bloodhound uses standardized CloudWatch log markers to make +Lambda execution paths easy to identify during debugging. + +Each invocation emits a structured log header: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: -on: - schedule: - # Run at 11 AM and 11 PM EST (4 PM and 4 AM UTC) - - cron: "0 16,4 * * *" - workflow_dispatch: +[BLOODHOUND][SCHEDULED][request_id=abc123] +[BLOODHOUND][SCAN][request_id=xyz456] +[BLOODHOUND][STATUS][request_id=def789] -jobs: invoke +This logging format provides: --lambda: - runs-on: ubuntu-latest +- clear identification of invocation type +- request-level traceability +- easier debugging of scheduled and manual executions - steps: - - name: Checkout repository - uses: actions/checkout@v2 +--- - - name: Set up AWS CLI - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-west-2 +### GitHub OIDC Authentication Bootstrap - - name: Invoke Bloodhound Lambda function - run: | - echo '{}' > test_event.json - aws lambda invoke --function-name BloodhoundLambda --payload file://test_event.json output.txt --region us-west-2 - cat output.txt +The GitHub workflow authenticates to AWS using **OIDC role assumption**. + +This avoids storing long-lived AWS credentials in GitHub secrets. + +Instead, GitHub obtains **temporary AWS credentials** during workflow execution. + +Authentication flow: + +```text +GitHub Actions +↓ +OIDC identity token +↓ +AWS STS AssumeRoleWithWebIdentity +↓ +BloodhoundGitHubInvokeRole +↓ +Temporary AWS credentials +↓ +Invoke BloodhoundLambdaV2 ``` -#### 3. Add AWS Credentials to GitHub Secrets +### Bootstrap Script + +The repository includes a helper script that configures the AWS IAM +resources required for **GitHub Actions OIDC authentication**. + +Script: + +`scripts/bootstrap_github_oidc.sh` + +This script prepares the AWS account so GitHub Actions workflows +can securely invoke the Bloodhound Lambda using OIDC role +assumption instead of long-lived AWS access keys. + +After running the script, GitHub workflows will assume the role: + +BloodhoundGitHubInvokeRole + +This role allows GitHub Actions to invoke: + +BloodhoundLambdaV2 + +This script performs the following tasks: + +1. Detects the GitHub OIDC identity provider +2. Creates it if missing +3. Creates the IAM role `BloodhoundGitHubInvokeRole` +4. Configures the trust policy for the repository +5. Attaches Lambda invocation permissions +6. Tags IAM resources for governance and auditing + +Temporary Policy Artifacts + +The bootstrap script generates temporary JSON files during execution: + +trust-policy.json +lambda-policy.json + +These files are required by the AWS CLI when creating IAM roles and attaching policies. + +They are temporary artifacts only and are automatically removed when the script exits. + +This cleanup behavior is implemented using a Bash trap: + +trap "rm -f trust-policy.json lambda-policy.json" EXIT + +This ensures that: + +temporary policy files are never accidentally committed to the repository + +local workspaces remain clean after script execution + +CI environments do not accumulate artifacts + +These files should not be added to Git. + +If they appear locally (for example if the script is interrupted), they can safely be deleted. + +--- + +### Run the bootstrap script + +Run once when setting up CI access for a new AWS account. + +```bash +chmod +x scripts/bootstrap_github_oidc.sh +./scripts/bootstrap_github_oidc.sh +``` + +The script will output the IAM role ARN used by the GitHub workflow. + +Example: + +arn:aws:iam:::role/BloodhoundGitHubInvokeRole -1. Go to your repository on GitHub. -2. Click on "Settings". -3. Click on "Secrets and variables" in the left sidebar, then click on "Actions". -4. Click "New repository secret". -5. Add the following secrets: - - `AWS_ACCESS_KEY_ID`: Your AWS access key ID. - - `AWS_SECRET_ACCESS_KEY`: Your AWS secret access key. +--- + +### Trust Policy + +The IAM role restricts access to workflows originating from the +official repository: -#### 4. Push the Workflow to GitHub +`repo:codeplatoon-devops/Bloodhound:*` -Commit and push the `.github/workflows/invoke_lambda.yml` file to your repository. +This allows engineers to run workflows from **any branch** within the +repository, enabling CI testing for feature branches while still +preventing external repositories from assuming the role. -#### 5. Verify the Workflow +--- -1. Go to the "Actions" tab in your GitHub repository. -2. You should see the new workflow listed. It will run according to the schedule and can also be manually triggered. +### Security Notes -## Conclusion +This architecture provides several advantages: -By following these steps, you have successfully set up, deployed, and tested the Bloodhound Lambda function with Slack integration. The function scans AWS regions for EC2 and RDS instances and posts a summary to the specified Slack channel. Additionally, you have set up a GitHub Action to invoke the Lambda function twice a day at 11 AM and 11 PM EST. Ensure to review the logs and Slack messages to confirm the function's correct behavior. +• no AWS access keys stored in GitHub +• temporary credentials issued per workflow run +• access restricted to a specific repository +• IAM role tagged for governance and auditing --- + +## Slack slash commands (v2) + +Slash commands require a publicly reachable HTTPS endpoint. For v2 we recommend a **Lambda Function URL** (one endpoint) and route based on the Slack `command` field. + +- `/v2_seek` runs scan + reports (non-destructive) +- `/v2_seek_destroy_plan` previews the teardown plan +- `/v2_seek_destroy CONFIRM` runs destructive cleanup (deletes all non-whitelisted candidates) +- `/v2_status` shows the current Bloodhound system status + +To enable slash commands you must set these env vars in Lambda: + +- `SLACK_SIGNING_SECRET` +- `SLACK_ALLOWED_USER_IDS` (optional) +- `SLACK_ALLOWED_CHANNEL_IDS` (optional) +- `SLACK_DESTROY_CONFIRM_TOKEN` (default `CONFIRM`) + +## Validation Scripts + +Bloodhound includes automation scripts that help engineers quickly +verify the infrastructure deployment and teardown pipeline. + +These scripts are located in: + +`tools/` + +### Validation Workflow + +Bloodhound also provides an automated validation workflow that runs the +available validation tools in the correct order. + +Run: + +```bash +tools/run_validation_workflow.sh +``` + +This script orchestrates the following validation stages: + +1. Lambda infrastructure smoke test +2. Controlled teardown validation + +The workflow verifies that: + +- the Lambda deployment is healthy +- environment variables match the expected configuration +- Slack commands are correctly routed to Lambda +- the teardown pipeline can safely delete resources + +This provides a fast way to confirm that the full Bloodhound deployment +is functioning correctly after infrastructure changes. + +Typical usage after deploying infrastructure: + +terraform apply +tools/run_validation_workflow.sh + +### Smoke Test + +Script: + +```bash +tools/smoke_test_lambda.sh +``` + +This script performs a quick health check of the deployed Lambda. + +It verifies: + +Lambda function exists + +environment variables are present + +CloudWatch log group exists + +Lambda Function URL is configured + +This script is useful immediately after running: + +terraform apply + +It detects most deployment problems within seconds. + +### Controlled Teardown Validation + +Script: + +```bash +tools/validate_teardown.sh +``` + +This script automates the teardown validation procedure described in: + +`docs/validate_teardown.md` + +The script: + +creates a disposable EC2 instance + +captures the instance ID + +prompts the engineer to run the scan command + +Legacy: + +/seek + +Current: + +/v2_seek + +Then prompts the engineer to run the teardown command + +Legacy: + +/seek_destroy CONFIRM + +Current: + +/v2_seek_destroy CONFIRM + +verifies that the instance was deleted + +restores Bloodhound to safe mode + +This test confirms the full teardown pipeline: + +Execution path for destructive operations: + +Operator workflow: +Slack → Lambda → AWS API → resource deletion + +Validation workflow: +Validation script → Lambda → AWS API → resource deletion + +Optional Log Streaming + +During validation, Lambda execution logs can be streamed live using: + +```bash +aws logs tail /aws/lambda/BloodhoundLambdaV2 \ +--region us-west-2 \ +--follow +``` + +This allows engineers to observe the execution path of: + +Legacy commands: + +/seek +/seek_destroy + +Current commands: + +/v2_seek +/v2_seek_destroy + +in real time. + +Environment configuration validation + +The smoke test also verifies that critical Lambda environment variables +match the local `.env` configuration. + +This prevents common deployment mistakes such as: + +- Terraform not applied after `.env` changes +- Lambda environment variables edited manually +- CI/CD deploying outdated configuration + +If a mismatch is detected, the script will stop immediately and +display the conflicting values. + +5. Example Failure Output + +Example when Terraform is stale: + +Checking Lambda environment variables... + +Comparing critical environment variables with .env... + +ERROR: Environment variable mismatch + +Variable: APPLY_CHANGES +Expected: false +Actual: true + +Terraform deployment may be out of sync. + +If Slack commands stop responding after deployment, see: + +`docs/troubleshooting_slack_commands.md` diff --git a/assets/bloodhound_lambda_architecture_v2.svg b/assets/bloodhound_lambda_architecture_v2.svg new file mode 100644 index 0000000..515c5d7 --- /dev/null +++ b/assets/bloodhound_lambda_architecture_v2.svg @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + Bloodhound v2 — Architecture and Logic Flow + AWS resource scan + whitelist filter + budget projection + Slack alerts + optional teardown (with safety rails) + + + + Triggers + + + Scheduled (GitHub Actions or scheduler) + Invokes Lambda for scan + budget report + Recommended for regular visibility + + + On-demand (Slack slash commands) + Slack → Lambda Function URL (HTTPS) + Signature verification + replay protection + Responds quickly; runs worker async + + + + AWS Account (single account) — BloodhoundLambdaV2 + Runtime: python3.10 • Handler: handlers.lambda_function.lambda_handler • Deployed via Terraform + + + + HTTP Entry (Function URL) + + bloodhound.slack_commands: verify + route (/seek, /seek_destroy) + + Immediately returns 200 to Slack + Worker invoked async for full run + + + Scheduled / Direct Invoke + + aws lambda invoke (CLI) or scheduler + + Runs full scan + budget + teardown plan + Optionally executes teardown (apply mode) + + + + Core Orchestrator + + bloodhound.app: load config → scan_all → whitelist → budget → messages → Slack posts + + Config + safety rails + APPLY_CHANGES (default false) + TEARDOWN_SIMULATE, TEARDOWN_TARGET_IDS + TEARDOWN_ALLOW_ALL (dangerous) + + Slack destinations + Scan channel: SLACK_SCAN_CHANNEL_ID + Alert channel: SLACK_ALERT_CHANNEL_ID + Posting via chat.postMessage + + + + Multi-region Scanning + bloodhound.scanner.scan_all + per-service scanners + Regions: explicit list or discovery mode + Resources scanned (examples): + - EC2 instances, EBS volumes, Elastic IPs + - RDS instances + - ELBv2 load balancers + + + + Budget Projection + bloodhound.budget → AWS Cost Explorer + Cohort config: start YYYY-MM, total USD, length months + Dynamic monthly allowance using cohort-to-date spend + Over-budget threshold based on projected days over + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Whitelist Filter + bloodhound.whitelist: KEEP_TAG_KEY / KEEP_TAG_VALUE + Kept resources are excluded from teardown + + Teardown (optional) + bloodhound.teardown: planner + executor + Dry-run by default; apply requires explicit flags + + + + Slack Workspace + + Channels + Scan summaries + whitelisted list + Budget summaries + overage alerts + Teardown plan + results (when enabled) + + + + invoke + + + HTTPS request + + + + /seek, /seek_destroy + + + + async invoke worker + + + + direct run + + + + scan candidates + + + + + + + + fetch costs + + + + chat.postMessage + + + + Safety rails for destructive actions + Default behavior posts a teardown plan only (APPLY_CHANGES=false). + Apply-mode supports simulate, explicit targets, or allow-all (dangerous). + Slash destroy requires a confirm token and optional allowlists. + diff --git a/bloodhound.py b/bloodhound.py deleted file mode 100755 index d6288cf..0000000 --- a/bloodhound.py +++ /dev/null @@ -1,94 +0,0 @@ -import boto3 -import os -import json -from dotenv import load_dotenv - -# Import WebClient from Python SDK (github.com/slackapi/python-slack-sdk) -from slack_sdk import WebClient - -load_dotenv() - -STUDENT_REGIONS = [ - "eu-central-1", - "eu-west-1", - "eu-west-2", - "us-east-1", - "us-east-2", - "us-west-1", - "us-west-2", -] - - -def create_session(region): - return boto3.Session(profile_name="bloodhound", region_name=region) - - -def search_regions_for_rds_resources(session): - rds_instances = [] - print(f"Sniffing out rds resources in {session.region_name}...") - rds = session.client("rds") - response = rds.describe_db_instances() - for dbinstance in response["DBInstances"]: - rds_instances.append(dbinstance["DBInstanceIdentifier"]) - return {"rds": rds_instances} - - -def search_regions_for_ec2_resources(session): - """ - TODO: modify the parsing logic here from the describe_instances() call, - https://serverfault.com/questions/749118/ - Instances spun up at the same time will be in the same reservationId more than likely. - Instances spun up individually will have their own reservationId - """ - ec2_instances = [] - print(f"Sniffing out ec2 resources in {session.region_name}...") - ec2 = session.client("ec2") - response = ec2.describe_instances() - try: - # Instances spun up at the same time will be listed in the same Reservation - if len(response["Reservations"]) == 1: - for ec2instance in response["Reservations"][0]["Instances"]: - if ec2instance["State"]["Name"] in ("stopped", "terminated"): - continue - ec2_instances.append(ec2instance["InstanceId"]) - # Indvidual instances will appear in their own Reservation - else: - for ec2instance in response["Reservations"]: - if ec2instance["Instances"][0]["State"]["Name"] in ("stopped", "terminated"): - continue - ec2_instances.append(ec2instance["Instances"][0]["InstanceId"]) - except IndexError: - return {"ec2": []} - return {"ec2": ec2_instances} - - -def format_message(resources): - message = ":dog2: Woof! Woof! :dog2:\n Bloodhound found the following resources in use: \n" - for region in resources.keys(): - if len(resources[region]["ec2"]) == 0 and len(resources[region]["rds"]) == 0: - continue - message += f'- *{region.upper()}*: {len(resources[region]["ec2"])} ec2 instances, {len(resources[region]["rds"])} rds instances\n' - message += f"Please stop or terminate all unneeded resources!" - return message - - -def send_slack_message(message): - channel_id = os.environ.get("CHANNEL_ID") - client = WebClient(token=os.environ.get("SLACK_BOT_TOKEN")) - client.chat_postMessage(channel=channel_id, text=message) - - -def main(): - resources_in_regions = {} - print(f"Going hunting in regions {STUDENT_REGIONS}") - for region in STUDENT_REGIONS: - session = create_session(region) - resources_in_regions[region] = search_regions_for_ec2_resources(session) - resources_in_regions[region].update(search_regions_for_rds_resources(session)) - message = format_message(resources_in_regions) - print(message) - send_slack_message(message) - - -if __name__ == "__main__": - main() diff --git a/bloodhound/__init__.py b/bloodhound/__init__.py new file mode 100644 index 0000000..e3c7188 --- /dev/null +++ b/bloodhound/__init__.py @@ -0,0 +1,5 @@ +__all__ = ["__version__"] + +__version__ = "2.0.0-dev" + + diff --git a/bloodhound/app.py b/bloodhound/app.py new file mode 100644 index 0000000..1b095b0 --- /dev/null +++ b/bloodhound/app.py @@ -0,0 +1,280 @@ +""" +bloodhound/app.py + +Orchestration entrypoint for Bloodhound v2. + +Primary responsibilities: +- Load env config (via `config.py`) +- Scan resources (via `scanner/*`) +- Split candidates vs whitelisted/kept (via `whitelist.py`) +- Post Slack reports (via `messages.py` + `slack.py`) +- Optionally execute teardown actions (via `teardown/*`) + +Used by: +- `lambda_function.lambda_handler` (AWS Lambda) +- `run_local.py` (local testing) +""" + +from __future__ import annotations + +import json +import os +from typing import Any + +from bloodhound.aws import create_clients +from bloodhound.budget import compute_budget_snapshot +from bloodhound.config import load_config, validate_config +from bloodhound.messages import ( + format_budget_message, + format_scan_message, + format_teardown_plan_message, + format_teardown_result_message, + format_whitelisted_resources_message, + format_status_message, +) +from bloodhound.scanner.regions import discover_regions, select_regions +from bloodhound.scanner.scan_all import scan_all +from bloodhound.slack import SlackNotifier +from bloodhound.teardown.executor import execute_actions +from bloodhound.teardown.planner import plan_deletions +from bloodhound.whitelist import filter_whitelisted +from bloodhound.types import resource_key +from bloodhound.handlers.slack_handler import handle_slack_event +from bloodhound.handlers.validation_handler import handle_validation_event +#from bloodhound.handlers.scheduled_handler import handle_scheduled_event +from bloodhound.services.status_service import handle_status_command +from bloodhound.services.scan_service import scan_resources +from bloodhound.services.budget_service import compute_budget +from bloodhound.services.teardown_service import plan_teardown, execute_teardown + +def compute_system_health( + *, + apply_changes: bool, + allow_all_targets: bool, + max_delete_count: int, + budget_over_threshold: bool, +) -> str: + """ + Compute a simple system health indicator for /v2_status. + + Args: + apply_changes: + Whether destructive mode is enabled. + + allow_all_targets: + Whether teardown can target all resources. + + max_delete_count: + Maximum number of deletions allowed per run. + + budget_over_threshold: + Whether budget alert threshold has been triggered. + + Returns: + str: + Emoji indicator representing system health: + 🟢 healthy + 🟡 warning + 🔴 critical + """ + + if budget_over_threshold: + return "🔴" + + if apply_changes and allow_all_targets: + return "🔴" + + if apply_changes: + return "🟡" + + if max_delete_count > 20: + return "🟡" + + return "🟢" + +def resource_key_from_action(a) -> str: + if a.arn: + return a.arn + return f"{a.service}:{a.region}:{a.id}" + + +def execute_pipeline(event): + """ + Core Bloodhound operational pipeline. + + This function performs the full orchestration workflow: + + 1. Load configuration + 2. Create AWS clients + 3. Scan resources across regions + 4. Apply whitelist filtering + 5. Generate Slack scan reports + 6. Compute budget projections + 7. Plan teardown actions + 8. Optionally execute deletion actions + + Separating this pipeline from the Lambda entrypoint keeps the + application architecture maintainable as the system grows. + """ + + print("Bloodhound pipeline started") + print("Event:", json.dumps(event)) + + # Load configuration from env/.env and validate required fields. + cfg = load_config() + errors = validate_config(cfg) + if errors: + # Still return a structured error for Lambda invocations. + return {"ok": False, "errors": errors} + + # AWS session/clients (Lambda uses its execution role; local can use AWS_PROFILE). + clients = create_clients(profile=cfg.aws.profile) + slack = None + if cfg.slack.enabled: + slack = SlackNotifier.from_token( + bot_token=cfg.slack.bot_token, + scan_channel_id=cfg.slack.scan_channel_id, + alert_channel_id=cfg.slack.alert_channel_id, + ) + + status_response = handle_status_command(event, cfg, slack, compute_system_health) + if status_response: + return status_response + + scan_result = scan_resources(cfg, clients, slack) + + budget_snapshot = compute_budget(cfg, clients, slack) + + actions = plan_teardown(cfg, event, scan_result["all_candidates"], slack) + + exec_summary = execute_teardown( + cfg, + event, + clients, + actions, + slack, + resource_key_from_action + ) + + result = { + "ok": True, + "regions": scan_result["regions"], + "scan": { + "candidates_total": sum(len(v) for v in scan_result["candidates_by_region"].values()), + "kept_total": sum(len(v) for v in scan_result["kept_by_region"].values()), + }, + "budget": { + "projected_month_end_spend_usd": budget_snapshot.projected_month_end_spend_usd, + "dynamic_monthly_allowance_usd": budget_snapshot.dynamic_monthly_allowance_usd, + "over_budget_threshold_met": budget_snapshot.over_budget_threshold_met, + }, + "teardown": { + "apply_changes": cfg.teardown.apply_changes, + "simulate": cfg.teardown.simulate, + "targets_filter": sorted(cfg.teardown.target_ids) if cfg.teardown.target_ids else None, + "planned_actions": len(actions), + "execution": exec_summary, + }, + } + + # ------------------------------------------------------------ + # DEBUG LOGGING + # ------------------------------------------------------------ + print("Bloodhound pipeline completed") + + print("Scan summary:") + print( + json.dumps( + { + "candidates_total": result["scan"]["candidates_total"], + "kept_total": result["scan"]["kept_total"], + }, + indent=2, + ) + ) + + print("Budget summary:") + print( + json.dumps( + { + "projected_month_end_spend_usd": result["budget"]["projected_month_end_spend_usd"], + "dynamic_monthly_allowance_usd": result["budget"]["dynamic_monthly_allowance_usd"], + "over_budget_threshold_met": result["budget"]["over_budget_threshold_met"], + }, + indent=2, + ) + ) + + print("Teardown summary:") + print( + json.dumps( + { + "apply_changes": result["teardown"]["apply_changes"], + "simulate": result["teardown"]["simulate"], + "planned_actions": result["teardown"]["planned_actions"], + }, + indent=2, + ) + ) + + return result + + +def run_scheduled_scan(): + """ + Entry point for scheduled executions. + + This bypasses the generic run() router and directly + executes the Bloodhound pipeline with a controlled event. + + This prevents recursive re-entry into scheduled handlers. + """ + + event = {"source": "scheduled"} + + return execute_pipeline(event) + + +def run(event, context): + """ + Main orchestration entrypoint for Lambda and local testing. + Returns a small JSON-serializable summary for `aws lambda invoke`. + + The run() function now acts as a thin orchestrator that prepares + the runtime environment and then delegates execution to the + Bloodhound pipeline. + """ + + # ------------------------------------------------------------ + # Prepare runtime environment based on invocation source + # + # Slack commands and validation harnesses modify environment + # variables so the pipeline behaves in the correct mode. + # ------------------------------------------------------------ + + # Determine the event source safely. + # If the event is not a dictionary (which can happen in some Lambda test cases), + # we default the source to None so the routing logic below will simply skip. + source = event.get("source") if isinstance(event, dict) else None + + # ------------------------------------------------------------ + # Route the event to the appropriate handler + # ------------------------------------------------------------ + + if source == "slack_command": + handle_slack_event(event) + + elif source == "validation": + handle_validation_event(event) + + + # ------------------------------------------------------------ + # Execute the core Bloodhound pipeline + # ------------------------------------------------------------ + + return execute_pipeline(event) + + + + + diff --git a/bloodhound/aws.py b/bloodhound/aws.py new file mode 100644 index 0000000..0318846 --- /dev/null +++ b/bloodhound/aws.py @@ -0,0 +1,38 @@ +""" +bloodhound/aws.py + +Thin AWS session/client helpers for Bloodhound v2. + +In Lambda: credentials come from the execution role. +Locally: you can use AWS_PROFILE via environment variables. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +import boto3 + + +@dataclass(frozen=True) +class AwsClients: + session: boto3.Session + + def client(self, service: str, region: Optional[str] = None): + if region: + return self.session.client(service, region_name=region) + return self.session.client(service) + + +def create_session(profile: Optional[str] = None, region: Optional[str] = None) -> boto3.Session: + # In Lambda, profile should be None and the execution role creds will be used. + if profile: + return boto3.Session(profile_name=profile, region_name=region) + return boto3.Session(region_name=region) + + +def create_clients(profile: Optional[str] = None, region: Optional[str] = None) -> AwsClients: + return AwsClients(session=create_session(profile=profile, region=region)) + + diff --git a/bloodhound/budget.py b/bloodhound/budget.py new file mode 100644 index 0000000..a4f8996 --- /dev/null +++ b/bloodhound/budget.py @@ -0,0 +1,208 @@ +""" +bloodhound/budget.py + +Budget/projection logic for Bloodhound v2. + +Uses Cost Explorer (CE) to compute: +- cohort-to-date spend +- dynamic monthly allowance based on remaining budget/months +- simple month-end projection from daily spend run-rate +""" + +from __future__ import annotations + +import calendar +from dataclasses import dataclass +from datetime import date, datetime, timedelta + +from bloodhound.aws import AwsClients + + +@dataclass(frozen=True) +class BudgetSnapshot: + cohort_start_yyyy_mm: str + cohort_total_budget_usd: float + cohort_length_months: int + + cohort_to_date_spend_usd: float + remaining_cohort_budget_usd: float + remaining_months: int + dynamic_monthly_allowance_usd: float + + current_month_to_date_spend_usd: float + projected_month_end_spend_usd: float + + budget_over_days: int + over_budget_threshold_met: bool + + +def compute_budget_snapshot( + clients: AwsClients, + cohort_start_yyyy_mm: str, + cohort_total_budget_usd: float, + cohort_length_months: int, + budget_over_days: int, + today: date | None = None, +) -> BudgetSnapshot: + today = today or date.today() + + start_year, start_month = _parse_yyyy_mm(cohort_start_yyyy_mm) + cohort_start = date(start_year, start_month, 1) + + # Cost Explorer is effectively "global"; us-east-1 is the common boto3 convention. + ce = clients.client("ce", region="us-east-1") + + # Cohort-to-date spend (monthly granularity). + cohort_to_date = _get_cost_monthly_total(ce, start=cohort_start, end=today + timedelta(days=1)) + + # Current month daily spend for: + # - month-to-date spend + # - "over budget X days" calculation without needing a database + month_start = date(today.year, today.month, 1) + daily_costs = _get_cost_daily(ce, start=month_start, end=today + timedelta(days=1)) + mtd_spend = sum(daily_costs.values()) + + remaining_budget = max(0.0, cohort_total_budget_usd - cohort_to_date) + remaining_months = _remaining_months(cohort_start, cohort_length_months, today) + allowance = remaining_budget / remaining_months if remaining_months > 0 else remaining_budget + + # Run-rate projection: assumes current pace continues to month end. + projected_month_end = _project_month_end(mtd_spend, today) + over_met = _over_budget_threshold_met( + daily_costs=daily_costs, + days_in_month=calendar.monthrange(today.year, today.month)[1], + monthly_allowance_usd=allowance, + budget_over_days=budget_over_days, + ) + + return BudgetSnapshot( + cohort_start_yyyy_mm=cohort_start_yyyy_mm, + cohort_total_budget_usd=cohort_total_budget_usd, + cohort_length_months=cohort_length_months, + cohort_to_date_spend_usd=cohort_to_date, + remaining_cohort_budget_usd=remaining_budget, + remaining_months=remaining_months, + dynamic_monthly_allowance_usd=allowance, + current_month_to_date_spend_usd=mtd_spend, + projected_month_end_spend_usd=projected_month_end, + budget_over_days=budget_over_days, + over_budget_threshold_met=over_met, + ) + + +def _parse_yyyy_mm(raw: str) -> tuple[int, int]: + try: + dt = datetime.strptime(raw, "%Y-%m") + return dt.year, dt.month + except ValueError: + # Fall back to current month; validation should catch missing/invalid. + today = date.today() + return today.year, today.month + + +def _to_ce_date(d: date) -> str: + return d.strftime("%Y-%m-%d") + + +def _get_cost_monthly_total(ce_client, start: date, end: date) -> float: + """ + Returns the sum of monthly UnblendedCost between [start, end) dates. + Cost Explorer end is exclusive. + """ + resp = ce_client.get_cost_and_usage( + TimePeriod={"Start": _to_ce_date(start), "End": _to_ce_date(end)}, + Granularity="MONTHLY", + Metrics=["UnblendedCost"], + ) + total = 0.0 + for r in resp.get("ResultsByTime", []) or []: + amt = ((r.get("Total") or {}).get("UnblendedCost") or {}).get("Amount") + if not amt: + continue + try: + total += float(amt) + except ValueError: + continue + return total + + +def _get_cost_daily(ce_client, start: date, end: date) -> dict[date, float]: + """ + Returns a mapping of day -> UnblendedCost between [start, end). + """ + resp = ce_client.get_cost_and_usage( + TimePeriod={"Start": _to_ce_date(start), "End": _to_ce_date(end)}, + Granularity="DAILY", + Metrics=["UnblendedCost"], + ) + out: dict[date, float] = {} + for r in resp.get("ResultsByTime", []) or []: + start_str = (r.get("TimePeriod") or {}).get("Start") + if not start_str: + continue + try: + day = datetime.strptime(start_str, "%Y-%m-%d").date() + except ValueError: + continue + amt = ((r.get("Total") or {}).get("UnblendedCost") or {}).get("Amount") + try: + out[day] = float(amt) if amt is not None else 0.0 + except ValueError: + out[day] = 0.0 + return out + + +def _remaining_months(cohort_start: date, cohort_length_months: int, today: date) -> int: + """ + Remaining months INCLUDING current month, clamped to at least 1 while inside cohort. + """ + months_elapsed = (today.year - cohort_start.year) * 12 + (today.month - cohort_start.month) + remaining = cohort_length_months - months_elapsed + return max(1, remaining) + + +def _project_month_end(mtd_spend_usd: float, today: date) -> float: + days_in_month = calendar.monthrange(today.year, today.month)[1] + days_elapsed = max(1, today.day) + return (mtd_spend_usd / days_elapsed) * days_in_month + + +def _over_budget_threshold_met( + daily_costs: dict[date, float], + days_in_month: int, + monthly_allowance_usd: float, + budget_over_days: int, +) -> bool: + """ + Minimal, state-free approach: + - Look at the last N days present in daily_costs. + - For each day, compute what the month-end projection would be at that point. + - If all N projections exceed monthly_allowance_usd, threshold is met. + """ + if budget_over_days <= 0: + return False + if not daily_costs: + return False + + days = sorted(daily_costs.keys()) + last_days = days[-budget_over_days:] + if len(last_days) < budget_over_days: + return False + + # cumulative spend up to each day (inclusive) + cumulative = 0.0 + day_to_cumulative: dict[date, float] = {} + for d in days: + cumulative += daily_costs.get(d, 0.0) + day_to_cumulative[d] = cumulative + + for d in last_days: + days_elapsed = d.day + if days_elapsed <= 0: + return False + projection = (day_to_cumulative[d] / days_elapsed) * days_in_month + if projection <= monthly_allowance_usd: + return False + return True + + diff --git a/bloodhound/config.py b/bloodhound/config.py new file mode 100644 index 0000000..13c0f47 --- /dev/null +++ b/bloodhound/config.py @@ -0,0 +1,217 @@ +""" +bloodhound/config.py + +Configuration loader for Bloodhound v2. + +Reads configuration from environment variables and (for local runs) a `.env` file. +All non-code “knobs” (channels, regions, whitelist tag, teardown flags, cohort config) +should live here so the rest of the code stays clean. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Optional + +from dotenv import load_dotenv + +def _env(name: str, default: Optional[str] = None) -> Optional[str]: + value = os.environ.get(name) + if value is None: + return default + value = value.strip() + return value if value != "" else default + + +def _env_bool(name: str, default: bool = False) -> bool: + raw = _env(name) + if raw is None: + return default + return raw.lower() in {"1", "true", "t", "yes", "y", "on"} + + +def _env_int(name: str, default: int) -> int: + raw = _env(name) + if raw is None: + return default + try: + return int(raw) + except ValueError: + return default + + +def _env_float(name: str, default: float) -> float: + raw = _env(name) + if raw is None: + return default + try: + return float(raw) + except ValueError: + return default + + +def _split_csv(raw: Optional[str]) -> list[str]: + if not raw: + return [] + return [x.strip() for x in raw.split(",") if x.strip()] + + +@dataclass(frozen=True) +class SlackConfig: + bot_token: str + scan_channel_id: str + alert_channel_id: str + enabled: bool + + +@dataclass(frozen=True) +class RegionConfig: + mode: str # explicit | discover + regions: list[str] + + +@dataclass(frozen=True) +class WhitelistConfig: + keep_tag_key: str + keep_tag_value: str + keep_resource_ids: set[str] + + +@dataclass(frozen=True) +class TeardownConfig: + apply_changes: bool + simulate: bool + target_ids: set[str] + allow_all_targets: bool + teardown_mode: str # delete (future: stop) + rds_final_snapshot: bool + max_delete_count: int # safety limit for number of deletions per run + + +@dataclass(frozen=True) +class BudgetConfig: + cohort_start_yyyy_mm: str + cohort_total_budget_usd: float + cohort_length_months: int + budget_over_days: int + + +@dataclass(frozen=True) +class AwsConfig: + # Optional local-only convenience; Lambda ignores this unless you explicitly set it. + profile: Optional[str] + + +@dataclass(frozen=True) +class AppConfig: + slack: SlackConfig + regions: RegionConfig + whitelist: WhitelistConfig + teardown: TeardownConfig + budget: BudgetConfig + aws: AwsConfig + + +def load_config() -> AppConfig: + # Load local .env for developer convenience. In Lambda, env vars are provided by the runtime. + load_dotenv(override=False) + + slack_enabled = _env_bool("SLACK_ENABLED", True) + slack_bot_token = _env("SLACK_BOT_TOKEN") or "" + # Backwards compat with v1 env var: + v1_channel_id = _env("CHANNEL_ID") + scan_channel_id = _env("SLACK_SCAN_CHANNEL_ID") or v1_channel_id or "" + alert_channel_id = _env("SLACK_ALERT_CHANNEL_ID") or scan_channel_id + + region_mode = (_env("REGION_MODE", "explicit") or "explicit").lower() + regions = _split_csv(_env("REGIONS")) + + keep_tag_key = _env("KEEP_TAG_KEY", "bloodhound:keep") or "bloodhound:keep" + keep_tag_value = _env("KEEP_TAG_VALUE", "true") or "true" + keep_resource_ids = set(_split_csv(_env("KEEP_RESOURCE_IDS"))) + + apply_changes = _env_bool("APPLY_CHANGES", False) + simulate = _env_bool("TEARDOWN_SIMULATE", False) + target_ids = set(_split_csv(_env("TEARDOWN_TARGET_IDS"))) + allow_all_targets = _env_bool("TEARDOWN_ALLOW_ALL", False) + teardown_mode = (_env("TEARDOWN_MODE", "delete") or "delete").lower() + rds_final_snapshot = _env_bool("RDS_FINAL_SNAPSHOT", False) + + # Safety guard: prevent large accidental deletions. + # If a teardown plan exceeds this number, execution will stop. + max_delete_count = _env_int("TEARDOWN_MAX_DELETE_COUNT", 5) + + cohort_start_yyyy_mm = _env("COHORT_START_YYYY_MM", "") or "" + cohort_total_budget_usd = _env_float("COHORT_TOTAL_BUDGET_USD", 3000.0) + cohort_length_months = _env_int("COHORT_LENGTH_MONTHS", 7) + budget_over_days = max(1, _env_int("BUDGET_OVER_DAYS", 2)) + + profile = _env("AWS_PROFILE") + + return AppConfig( + slack=SlackConfig( + bot_token=slack_bot_token, + scan_channel_id=scan_channel_id, + alert_channel_id=alert_channel_id, + enabled=slack_enabled, + ), + regions=RegionConfig(mode=region_mode, regions=regions), + whitelist=WhitelistConfig( + keep_tag_key=keep_tag_key, + keep_tag_value=keep_tag_value, + keep_resource_ids=keep_resource_ids, + ), + teardown=TeardownConfig( + apply_changes=apply_changes, + simulate=simulate, + target_ids=target_ids, + allow_all_targets=allow_all_targets, + teardown_mode=teardown_mode, + rds_final_snapshot=rds_final_snapshot, + max_delete_count=max_delete_count, + ), + budget=BudgetConfig( + cohort_start_yyyy_mm=cohort_start_yyyy_mm, + cohort_total_budget_usd=cohort_total_budget_usd, + cohort_length_months=cohort_length_months, + budget_over_days=budget_over_days, + ), + aws=AwsConfig(profile=profile), + ) + + +def validate_config(cfg: AppConfig) -> list[str]: + errors: list[str] = [] + + if cfg.slack.enabled: + if not cfg.slack.bot_token: + errors.append("Missing SLACK_BOT_TOKEN") + if not cfg.slack.scan_channel_id: + errors.append("Missing SLACK_SCAN_CHANNEL_ID (or CHANNEL_ID for v1 compatibility)") + + if cfg.regions.mode not in {"explicit", "discover"}: + errors.append("REGION_MODE must be 'explicit' or 'discover'") + if cfg.regions.mode == "explicit" and not cfg.regions.regions: + errors.append("REGION_MODE=explicit requires REGIONS to be set") + + if cfg.teardown.teardown_mode not in {"delete"}: + errors.append("TEARDOWN_MODE currently supports only 'delete' in v2.0") + + if cfg.teardown.apply_changes and not cfg.teardown.simulate and not cfg.teardown.allow_all_targets: + if not cfg.teardown.target_ids: + errors.append( + "APPLY_CHANGES=true requires TEARDOWN_TARGET_IDS to be set (comma-separated IDs/ARNs) " + "unless TEARDOWN_ALLOW_ALL=true" + ) + + if not cfg.budget.cohort_start_yyyy_mm: + errors.append("Missing COHORT_START_YYYY_MM (e.g. 2026-01)") + if cfg.budget.cohort_length_months <= 0: + errors.append("COHORT_LENGTH_MONTHS must be > 0") + if cfg.budget.cohort_total_budget_usd <= 0: + errors.append("COHORT_TOTAL_BUDGET_USD must be > 0") + + return errors + + diff --git a/bloodhound/handlers/__init__.py b/bloodhound/handlers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bloodhound/handlers/scheduled_handler.py b/bloodhound/handlers/scheduled_handler.py new file mode 100644 index 0000000..0140110 --- /dev/null +++ b/bloodhound/handlers/scheduled_handler.py @@ -0,0 +1,42 @@ +""" +scheduled_handler.py + +Handles scheduled scan events for Bloodhound. + +This handler is triggered when the Lambda function is invoked +by a scheduler (GitHub Actions cron or future EventBridge rule). +""" + +def handle_scheduled_event(event, context=None): + """ + Executes a scheduled Bloodhound scan. + + This function currently acts as a simple pass-through + to the main Bloodhound application runtime. + + Parameters + ---------- + event : dict + Lambda event payload + context : object + Lambda execution context + + Returns + ------- + dict + Result of scheduled scan execution + """ + + print("Scheduled Bloodhound scan triggered") + print("Event payload:", event) + + # Import here to avoid circular imports + from bloodhound.app import run_scheduled_scan + + # Execute scheduled scan directly (no generic run path) + result = run_scheduled_scan() + + return { + "status": "scheduled_scan_executed", + "result": result + } \ No newline at end of file diff --git a/bloodhound/handlers/slack_handler.py b/bloodhound/handlers/slack_handler.py new file mode 100644 index 0000000..4b70330 --- /dev/null +++ b/bloodhound/handlers/slack_handler.py @@ -0,0 +1,60 @@ +""" +Slack command handler. + +Interprets Slack slash commands and configures runtime +environment flags so the Bloodhound pipeline executes +in the correct operational mode. +""" + +import os + + +def handle_slack_event(event): + """ + Apply Slack command overrides. + + Slack commands never directly execute infrastructure + operations. They only modify runtime flags before + the main pipeline runs. + """ + + mode = (event.get("mode") or "").strip() + + # ------------------------------------------------------------ + # /v2_seek + # + # Scan-only safe mode. + # - performs resource scan + # - generates teardown plan + # - no deletion allowed + # ------------------------------------------------------------ + if mode == "seek": + + os.environ["APPLY_CHANGES"] = "false" + os.environ["TEARDOWN_SIMULATE"] = "true" + os.environ["TEARDOWN_ALLOW_ALL"] = "false" + + # ------------------------------------------------------------ + # /v2_seek_destroy_plan + # + # Preview deletion(teardown) plan but do not execute. + # - still safe mode + # ------------------------------------------------------------ + elif mode == "seek_destroy_plan": + + os.environ["APPLY_CHANGES"] = "false" + os.environ["TEARDOWN_SIMULATE"] = "true" + os.environ["TEARDOWN_ALLOW_ALL"] = "true" + + # ------------------------------------------------------------ + # /v2_seek_destroy_execute + # + # Execute destructive teardown. + # - executes real AWS deletion APIs + # - deletes all non-whitelisted resources + # ------------------------------------------------------------ + elif mode == "seek_destroy_execute": + + os.environ["APPLY_CHANGES"] = "true" + os.environ["TEARDOWN_SIMULATE"] = "false" + os.environ["TEARDOWN_ALLOW_ALL"] = "true" \ No newline at end of file diff --git a/bloodhound/handlers/validation_handler.py b/bloodhound/handlers/validation_handler.py new file mode 100644 index 0000000..f2c4333 --- /dev/null +++ b/bloodhound/handlers/validation_handler.py @@ -0,0 +1,66 @@ +""" +Validation event handler. + +Responsible for preparing the runtime environment for +validation harness executions. + +This ensures validation runs: +- are only triggered by automation +- restrict deletion to explicit validation targets +""" + +import os + + +def handle_validation_event(event): + """ + Prepare environment for validation teardown runs. + + This function performs safety checks and sets environment + overrides so the rest of the Bloodhound pipeline behaves + exactly like a real destructive run — but restricted to + validation resources. + """ + + + # ------------------------------------------------------------ + # Validation invocation guard + # + # Ensure validation mode can ONLY be triggered by the + # validation harness or CI workflow. Slack commands and + # other invocation sources must never trigger validation. + # ------------------------------------------------------------ + if event.get("source") != "validation": + raise RuntimeError( + "Validation events must originate from validation harness" + ) + + # ------------------------------------------------------------ + # Validate invocation mode + # ------------------------------------------------------------ + if event.get("mode") != "seek_destroy_validation": + raise RuntimeError( + "Invalid validation invocation mode." + ) + + # ------------------------------------------------------------ + # Validation requires explicit resource targets + # ------------------------------------------------------------ + target_ids = event.get("target_ids", []) + + if not target_ids: + raise RuntimeError( + "Validation mode requires target_ids" + ) + + # ------------------------------------------------------------ + # Override runtime configuration + # + # This forces Bloodhound into destructive execution mode + # while restricting deletions to validation targets only. + # ------------------------------------------------------------ + os.environ["APPLY_CHANGES"] = "true" + os.environ["TEARDOWN_SIMULATE"] = "false" + os.environ["TEARDOWN_ALLOW_ALL"] = "false" + + os.environ["TEARDOWN_TARGET_IDS"] = ",".join(target_ids) \ No newline at end of file diff --git a/bloodhound/messages.py b/bloodhound/messages.py new file mode 100644 index 0000000..52536cc --- /dev/null +++ b/bloodhound/messages.py @@ -0,0 +1,412 @@ +""" +bloodhound/messages.py + +Slack message formatting for Bloodhound v2. + +This file contains ONLY formatting/aggregation logic (no AWS calls). +It is used by `app.py` right before posting via `slack.py`. +""" + +from __future__ import annotations + +from collections import defaultdict +from datetime import datetime +from zoneinfo import ZoneInfo + +from bloodhound.budget import BudgetSnapshot +from bloodhound.teardown.planner import PlannedAction +from bloodhound.types import ResourceRecord + + +SCANNED_RESOURCE_TYPES_ORDER: list[tuple[str, str]] = [ + ("ec2", "instance"), + ("rds", "db_instance"), + ("ec2", "elastic_ip"), + ("ec2", "nat_gateway"), + ("ec2", "ebs_volume"), + ("elbv2", "load_balancer"), +] + + +_ET = ZoneInfo("America/New_York") + + +def _et_now_time_str() -> str: + # Example: 4:24 PM ET + return datetime.now(_ET).strftime("%-I:%M %p ET") + + +def _fmt_kv(key: str, value: str) -> str: + return f"*{key}*: {value}" + + +def _fmt_counts_line(prefix: str, counts: dict[tuple[str, str], int]) -> str: + parts = [f"{svc}.{rtype}={counts[(svc, rtype)]}" for svc, rtype in SCANNED_RESOURCE_TYPES_ORDER] + return f"{prefix} {', '.join(parts)}" + + +def _display_resource(r: ResourceRecord) -> str: + """ + Short, human-friendly one-liner for Slack. + """ + name = r.tags.get("Name") + name_part = f" (Name=`{name}`)" if name else "" + return f"- `{r.region}` {r.service}.{r.resource_type} `{r.id}`{name_part}" + + +def format_scan_message( + resources_by_region: dict[str, list[ResourceRecord]], + whitelisted_by_region: dict[str, list[ResourceRecord]], +) -> str: + lines: list[str] = [] + lines.append("*Bloodhound v2 — Scan Summary*") + lines.append(_fmt_kv("time_et", _et_now_time_str())) + + total_found = 0 + total_whitelisted = 0 + + # Totals by (service, resource_type) across all regions. + totals_candidates: dict[tuple[str, str], int] = defaultdict(int) + totals_kept: dict[tuple[str, str], int] = defaultdict(int) + + for region in sorted(resources_by_region.keys()): + resources = resources_by_region[region] + kept = whitelisted_by_region.get(region, []) + + total_found += len(resources) + total_whitelisted += len(kept) + + type_counts = defaultdict(int) + for r in resources: + type_counts[(r.service, r.resource_type)] += 1 + totals_candidates[(r.service, r.resource_type)] += 1 + + kept_type_counts = defaultdict(int) + for r in kept: + kept_type_counts[(r.service, r.resource_type)] += 1 + totals_kept[(r.service, r.resource_type)] += 1 + + lines.append("") + lines.append(f"*Region*: `{region}`") + lines.append(f"*Counts*: candidates `{len(resources)}` | kept `{len(kept)}`") + lines.append(f"- {_fmt_counts_line('Candidates:', type_counts)}") + lines.append(f"- {_fmt_counts_line('Kept:', kept_type_counts)}") + + lines.append("") + lines.append("*Totals*") + lines.append(f"*Counts*: candidates `{total_found}` | kept `{total_whitelisted}`") + lines.append(f"- {_fmt_counts_line('Candidates:', totals_candidates)}") + lines.append(f"- {_fmt_counts_line('Kept:', totals_kept)}") + return "\n".join(lines) + + +def format_whitelisted_resources_message( + whitelisted_by_region: dict[str, list[ResourceRecord]], + *, + max_items: int = 50, +) -> str: + """ + Separate report listing all whitelisted ("kept") resources. + Capped to max_items to keep Slack readable. + """ + kept_all: list[ResourceRecord] = [] + for region in sorted(whitelisted_by_region.keys()): + kept_all.extend(whitelisted_by_region.get(region, [])) + + lines: list[str] = [] + lines.append("*Bloodhound v2 — Whitelisted Resources (Kept)*") + lines.append(_fmt_kv("time_et", _et_now_time_str())) + lines.append("") + + if not kept_all: + lines.append("No whitelisted resources found.") + return "\n".join(lines) + + lines.append(_fmt_kv("kept_total", f"`{len(kept_all)}`")) + lines.append("") + + shown = 0 + for region in sorted(whitelisted_by_region.keys()): + region_items = whitelisted_by_region.get(region, []) + if not region_items: + continue + lines.append(f"*Region*: `{region}`") + for r in region_items: + if shown >= max_items: + break + lines.append(_display_resource(r)) + shown += 1 + if shown >= max_items: + break + lines.append("") + + if shown < len(kept_all): + lines.append(f"... and `{len(kept_all) - shown}` more (increase max_items if needed)") + + return "\n".join(lines).rstrip() + + +def format_budget_message(b: BudgetSnapshot) -> str: + def usd(x: float) -> str: + return f"${x:,.2f}" + + lines: list[str] = [] + lines.append("*Bloodhound v2 — Budget Summary*") + lines.append(_fmt_kv("time_et", _et_now_time_str())) + lines.append("") + lines.append("*Cohort*") + lines.append(f"- {_fmt_kv('start', f'`{b.cohort_start_yyyy_mm}`')}") + lines.append(f"- {_fmt_kv('total_budget', f'`{usd(b.cohort_total_budget_usd)}` over `{b.cohort_length_months}` months')}") + lines.append(f"- {_fmt_kv('to_date_spend', f'`{usd(b.cohort_to_date_spend_usd)}`')}") + lines.append(f"- {_fmt_kv('remaining_budget', f'`{usd(b.remaining_cohort_budget_usd)}`')}") + lines.append(f"- {_fmt_kv('remaining_months', f'`{b.remaining_months}`')}") + lines.append(f"- {_fmt_kv('monthly_allowance', f'`{usd(b.dynamic_monthly_allowance_usd)}`')}") + lines.append("") + lines.append("*This month*") + lines.append(f"- {_fmt_kv('month_to_date', f'`{usd(b.current_month_to_date_spend_usd)}`')}") + lines.append(f"- {_fmt_kv('projected_month_end', f'`{usd(b.projected_month_end_spend_usd)}`')}") + lines.append(f"- {_fmt_kv('over_budget_days_required', f'`{b.budget_over_days}`')}") + lines.append(f"- {_fmt_kv('over_budget_threshold_met', f'`{str(b.over_budget_threshold_met).lower()}`')}") + return "\n".join(lines) + + +def format_teardown_plan_message( + actions: list[PlannedAction], + apply_changes: bool, + *, + simulate: bool, + targets_filter_count: int, + allow_all: bool, +) -> str: + # Determine execution mode for Slack output. + # This ensures the Header and Slack messages clearly shows whether Bloodhound + # is running in dry-run, simulation, or real deletion mode. + if not apply_changes: + header = "*Bloodhound v2 — Teardown Plan (DRY RUN)*" + mode_text = "DRY RUN — No AWS resources will be deleted" + + elif simulate: + header = "*Bloodhound v2 — Teardown Plan (SIMULATION)*" + mode_text = "SIMULATION — Deletion calls are simulated" + + else: + header = "*Bloodhound v2 — Teardown Plan (DESTRUCTIVE APPLY)*" + mode_text = "APPLY (DESTRUCTIVE) — Resources WILL be deleted" + + # Slack message header + #header = "*Bloodhound v2 — Teardown Plan*" + + # First lines of the Slack report + lines = [ + header, + _fmt_kv("mode", f"`{mode_text}`"), + _fmt_kv("time_et", _et_now_time_str()), + "" + ] + + # Extra safety visibility for dry-run + if not apply_changes: + lines.append("🚨 *DRY RUN MODE* — No AWS resources will be deleted") + + targets_filter_active = targets_filter_count > 0 + lines.append(_fmt_kv("simulate", f"`{str(simulate).lower()}`")) + lines.append(_fmt_kv("targets_filter_active", f"`{str(targets_filter_active).lower()}`")) + if targets_filter_active: + lines.append(_fmt_kv("targets_filter_count", f"`{targets_filter_count}`")) + lines.append(_fmt_kv("allow_all_targets", f"`{str(allow_all).lower()}`")) + lines.append("") + + # Visual divider between execution configuration and deletion summary + lines.append("────────") + lines.append("") + + if not actions: + lines.append("No deletions planned.") + return "\n".join(lines) + + # Total number of API operations Bloodhound plans to execute + lines.append(_fmt_kv("planned_actions", f"`{len(actions)}`")) + lines.append("") + + # Aggregate counts by AWS action, service, and region + by_action = defaultdict(int) + by_service = defaultdict(int) + by_region = defaultdict(int) + + # Count how many planned actions belong to each service, API action, and region + for a in actions: + by_action[a.action] += 1 + by_service[a.service] += 1 + by_region[a.region] += 1 + + # ------------------------------------------------------------ + # Regions most affected by the teardown plan + # ------------------------------------------------------------ + lines.append("*Top Regions Affected*") + + # Sort regions by number of actions (largest first) + for region, count in sorted(by_region.items(), key=lambda x: x[1], reverse=True): + lines.append(f"- `{region}`: `{count}`") + + lines.append("") + + + # ------------------------------------------------------------ + # Summary by AWS service (ec2, elbv2, rds, etc.) affected by the teardown plan. + # ------------------------------------------------------------ + lines.append("*Services affected*") + #lines.append("`" + ", ".join([f"{k}={by_service[k]}" for k in sorted(by_service.keys())]) + "`") + # Render services vertically for easier Slack scanning + for svc in sorted(by_service.keys()): + lines.append(f"- `{svc}`: `{by_service[svc]}`") + + lines.append("") + + + # ------------------------------------------------------------ + # Summary by AWS API action (terminate, delete, release, etc.) + # ------------------------------------------------------------ + lines.append("*Planned Actions*") + + # Render each action count on its own line for easier Slack scanning + for action in sorted(by_action.keys()): + lines.append(f"- `{action}`: `{by_action[action]}`") + + # Keep Slack output short: show a small sample. + sample = actions[:15] + lines.append("") + lines.append("*Sample (first 15)*") + for a in sample: + lines.append(f"- `{a.region}` {a.service}.{a.resource_type} `{a.id}` → `{a.action}`") + if len(actions) > len(sample): + lines.append(f"- ... and `{len(actions) - len(sample)}` more") + + return "\n".join(lines) + + +def format_teardown_result_message(attempted: int, succeeded: int, failed: int, simulated: int) -> str: + lines: list[str] = [] + lines.append("*Bloodhound v2 — Teardown Results*") + lines.append(_fmt_kv("time_et", _et_now_time_str())) + lines.append("") + lines.append(_fmt_kv("attempted", f"`{attempted}`")) + lines.append(_fmt_kv("succeeded", f"`{succeeded}`")) + lines.append(_fmt_kv("failed", f"`{failed}`")) + lines.append(_fmt_kv("simulated", f"`{simulated}`")) + return "\n".join(lines) + +def format_status_message( + *, + health: str, + apply_changes: bool, + simulate: bool, + max_delete_count: int, + expected_account_id: str, + account_verified: bool, + last_scan_resource_total: int, + last_scan_whitelisted_total: int, + regions_scanned: int, +): + """ + Format a Slack status report for the Bloodhound system. + + This message is used by the `/v2_status` Slack command to provide + engineers with a quick operational overview of the system state. + + Args: + health: + System health indicator used by the operational dashboard. + + Possible values: + 🟢 healthy + 🟡 warning + 🔴 critical + + This value is computed in `app.py` based on destructive mode, + deletion safety limits, and other operational signals. + + apply_changes: + Whether Bloodhound is allowed to perform destructive operations. + + simulate: + Whether teardown operations are currently simulated. + + max_delete_count: + Maximum number of resources allowed to be deleted in a single run. + + expected_account_id: + AWS account ID that Bloodhound expects to operate within. + + account_verified: + Whether the runtime AWS account matches the expected account ID. + + last_scan_resource_total: + Total number of resources discovered during the most recent scan. + + last_scan_whitelisted_total: + Number of resources that were protected (whitelisted) during the scan. + + regions_scanned: + Total number of AWS regions included in the most recent scan. + + Returns: + Slack-formatted string representing the Bloodhound operational + status dashboard used by the `/v2_status` command. + """ + + # ------------------------------------------------------------ + # Determine execution mode + # ------------------------------------------------------------ + if not apply_changes: + mode = "DRY RUN" + elif simulate: + mode = "SIMULATION" + else: + mode = "DESTRUCTIVE APPLY" + + candidate_for_deletion = last_scan_resource_total - last_scan_whitelisted_total + + lines: list[str] = [] + + lines.append("*Bloodhound v2 — System Status*") + lines.append(_fmt_kv("system_health", f"`{health}`")) + lines.append(_fmt_kv("time_et", _et_now_time_str())) + lines.append("") + + # ------------------------------------------------------------ + # System mode + # ------------------------------------------------------------ + lines.append("*System Mode*") + lines.append(_fmt_kv("mode", f"`{mode}`")) + lines.append(_fmt_kv("apply_changes", f"`{str(apply_changes).lower()}`")) + lines.append(_fmt_kv("simulate", f"`{str(simulate).lower()}`")) + lines.append("") + + lines.append("────────") + lines.append("") + + """ + # ------------------------------------------------------------ + # Last scan summary + # ------------------------------------------------------------ + lines.append("*Last Scan Summary*") + lines.append(_fmt_kv("regions_scanned", f"`{regions_scanned}`")) + lines.append(_fmt_kv("resources_found", f"`{last_scan_resource_total}`")) + lines.append(_fmt_kv("resources_whitelisted", f"`{last_scan_whitelisted_total}`")) + lines.append(_fmt_kv("resources_candidate_for_deletion", f"`{candidate_for_deletion}`")) + lines.append("") + + lines.append("────────") + lines.append("")""" + + # ------------------------------------------------------------ + # Safety guards + # ------------------------------------------------------------ + lines.append("*Safety Guards*") + lines.append(_fmt_kv("max_deletion_limit", f"`{max_delete_count}`")) + lines.append(_fmt_kv("expected_account_id", f"`{expected_account_id}`")) + lines.append(_fmt_kv("account_verified", f"`{str(account_verified).lower()}`")) + + return "\n".join(lines) + + diff --git a/bloodhound/scanner/__init__.py b/bloodhound/scanner/__init__.py new file mode 100644 index 0000000..5e0629b --- /dev/null +++ b/bloodhound/scanner/__init__.py @@ -0,0 +1,7 @@ +__all__ = [ + "scan_all", +] + +from bloodhound.scanner.scan_all import scan_all + + diff --git a/bloodhound/scanner/ec2.py b/bloodhound/scanner/ec2.py new file mode 100644 index 0000000..d1bc862 --- /dev/null +++ b/bloodhound/scanner/ec2.py @@ -0,0 +1,164 @@ +""" +bloodhound/scanner/ec2.py + +EC2-family scanners (instances, EBS volumes, EIPs, NAT gateways). +These produce `ResourceRecord` items which are later whitelisted and optionally torn down. +""" + +from __future__ import annotations + +from typing import Any + +from bloodhound.aws import AwsClients +from bloodhound.types import ResourceRecord + + +def scan_ec2_instances(clients: AwsClients, region: str) -> list[ResourceRecord]: + ec2 = clients.client("ec2", region=region) + paginator = ec2.get_paginator("describe_instances") + records: list[ResourceRecord] = [] + + for page in paginator.paginate(): + for reservation in page.get("Reservations", []): + for inst in reservation.get("Instances", []): + state = (inst.get("State") or {}).get("Name") or "unknown" + if state in {"stopped", "terminated", "shutting-down"}: + continue + tags = _tags_to_dict(inst.get("Tags")) + instance_id = inst.get("InstanceId") + if not instance_id: + continue + records.append( + ResourceRecord( + service="ec2", + resource_type="instance", + region=region, + id=instance_id, + arn=None, + state=state, + tags=tags, + delete_supported=True, + delete_action="terminate_instances", + delete_params={"InstanceIds": [instance_id]}, + ) + ) + + return records + + +def scan_ebs_unattached_volumes(clients: AwsClients, region: str) -> list[ResourceRecord]: + ec2 = clients.client("ec2", region=region) + paginator = ec2.get_paginator("describe_volumes") + records: list[ResourceRecord] = [] + + for page in paginator.paginate( + Filters=[ + {"Name": "status", "Values": ["available"]}, + ] + ): + for vol in page.get("Volumes", []): + vol_id = vol.get("VolumeId") + if not vol_id: + continue + state = vol.get("State") or "unknown" + tags = _tags_to_dict(vol.get("Tags")) + records.append( + ResourceRecord( + service="ec2", + resource_type="ebs_volume", + region=region, + id=vol_id, + arn=None, + state=state, + tags=tags, + delete_supported=True, + delete_action="delete_volume", + delete_params={"VolumeId": vol_id}, + ) + ) + return records + + +def scan_eips_unassociated(clients: AwsClients, region: str) -> list[ResourceRecord]: + ec2 = clients.client("ec2", region=region) + records: list[ResourceRecord] = [] + + # Note: describe_addresses is not pageable. + resp = ec2.describe_addresses() + for addr in resp.get("Addresses", []): + if addr.get("AssociationId") or addr.get("NetworkInterfaceId") or addr.get("InstanceId"): + continue + alloc_id = addr.get("AllocationId") + public_ip = addr.get("PublicIp") + # AllocationId is required to release in VPC; for classic, PublicIp can be used. + rid = alloc_id or public_ip + if not rid: + continue + tags = _tags_to_dict(addr.get("Tags")) + delete_params: dict[str, Any] + if alloc_id: + delete_params = {"AllocationId": alloc_id} + else: + delete_params = {"PublicIp": public_ip} + records.append( + ResourceRecord( + service="ec2", + resource_type="elastic_ip", + region=region, + id=rid, + arn=None, + state="unassociated", + tags=tags, + delete_supported=True, + delete_action="release_address", + delete_params=delete_params, + ) + ) + + return records + + +def scan_nat_gateways(clients: AwsClients, region: str) -> list[ResourceRecord]: + ec2 = clients.client("ec2", region=region) + paginator = ec2.get_paginator("describe_nat_gateways") + records: list[ResourceRecord] = [] + + for page in paginator.paginate(): + for nat in page.get("NatGateways", []): + nat_id = nat.get("NatGatewayId") + if not nat_id: + continue + state = nat.get("State") or "unknown" + if state in {"deleted", "deleting"}: + continue + tags = _tags_to_dict(nat.get("Tags")) + records.append( + ResourceRecord( + service="ec2", + resource_type="nat_gateway", + region=region, + id=nat_id, + arn=None, + state=state, + tags=tags, + delete_supported=True, + delete_action="delete_nat_gateway", + delete_params={"NatGatewayId": nat_id}, + ) + ) + return records + + +def _tags_to_dict(tags: Any) -> dict[str, str]: + if not tags: + return {} + out: dict[str, str] = {} + for t in tags: + k = t.get("Key") + v = t.get("Value") + if k is None or v is None: + continue + out[str(k)] = str(v) + return out + + diff --git a/bloodhound/scanner/elbv2.py b/bloodhound/scanner/elbv2.py new file mode 100644 index 0000000..8501fc6 --- /dev/null +++ b/bloodhound/scanner/elbv2.py @@ -0,0 +1,64 @@ +""" +bloodhound/scanner/elbv2.py + +ELBv2 scanner (ALB/NLB). Collects load balancers and tags (in batches). +""" + +from __future__ import annotations + +from bloodhound.aws import AwsClients +from bloodhound.types import ResourceRecord + + +def scan_elbv2_load_balancers(clients: AwsClients, region: str) -> list[ResourceRecord]: + elb = clients.client("elbv2", region=region) + paginator = elb.get_paginator("describe_load_balancers") + lbs: list[dict] = [] + for page in paginator.paginate(): + lbs.extend(page.get("LoadBalancers", []) or []) + + if not lbs: + return [] + + # Fetch tags in batches of 20 ARNs (API limit). + arn_to_tags: dict[str, dict[str, str]] = {} + arns = [lb.get("LoadBalancerArn") for lb in lbs if lb.get("LoadBalancerArn")] + for i in range(0, len(arns), 20): + batch = arns[i : i + 20] + try: + tag_resp = elb.describe_tags(ResourceArns=batch) + for desc in tag_resp.get("TagDescriptions", []) or []: + arn = desc.get("ResourceArn") + if not arn: + continue + arn_to_tags[arn] = {t.get("Key"): t.get("Value") for t in (desc.get("Tags") or []) if t.get("Key")} + except Exception: + # Best-effort: skip tags on failure. + continue + + records: list[ResourceRecord] = [] + for lb in lbs: + arn = lb.get("LoadBalancerArn") + name = lb.get("LoadBalancerName") + if not arn or not name: + continue + state = (lb.get("State") or {}).get("Code") or "unknown" + if state in {"deleted", "deleting"}: + continue + records.append( + ResourceRecord( + service="elbv2", + resource_type="load_balancer", + region=region, + id=name, + arn=arn, + state=state, + tags=arn_to_tags.get(arn, {}), + delete_supported=True, + delete_action="delete_load_balancer", + delete_params={"LoadBalancerArn": arn}, + ) + ) + return records + + diff --git a/bloodhound/scanner/rds.py b/bloodhound/scanner/rds.py new file mode 100644 index 0000000..80767ea --- /dev/null +++ b/bloodhound/scanner/rds.py @@ -0,0 +1,73 @@ +""" +bloodhound/scanner/rds.py + +RDS scanner. Collects DB instances and best-effort tags (requires list_tags_for_resource). +""" + +from __future__ import annotations + +from typing import Any + +from bloodhound.aws import AwsClients +from bloodhound.types import ResourceRecord + + +def scan_rds_instances(clients: AwsClients, region: str, rds_final_snapshot: bool) -> list[ResourceRecord]: + rds = clients.client("rds", region=region) + paginator = rds.get_paginator("describe_db_instances") + records: list[ResourceRecord] = [] + + for page in paginator.paginate(): + for db in page.get("DBInstances", []): + db_id = db.get("DBInstanceIdentifier") + arn = db.get("DBInstanceArn") + status = db.get("DBInstanceStatus") or "unknown" + if not db_id: + continue + # Tags require a separate call; keep best-effort for now. + tags = _tags_to_dict(_safe_list_tags(rds, arn)) + delete_params: dict[str, Any] = { + "DBInstanceIdentifier": db_id, + "SkipFinalSnapshot": not rds_final_snapshot, + } + records.append( + ResourceRecord( + service="rds", + resource_type="db_instance", + region=region, + id=db_id, + arn=arn, + state=status, + tags=tags, + delete_supported=True, + delete_action="delete_db_instance", + delete_params=delete_params, + ) + ) + + return records + + +def _safe_list_tags(rds_client, arn: str | None) -> list[dict[str, str]]: + if not arn: + return [] + try: + resp = rds_client.list_tags_for_resource(ResourceName=arn) + return resp.get("TagList", []) or [] + except Exception: + return [] + + +def _tags_to_dict(tags: Any) -> dict[str, str]: + if not tags: + return {} + out: dict[str, str] = {} + for t in tags: + k = t.get("Key") + v = t.get("Value") + if k is None or v is None: + continue + out[str(k)] = str(v) + return out + + diff --git a/bloodhound/scanner/regions.py b/bloodhound/scanner/regions.py new file mode 100644 index 0000000..ae52cba --- /dev/null +++ b/bloodhound/scanner/regions.py @@ -0,0 +1,39 @@ +""" +bloodhound/scanner/regions.py + +Region selection helpers. + +- explicit mode: use REGIONS from config +- discover mode: discover regions via EC2 DescribeRegions +""" + +from __future__ import annotations + +from typing import Optional + +from bloodhound.aws import AwsClients + + +def discover_regions(clients: AwsClients, home_region: str = "us-east-1") -> list[str]: + ec2 = clients.client("ec2", region=home_region) + resp = ec2.describe_regions(AllRegions=True) + regions = [r["RegionName"] for r in resp.get("Regions", []) if "RegionName" in r] + regions.sort() + return regions + + +def select_regions(mode: str, explicit_regions: list[str], discovered_regions: Optional[list[str]]) -> list[str]: + if mode == "explicit": + regions = list(explicit_regions) + regions.sort() + return regions + if mode == "discover": + regions = list(discovered_regions or []) + regions.sort() + return regions + # Fallback: behave like explicit with whatever was provided. + regions = list(explicit_regions) + regions.sort() + return regions + + diff --git a/bloodhound/scanner/scan_all.py b/bloodhound/scanner/scan_all.py new file mode 100644 index 0000000..81715e2 --- /dev/null +++ b/bloodhound/scanner/scan_all.py @@ -0,0 +1,46 @@ +""" +bloodhound/scanner/scan_all.py + +Scan coordinator: runs all per-service scanners for each region and returns a region->records map. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from bloodhound.aws import AwsClients +from bloodhound.scanner.ec2 import ( + scan_ec2_instances, + scan_ebs_unattached_volumes, + scan_eips_unassociated, + scan_nat_gateways, +) +from bloodhound.scanner.elbv2 import scan_elbv2_load_balancers +from bloodhound.scanner.rds import scan_rds_instances +from bloodhound.types import ResourceRecord + + +@dataclass(frozen=True) +class ScanResult: + region: str + resources: list[ResourceRecord] + + +def scan_region(clients: AwsClients, region: str, rds_final_snapshot: bool) -> list[ResourceRecord]: + resources: list[ResourceRecord] = [] + resources.extend(scan_ec2_instances(clients, region)) + resources.extend(scan_rds_instances(clients, region, rds_final_snapshot=rds_final_snapshot)) + resources.extend(scan_eips_unassociated(clients, region)) + resources.extend(scan_nat_gateways(clients, region)) + resources.extend(scan_ebs_unattached_volumes(clients, region)) + resources.extend(scan_elbv2_load_balancers(clients, region)) + return resources + + +def scan_all(clients: AwsClients, regions: list[str], rds_final_snapshot: bool) -> dict[str, list[ResourceRecord]]: + out: dict[str, list[ResourceRecord]] = {} + for region in regions: + out[region] = scan_region(clients, region, rds_final_snapshot=rds_final_snapshot) + return out + + diff --git a/bloodhound/services/budget_service.py b/bloodhound/services/budget_service.py new file mode 100644 index 0000000..6fde08f --- /dev/null +++ b/bloodhound/services/budget_service.py @@ -0,0 +1,74 @@ +""" +bloodhound/services/budget_service.py + +Budget monitoring service for Bloodhound v2. + +Responsibilities: +- Compute budget snapshot +- Generate formatted Slack budget message +- Post budget alerts + +This logic was previously embedded in the main pipeline and +has been extracted into a service module for clarity. +""" + +from bloodhound.budget import compute_budget_snapshot +from bloodhound.messages import format_budget_message + + +def compute_budget(cfg, clients, slack): + """ + Compute the current budget snapshot and optionally post a Slack notification. + + This function retrieves AWS billing data and calculates the projected + monthly spend based on the configured cohort budget parameters. + + Args: + cfg: + Bloodhound configuration object produced by `load_config()`. + Must contain the `budget` configuration section. + + clients: + AWS client container created by `create_clients()`. + Provides access to required AWS service clients such as + Cost Explorer. + + slack: + Optional SlackNotifier instance. If Slack integration is + enabled, the formatted budget message will be posted to the + configured alert channel. + + Returns: + BudgetSnapshot: + Snapshot object returned by `compute_budget_snapshot` + containing: + + - projected_month_end_spend_usd + - dynamic_monthly_allowance_usd + - over_budget_threshold_met + + Raises: + RuntimeError: + If AWS billing APIs fail or return unexpected data. + + Side Effects: + - Calls AWS Cost Explorer API + - Posts a budget summary message to Slack alert channel + + """ + + budget_snapshot = compute_budget_snapshot( + clients, + cohort_start_yyyy_mm=cfg.budget.cohort_start_yyyy_mm, + cohort_total_budget_usd=cfg.budget.cohort_total_budget_usd, + cohort_length_months=cfg.budget.cohort_length_months, + budget_over_days=cfg.budget.budget_over_days, + ) + + budget_msg = format_budget_message(budget_snapshot) + + # Always post budget summary to alert channel (keeps scan channel quieter). + if slack: + slack.post_alert(budget_msg) + + return budget_snapshot \ No newline at end of file diff --git a/bloodhound/services/scan_service.py b/bloodhound/services/scan_service.py new file mode 100644 index 0000000..38442e1 --- /dev/null +++ b/bloodhound/services/scan_service.py @@ -0,0 +1,121 @@ +""" +bloodhound/services/scan_service.py + +Resource scanning service for Bloodhound v2. + +Responsibilities: +- Discover AWS regions +- Scan resources across regions +- Apply whitelist filtering +- Generate Slack scan reports +- Produce candidate resource list used for teardown planning + +This module contains logic that previously existed inside +execute_pipeline() and has been extracted for maintainability. +""" + +from bloodhound.scanner.regions import discover_regions, select_regions +from bloodhound.scanner.scan_all import scan_all +from bloodhound.whitelist import filter_whitelisted +from bloodhound.messages import ( + format_scan_message, + format_whitelisted_resources_message, +) + + +def scan_resources(cfg, clients, slack): + """ + Scan AWS resources across configured regions and apply whitelist filtering. + + This service performs the full resource discovery phase of the Bloodhound + pipeline. It identifies candidate resources for teardown while separating + whitelisted resources that must be preserved. + + Args: + cfg: + Bloodhound configuration object produced by `load_config()`. + Contains region configuration, whitelist rules, and teardown + parameters. + + clients: + AWS client container created by `create_clients()`. Provides + service clients required for scanning infrastructure resources. + + slack: + Optional SlackNotifier instance used for sending scan reports + and whitelist summaries. + + Returns: + dict: + Dictionary containing the scan results with the following keys: + + regions (list[str]) + Regions that were scanned. + + candidates_by_region (dict[str, list]) + Resources eligible for teardown grouped by AWS region. + + kept_by_region (dict[str, list]) + Resources preserved due to whitelist rules grouped by region. + + all_candidates (list) + Flattened list of all candidate resources across regions. + This list is used as the input for teardown planning. + + Raises: + RuntimeError: + If region discovery or resource scanning fails. + + Side Effects: + - Calls AWS resource APIs across multiple regions + - Posts scan summary to Slack scan channel + - Posts whitelist report to Slack scan channel + """ + + discovered = None + if cfg.regions.mode == "discover": + discovered = discover_regions(clients) + + regions = select_regions( + cfg.regions.mode, + cfg.regions.regions, + discovered_regions=discovered + ) + + # 1) Scan + raw_by_region = scan_all( + clients, + regions=regions, + rds_final_snapshot=cfg.teardown.rds_final_snapshot + ) + + candidates_by_region: dict[str, list] = {} + kept_by_region: dict[str, list] = {} + + for region, records in raw_by_region.items(): + # Whitelist is applied before teardown planning. + candidates, kept = filter_whitelisted(records, cfg.whitelist) + candidates_by_region[region] = candidates + kept_by_region[region] = kept + + scan_msg = format_scan_message(candidates_by_region, kept_by_region) + + if slack: + slack.post_scan(scan_msg) + slack.post_scan(format_whitelisted_resources_message(kept_by_region)) + + # ------------------------------------------------------------ + # Build flattened candidate list for teardown planning + # ------------------------------------------------------------ + # 3) Teardown plan + optional apply + all_candidates = [ + r for region in sorted(candidates_by_region.keys()) + for r in candidates_by_region[region] + ] + + return { + "regions": regions, + "candidates_by_region": candidates_by_region, + "kept_by_region": kept_by_region, + "all_candidates": all_candidates, + } \ No newline at end of file diff --git a/bloodhound/services/status_service.py b/bloodhound/services/status_service.py new file mode 100644 index 0000000..5ad1988 --- /dev/null +++ b/bloodhound/services/status_service.py @@ -0,0 +1,98 @@ +""" +bloodhound/services/status_service.py + +Status command service for Bloodhound v2. + +Responsibilities: +- Handle Slack /v2_status command +- Compute system health indicator +- Post formatted status message to Slack + +This module intentionally contains only the logic previously +embedded inside the pipeline so that orchestration code remains +clean and maintainable. +""" + +from bloodhound.messages import format_status_message + + +def handle_status_command(event, cfg, slack, compute_system_health): + """ + Handle the Slack `/v2_status` command. + + This function generates a system status report describing the current + operational state of the Bloodhound pipeline including safety flags + and deletion controls. + + Args: + event (dict): + Lambda event payload representing the Slack command request. + Must contain: + source="slack_command" + mode="status" + + cfg: + Bloodhound configuration object produced by `load_config()`. + + slack: + Optional SlackNotifier instance used to post the formatted + status message to the alert channel. + + compute_system_health (callable): + Function that evaluates system health based on operational + flags such as teardown mode and budget thresholds. + + Returns: + dict | None: + + dict + Returned when a status command is processed: + + { + "ok": True, + "mode": "status" + } + + None + Returned when the incoming event is not a status command, + allowing the main pipeline to continue execution. + + Raises: + RuntimeError: + If Slack message formatting fails. + """ + + # ------------------------------------------------------------ + # Status command (read-only system overview) + # ------------------------------------------------------------ + if isinstance(event, dict) and event.get("source") == "slack_command": + if event.get("mode") == "status": + + health = compute_system_health( + apply_changes=cfg.teardown.apply_changes, + allow_all_targets=cfg.teardown.allow_all_targets, + max_delete_count=cfg.teardown.max_delete_count, + budget_over_threshold=False, + ) + + status_msg = format_status_message( + health=health, + apply_changes=cfg.teardown.apply_changes, + simulate=cfg.teardown.simulate, + max_delete_count=cfg.teardown.max_delete_count, + expected_account_id=getattr(cfg.aws, "expected_account_id", "not-configured"), + account_verified=True, + last_scan_resource_total=0, + last_scan_whitelisted_total=0, + regions_scanned=0, + ) + + if slack: + slack.post_alert(status_msg) + + return { + "ok": True, + "mode": "status", + } + + return None \ No newline at end of file diff --git a/bloodhound/services/teardown_service.py b/bloodhound/services/teardown_service.py new file mode 100644 index 0000000..42e5417 --- /dev/null +++ b/bloodhound/services/teardown_service.py @@ -0,0 +1,264 @@ +""" +bloodhound/services/teardown_service.py + +Teardown planning and execution service for Bloodhound v2. + +Responsibilities: +- Perform validation safety checks +- Plan deletion actions +- Execute AWS resource teardown +- Post Slack teardown reports + +This module contains logic extracted from execute_pipeline() +without altering behavior. + +# ------------------------------------------------------------ +# WARNING +# +# This module performs destructive AWS operations including +# permanent deletion of infrastructure resources. +# +# All safety guards MUST remain intact. +# ------------------------------------------------------------ +""" + +import json + +from bloodhound.teardown.planner import plan_deletions +from bloodhound.teardown.executor import execute_actions +from bloodhound.messages import ( + format_teardown_plan_message, + format_teardown_result_message, +) + + +def plan_teardown(cfg, event, all_candidates, slack): + """ + Generate the teardown plan and enforce validation safety checks. + + This function validates teardown targets (when running in validation + mode) and builds the list of resource deletion actions using the + teardown planner. + + Args: + cfg: + Bloodhound configuration object produced by `load_config()`. + Contains teardown safety parameters and target filters. + + event (dict): + Invocation event used to determine execution mode + (Slack command, validation harness, scheduled run). + + all_candidates (list): + Flattened list of candidate resources returned by the scan + phase. Each resource object must contain: + + id + arn + region + tags + + slack: + Optional SlackNotifier instance used to post the teardown + plan summary. + + Returns: + list: + List of planned teardown action objects returned by + `plan_deletions()`. + + Raises: + RuntimeError: + If validation safety checks fail or invalid teardown + targets are detected. + + Side Effects: + - Posts teardown plan summary to Slack alert channel + """ + + # ------------------------------------------------------------------ + # Validation Safety Guard + # + # When running validation mode, ensure that: + # + # 1. Target IDs exist in scan results + # 2. Target IDs reference resources tagged for validation + # + # This prevents accidental deletion of unrelated infrastructure. + # ------------------------------------------------------------------ + + if isinstance(event, dict) and event.get("source") == "validation": + + validation_targets = set(cfg.teardown.target_ids) + + candidate_ids = {r.id for r in all_candidates} + + tagged_resources = { + r.id + for r in all_candidates + if r.tags.get("bloodhound:test") == "true" + } + + # Ensure targets exist in scan results + if not validation_targets.issubset(candidate_ids): + + invalid_targets = validation_targets - candidate_ids + + raise RuntimeError( + f"Validation safety check failed. " + f"Targets not present in scan results: {sorted(invalid_targets)}" + ) + + # Ensure targets are validation-tagged resources + if not validation_targets.issubset(tagged_resources): + + invalid_targets = validation_targets - tagged_resources + + raise RuntimeError( + f"Validation safety check failed. " + f"Targets missing validation tag: {sorted(invalid_targets)}" + ) + + actions, _manual = plan_deletions(all_candidates) + + if slack: + slack.post_alert( + format_teardown_plan_message( + actions, + apply_changes=cfg.teardown.apply_changes, + simulate=cfg.teardown.simulate, + targets_filter_count=len(cfg.teardown.target_ids), + allow_all=cfg.teardown.allow_all_targets, + ) + ) + + return actions + + +def execute_teardown(cfg, event, clients, actions, slack, resource_key_from_action): + """ + Execute planned teardown actions against AWS resources. + + This function applies deletion operations for resources selected + by the teardown planner. Execution behavior is controlled by the + configuration flags for simulation mode, validation mode, and + explicit target filtering. + + Args: + cfg: + Bloodhound configuration object produced by `load_config()`. + + event (dict): + Invocation event used to determine validation mode or + Slack command execution mode. + + clients: + AWS client container created by `create_clients()`. + + actions (list): + List of teardown action objects produced by `plan_teardown()`. + + slack: + Optional SlackNotifier instance used to post execution + results to the alert channel. + + resource_key_from_action (callable): + Helper function that converts an action object into the + canonical resource key format used for target filtering. + + Returns: + dict | None: + + dict + Execution summary containing: + + attempted + succeeded + failed + simulated + failures + targets_filter + planned_actions_total + executed_actions_total + allow_all_targets + + None + Returned when destructive execution is disabled. + + Raises: + RuntimeError: + If AWS deletion operations fail unexpectedly. + + Side Effects: + - Calls AWS deletion APIs (EC2, RDS, etc.) + - Permanently deletes AWS infrastructure + - Posts execution summary to Slack alert channel + + + """ + + exec_summary = None + + if cfg.teardown.apply_changes: + + actions_to_execute = actions + + # ------------------------------------------------------------ + # Validation safety filter + # + # When validation mode is active, restrict deletion to resources + # tagged specifically for validation runs. + # ------------------------------------------------------------ + validation_mode = isinstance(event, dict) and event.get("validation") is True + + if validation_mode: + actions_to_execute = [ + a for a in actions + if getattr(a, "tags", {}).get("bloodhound:test") == "true" + ] + + elif cfg.teardown.target_ids: + + targets = cfg.teardown.target_ids + + actions_to_execute = [ + a + for a in actions + if ( + (a.id in targets) + or (a.arn and a.arn in targets) + or (resource_key_from_action(a) in targets) + ) + ] + + exec_result = execute_actions( + clients, + actions_to_execute, + simulate=cfg.teardown.simulate + ) + + exec_summary = { + "attempted": exec_result.attempted, + "succeeded": exec_result.succeeded, + "failed": exec_result.failed, + "simulated": exec_result.simulated, + "failures": exec_result.failures[:20], + "targets_filter": sorted(cfg.teardown.target_ids) if cfg.teardown.target_ids else None, + "planned_actions_total": len(actions), + "executed_actions_total": len(actions_to_execute), + "allow_all_targets": cfg.teardown.allow_all_targets, + } + + if slack: + slack.post_alert( + format_teardown_result_message( + attempted=exec_result.attempted, + succeeded=exec_result.succeeded, + failed=exec_result.failed, + simulated=exec_result.simulated, + ) + + "\n" + + json.dumps(exec_summary, indent=2) + ) + + return exec_summary \ No newline at end of file diff --git a/bloodhound/slack.py b/bloodhound/slack.py new file mode 100644 index 0000000..0210651 --- /dev/null +++ b/bloodhound/slack.py @@ -0,0 +1,35 @@ +""" +bloodhound/slack.py + +Small wrapper around the Slack SDK for posting messages. +The formatting is handled in `messages.py`. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from slack_sdk import WebClient + + +@dataclass(frozen=True) +class SlackNotifier: + client: WebClient + scan_channel_id: str + alert_channel_id: str + + @classmethod + def from_token(cls, bot_token: str, scan_channel_id: str, alert_channel_id: str) -> "SlackNotifier": + return cls( + client=WebClient(token=bot_token), + scan_channel_id=scan_channel_id, + alert_channel_id=alert_channel_id, + ) + + def post_scan(self, text: str) -> None: + self.client.chat_postMessage(channel=self.scan_channel_id, text=text) + + def post_alert(self, text: str) -> None: + self.client.chat_postMessage(channel=self.alert_channel_id, text=text) + + diff --git a/bloodhound/slack_commands.py b/bloodhound/slack_commands.py new file mode 100644 index 0000000..bdcf583 --- /dev/null +++ b/bloodhound/slack_commands.py @@ -0,0 +1,224 @@ +""" +bloodhound/slack_commands.py + +Slack slash command HTTP handler for Bloodhound v2. + +Why this exists: +- Slack needs a public HTTPS endpoint for slash commands (/seek, /seek_destroy) +- Slack requires a fast response; we return immediately and then self-invoke the Lambda + +Deployed via: +- Lambda Function URL (recommended) or API Gateway +""" + +from __future__ import annotations + +import base64 +#import cmd +import hashlib +import hmac +import os +import time +import urllib.parse +from dataclasses import dataclass +from typing import Any + +import boto3 + + +@dataclass(frozen=True) +class SlackCommand: + command: str + text: str + user_id: str + channel_id: str + + +def is_slack_http_event(event: dict[str, Any]) -> bool: + # Lambda Function URL / API Gateway both provide headers + body for HTTP requests. + return isinstance(event, dict) and "headers" in event and "body" in event + + +def handle_slack_command_http(event: dict[str, Any]) -> dict[str, Any]: + """ + Handles Slack slash commands via an HTTP-triggered Lambda (Function URL or API Gateway). + + Responds immediately (Slack requires fast response), then asynchronously invokes the + same Lambda function to run the scan/teardown and post the normal Slack reports. + """ + + # ------------------------------------------------------------- + # Lightweight health endpoint for infrastructure validation + # Allows curl checks without requiring Slack signature headers + # ------------------------------------------------------------- + raw_path = event.get("rawPath") or event.get("path") or "" + if raw_path == "/health": + return { + "statusCode": 200, + "headers": {"Content-Type": "application/json"}, + "body": json_dumps({ + "ok": True, + "service": "BloodhoundLambdaV2", + "status": "healthy" + }), + } + + # Slack request verification (signature + timestamp) is mandatory. + signing_secret = os.environ.get("SLACK_SIGNING_SECRET", "").strip() + if not signing_secret: + return _http_text(500, "Missing SLACK_SIGNING_SECRET") + + headers = _lowercase_headers(event.get("headers") or {}) + raw_body = event.get("body") or "" + if event.get("isBase64Encoded"): + raw_body = base64.b64decode(raw_body).decode("utf-8") + + if not _verify_signature(headers, raw_body, signing_secret): + return _http_text(401, "Invalid Slack signature") + + form = urllib.parse.parse_qs(raw_body, keep_blank_values=True) + cmd = SlackCommand( + command=_first(form, "command"), + text=_first(form, "text"), + user_id=_first(form, "user_id"), + channel_id=_first(form, "channel_id"), + ) + + allowed_channels = _split_csv(os.environ.get("SLACK_ALLOWED_CHANNEL_IDS", "")) + if allowed_channels and cmd.channel_id not in allowed_channels: + # Slack surfaces non-200 responses as "dispatch_failed", so return 200 with a helpful message. + return _http_text(200, "Not allowed in this channel.") + + # Route based on Slack's `command` field. + # Non-destructive scan command + if cmd.command in ("/seek", "/v2_seek"): + _invoke_worker(mode="seek", cmd=cmd) + return _http_text(200, "BloodHound is on the hunt...please stand by.") + + # ------------------------------------------------------------ + # Preview teardown plan (safe) + # ------------------------------------------------------------ + if cmd.command in ("/v2_seek_destroy_plan",): + _invoke_worker(mode="seek", cmd=cmd) + return _http_text(200, "Generating teardown preview...please stand by.") + + # ------------------------------------------------------------ + # System status command + # ------------------------------------------------------------ + if cmd.command in ("/v2_status",): + _invoke_worker(mode="status", cmd=cmd) + return _http_text(200, "Fetching Bloodhound system status...") + + # Destructive teardown command + if cmd.command in ("/seek_destroy", "/v2_seek_destroy"): + if not _destroy_allowed(cmd): + # Slack surfaces non-200 responses as "dispatch_failed", so return 200 with a helpful message. + return _http_text(200, "Not allowed. Use `/seek_destroy CONFIRM` (and ensure you are allowlisted).") + _invoke_worker(mode="seek_destroy", cmd=cmd) + return _http_text(200, "Uh oh someone let the dog out ---> Seek & Destroy Underway friendly assets whitelisted...please stand by.") + + # Unknown command (still return 200 so Slack doesn't show dispatch_failed). + return _http_text(200, f"Unknown command: {cmd.command}") + + +def _invoke_worker(mode: str, cmd: SlackCommand) -> None: + """ + Asynchronously invoke THIS lambda so the HTTP response can return immediately. + The worker invocation will run the normal scan/teardown and post to Slack. + """ + # We invoke THIS lambda asynchronously so Slack isn't waiting on a long scan. + function_name = os.environ.get("AWS_LAMBDA_FUNCTION_NAME") + if not function_name: + # Local runs won't have this; just no-op. + return + + payload = { + "source": "slack_command", + "mode": mode, + "slack": { + "command": cmd.command, + "text": cmd.text, + "user_id": cmd.user_id, + "channel_id": cmd.channel_id, + }, + } + + boto3.client("lambda").invoke( + FunctionName=function_name, + InvocationType="Event", + Payload=str.encode(json_dumps(payload)), + ) + + +def _destroy_allowed(cmd: SlackCommand) -> bool: + """ + Super-safe by default: + - Require CONFIRM token + - Require user allowlist if configured + """ + # Minimal "are you sure?" gate. + confirm_token = os.environ.get("SLACK_DESTROY_CONFIRM_TOKEN", "CONFIRM").strip() + if cmd.text.strip() != confirm_token: + return False + + allowed_users = _split_csv(os.environ.get("SLACK_ALLOWED_USER_IDS", "")) + if allowed_users and cmd.user_id not in allowed_users: + return False + + return True + + +def _verify_signature(headers: dict[str, str], body: str, signing_secret: str) -> bool: + ts = headers.get("x-slack-request-timestamp", "") + sig = headers.get("x-slack-signature", "") + if not ts or not sig: + return False + + try: + ts_int = int(ts) + except ValueError: + return False + + # Reject old requests (replay protection). + if abs(int(time.time()) - ts_int) > 60 * 5: + return False + + base = f"v0:{ts}:{body}".encode("utf-8") + digest = hmac.new(signing_secret.encode("utf-8"), base, hashlib.sha256).hexdigest() + expected = f"v0={digest}" + return hmac.compare_digest(expected, sig) + + +def _http_text(status_code: int, text: str) -> dict[str, Any]: + return { + "statusCode": status_code, + "headers": {"Content-Type": "text/plain; charset=utf-8"}, + "body": text, + } + + +def _lowercase_headers(headers: dict[str, Any]) -> dict[str, str]: + out: dict[str, str] = {} + for k, v in headers.items(): + if k is None or v is None: + continue + out[str(k).lower()] = str(v) + return out + + +def _first(form: dict[str, list[str]], key: str) -> str: + vals = form.get(key) or [] + return vals[0] if vals else "" + + +def _split_csv(raw: str) -> list[str]: + return [x.strip() for x in raw.split(",") if x.strip()] + + +def json_dumps(obj: Any) -> str: + # Tiny local wrapper to avoid importing json at module import time. + import json + + return json.dumps(obj) + + diff --git a/bloodhound/teardown/__init__.py b/bloodhound/teardown/__init__.py new file mode 100644 index 0000000..53b7079 --- /dev/null +++ b/bloodhound/teardown/__init__.py @@ -0,0 +1,6 @@ +__all__ = ["plan_deletions", "execute_actions"] + +from bloodhound.teardown.executor import execute_actions +from bloodhound.teardown.planner import plan_deletions + + diff --git a/bloodhound/teardown/executor.py b/bloodhound/teardown/executor.py new file mode 100644 index 0000000..47f6411 --- /dev/null +++ b/bloodhound/teardown/executor.py @@ -0,0 +1,130 @@ +""" +bloodhound/teardown/executor.py + +Executes teardown actions produced by `planner.py`. + +Supports simulate mode (TEARDOWN_SIMULATE): +- EC2-family actions use DryRun=True where supported +- non-DryRun services are treated as no-op in simulate mode (safest) +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from bloodhound.aws import AwsClients +from bloodhound.teardown.planner import PlannedAction + + +@dataclass(frozen=True) +class ExecutionResult: + attempted: int + succeeded: int + failed: int + simulated: int + failures: list[str] + + +def execute_actions(clients: AwsClients, actions: list[PlannedAction], *, simulate: bool) -> ExecutionResult: + """ + Executes planned actions. This is intentionally minimal and best-effort. + Caller must gate this behind APPLY_CHANGES=true. + If simulate=True, no destructive calls are performed: + - For EC2 actions that support DryRun, we pass DryRun=True and treat DryRunOperation as success. + - For actions that do not support DryRun (e.g., RDS delete, ELB delete), we no-op and count as simulated success. + """ + attempted = 0 + succeeded = 0 + failed = 0 + simulated_count = 0 + failures: list[str] = [] + + # ------------------------------------------------------------------ + # Safety guard: prevent large accidental deletions + # + # If the teardown plan contains too many resources, abort execution. + # This protects against scanning bugs or unexpected AWS API results + # that could otherwise cause mass deletion. + # ------------------------------------------------------------------ + from bloodhound.config import load_config + cfg = load_config() + + if len(actions) > cfg.teardown.max_delete_count: + raise RuntimeError( + f"Teardown aborted: plan contains {len(actions)} resources " + f"which exceeds TEARDOWN_MAX_DELETE_COUNT={cfg.teardown.max_delete_count}" + ) + + for a in actions: + attempted += 1 + try: + # simulate=True guarantees we do NOT destroy anything: + # - EC2-family: validate permissions/shape with DryRun=True + # - RDS/ELB deletes: no-op (safest) + did_simulate = _execute_one(clients, a, simulate=simulate) + if did_simulate: + simulated_count += 1 + succeeded += 1 + except Exception as e: + failed += 1 + failures.append(f"{a.service}/{a.region}/{a.resource_type}/{a.id} -> {a.action}: {e}") + + return ExecutionResult( + attempted=attempted, + succeeded=succeeded, + failed=failed, + simulated=simulated_count, + failures=failures, + ) + + +def _execute_one(clients: AwsClients, a: PlannedAction, *, simulate: bool) -> bool: + # Map actions to boto3 service clients. + if a.action in {"terminate_instances", "delete_volume", "release_address", "delete_nat_gateway"}: + ec2 = clients.client("ec2", region=a.region) + if simulate: + params = dict(a.params) + params["DryRun"] = True + try: + getattr(ec2, a.action)(**params) + except Exception as e: + if _is_dry_run_success(e): + return True + raise + # If the API actually performed the action, something is wrong; treat as error to be safe. + raise RuntimeError("DryRun did not raise; refusing to continue in simulate mode") + getattr(ec2, a.action)(**a.params) + return False + + if a.action == "delete_db_instance": + if simulate: + # RDS delete does not support DryRun. No-op for safety. + return True + rds = clients.client("rds", region=a.region) + getattr(rds, a.action)(**a.params) + return False + + if a.action == "delete_load_balancer": + if simulate: + # ELBv2 delete does not support DryRun. No-op for safety. + return True + elb = clients.client("elbv2", region=a.region) + getattr(elb, a.action)(**a.params) + return False + + raise ValueError(f"Unsupported action: {a.action}") + + +def _is_dry_run_success(exc: Exception) -> bool: + """ + boto3 typically raises botocore.exceptions.ClientError with error code DryRunOperation. + We avoid importing botocore types here; we just pattern-match on known attributes. + """ + resp = getattr(exc, "response", None) + if not isinstance(resp, dict): + return False + err = resp.get("Error") or {} + code = err.get("Code") + return code == "DryRunOperation" + + diff --git a/bloodhound/teardown/planner.py b/bloodhound/teardown/planner.py new file mode 100644 index 0000000..84e675f --- /dev/null +++ b/bloodhound/teardown/planner.py @@ -0,0 +1,49 @@ +""" +bloodhound/teardown/planner.py + +Builds a teardown action plan from candidate ResourceRecords. +This is used for both dry-run reporting and apply-mode execution. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from bloodhound.types import ResourceRecord + + +@dataclass(frozen=True) +class PlannedAction: + service: str + region: str + resource_type: str + id: str + arn: str | None + action: str + params: dict + + +def plan_deletions(records: list[ResourceRecord]) -> tuple[list[PlannedAction], list[ResourceRecord]]: + """ + Returns (planned_actions, manual_resources). + """ + actions: list[PlannedAction] = [] + manual: list[ResourceRecord] = [] + for r in records: + if r.delete_supported and r.delete_action and r.delete_params: + actions.append( + PlannedAction( + service=r.service, + region=r.region, + resource_type=r.resource_type, + id=r.id, + arn=r.arn, + action=r.delete_action, + params=r.delete_params, + ) + ) + else: + manual.append(r) + return actions, manual + + diff --git a/bloodhound/types.py b/bloodhound/types.py new file mode 100644 index 0000000..e734eaf --- /dev/null +++ b/bloodhound/types.py @@ -0,0 +1,35 @@ +""" +bloodhound/types.py + +Shared data structures used across scanners/whitelist/teardown. +Keeping these centralized avoids “dict soup” across the codebase. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Optional + + +@dataclass(frozen=True) +class ResourceRecord: + service: str + resource_type: str + region: str + id: str + arn: Optional[str] + state: str + tags: dict[str, str] + + delete_supported: bool = False + delete_action: Optional[str] = None + delete_params: Optional[dict[str, Any]] = None + + +def resource_key(r: ResourceRecord) -> str: + # Prefer ARN when present; else fall back to service/region/id. + if r.arn: + return r.arn + return f"{r.service}:{r.region}:{r.id}" + + diff --git a/bloodhound/whitelist.py b/bloodhound/whitelist.py new file mode 100644 index 0000000..4bc8b99 --- /dev/null +++ b/bloodhound/whitelist.py @@ -0,0 +1,43 @@ +""" +bloodhound/whitelist.py + +Whitelist (keep) logic for Bloodhound v2. + +Kept resources are excluded from teardown planning/execution. +Primary rule is a single tag (configurable via KEEP_TAG_KEY/KEEP_TAG_VALUE). +""" + +from __future__ import annotations + +from bloodhound.config import WhitelistConfig +from bloodhound.types import ResourceRecord, resource_key + + +def is_whitelisted(record: ResourceRecord, cfg: WhitelistConfig) -> bool: + # Explicit allowlist (IDs/ARNs) first. + if record.id in cfg.keep_resource_ids: + return True + if record.arn and record.arn in cfg.keep_resource_ids: + return True + if resource_key(record) in cfg.keep_resource_ids: + return True + + # Tag-based keep (minimal, default). + val = record.tags.get(cfg.keep_tag_key) + if val is None: + return False + # Normalize for "TRUE", "true", etc. + return val.strip().lower() == cfg.keep_tag_value.strip().lower() + + +def filter_whitelisted(records: list[ResourceRecord], cfg: WhitelistConfig) -> tuple[list[ResourceRecord], list[ResourceRecord]]: + kept: list[ResourceRecord] = [] + candidates: list[ResourceRecord] = [] + for r in records: + if is_whitelisted(r, cfg): + kept.append(r) + else: + candidates.append(r) + return candidates, kept + + diff --git a/docs/SLACK_SETUP.md b/docs/SLACK_SETUP.md new file mode 100644 index 0000000..6bb997e --- /dev/null +++ b/docs/SLACK_SETUP.md @@ -0,0 +1,326 @@ +# Slack Setup (Bloodhound-V2) + +## Table of Contents + +- [Recommended Manifest Setup](#recommended-manifest-based-setup-v2) +- [Slack Manifest Design](#slack-manifest-design-architecture-notes) +- [Slack Commands](#slack-commands-bloodhound-v2) +- [Internal Command Mapping](#internal-command-mapping) +- [Teardown Safety Workflow](#teardown-safety-workflow) +- [Create or Update the Slack App](#1-create-or-update-the-slack-app) +- [Install the App](#2-install-the-app) +- [Retrieve Required Secrets](#3-retrieve-required-secrets-manual-step) +- [Invite Bot to Channel](#4-invite-bot-to-channel) +- [Capture Channel IDs](#5-capture-channel-ids) +- [Validate](#6-validate) +- [Legacy Manual Slack Setup](#legacy-manual-slack-setup-deprecated) +- [/v2_status Command](#v2_status--system-status-command) +- [Validate Slack Command Endpoint](#validate-slack-command-endpoint) + +This project uses a Slack App to post scan summaries, budget alerts, and handle slash commands. + +Bloodhound V2 Slack Commands + +/v2_seek + Run AWS resource scan + +/v2_seek_destroy_plan + Preview teardown plan + +/v2_seek_destroy CONFIRM + Execute destructive teardown + +/v2_status + Show system health and configuration + +The preferred setup method is **manifest-based configuration**, which ensures Slack app settings are version-controlled and reproducible. + +--- + +## ✅ Recommended: Manifest-Based Setup (V2) + +The canonical Slack configuration lives in: + +`infra/slack/bloodhound_v2_manifest.json` + +## Slack Manifest Design (Architecture Notes) + +The Slack app configuration is managed via a JSON manifest to ensure reproducibility and version control. + +### Design Principles + +**Least privilege** +Only minimal bot scopes are requested: + +- `chat:write` — post scan and budget messages +- `commands` — enable `/v2_seek`, `/v2_seek_destroy`, `/v2_status` +- `channels:read`, `groups:read`, `im:read`, `mpim:read` — read channel metadata + +Note: Although the Slack commands are `/v2_seek` and `/v2_seek_destroy`, +the internal execution modes remain `seek` and `seek_destroy`. +This preserves compatibility with existing scripts, validation tools, +and documentation across the repository. + +No admin or elevated scopes are requested. + +## Slack Commands (Bloodhound V2) + +The Slack interface exposes the following commands: + +| Command | Description | +|-------|-------------| +| `/v2_seek` | Runs a non-destructive AWS scan and posts results | +| `/v2_seek_destroy_plan` | Generates a teardown preview of resources that would be deleted | +| `/v2_seek_destroy CONFIRM` | Executes destructive cleanup of non-whitelisted resources | +| `/v2_status` | Returns service status and health information | + +Example usage: + +/v2_seek + +/v2_seek_destroy_plan + +/v2_seek_destroy CONFIRM + +/v2_status + +## Internal Command Mapping + +Although Slack commands are versioned (`/v2_*`), the internal Lambda +execution modes remain unchanged. + +```text +| Slack Command | Internal Mode | +|---------------|--------------| +| /v2_seek | seek | +| /v2_seek_destroy_plan | seek_destroy_plan | +| /v2_seek_destroy| seek_destroy | +| /v2_status | status | +``` + +## Teardown Safety Workflow + +Bloodhound uses a two-step teardown workflow to prevent accidental +destructive operations. + +Typical workflow: + +1. /v2_seek + Perform a scan and generate a teardown preview. + +2. /v2_seek_destroy_plan + Review the full deletion plan for non-whitelisted resources. + +3. /v2_seek_destroy CONFIRM + Execute the teardown plan and delete resources. + +**Stateless HTTP integration** +- `socket_mode_enabled = false` +- Slash commands use a Lambda Function URL (HTTPS endpoint). + +Requests are received by the Lambda handler and routed through +the Bloodhound event router to the Slack command handler. + +This keeps the architecture simple and serverless. + +**Non-interactive design** +- No buttons, modals, or interactive components. +- All actions are explicit slash commands. + +**Deployment flow** +The `url` fields in slash commands are placeholders during setup. + +After deploying the infrastructure, retrieve the Lambda endpoint with: + +terraform output bloodhound_lambda_url + +Use this URL as the Request URL for all slash commands. + +### 1. Create or Update the Slack App + +1. Go to https://api.slack.com/apps +2. Click **Create New App** +3. Choose **From an app manifest** +4. Select your workspace +5. Paste the contents of: + `infra/slack/bloodhound_v2_manifest.json` +6. Click **Create** + +If the app already exists: +- Go to **App Manifest** +- Replace contents with the repo JSON +- Click **Save Changes** + +### 2. Install the App + +1. Go to **Install App** +2. Click **Install to Workspace** +3. Approve permissions + +### 3. Retrieve Required Secrets (Manual Step) + +After installation: + +From **OAuth & Permissions**: +- Copy **Bot User OAuth Token** → `SLACK_BOT_TOKEN` + +From **Basic Information**: +- Copy **Signing Secret** → `SLACK_SIGNING_SECRET` + +Store securely. +Do NOT commit these to git. + +### 4. Invite Bot to Channel + +In Slack: + +`/invite @bloodhoundv2` + +### 5. Capture Channel IDs + +Right-click channel → Copy link + +Extract channel ID from URL. + +Set: + +```md +SLACK_SCAN_CHANNEL_ID= +SLACK_ALERT_CHANNEL_ID= + +``` + +### 6. Validate + +Run locally: + +`tools/run_local.py` + +Confirm Slack messages appear. + +# ⚠️ Legacy Manual Slack Setup (Deprecated) + +The steps below reflect the original manual Slack configuration +approach used prior to manifest-based setup. + +Use only if manifest configuration is unavailable. + +## Slack setup (from scratch) + +This project’s main `README.md` assumes you already have: + +- a Slack bot token (`SLACK_BOT_TOKEN`) +- Slack channel IDs for `SLACK_SCAN_CHANNEL_ID` and `SLACK_ALERT_CHANNEL_ID` + +Use this guide only if you need to create/configure the Slack app/bot from scratch. + +--- + +### 1) Create a Slack App + +1. Go to [Slack API: Applications](https://api.slack.com/apps). +2. Click **Create New App**. +3. Choose **From scratch**. +4. Name the app and select the workspace. +5. Click **Create App**. + +--- + +### 2) Add bot permissions + +1. In your Slack app settings, go to **OAuth & Permissions**. +2. Under **Scopes**, add bot token scopes: + - `chat:write` + - `channels:read` + - `groups:read` +3. Click **Install App to Workspace** and approve. +4. Copy the **Bot User OAuth Token** (this is your `SLACK_BOT_TOKEN`). + +--- + +### 3) Get the Slack Channel ID + +1. In Slack, open the channel. +2. Copy the channel ID from the channel details (or URL). +3. Set: + - `SLACK_SCAN_CHANNEL_ID=` + - `SLACK_ALERT_CHANNEL_ID=` (can be the same or different) + +--- + +### 4) Invite the bot to the channel + +In the channel, run: + +- `/invite @your-bot-name` + +--- + +## `/v2_status` — System Status Command + +The `/v2_status` command returns the current operational state of the +Bloodhound system. + +This command is read-only and does not perform any AWS actions. + +The status report includes: + +- current execution mode (DRY RUN, SIMULATION, or DESTRUCTIVE APPLY) +- deletion safety limits +- AWS account verification status +- summary of the most recent scan + +Example: + +/v2_status + +Example response: + +Bloodhound v2 — System Status + +System Mode +mode: DRY RUN +apply_changes: false +simulate: true + +Safety Guards +max_deletion_limit: 10 +expected_account_id: 123456789012 +account_verified: true + +## Validate Slack Command Endpoint + +After deployment, confirm the Lambda endpoint is active. + +Example: + +curl $(terraform output -raw bloodhound_lambda_url)/health + +Expected response: + +```json +{ + "ok": true, + "service": "BloodhoundLambdaV2", + "status": "healthy" +} +``` + +--- + +## Lambda Execution Logging + +Bloodhound Lambda executions emit structured log markers: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: + +[BLOODHOUND][SLACK][request_id=...] +[BLOODHOUND][SCAN][request_id=...] +[BLOODHOUND][STATUS][request_id=...] + +The request_id corresponds to the AWS Lambda invocation ID +(context.aws_request_id) and allows engineers to trace +individual executions through CloudWatch logs. + diff --git a/docs/architecture_overview.md b/docs/architecture_overview.md new file mode 100644 index 0000000..dba0822 --- /dev/null +++ b/docs/architecture_overview.md @@ -0,0 +1,122 @@ +# Bloodhound Architecture Overview + +Bloodhound is a serverless automation system designed to detect and safely +remove unused AWS infrastructure. + +The system is implemented as an AWS Lambda service with multiple execution +entry points. + +--- + +## System Entry Points + +Bloodhound can be triggered from several sources: + +```text +| Source | Purpose | +|------|------| +Slack Commands | Interactive operations +GitHub Actions | CI/CD automation +Scheduled Events | Automatic scans +Validation Workflows | Safe teardown testing +``` + +All entry points eventually invoke the same internal pipeline. + +--- + +## High-Level Execution Flow + +```text +Slack / GitHub Actions / Scheduled Event + │ + ▼ + AWS Lambda Function URL + │ + ▼ + Event Router + (handlers.lambda_function) + │ + ▼ + Bloodhound Execution Pipeline + (bloodhound.app) + │ + ▼ + ┌─────────────────────────────────────┐ + │ Pipeline Stages │ + │ │ + │ 1. Resource Scan │ + │ scanner/ │ + │ │ + │ 2. Budget Evaluation │ + │ budget_service │ + │ │ + │ 3. Teardown Planning │ + │ teardown/planner │ + │ │ + │ 4. Execution (optional) │ + │ teardown/executor │ + │ │ + └─────────────────────────────────────┘ + │ + ▼ + Slack / Logs / Reports +``` + +## Safety Architecture + +Bloodhound is designed to operate safely by default. + +Destructive actions require multiple conditions: + +APPLY_CHANGES=true +TEARDOWN_SIMULATE=false +CONFIRM token in Slack command + +These guardrails ensure that infrastructure deletion cannot occur accidentally. + +## Resource Scanning + +The scanner layer currently supports: + +EC2 +ELBv2 +RDS + +## Future scanners may include: + +EBS snapshots +unused AMIs +orphaned ENIs +S3 buckets +Deployment Model + +Bloodhound is deployed as an AWS Lambda function using Terraform. + +Deployment pipeline: + +```text +scripts/build_lambda.sh + │ + ▼ +.build/lambda_pkg + │ + ▼ +archive_file + │ + ▼ +.build/bloodhound_lambda_v2.zip + │ + ▼ +AWS Lambda Deployment +``` + +## Key Design Principles + +Bloodhound was designed around several core principles: + +Safety-first infrastructure automation +Serverless operation +Slack-driven operations +Deterministic Lambda builds +Auditable infrastructure changes \ No newline at end of file diff --git a/docs/bloodhound_v2_plan.md b/docs/bloodhound_v2_plan.md new file mode 100644 index 0000000..ac25688 --- /dev/null +++ b/docs/bloodhound_v2_plan.md @@ -0,0 +1,479 @@ +# Bloodhound v2 Plan (tracked implementation checklist) + +## Table of Contents + +- [Current Reality (What v2 Does Today)](#0-current-reality-what-v2-does-today) +- [v2 Guiding Principles](#1-v2-guiding-principles-keep-it-minimal) +- [v2 Configuration](#2-v2-configuration-environment-variables) +- [Target Architecture](#3-target-architecture-whats-implemented) +- [Resource Record Schema](#4-resource-record-schema-current) +- [Resources to Scan](#5-resources-to-scan-status) +- [Teardown Delete Policy](#6-teardown-delete-policy-status) +- [Milestones](#7-milestones-tracked-checklist) +- [Open Questions / Future Improvements](#8-open-questions--next-improvements-optional) + +This document is the shared plan for evolving Bloodhound from v1.0 → v2.x while keeping the project simple and readable. + +Repo: `https://github.com/codeplatoon-devops/Bloodhound.git` + +--- + +## 0) Current reality (what v2 does today) + +Bloodhound v2 is already deployed as a **new** Lambda (`BloodhoundLambdaV2`) so it does not touch v1. + +### Invocation paths + +- **Scheduled**: A scheduled event invokes `BloodhoundLambdaV2`. + + Scheduled events may originate from: + + - EventBridge (production scheduler) + - GitHub Actions validation workflows + + These events route through a dedicated scheduled handler: + + scheduled_handler → run_scheduled_scan() → execute_pipeline() + + Scheduled executions do NOT pass through the generic `run()` function + to prevent recursive execution. + +- **On-demand**: Slack slash commands (`/v2_seek`, `/v2_seek_destroy_plan`, `/v2_seek_destroy CONFIRM`, `/v2_status`) hit a **Lambda Function URL**. + +### What it scans (per region) + +- **EC2 instances** (skips stopped/terminated) +- **EBS volumes** (unattached/`available`) +- **Elastic IPs** (unassociated) +- **NAT Gateways** (active states) +- **RDS DB instances** +- **ELBv2 load balancers** + +### What it produces (Slack) + +- **Scan summary** (includes zero counts) +- **Whitelisted resources list** (separate message) +- **Budget summary** (7-month cohort, dynamic monthly allowance) +- **Teardown plan** (always produced) +- **Teardown results** (only when apply-mode executes) + +### Direct validation (CLI) + +The Lambda can be invoked directly using the AWS CLI to validate core +pipeline behavior without Slack or GitHub Actions. + +Example: +```bash +aws lambda invoke \ +--function-name BloodhoundLambdaV2 \ +--region us-west-2 \ +--payload '{"source":"scan"}' \ +--cli-binary-format raw-in-base64-out \ +out.json +``` + +Then: + +`cat out.json` + +Expected result: + +```json +{ + "ok": true, + ... +} +``` + +This confirms: + +- routing logic is correct +- recursion issues are resolved +- pipeline executes successfully + +### Scheduler Validation (GitHub Actions) + +The repository includes a GitHub Actions workflow mode that +simulates scheduled EventBridge invocations. + +Mode: + +`validate_scheduler` + +Example payload used during validation: + +```json +{ + "source": "scheduled" +} +``` + +This validation ensures: + +- scheduled execution runs exactly once +- recursion does not occur +- structured logging is emitted +- request_id tracing appears in CloudWatch logs + +Each invocation produces a structured log marker: + +[BLOODHOUND][SCHEDULED][request_id=...] + +The request_id corresponds to the AWS Lambda invocation ID, +allowing a single execution to be traced across all CloudWatch log lines. + +This workflow allows engineers to validate scheduler behavior +before enabling production EventBridge triggers. + +### Teardown safety rails (v2) + +- Default is **dry-run** (`APPLY_CHANGES=false`) +- Safe testing of apply-mode: **simulate** (`TEARDOWN_SIMULATE=true`) + +Strong safety rails: + +- Explicit allowlist: TEARDOWN_TARGET_IDS=... +- Allow-all mode: TEARDOWN_ALLOW_ALL=true (dangerous; relies on whitelisting) + +Validation safety model: + +Validation runs are executed through a dedicated validation harness +rather than Slack commands. + +The validation workflow is: + +Terraform creates validation resource + ↓ +Validation script captures resource ID + ↓ +Validation script invokes Lambda with validation payload + ↓ +Lambda validation handler enables controlled destructive mode + ↓ +Teardown restricted to explicit validation targets + +Validation runs enforce three safety checks: + +1. Invocation source must be `validation` +2. Explicit `target_ids` must be provided +3. Targets must exist in scan results AND contain the tag `bloodhound:test=true` + +This ensures destructive validation testing cannot affect production resources. + + +This prevents accidental deployments that would enable automated deletion. +- Slash destroy requires: + - Confirm token: `/seek_destroy CONFIRM` (configurable token) + - Optional allowlists: user/channel IDs + +Bloodhound always generates a teardown plan (a list of resources it would delete). +If deletion is disabled, the plan is shown in Slack but no resources are removed. + +--- + +### Validation architecture (v2) + +Validation testing for teardown operations is automated and does not +use Slack commands. + +Validation events are invoked directly by the validation harness. + +Example validation payload: + +```json +{ + "source": "validation", + "mode": "seek_destroy_validation", + "target_ids": ["i-1234567890"] +} +``` + +The Lambda validation handler performs the following: + +- verifies the event originates from the validation harness +- verifies the correct validation mode +- requires explicit target_ids +- enables destructive execution internally +- restricts deletion scope to validation targets + +Additional safety checks occur during teardown planning: + +- target IDs must exist in scan results +- resources must contain tag `bloodhound:test=true` + +These safeguards ensure validation cannot accidentally delete +non-validation infrastructure. + +## 1) v2 guiding principles (keep it minimal) + +- **Configuration over code edits** + - Anything that varies cohort-to-cohort (start month, channels, regions) must be env-configurable. +- **Safe rollout for destructive actions** + - Default is dry-run. + - Apply-mode is explicit and layered with safety rails. +- **Simple whitelist** + - Primary mechanism is a single tag: `bloodhound:keep=true`. + - Optional escape hatch: env allowlist for IDs/ARNs (keep list) and explicit teardown targets. +- **No unnecessary infrastructure** + - Compute from AWS APIs each run when possible. + - Avoid DB/state unless it materially simplifies the system. + +--- + +## 2) v2 configuration (environment variables) + +This is the canonical list; `env.example` should be treated as the “source of truth” for local runs. + +### Slack + +- **`SLACK_ENABLED`**: `true|false` +- **`SLACK_BOT_TOKEN`**: bot token used to post messages +- **`SLACK_SCAN_CHANNEL_ID`**: channel for scan summaries + whitelisted list +- **`SLACK_ALERT_CHANNEL_ID`**: channel for budget alerts + teardown plan/results + - If unset, defaults to `SLACK_SCAN_CHANNEL_ID` + +### Slack slash commands (Function URL) + +- **`SLACK_SIGNING_SECRET`**: required for Slack signature verification +- **`SLACK_DESTROY_CONFIRM_TOKEN`**: required argument text for `/seek_destroy` (default `CONFIRM`) +- **`SLACK_ALLOWED_USER_IDS`** (optional): comma-separated list of Slack user IDs allowed to run destroy +- **`SLACK_ALLOWED_CHANNEL_IDS`** (optional): comma-separated list of Slack channel IDs allowed to run destroy + +### Regions + +- **`REGION_MODE`**: `explicit` or `discover` + - `explicit`: use `REGIONS` + - `discover`: discover regions dynamically +- **`REGIONS`**: comma-separated region list (only used in `explicit` mode) + +### Whitelist + +- **`KEEP_TAG_KEY`**: default `bloodhound:keep` +- **`KEEP_TAG_VALUE`**: default `true` +- **`KEEP_RESOURCE_IDS`** (optional): comma-separated IDs/ARNs always treated as kept + +### Teardown controls + +- **`APPLY_CHANGES`**: `true|false` (default `false`) +- **`TEARDOWN_SIMULATE`**: `true|false` (default `false` in config; we often use `true` for safe testing) +- **`TEARDOWN_TARGET_IDS`** (optional): comma-separated IDs/ARNs that are the only allowed targets +- **`TEARDOWN_ALLOW_ALL`**: `true|false` (dangerous; delete everything not whitelisted) +- **`RDS_FINAL_SNAPSHOT`**: `true|false` (default `false`) + +### Budget (dynamic cohort) + +- **`COHORT_START_YYYY_MM`**: e.g. `2026-01` +- **`COHORT_TOTAL_BUDGET_USD`**: default `3000` +- **`COHORT_LENGTH_MONTHS`**: default `7` +- **`BUDGET_OVER_DAYS`**: default `2` + +--- + +## 3) Target architecture (what’s implemented) + +### Execution Routing Model + +Bloodhound uses explicit event routing in the Lambda entrypoint. + +Event → Lambda Router → Handler → Execution Function + +Key rule: + +- Scheduled events must not pass through the generic `run()` function +- Slack HTTP events are handled by the Slack command handler +- Validation events are handled by the validation harness handler +- Manual invocations (scan/status) execute through the main `run()` pipeline + +This prevents: + +- recursive execution loops +- unintended handler re-entry +- ambiguous control flow + +The Lambda entrypoint performs deterministic event routing to ensure +each invocation type is handled by the correct execution path. + +Example routing flow: + +```text +Lambda handler + ↓ +event routing (Slack / scheduled / validation / default) + ↓ + +Slack: + handle_slack_command_http() + +Scheduled: + scheduled_handler → run_scheduled_scan() + +Validation: + validation handler → controlled teardown validation + +Default: + bloodhound.app.run() +``` + +### Structured Logging + +Bloodhound emits standardized CloudWatch log markers to simplify +debugging and operational tracing. + +Each Lambda invocation logs a structured header: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: + +[BLOODHOUND][SCHEDULED][request_id=abc123] +[BLOODHOUND][SCAN][request_id=xyz456] +[BLOODHOUND][STATUS][request_id=def789] + +The request_id value comes from the AWS Lambda context object +(`context.aws_request_id`) and uniquely identifies each invocation. + +This allows engineers to quickly identify invocation types and +trace individual Lambda executions through CloudWatch logs. + +### Code structure (current) + +- `bloodhound/` + - `app.py` (orchestrator) + - `config.py` (env parsing + defaults + validation) + - `scanner/` (`regions.py`, `scan_all.py`, `ec2.py`, `rds.py`, `elbv2.py`) + - `whitelist.py` + - `budget.py` + - `teardown/` (`planner.py`, `executor.py`) + - `messages.py` (Slack message formatting) + - `slack.py` (Slack API) + - `slack_commands.py` (Function URL entrypoint: verify + route) + - `types.py` (resource record schema) +- `handlers/lambda_function.py` (Lambda handler) +- `tools/run_local.py` (local runner) +- `infra/` (Terraform: build zip, IAM, Lambda, Function URL) + +### Logic flow (every run) + +- **Scan** + - Determine regions + - Collect resources per region/service + - Apply whitelist + - Post scan summary + whitelisted list +- **Budget** + - Use Cost Explorer to compute cohort-to-date spend + dynamic monthly allowance + - Compute run-rate month-end projection and “over budget streak” + - Post budget summary; post alert only when threshold is met +- **Teardown** + - Build a plan (always) + - If apply-mode: execute (or simulate) and post results + +--- + +## 4) Resource record schema (current) + +All scanners produce records with a shared shape (`bloodhound/types.py`): + +- **Required** + - `service`, `resource_type`, `region`, `id`, `state`, `tags` +- **Optional** + - `delete_supported`, `delete_action`, `delete_params` + +--- + +## 5) Resources to scan (status) + +Phase 1 (high value, low complexity) + +- ~~EC2 instances~~ +- ~~RDS instances~~ +- ~~Elastic IPs (unassociated)~~ +- ~~NAT gateways~~ +- ~~EBS volumes (unattached)~~ +- ~~ELBv2 load balancers~~ + +Phase 2 (later; only if needed) + +- EBS snapshots +- AMIs +- ElastiCache +- OpenSearch +- Redshift + +--- + +## 6) Teardown delete policy (status) + +Defaults (as implemented) + +- ~~EC2: terminate~~ +- ~~EBS: delete unattached volumes~~ +- ~~EIP: release if unassociated~~ +- ~~NAT gateway: delete~~ +- ~~Load balancer: delete~~ +- ~~RDS: delete without final snapshot by default (`RDS_FINAL_SNAPSHOT=false`)~~ + +--- + +## 7) Milestones (tracked checklist) + +### v2.0 (refactor + config + deploy) + +- ~~Archive v1.0 as runnable snapshot under `versions/v1_0/`~~ +- ~~Modularize into package structure (`bloodhound/`, `handlers/`, `tools/`, `docs/`)~~ +- ~~Env-driven Slack channels (scan vs alert)~~ +- ~~Env-driven region configuration (explicit + discover mode)~~ +- ~~Terraform deploy of a new Lambda (`BloodhoundLambdaV2`)~~ +- ~~Slack message formatting improvements (human-readable, consistent, includes zeros)~~ + +### v2.1 (resource coverage) + +- ~~EC2 instances~~ +- ~~EBS unattached volumes~~ +- ~~EIPs unassociated~~ +- ~~NAT gateways~~ +- ~~RDS instances~~ +- ~~ELBv2~~ + +### v2.2 (whitelist) + +- ~~Tag-based whitelist: `bloodhound:keep=true`~~ +- ~~Optional keep list: `KEEP_RESOURCE_IDS`~~ +- ~~Whitelisted resources posted as a separate list~~ + +### v2.3 (teardown dry-run) + +- ~~Planner produces proposed actions~~ +- ~~Plan posted to Slack~~ +- ~~Full plan logged as JSON~~ + +### v2.4 (teardown apply-mode + safety rails) + +- ~~Apply-mode gated by `APPLY_CHANGES=true`~~ +- ~~Simulate mode available (`TEARDOWN_SIMULATE=true`)~~ +- ~~Target filter safety rail (`TEARDOWN_TARGET_IDS`)~~ +- ~~Allow-all mode (`TEARDOWN_ALLOW_ALL=true`)~~ +- ~~Executor posts results to Slack~~ + +### v2.5 (budget + alerts) + +- ~~Cost Explorer cohort-to-date + dynamic monthly allowance~~ +- ~~Month-end run-rate projection~~ +- ~~Alert on `BUDGET_OVER_DAYS` consecutive days over allowance~~ + +### v2.6 (slash commands) + +- ~~Function URL entrypoint~~ +- ~~Signature verification + replay protection~~ +- ~~`/v2_seek` (non-destructive scan)~~ +- ~~`/v2_seek_destroy_plan` (teardown preview)~~ +- ~~`/v2_seek_destroy CONFIRM` (destructive, guarded)~~ +- ~~`/v2_status` (system status report)~~ +- ~~Optional allowlists for destroy (user/channel IDs)~~ + +--- + +## 8) Open questions / next improvements (optional) + +- Separate scan vs teardown IAM roles (read-only vs write) +- Add more resource types if needed (snapshots/AMIs/etc.) +- Optional: separate scheduled scan runs from destructive runs via dedicated workflow/job \ No newline at end of file diff --git a/docs/configuration_system.md b/docs/configuration_system.md new file mode 100644 index 0000000..4d65243 --- /dev/null +++ b/docs/configuration_system.md @@ -0,0 +1,298 @@ +# Bloodhound v2 Configuration Guide + +## Table of Contents + +- [Configuration Sources](#configuration-sources) +- [Teardown Mode Configuration](#teardown-mode-configuration) +- [Deletion Safety Limit](#deletion-safety-limit) +- [AWS Account Safety Guard](#aws-account-safety-guard) +- [Terraform Deployment Safety](#terraform-deployment-safety) +- [Bloodhound Safety Architecture](#bloodhound-safety-architecture) +- [Teardown Execution Flow](#teardown-execution-flow) +- [Related Documentation](#related-documentation) + +This document describes the configuration system used by **Bloodhound v2**, including environment variables, teardown behavior, and operational safety controls. + +Configuration is primarily provided through environment variables. + +For local development, variables are defined in: + +``` +.env +``` + +For AWS deployment, variables are configured in the Lambda **Environment Variables** section. + +An example configuration template is provided in: + +``` +env.example +``` + +--- + +# Configuration Sources + +Bloodhound loads configuration from the following sources: + +| Source | Purpose | +| ---------------------------- | ---------------------------------- | +| `.env` | Local development configuration | +| `env.example` | Template for creating `.env` | +| Lambda environment variables | Production configuration | +| Terraform variables | Infrastructure-level configuration | + +--- + +# Teardown Mode Configuration + +Bloodhound supports both **safe planning mode** and **real deletion mode**. + +Two environment variables control how teardown operations behave: + +``` +APPLY_CHANGES +TEARDOWN_SIMULATE +``` + +These variables must follow specific combinations. + +| APPLY_CHANGES | TEARDOWN_SIMULATE | Meaning | +| ------------- | ----------------- | ------------------------------------- | +| false | true | Safe dry-run mode (default operation) | +| true | false | Real deletion mode | +| false | false | Allowed but uncommon configuration (builds a plan but performs no deletion) | +| true | true | ❌ Invalid configuration | + +If both values are set to `true`, the configuration becomes contradictory. + +Example: + +``` +APPLY_CHANGES=true +TEARDOWN_SIMULATE=true +``` + +This would instruct Bloodhound to: + +* perform destructive actions +* simulate destructive actions + +This configuration is invalid. + +The validation scripts will refuse to run if this state is detected. + +--- + +# Deletion Safety Limit + +Bloodhound includes a safety rail that limits how many resources may be deleted in a single run. + +Environment variable: + +``` +TEARDOWN_MAX_DELETE_COUNT +``` + +Example: + +``` +TEARDOWN_MAX_DELETE_COUNT=5 +``` + +If a teardown plan contains more resources than this limit, execution will stop. + +This protects against: + +* scanning bugs +* AWS API anomalies +* incorrect filtering logic +* accidental large-scale deletion events + +--- + +# AWS Account Safety Guard + +Validation scripts include a safety guard that verifies the AWS account ID before executing destructive tests. + +Environment variable: + +``` +EXPECTED_AWS_ACCOUNT_ID +``` + +Example: + +``` +EXPECTED_AWS_ACCOUNT_ID=123456789012 +``` + +When validation scripts run, they compare the current AWS credentials against this value. + +If the account does not match, execution stops. + +This prevents validation scripts from running against the wrong AWS account. + +--- + +# Terraform Deployment Safety + +Terraform includes an additional safety guard that prevents deployments when destructive mode is enabled. + +Variable: + +``` +allow_apply_mode +``` + +Bloodhound can delete resources when: + +``` +APPLY_CHANGES=true +``` + +However Terraform will refuse deployment unless the engineer explicitly confirms the action. + +Example deployment command: + +``` +terraform apply -var allow_apply_mode=true +``` + +This prevents accidental enabling of destructive mode. + +--- + +# Bloodhound Safety Architecture + +## Execution Path Safety (Important) + +Bloodhound separates execution paths based on invocation type to ensure +safe and deterministic behavior. + +- Slack commands may use async self-invocation to return immediately +- Scheduled executions run synchronously through a dedicated path +- Validation harness invocations execute through a dedicated validation handler +- Manual operations (scan/status) execute through the main pipeline + +Scheduled events do NOT pass through the generic `run()` function. + +Instead they follow the dedicated execution path: + +scheduled_handler → run_scheduled_scan() → execute_pipeline() + +Scheduled events may originate from: + +- EventBridge (production scheduler) +- GitHub Actions scheduler validation workflows (`validate_scheduler` mode) + +This separation prevents recursive execution and ensures that scheduled +runs cannot re-enter the Lambda routing layer. + +Bloodhound includes multiple independent safety mechanisms designed to prevent accidental infrastructure deletion. + +These controls operate at different layers of the system. + +| Safety Layer | Purpose | +| -------------------------------- | ---------------------------------------------------------- | +| Slack confirmation token | prevents accidental teardown commands | +| validation harness isolation | restricts automated destructive tests to validation runs | +| max deletion count | prevents mass deletion events | +| Terraform apply guard | prevents destructive deployment configuration | +| Terraform account guard | prevents deploying infrastructure in the wrong AWS account | +| validation script account guard | prevents running validation tests in the wrong AWS account | +| config consistency guard | prevents invalid teardown configuration | + +These protections are intentionally redundant to provide multiple layers of safety. + +If one safety mechanism fails or is bypassed, others remain in place. + +This **defense-in-depth model** is common in internal cloud automation systems. + +--- + +# Lambda Logging and Traceability + +Bloodhound emits standardized CloudWatch log markers for all Lambda +invocations. + +Each invocation includes a structured log header: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: + +[BLOODHOUND][SCHEDULED][request_id=abc123] +[BLOODHOUND][SCAN][request_id=xyz456] +[BLOODHOUND][STATUS][request_id=def789] + +Including the Lambda `request_id` (from `context.aws_request_id`) +allows engineers to trace individual executions through CloudWatch +logs and quickly identify the event type being processed. + +This logging format significantly improves operational debugging and +scheduler validation. + +--- + +# Teardown Execution Flow + +The teardown process follows this sequence of safety checks. + +```text +Engineer / Validation Harness + │ + ▼ +Lambda Invocation + ├─ Slack Command (/v2_seek_destroy CONFIRM) + └─ Validation Harness Invocation + │ + ▼ +Lambda Router + │ + ▼ +Teardown Execution Path + │ + ▼ +Slack Confirmation Guard + │ + ▼ +Configuration Consistency Guard + │ + ▼ +Deletion Limit Guard + │ + ▼ +AWS API Delete Calls + │ + ▼ +CloudWatch Logging +``` + +Each stage ensures that destructive operations occur only when explicitly intended. + +--- + +# Related Documentation + +Detailed validation procedures are documented in the following guides. + +Slack command validation: + +``` +docs/slack_and_lambda_validation.md +``` + +Teardown validation workflow: + +``` +docs/validate_teardown.md +``` + +System architecture overview: + +``` +docs/bloodhound_v2_plan.md +``` + +--- diff --git a/docs/demo/cloudwatch_log_navigation.pdf b/docs/demo/cloudwatch_log_navigation.pdf new file mode 100644 index 0000000..d367754 Binary files /dev/null and b/docs/demo/cloudwatch_log_navigation.pdf differ diff --git a/docs/demo/teardown_validation_workflow.pdf b/docs/demo/teardown_validation_workflow.pdf new file mode 100644 index 0000000..ba029ef Binary files /dev/null and b/docs/demo/teardown_validation_workflow.pdf differ diff --git a/docs/demo_build.sh b/docs/demo_build.sh new file mode 100644 index 0000000..a135702 --- /dev/null +++ b/docs/demo_build.sh @@ -0,0 +1,175 @@ +env AWS_PROFILE=changeme bash -lc ' +set -euo pipefail + +KEEP_TAG_KEY="bloodhound:keep" +KEEP_TAG_VALUE="true" + +REGIONS=(us-east-1 us-east-2 us-west-1 us-west-2) +TS="$(date +%Y%m%d%H%M%S)" + +say() { echo "[$(date +%H:%M:%S)] $*"; } + +get_default_vpc_id() { + local region="$1" + aws --region "$region" ec2 describe-vpcs \ + --filters Name=isDefault,Values=true \ + --query "Vpcs[0].VpcId" --output text +} + +get_default_sg_id() { + local region="$1" vpc_id="$2" + aws --region "$region" ec2 describe-security-groups \ + --filters Name=vpc-id,Values="$vpc_id" Name=group-name,Values=default \ + --query "SecurityGroups[0].GroupId" --output text +} + +get_default_subnets() { + local region="$1" vpc_id="$2" + aws --region "$region" ec2 describe-subnets \ + --filters Name=vpc-id,Values="$vpc_id" Name=default-for-az,Values=true \ + --query "Subnets[].SubnetId" --output text +} + +get_al2023_ami() { + local region="$1" + aws --region "$region" ssm get-parameter \ + --name "/aws/service/ami-amazon-linux-latest/al2023-ami-kernel-6.1-x86_64" \ + --query "Parameter.Value" --output text +} + +create_ec2() { + local region="$1" subnet_id="$2" sg_id="$3" ami_id="$4" keep="$5" name="$6" + local tag_spec + + if [[ "$keep" == "true" ]]; then + tag_spec="ResourceType=instance,Tags=[{Key=Name,Value=${name}},{Key=${KEEP_TAG_KEY},Value=${KEEP_TAG_VALUE}},{Key=bloodhound:demo,Value=true},{Key=bloodhound:created_by,Value=cursor}]" + else + tag_spec="ResourceType=instance,Tags=[{Key=Name,Value=${name}},{Key=bloodhound:demo,Value=true},{Key=bloodhound:created_by,Value=cursor}]" + fi + + local vol_tag_spec="ResourceType=volume,Tags=[{Key=Name,Value=${name}-root},{Key=bloodhound:demo,Value=true},{Key=bloodhound:created_by,Value=cursor}]" + + aws --region "$region" ec2 run-instances \ + --image-id "$ami_id" \ + --instance-type "t3.micro" \ + --subnet-id "$subnet_id" \ + --security-group-ids "$sg_id" \ + --count 1 \ + --tag-specifications "$tag_spec" "$vol_tag_spec" \ + --query "Instances[0].InstanceId" --output text +} + +ensure_rds_subnet_group() { + local region="$1" name="$2" subnet_ids_csv="$3" + + if aws --region "$region" rds describe-db-subnet-groups --db-subnet-group-name "$name" >/dev/null 2>&1; then + return 0 + fi + + # shellcheck disable=SC2206 + local subnet_ids=(${subnet_ids_csv}) + if [[ ${#subnet_ids[@]} -lt 2 ]]; then + echo "ERROR: Need at least 2 default subnets for RDS subnet group in region=${region}, got: ${subnet_ids_csv}" >&2 + return 1 + fi + + aws --region "$region" rds create-db-subnet-group \ + --db-subnet-group-name "$name" \ + --db-subnet-group-description "Bloodhound demo subnet group (${region})" \ + --subnet-ids "${subnet_ids[0]}" "${subnet_ids[1]}" \ + --tags Key=bloodhound:demo,Value=true Key=bloodhound:created_by,Value=cursor >/dev/null +} + +create_rds() { + local region="$1" subnet_group="$2" sg_id="$3" keep="$4" identifier="$5" + + local username="bloodhound" + local password + password="$(openssl rand -base64 24 | tr -d '\''/@" '\'' | cut -c1-20)" + + local tags=(Key=bloodhound:demo,Value=true Key=bloodhound:created_by,Value=cursor) + if [[ "$keep" == "true" ]]; then + tags+=(Key="${KEEP_TAG_KEY}",Value="${KEEP_TAG_VALUE}") + tags+=(Key=Name,Value="${identifier}") + else + tags+=(Key=Name,Value="${identifier}") + fi + + aws --region "$region" rds create-db-instance \ + --db-instance-identifier "$identifier" \ + --engine postgres \ + --db-instance-class db.t3.micro \ + --allocated-storage 20 \ + --master-username "$username" \ + --master-user-password "$password" \ + --no-publicly-accessible \ + --backup-retention-period 1 \ + --storage-type gp2 \ + --db-subnet-group-name "$subnet_group" \ + --vpc-security-group-ids "$sg_id" \ + --tags "${tags[@]}" \ + --query "DBInstance.DBInstanceIdentifier" --output text +} + +say "Confirming caller identity (profile=geekstar)" +aws sts get-caller-identity --output json + +say "Creating EC2 instances (2 per region: 1 whitelisted, 1 not)" +for region in "${REGIONS[@]}"; do + say "Region: ${region}" + + vpc_id="$(get_default_vpc_id "$region")" + if [[ -z "$vpc_id" || "$vpc_id" == "None" ]]; then + echo "ERROR: No default VPC found in region=${region}" >&2 + exit 1 + fi + + sg_id="$(get_default_sg_id "$region" "$vpc_id")" + subnets="$(get_default_subnets "$region" "$vpc_id")" + # pick the first subnet for EC2 placement + subnet_id="$(awk "{print \$1}" <<<"$subnets")" + + ami_id="$(get_al2023_ami "$region")" + + keep_name="bloodhound-demo-ec2-keep-${region}-${TS}" + nokeep_name="bloodhound-demo-ec2-nokeep-${region}-${TS}" + + keep_id="$(create_ec2 "$region" "$subnet_id" "$sg_id" "$ami_id" true "$keep_name")" + say " EC2 (kept) ${keep_id} name=${keep_name}" + + nokeep_id="$(create_ec2 "$region" "$subnet_id" "$sg_id" "$ami_id" false "$nokeep_name")" + say " EC2 (not kept) ${nokeep_id} name=${nokeep_name}" +done + +say "Creating 2 RDS instances (one kept, one not)" +RDS_KEEP_REGION="us-east-1" +RDS_NOKEEP_REGION="us-west-2" + +for region in "$RDS_KEEP_REGION" "$RDS_NOKEEP_REGION"; do + say "RDS Region: ${region}" + vpc_id="$(get_default_vpc_id "$region")" + sg_id="$(get_default_sg_id "$region" "$vpc_id")" + subnets="$(get_default_subnets "$region" "$vpc_id")" + + subnet_group="bloodhound-demo-subnetgrp-${region}" + ensure_rds_subnet_group "$region" "$subnet_group" "$subnets" + +done + +keep_rds_id="bloodhound-demo-rds-keep-${TS}" +nokeep_rds_id="bloodhound-demo-rds-nokeep-${TS}" + +vpc_id="$(get_default_vpc_id "$RDS_KEEP_REGION")" +sg_id="$(get_default_sg_id "$RDS_KEEP_REGION" "$vpc_id")" +subnet_group="bloodhound-demo-subnetgrp-${RDS_KEEP_REGION}" +created_keep_rds="$(create_rds "$RDS_KEEP_REGION" "$subnet_group" "$sg_id" true "$keep_rds_id")" +say " RDS (kept) ${created_keep_rds} region=${RDS_KEEP_REGION}" + +vpc_id="$(get_default_vpc_id "$RDS_NOKEEP_REGION")" +sg_id="$(get_default_sg_id "$RDS_NOKEEP_REGION" "$vpc_id")" +subnet_group="bloodhound-demo-subnetgrp-${RDS_NOKEEP_REGION}" +created_nokeep_rds="$(create_rds "$RDS_NOKEEP_REGION" "$subnet_group" "$sg_id" false "$nokeep_rds_id")" +say " RDS (not kept) ${created_nokeep_rds} region=${RDS_NOKEEP_REGION}" + +say "Done. (RDS will take several minutes to become available.)" +' \ No newline at end of file diff --git a/docs/future_improvements.md b/docs/future_improvements.md new file mode 100644 index 0000000..2e5e78b --- /dev/null +++ b/docs/future_improvements.md @@ -0,0 +1,475 @@ +# Bloodhound V2 – Future Infrastructure Hardening (TODO) + +WIP .. HERE AS A REMINDER + +This document records security and platform improvements that should be implemented **after the V2 migration is stable**. + +These items were intentionally deferred during the V2 takeover in order to follow the engineering principle: + +> One axis of change at a time. + +The initial goal was to successfully deploy and stabilize **BloodhoundLambdaV2** without introducing additional infrastructure changes that could complicate debugging. + +Once V2 has run successfully for several cycles, the following improvements should be implemented. + +--- + +## 1. Move Slack Secrets to AWS Secrets Manager + +### Current State + +Slack secrets are currently provided through Lambda environment variables via Terraform: + +* `SLACK_BOT_TOKEN` +* `SLACK_SIGNING_SECRET` + +These values are supplied through `terraform.tfvars` and therefore appear in Terraform state. + +### Risks + +* Secrets may be stored in Terraform state +* Secrets could be exposed if state storage is misconfigured +* Secrets exist outside AWS secret management systems + +### Target Architecture + +Secrets should be stored in **AWS Secrets Manager** and retrieved at runtime. + +Example secret: + +``` +bloodhound/slack +``` + +Secret value example: + +``` +{ + "bot_token": "...", + "signing_secret": "..." +} +``` + +### Implementation Steps + +1. Create secret in AWS Secrets Manager. +2. Grant Lambda IAM role permission: + +``` +secretsmanager:GetSecretValue +``` + +3. Pass secret ARN to Lambda via environment variable: + +``` +SLACK_SECRET_ARN +``` + +4. Update Lambda code to retrieve secret at runtime. + +Result: + +``` +Secrets Manager + ↓ +Lambda IAM Role + ↓ +Lambda runtime retrieval +``` + +Benefits: + +* Secrets are encrypted +* Access is auditable +* Secrets can be rotated without redeploying Lambda + +--- + +## 2. Remove Secrets from Terraform Variables + +Once Secrets Manager is implemented: + +Remove the following from `terraform.tfvars`: + +``` +SLACK_BOT_TOKEN +SLACK_SIGNING_SECRET +``` + +Terraform should only provide the **secret ARN**, not the secret value. + +Example: + +``` +lambda_env = { + SLACK_SECRET_ARN = "arn:aws:secretsmanager:..." +} +``` + +--- + +## 3. Introduce CI/CD Deployment Role (GitHub OIDC) + +### Current State + +Terraform is executed locally using developer AWS credentials. + +### Risks + +* Infrastructure depends on developer access +* Manual deployments increase risk of configuration drift + +### Target Architecture + +Deploy infrastructure through GitHub Actions using **OpenID Connect (OIDC)**. + +Flow: + +``` +GitHub Actions + ↓ +OIDC authentication + ↓ +Assume IAM role + ↓ +Terraform apply +``` + +Benefits: + +* No long-lived AWS credentials +* Infrastructure ownership belongs to the organization +* Deployments become auditable and reproducible + +--- + +## 4. Restrict Lambda IAM Permissions + +The Lambda role currently has broad permissions to support scanning and teardown operations. + +Future improvements should include: + +* splitting scanning and deletion permissions +* limiting actions to specific services used by Bloodhound +* adding explicit resource constraints where possible + +Example separation: + +``` +BloodhoundScanRole +BloodhoundDeleteRole +``` + +This reduces the blast radius of the Lambda. + +--- + +## 5. Configure Terraform Remote State + +Terraform state should be stored remotely instead of locally. + +Recommended backend: + +``` +S3 + DynamoDB state locking +``` + +Benefits: + +* prevents state corruption +* supports multiple maintainers +* enables CI/CD workflows + +--- + +## 6. Add Structured Logging + +Lambda logs should include structured JSON logs to improve observability. + +Example fields: + +``` +event_type +command +region +resource_count +teardown_plan +execution_mode +``` + +This improves debugging and monitoring. + +--- + +## 7. Implement Slack Request Validation Monitoring + +Slack signature verification is already implemented. + +Future improvement: + +Log validation failures with rate limits to detect potential abuse. + +--- + +## Summary + +These improvements move Bloodhound toward a production-grade serverless architecture. + +Target state: + +``` +Slack + ↓ +Lambda Function URL + ↓ +BloodhoundLambdaV2 + ↓ +AWS APIs + +Secrets: +AWS Secrets Manager + +Deployments: +GitHub Actions (OIDC) + +Infrastructure: +Terraform with remote state +``` + +These changes should only be implemented **after the V2 migration is confirmed stable**. + +## Future Infrastructure Hardening + +Bloodhound v2 currently prioritizes **deployment stability and operational safety**. + +Several platform improvements were intentionally deferred during the V2 migration in order to follow the engineering principle: + +> One axis of change at a time. + +The current architecture already includes operational safeguards such as: + +* Lambda version publishing +* a production alias (`prod`) +* Terraform-controlled rollback support +* immutable Lambda versions +* Slack integration that does not depend on version changes + +These mechanisms ensure that deployments and rollbacks remain safe while the new V2 infrastructure stabilizes. + +Additional security and platform improvements are planned once the system has been validated in production. + +Examples include: + +* migrating Slack secrets to **AWS Secrets Manager** +* introducing **GitHub OIDC deployment roles** +* configuring **Terraform remote state (S3 + DynamoDB)** +* tightening Lambda IAM permissions +* improving observability and structured logging + + +Yes — adding a **reminder section for the AWS account guard** in that document is a very good idea. It belongs alongside the other **post-stabilization hardening tasks** because it’s a **runtime safety control** and not required to stabilize V2. + +Below is a section you can **append to your document** in the same style and structure you are already using. + +--- + +# 8. Implement AWS Account Runtime Guard + +### Current State + +Bloodhound v2 includes **documentation and validation-script guards** that verify the AWS account ID before executing destructive operations. + +However, the **Lambda runtime itself does not yet enforce this check**. + +The `/v2_status` command currently displays: + +``` +expected_account_id: not-configured +``` + +This indicates that the runtime guard has not yet been connected to the configuration system. + +The guard is partially implemented in the codebase but has not yet been integrated into: + +``` +config.py +app.py +``` + +--- + +### Risks + +Without a runtime account guard, it is theoretically possible for Bloodhound to execute against the wrong AWS account if credentials are misconfigured. + +Examples: + +``` +running teardown against production instead of development +running validation scripts against the wrong account +``` + +Other safeguards already exist (dry-run defaults, deletion limits, Terraform guards), but adding an account verification layer provides **additional defense-in-depth**. + +--- + +### Target Architecture + +Bloodhound should verify the AWS account identity during Lambda startup before performing any scan or teardown operations. + +Runtime flow: + +``` +Lambda start + ↓ +Load configuration + ↓ +Retrieve AWS caller identity (STS) + ↓ +Compare to EXPECTED_AWS_ACCOUNT_ID + ↓ +Abort execution if mismatch +``` + +--- + +### Implementation Steps + +1. Add environment variable: + +``` +EXPECTED_AWS_ACCOUNT_ID +``` + +Example: + +``` +EXPECTED_AWS_ACCOUNT_ID=123456789012 +``` + +--- + +2. Extend `AwsConfig` in `config.py`: + +``` +expected_account_id: Optional[str] +``` + +--- + +3. Load the variable during configuration loading. + +Example: + +``` +expected_account_id = _env("EXPECTED_AWS_ACCOUNT_ID") +``` + +--- + +4. Add runtime verification in `app.py` after configuration loading. + +Example logic: + +``` +sts = clients.sts +identity = sts.get_caller_identity() +runtime_account_id = identity["Account"] + +if expected_account_id and runtime_account_id != expected_account_id: + abort execution +``` + +--- + +5. Update `/v2_status` to display: + +``` +expected_account_id +runtime_account_id +account_verified +``` + +Example output: + +``` +Safety Guards + +max_deletion_limit: 5 +expected_account_id: 123456789012 +runtime_account_id: 123456789012 +account_verified: true +``` + +--- + +### Benefits + +This guard prevents infrastructure automation from operating in the wrong AWS account. + +It is particularly valuable in environments with multiple accounts such as: + +``` +development +staging +production +``` + +This mechanism is commonly used in internal cloud automation systems to provide an additional safety layer before destructive operations. + +--- + +### Deployment Timing + +This change should be implemented **after V2 stability is confirmed**. + +It introduces additional AWS API calls (`sts:GetCallerIdentity`) and runtime checks, which were intentionally deferred during the V2 migration to minimize moving parts. + +--- + +# Resulting Safety Architecture + +Once implemented, Bloodhound will include the following independent safety mechanisms: + +``` +Slack confirmation guard +Deletion limit guard +Terraform destructive-mode guard +Validation script account guard +Runtime AWS account verification +``` + +This creates a **defense-in-depth safety model** for automated infrastructure cleanup. + +## 9. Terraform Execution Timeout Guard + +The validation harness now runs Terraform through a timeout wrapper +to prevent CI/CD pipelines from hanging indefinitely. + +Example: + +timeout 600 terraform apply + +If Terraform becomes stuck due to provider issues, network stalls, +or AWS API delays, the validation workflow will automatically abort. + +This protects CI systems from stuck jobs and ensures validation runs +fail fast when infrastructure operations do not complete in time. + + +## Observability Improvements (Future) + +Enhance Bloodhound runtime visibility and debugging support. + +Planned upgrades: + +- Replace stdout prints with structured logging +- Add log levels (INFO, WARNING, ERROR) +- Include request_id and execution context +- Emit scan/budget/teardown metrics as structured JSON +- Enable optional CloudWatch Insights queries +- Add CI pipeline visibility for scan summaries + +These improvements will make it easier to troubleshoot production +runs and analyze historical scans. + diff --git a/docs/github_actions.md b/docs/github_actions.md new file mode 100644 index 0000000..49bd967 --- /dev/null +++ b/docs/github_actions.md @@ -0,0 +1,326 @@ +# GitHub Actions Automation + +This document describes the GitHub Actions workflows used by the +Bloodhound system to invoke the AWS Lambda scanner and perform +automated operational checks. + +The workflows provide a safe and auditable mechanism for triggering +infrastructure scans and validation runs directly from the repository. + +--- + +## Overview + +The GitHub workflow invokes the Lambda function: + + BloodhoundLambdaV2 + +This version contains the V2 scanning logic and Slack-integrated +operational commands. + +The Lambda determines its execution behavior based on the event payload: + + event["source"] + +Example event payload: + + { "source": "scan" } + +This allows the same Lambda function to support multiple operational +execution modes. + +--- +## Architecture + +The GitHub workflow acts as an external trigger for the Bloodhound +Lambda scanner. + +```text +Execution flow: + + GitHub Actions + ↓ + AWS Lambda (BloodhoundLambdaV2) + ↓ + Lambda event router + ↓ + Execution handler (scan / status / scheduled) + ↓ + AWS API scanning + ↓ + Slack reporting +``` + +--- + +## Workflow Location + +Bloodhound uses two GitHub Actions workflows. + +Scheduled automation workflow: + + .github/workflows/invoke_lambda.yml + +Manual operator workflow: + + .github/workflows/bloodhound_ops.yml + +The scheduled workflow runs automated infrastructure scans, +while the manual workflow allows engineers to invoke specific +Bloodhound operations directly from the GitHub Actions UI. + +## Workflow Triggers + +The workflow supports two trigger types. + +### Scheduled Runs + +Bloodhound automatically runs scheduled scans twice per day. + +Cron configuration: + + 0 16,4 * * * + +Schedule (UTC): + + 16:00 UTC — 11:00 AM EST + 04:00 UTC — 11:00 PM EST + +Scheduled runs allow Bloodhound to continuously monitor AWS +infrastructure for unused resources and cost anomalies. + +--- + +### Manual Runs + +Engineers can manually invoke the workflow from the GitHub Actions UI. + +Location: + + Repository → Actions → Bloodhound Operations → Run workflow + +Manual runs allow engineers to trigger specific execution modes. + +Available modes: + + scan + status + validate_scheduler + validation (currently disabled) + +Notes: + +- `validate_scheduler` simulates an EventBridge scheduled invocation using: +```json + { "source": "scheduled" } +``` + +- The validation workflow exists but is currently disabled in CI while + teardown validation infrastructure is being stabilized. + +--- + +## Execution Modes + +The selected mode is passed to Lambda as: + + { "source": "" } + +Each mode triggers a different execution path. + +### scan + +Runs a full infrastructure scan across supported AWS regions. + +Identifies unused resources and generates a teardown plan. + +--- + +### validation + +Runs a safe validation workflow that verifies: + +- Lambda deployment +- scanner operation +- cost monitoring integration +- teardown safety logic + +Validation workflows use disposable test resources. + +--- + +### validate_scheduler + +Simulates the scheduled execution path. + +This mode sends the following payload to the Lambda: +```json + { "source": "scheduled" } +``` + +This allows engineers to validate the scheduled execution path +without waiting for the production scheduler or cron execution. + +--- + +### status + +Runs a system health check that verifies: + +- AWS API connectivity +- scanner configuration +- environment variables +- Slack integration status + +--- + +## Lambda Invocation + +The workflow invokes Lambda using the AWS CLI: + + aws lambda invoke + +Invocation artifacts are stored in two files: + +Response payload: + + output.json + +Invocation metadata: + + lambda_meta.json + +Metadata includes: + + StatusCode + ExecutedVersion + FunctionError + LogResult + +--- + +## CI Observability Improvements + +The workflow includes several improvements to make CI runs easier +to debug and monitor. + +--- + +### Structured CI Logs + +GitHub log groups are used to create collapsible sections in the +workflow logs. + +Example sections: + + Bloodhound Lambda Invocation + Lambda Response Payload + Lambda Invocation Metadata + Lambda Error Check + +This improves readability when analyzing CI runs. + +--- + +### Automatic Failure Detection + +The workflow performs explicit checks on the Lambda invocation metadata. + +CI will fail if either condition occurs: + +1. Lambda reports a runtime error + + FunctionError detected + +2. Lambda invocation returns a non-200 status code + +This prevents CI pipelines from silently succeeding when the Lambda +execution fails. + +--- + +### Example Failure + +If Lambda throws an exception: + + FunctionError: Unhandled + +The GitHub workflow will terminate with: + + JOB FAILED + +--- + +## CloudWatch Log Streaming + +The workflow optionally retrieves recent Lambda logs directly from +CloudWatch and prints them into the GitHub workflow output. + +This allows engineers to debug Lambda behavior without leaving the +GitHub Actions interface. + +Log retrieval uses: + + aws logs describe-log-streams + aws logs get-log-events + +The workflow retrieves the most recent log stream from: + + /aws/lambda/BloodhoundLambdaV2 + +and prints recent log messages. + +--- + +## Operational Benefits + +The GitHub automation provides several operational advantages. + +### Auditable Operations + +All Lambda invocations are recorded in the repository's CI history. + +Engineers can see: + +- who triggered the workflow +- when it ran +- the resulting logs + +--- + +### Safe Operational Control + +Infrastructure scans and validation workflows can be executed +without requiring direct AWS CLI access. + +Engineers can safely trigger operations through GitHub. + +--- + +### Faster Debugging + +CI logs now include: + +- Lambda response payload +- invocation metadata +- runtime logs from CloudWatch + +This significantly reduces time required to diagnose failures. + +--- + + +## Potential Enhancements + +The current workflow already streams Lambda logs directly +into the GitHub Actions output for debugging. + +Future enhancements may include: + +- attaching Lambda logs as downloadable CI artifacts +- rendering scan summaries in GitHub workflow summaries +- automated cost anomaly alerts +- integration with security scanning pipelines + +These additions would further improve observability and +operational reporting. \ No newline at end of file diff --git a/docs/images/cloudwatch_logs.png b/docs/images/cloudwatch_logs.png new file mode 100644 index 0000000..2369fc8 Binary files /dev/null and b/docs/images/cloudwatch_logs.png differ diff --git a/docs/images/github_actions_invoke_lambda_scan.png b/docs/images/github_actions_invoke_lambda_scan.png new file mode 100644 index 0000000..a267bdd Binary files /dev/null and b/docs/images/github_actions_invoke_lambda_scan.png differ diff --git a/docs/images/github_actions_manual_ops.png b/docs/images/github_actions_manual_ops.png new file mode 100644 index 0000000..23ceaeb Binary files /dev/null and b/docs/images/github_actions_manual_ops.png differ diff --git a/docs/images/slack_command_preview.png b/docs/images/slack_command_preview.png new file mode 100644 index 0000000..86ed32a Binary files /dev/null and b/docs/images/slack_command_preview.png differ diff --git a/docs/lambda_packaging.md b/docs/lambda_packaging.md new file mode 100644 index 0000000..5411486 --- /dev/null +++ b/docs/lambda_packaging.md @@ -0,0 +1,638 @@ +# Lambda Dependency Management and Packaging Strategy + +## Table of Contents + +- [Why AWS Includes boto3 in Lambda](#why-aws-includes-boto3-in-lambda) +- [Why boto3 Should Not Be Bundled](#why-boto3-should-not-be-bundled) +- [Development Dependencies](#development-dependencies) +- [Build Environment vs Lambda Runtime](#build-environment-vs-lambda-runtime) +- [Common Packaging Failures](#common-packaging-failures) +- [Lambda Packaging Flow (Current Implementation)](#lambda-packaging-flow-current-implementation) +- [Future Packaging Flow (Docker-Based)](#future-packaging-flow-docker-based) +- [Recommended Build Best Practices](#recommended-build-best-practices) + +This document explains how Bloodhound packages dependencies for AWS Lambda +and why certain libraries should not be bundled with the deployment package. + +It also describes the Docker-based build system used to ensure +deterministic Lambda packaging and runtime compatibility. + +Note: + +Some Lambda packaging failures may originate from dependency conflicts +during the pip installation step. Dependency management rules for the +Bloodhound Lambda are documented in this guide. + +Engineers encountering dependency resolution errors should review the +packaging guide before modifying `requirements.txt`. + +--- + +## Lambda Execution Routing (Important) + +Bloodhound uses explicit event routing in the Lambda entrypoint to ensure +safe execution across different invocation types. + +Execution model: + +`Event → Router → Handler → Pipeline` + +### Routing behavior + +- Slack HTTP events → handled immediately by Slack handler +- Scheduled events → routed to `scheduled_handler` +- Validation events → routed to the validation handler +- Default/manual events → routed to `run()` + +### Scheduled execution (critical behavior) + +Scheduled events do NOT pass through `run()`. + +Instead they follow: + +`scheduled_handler → run_scheduled_scan() → execute_pipeline()` + +This separation prevents: + +- recursive execution loops +- duplicate handler invocation +- unintended re-entry into routing logic + +Engineers modifying Lambda execution must ensure scheduled events +remain isolated from the generic run() path. + +--- +## Bloodhound Application Architecture + +The Bloodhound Lambda is organized using a layered architecture +to separate AWS event handling, orchestration logic, and AWS +resource scanning functionality. + +Directory layout inside the Lambda package: + +bloodhound/ + config/ runtime configuration and environment handling + handlers/ internal request handlers + scanner/ AWS resource discovery logic + services/ orchestration and service layer logic + teardown/ infrastructure cleanup planning and execution + +Responsibilities: + +handlers + interpret events and route them into the service layer + +services + coordinate higher-level operations such as scanning, + status checks, and teardown workflows + +scanner + interact with AWS APIs to discover infrastructure resources + +teardown + plan and execute resource cleanup actions + +--- + +## Lambda Entry Point + +AWS Lambda invokes the function defined in: + +handlers/lambda_function.py + +This file acts as the Lambda entrypoint and is responsible for receiving +AWS events and routing them into the Bloodhound execution pipeline. + +Execution flow: + +AWS Lambda + │ + ▼ +handlers/lambda_function.py + │ + ▼ +bloodhound.handlers.* + │ + ▼ +service layer + +Separating the Lambda entrypoint from the application modules ensures +that AWS-specific logic remains isolated from the core Bloodhound +application code. + +--- + +## Lambda Logging and Traceability + +Bloodhound Lambda executions emit structured CloudWatch log markers: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: + +[BLOODHOUND][SCHEDULED][request_id=...] +[BLOODHOUND][SCAN][request_id=...] +[BLOODHOUND][STATUS][request_id=...] + +The request_id corresponds to the AWS Lambda invocation ID +(context.aws_request_id). + +Including the request_id allows engineers to trace individual +executions across CloudWatch logs and GitHub Actions validation runs. + +--- + +# Why AWS Includes boto3 in Lambda + +AWS Lambda Python runtimes already include the AWS SDK libraries: + +- boto3 +- botocore +- s3transfer +- jmespath + +Because these are included in the runtime environment, Lambda functions can +import them directly without bundling them in the deployment package. + +Example: + +```python +import boto3 +ec2 = boto3.client("ec2") +``` + +This works even if boto3 is not included in requirements.txt. + +# Why boto3 Should NOT Be Bundled + +AWS recommends not packaging boto3 unless you require a specific version. + +There are several reasons for this. + +Version Conflicts + +The Lambda runtime includes a specific version of boto3 and botocore. + +Example runtime versions: + +```text +boto3 1.34.x +botocore 1.34.x +``` + +If a deployment package includes different versions, Python may load +conflicting dependencies. + +This can cause subtle runtime failures. + +Larger Deployment Packages + +Lambda deployment limits: + +50 MB zipped +250 MB unzipped + +The boto3 dependency chain is large. + +Typical uncompressed size: + +boto3 + botocore ≈ 70MB + +Including it unnecessarily increases package size. + +Dependency Resolution Problems + +Packaging boto3 introduces additional dependencies. + +Example chain: + +```text +boto3 +└── botocore + └── urllib3 (< 1.27) +``` + +If a project forces a newer urllib3 version (for example urllib3==2.x) +pip will fail to resolve dependencies during packaging. + +This can break Terraform deployments. + +Recommended requirements.txt + +Bloodhound should only package dependencies not included in Lambda. + +Example: + +slack-sdk==3.26.1 +python-dotenv==1.0.0 + +Do NOT include: + +boto3 +botocore +urllib3 +s3transfer + +Lambda provides these automatically. + +## Development Dependencies + +Local development may require additional dependencies such as `boto3` +for testing AWS interactions. + +These dependencies are defined in `requirements-dev.txt`. + +Terraform packaging only uses `requirements.txt` to ensure that the +Lambda deployment package contains only the dependencies required +for runtime execution. + +## Python Packaging Metadata + +During dependency installation, pip may create additional metadata +directories inside the Lambda package. + +Examples include: + +*.dist-info +bin/ + +These directories are normal artifacts created by Python packaging +and contain metadata such as version information and package records. + +They do not affect Lambda execution and are safe to include in the +deployment package. + +## Build Environment vs Lambda Runtime + +Lambda packaging happens in two separate environments. + +### Build Environment (Local Machine) + +Terraform builds the Lambda package locally using a `local-exec` provisioner. + +Example command executed during packaging: + +```bash +python3 -m pip install -r requirements.txt -t .build/lambda_pkg +``` + +The Python version used here is the Python version installed on the engineer's machine. + +Example: + +Local Python: 3.13 + +### Runtime Environment (AWS Lambda) + +The Lambda function itself runs inside the runtime defined in Terraform: + +`runtime = "python3.10"` + +This means AWS executes the code in its own environment: + +`AWS Lambda Runtime: Python 3.10` + +These environments are independent. + +Most of Bloodhound's dependencies are pure Python libraries, so builds created with +Python 3.12 or 3.13 usually run correctly in the Python 3.10 Lambda runtime. + +However, compiled libraries may fail if built using a different Python version. +This is one of the main reasons Docker-based packaging is recommended. + +## Common Packaging Failures + +Lambda packaging may fail during `terraform apply` if pip cannot resolve dependency conflicts. + +Example error: +`ResolutionImpossible` + +A common cause is manually pinning a dependency that is managed by another library. + +Example conflict: + +`botocore requires urllib3 < 1.27` + +If a project pins: + +`urllib3==2.x` + +pip cannot resolve the dependency tree and the packaging step fails. + +Best practice: + +Only specify top-level dependencies required by the Lambda function. + +Example: + +```text +slack-sdk +python-dotenv +``` + +Avoid pinning dependencies managed by other libraries such as: + +```text +botocore +urllib3 +s3transfer +``` + +Allow pip to resolve those automatically. + +--- + +## Lambda Packaging Flow (Current Implementation) + +Terraform only rebuilds the Lambda package when runtime code changes. + +Source hashing is limited to: + +bloodhound/ +handlers/ + +Changes in other directories (docs, scripts, tests) will not trigger a +Lambda rebuild. This keeps Terraform deployments fast and avoids +unnecessary Lambda updates. + +```text +terraform apply + │ + ▼ +terraform_data.build_lambda_pkg + │ + ▼ +scripts/build_lambda.sh + │ + ▼ +Select build mode + ├─ Docker build (default) + │ ↓ + │ Docker container + │ public.ecr.aws/sam/build-python3.10 + │ ↓ + │ pip install dependencies + │ + └─ Local build (--local) + ↓ + python3 + pip + ↓ + pip install dependencies + │ + ▼ +Copy application source +(bloodhound/, handlers/) + │ + ▼ +Construct Lambda package +.build/lambda_pkg + │ + ▼ +archive_file provider + │ + ▼ +.build/bloodhound_lambda_v2.zip + │ + ▼ +Lambda deployment +``` + +## Lambda Deployment Artifact + +Terraform creates the Lambda deployment package by archiving: + +.build/lambda_pkg + +The resulting deployment artifact is: + +.build/bloodhound_lambda_v2.zip + +This ZIP file is the artifact uploaded to AWS Lambda. + +Terraform's archive_file provider constructs this archive automatically +during `terraform apply`. + +If this file is missing or empty, the Lambda build step likely failed +before the archive stage. + + +## Docker-Based Packaging (Optional) + +Bloodhound supports building Lambda dependencies inside a Docker container +that matches the Lambda runtime build environment. + +This mode is optional and can be enabled when deterministic builds are required +or when dependencies include compiled libraries. + +Example: + +`terraform apply -var="use_docker_build=true"` + +Docker builds use the AWS Lambda runtime container: + +`public.ecr.aws/sam/build-python3.10` + +This container includes the correct Python runtime, pip, and build tools +required to install dependencies compatible with the AWS Lambda Python 3.10 +runtime. + +When Docker mode is enabled, dependency installation runs inside the +container instead of the engineer's local Python environment. + +This prevents dependency inconsistencies caused by engineers using +different local Python versions. + +The AWS Lambda runtime itself remains: + +`public.ecr.aws/lambda/python:3.10` + +However, Bloodhound packages dependencies using the SAM build container +so that dependency installation occurs in a build environment aligned +with the Lambda Python 3.10 runtime. + + +Example: + +```text +terraform apply + │ + ▼ +Docker build container +(public.ecr.aws/sam/build-python3.10) + │ + ▼ +pip install runtime dependencies + │ + ▼ +.build/lambda_pkg + │ + ▼ +archive_file provider + │ + ▼ +bloodhound_lambda_v2.zip + │ + ▼ +Lambda deployment +``` + +## Recommended Build Best Practices + +Bloodhound supports two Lambda packaging methods: + +1) Docker-based build (recommended) +2) Local Python build (fallback) + +Docker builds are the preferred method because they ensure the +build environment matches the AWS Lambda runtime. + +Using Docker provides several advantages: + +- deterministic builds +- consistent dependency resolution +- matching Lambda runtime environment +- reduced risk of packaging failures +- consistent builds across different developer machines + +Docker builds use the AWS SAM build container: + +public.ecr.aws/sam/build-python3.10 + +This container provides the same runtime environment used by AWS +Lambda for dependency compilation. + +Example Docker-enabled build: + +terraform apply -var="use_docker_build=true" + +When Docker mode is disabled, Bloodhound falls back to using the +developer's local Python environment to install dependencies. + +## Lambda Build Directory Structure + +The Bloodhound Lambda packaging process uses a structured build directory +to support dependency caching, deterministic builds, and reliable Terraform execution. + +Directory layout: + +```text +.build/ + lambda_pkg/ prepared Lambda deployment package + bloodhound_lambda_v2.zip final Lambda deployment artifact +``` + +Runtime dependencies are installed directly into: + +.build/lambda_pkg + +The build script installs dependencies from `requirements.txt` +before copying the Bloodhound application source. + +Lambda requires all modules to exist directly on the Python import path. + +For this reason, dependencies are installed directly into the root +of the deployment package rather than inside a nested site-packages +directory. + +Example structure: + +.build/lambda_pkg/ + bloodhound/ + handlers/ + slack_sdk/ + slack/ + dotenv/ + +This flattened layout ensures Python can resolve imports correctly +during Lambda execution. + + +lambda_pkg/ + +This directory contains the final Lambda deployment package. + +The build script installs runtime dependencies and then copies +the Bloodhound application source into this directory. + +Terraform's `archive_file` provider creates the Lambda deployment +archive from this directory. + +The Terraform archive_file provider creates the Lambda deployment archive +from this directory. + +Why this structure exists + +This layered build design prevents several common Lambda packaging failures. + +Ensures deterministic Lambda packages + +Each Terraform run rebuilds the deployment package from scratch, +ensuring no stale dependencies or files remain from previous builds. + +This guarantees the Lambda deployment artifact always reflects +the current project state. + +Prevents Terraform archive failures + +Terraform's archive_file provider cannot create an archive from an empty +directory. + +If the build process deletes the entire .build directory, Terraform may +attempt to archive a directory that does not yet exist. + +By maintaining a stable directory structure and only refreshing specific +layers, Terraform can reliably evaluate the archive step. + +Prevents inconsistent build environments + +Separating dependency installation from source copying ensures the final +deployment package is constructed in a predictable order. + +This improves build determinism and makes the packaging process easier +to debug. + +Improves CI reliability + +CI pipelines and concurrent Terraform runs are less likely to fail when +the build directory structure remains stable. + +Deleting the entire .build directory can cause race conditions or +incomplete builds. + +By refreshing only the necessary layers, the packaging process becomes +more robust and reproducible. + +This layered build structure provides: + +- faster rebuilds +- deterministic packaging +- safer Terraform execution +- reduced dependency installation time + +## Quick Troubleshooting + +If Terraform fails during Lambda packaging or deployment, consult the +Terraform troubleshooting guide: + +docs/troubleshooting_terraform_lambda.md + +That document covers common deployment failures including: + +- Lambda runtime import errors +- Terraform skipping the build step +- archive_file creation failures +- empty Lambda deployment packages +- `.build` directory inconsistencies + +Most packaging issues can be diagnosed quickly by inspecting: + +`.build/lambda_pkg` + +If this directory is missing files or empty, Terraform likely skipped +the build step or the packaging script failed to run. + +The troubleshooting guide provides recovery procedures such as forcing +Terraform to rebuild the package: + +`terraform apply -replace=terraform_data.build_lambda_pkg` + diff --git a/docs/quick_demo.md b/docs/quick_demo.md new file mode 100644 index 0000000..063eb42 --- /dev/null +++ b/docs/quick_demo.md @@ -0,0 +1,1175 @@ +# Bloodhound Quick Demo Guide + +This guide demonstrates the primary capabilities of Bloodhound. + +## Operational Scenarios + +This guide walks through the following operational scenarios: + +1. [Run Bloodhound Locally](#scenario-1--run-bloodhound-locally) +2. [Slack Scan Command](#scenario-2--slack-scan-command) +3. [Generate a Teardown Plan](#scenario-3--generate-a-teardown-plan) +4. [Controlled Teardown Execution](#scenario-4--controlled-teardown-execution) +5. [Automated Scheduled Scans (GitHub Actions)](#scenario-5--automated-scheduled-scans-github-actions) +6. [Manual Operations (GitHub Actions)](#scenario-6--manual-operations-github-actions) +7. [CloudWatch Log Inspection](#scenario-7--cloudwatch-log-inspection) +8. [Automated Teardown Validation Workflow](#scenario-8--automated-teardown-validation-workflow) + +--- + +### Bloodhound can be operated in four ways: + +- Local CLI execution +- Slack commands +- Automated GitHub Actions scans +- Manual GitHub Actions operations + +# Local Configuration + +Before running Bloodhound locally, you must configure the environment variables used by the system. + +Bloodhound loads configuration from a `.env` file during local execution. + +## Step 1 — Create the `.env` File + +.env is only used for local development. + +In production, these values are configured in: +AWS Lambda → Environment Variables. + +Copy the example configuration: + +```bash +cp env.example .env +``` + +## Step 2 — Update Required Values + +Open .env and set the required values for your environment. + +At minimum you must configure: + +EXPECTED_AWS_ACCOUNT_ID= + +SLACK_BOT_TOKEN= +SLACK_SIGNING_SECRET= + +SLACK_SCAN_CHANNEL_ID= +SLACK_ALERT_CHANNEL_ID= + +## Step 3 — Confirm AWS Profile + +Verify which AWS profiles are available: + +```bash +aws configure list-profiles +``` + +Example output: + +```bash +default +``` + +## Step 4 — Verify AWS Identity + +Confirm your credentials point to the correct AWS account: +```bash +aws sts get-caller-identity +``` +Example output: + +```json +{ + "Account": "312345678432" +} +``` + +This account ID must match the value set in: +`EXPECTED_AWS_ACCOUNT_ID` + +Safety Defaults + +The default .env configuration runs Bloodhound in safe mode: + +APPLY_CHANGES=false +TEARDOWN_SIMULATE=true + +This means: + +infrastructure will not be deleted +Bloodhound will only scan and generate a teardown plan + +--- + +# Scenario 1 — Run Bloodhound Locally + +Bloodhound can be executed locally using the built-in runner. + +### Verify AWS Profile + +From the repository root, confirm which AWS profiles are available: + +```bash +aws configure list-profiles +``` + +Example output: +`default` + +Command + +Run Bloodhound locally using a valid AWS profile: + +Syntax Usage: +```bash +AWS_PROFILE= python -m tools.run_local +``` + +This command is useful for: + +debugging scans locally +validating AWS credentials +testing pipeline behavior before deploying changes +What Happens + +The runner: + +loads .env +authenticates with AWS using the specified profile +invokes the Bloodhound pipeline + +Pipeline stages: + +``` +scan_resources() # scans AWS resources (EC2, ELBv2, RDS) +compute_budget() # calculates potential cost savings +plan_teardown() # builds a teardown plan (no deletion) +``` + +### Important + +Local execution runs in safe mode by default. + +This command will: + +scan AWS resources +identify unused infrastructure +estimate potential cost savings +generate a teardown plan + +It will NOT delete resources unless destructive mode is explicitly enabled. + +Equivalent Slack Command + +Running this locally performs the same scan as the Slack command: + +/v2_seek + +### Example Output + +Example terminal output: + +```bash +aws configure list-profiles +default + +AWS_PROFILE=default python -m tools.run_local +``` +```json +Bloodhound pipeline started +Event: {} +Bloodhound pipeline completed +Scan summary: +{ + "candidates_total": 61, + "kept_total": 1 +} +Budget summary: +{ + "projected_month_end_spend_usd": 3645.4766142570115, + "dynamic_monthly_allowance_usd": 0.0, + "over_budget_threshold_met": true +} +Teardown summary: +{ + "apply_changes": false, + "simulate": true, + "planned_actions": 61 +} +{ + "budget": { + "dynamic_monthly_allowance_usd": 0.0, + "over_budget_threshold_met": true, + "projected_month_end_spend_usd": 3645.4766142570115 + }, + "ok": true, + "regions": [ + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2" + ], + "scan": { + "candidates_total": 61, + "kept_total": 1 + }, + "teardown": { + "apply_changes": false, + "execution": null, + "planned_actions": 61, + "simulate": true, + "targets_filter": null + } +} +``` + +--- + +# Scenario 2 — Slack Scan Command + +Bloodhound can be triggered interactively using Slack slash commands. + +### Slack Command + +``` +/v2_seek +``` + +### Command Preview in Slack + + + +### What Happens + +1. Slack sends an HTTPS request to the Lambda Function URL +2. Lambda routes the request to the Slack handler +3. Bloodhound executes the scan pipeline + +Execution path: + +``` +Slack + ↓ +Lambda Function URL + ↓ +slack_handler + ↓ +execute_pipeline() +``` + +Slack returns multiple messages summarizing: + +- scan results +- whitelisted resources +- budget analysis +- teardown plan (dry run) + +--- + +## Scan Summary + +```md +Bloodhound v2 — Scan Summary +time_et: 8:23 PM ET + +Region: us-east-1 +Counts: candidates 32 | kept 0 +- Candidates: ec2.instance=15, rds.db_instance=1, ec2.elastic_ip=4, ec2.nat_gateway=0, ec2.ebs_volume=0, elbv2.load_balancer=12 +- Kept: ec2.instance=0, rds.db_instance=0, ec2.elastic_ip=0, ec2.nat_gateway=0, ec2.ebs_volume=0, elbv2.load_balancer=0 + +Region: us-east-2 +Counts: candidates 7 | kept 1 +- Candidates: ec2.instance=1, rds.db_instance=0, ec2.elastic_ip=3, ec2.nat_gateway=1, ec2.ebs_volume=0, elbv2.load_balancer=2 +- Kept: ec2.instance=1, rds.db_instance=0, ec2.elastic_ip=0, ec2.nat_gateway=0, ec2.ebs_volume=0, elbv2.load_balancer=0 + +Region: us-west-1 +Counts: candidates 0 | kept 0 + +Region: us-west-2 +Counts: candidates 22 | kept 0 + +Totals +Counts: candidates 61 | kept 1 +``` + +--- + +## Whitelisted Resources + +```md +Bloodhound v2 — Whitelisted Resources (Kept) + +kept_total: 1 + +Region: us-east-2 +- us-east-2 ec2.instance i-00339b9b5023ec69e (Name=cp-fs-curriculum) +``` + +--- + +## Budget Summary + +```md +Bloodhound v2 — Budget Summary + +Cohort +- start: 2025-12 +- total_budget: $3,000.00 over 7 months +- to_date_spend: $6,261.11 +- remaining_budget: $0.00 +- remaining_months: 4 +- monthly_allowance: $0.00 + +This month +- month_to_date: $3,057.50 +- projected_month_end: $3,510.46 +- over_budget_days_required: 2 +- over_budget_threshold_met: true +``` + +--- + +## Teardown Plan (Dry Run) + +```md +Bloodhound v2 — Teardown Plan (DRY RUN) + +mode: DRY RUN — No AWS resources will be deleted + +planned_actions: 61 + +Top Regions Affected +- us-east-1: 32 +- us-west-2: 22 +- us-east-2: 7 + +Services affected +- ec2: 46 +- elbv2: 14 +- rds: 1 +``` + +Bloodhound operates in **dry-run mode by default**, ensuring no AWS resources are deleted unless destructive mode is explicitly enabled. + +--- + +# Scenario 3 — Generate a Teardown Plan + +This command previews which AWS resources **could be removed** without deleting anything. + +### Slack Command + +`/v2_seek_destroy_plan` + +### What Happens + +Bloodhound executes the full analysis pipeline: + +```md +scan_resources() +compute_budget() +plan_teardown() +``` + +The teardown stage generates a **deletion plan**, but no resources are deleted. + +Bloodhound always runs in **dry-run mode** unless destructive mode is explicitly enabled. + +--- + +### Example Slack Output + +When `/v2_seek_destroy_plan` is executed, Bloodhound posts multiple analysis messages. + +--- + +## Scan Summary + +```md +Bloodhound v2 — Scan Summary +time_et: 8:36 PM ET + +Region: us-east-1 +Counts: candidates 32 | kept 0 +- Candidates: ec2.instance=15, rds.db_instance=1, ec2.elastic_ip=4, ec2.nat_gateway=0, ec2.ebs_volume=0, elbv2.load_balancer=12 +- Kept: ec2.instance=0, rds.db_instance=0, ec2.elastic_ip=0, ec2.nat_gateway=0, ec2.ebs_volume=0, elbv2.load_balancer=0 + +Region: us-east-2 +Counts: candidates 7 | kept 1 +- Candidates: ec2.instance=1, rds.db_instance=0, ec2.elastic_ip=3, ec2.nat_gateway=1, ec2.ebs_volume=0, elbv2.load_balancer=2 +- Kept: ec2.instance=1, rds.db_instance=0, ec2.elastic_ip=0, ec2.nat_gateway=0, ec2.ebs_volume=0, elbv2.load_balancer=0 + +Totals +Counts: candidates 61 | kept 1 +``` + +--- + +## Budget Summary + +```md +Bloodhound v2 — Budget Summary + +Cohort +- start: 2025-12 +- total_budget: $3,000.00 over 7 months +- to_date_spend: $6,261.11 +- remaining_budget: $0.00 +- remaining_months: 4 +- monthly_allowance: $0.00 + +This month +- month_to_date: $3,057.50 +- projected_month_end: $3,510.46 +- over_budget_threshold_met: true +``` + +--- + +## Teardown Plan (Dry Run) + +```md +Bloodhound v2 — Teardown Plan (DRY RUN) + +mode: DRY RUN — No AWS resources will be deleted + +planned_actions: 61 + +Top Regions Affected +- us-east-1: 32 +- us-west-2: 22 +- us-east-2: 7 + +Services affected +- ec2: 46 +- elbv2: 14 +- rds: 1 +``` + +Sample actions generated by the teardown planner: + +```md +delete_db_instance +delete_load_balancer +delete_nat_gateway +delete_volume +release_address +terminate_instances +``` + +This preview allows engineers to safely review what infrastructure **would be removed** before enabling destructive mode. + +--- + +# Scenario 4 — Controlled Teardown Execution + +Bloodhound can perform automated infrastructure cleanup, but destructive actions +require multiple explicit confirmations. + +### Slack Command + +`/v2_seek_destroy CONFIRM` + +This command executes the teardown plan generated by Bloodhound. + +However, deletion only occurs if **all safety controls allow it**. + +--- + +### Safety Controls + +Infrastructure deletion only occurs when the following environment configuration is enabled: + +```md +APPLY_CHANGES=true +TEARDOWN_SIMULATE=false +``` + +Additional safeguards are built into the system: + +```md +TEARDOWN_MAX_DELETE_COUNT +whitelisting tags +Terraform deployment guard +Slack confirmation token +``` + +These protections ensure that accidental deletions cannot occur. + +--- + +### Why This Demo Does Not Execute Destructive Mode + +This documentation intentionally **does not run the destructive command**. + +In normal operation engineers should: + +1. run `/v2_seek` to scan resources +2. run `/v2_seek_destroy_plan` to preview cleanup actions +3. review the teardown plan +4. enable destructive mode only after validation + +This ensures infrastructure cleanup remains **safe, auditable, and intentional**. + +--- + +### Safe Example Output + +When destructive mode is **not enabled**, Bloodhound reports that the system +remains in dry-run mode: + + +Bloodhound v2 — Teardown Plan (DRY RUN) + +mode: DRY RUN — No AWS resources will be deleted +simulate: true + +planned_actions: 61 + + +To execute the plan, engineers must explicitly enable destructive mode +through configuration and deployment controls. + +--- + +# Scenario 5 — Automated Scheduled Scans (GitHub Actions) + +Bloodhound can run automatically using GitHub Actions. + +![GitHub Actions Scheduled Scan](images/github_actions_invoke_lambda_scan.png) + +This repository includes a scheduled workflow that invokes the Bloodhound +Lambda function on a fixed schedule. + +Workflow file: + +`.github/workflows/invoke_lambda.yml` + +### Scheduled Execution + +The workflow runs twice per day: + +```md +16:00 UTC → 11:00 AM EST +04:00 UTC → 11:00 PM EST +``` + +GitHub cron jobs only run from the repository **default branch**. + +--- + +### Example Workflow Invocation + +The workflow builds a deterministic Lambda event payload: + +```json +{ + "source": "scheduled" +} +``` + +Lambda is then invoked using the AWS CLI: + +```bash +aws lambda invoke \ + --function-name BloodhoundLambdaV2 \ + --payload file://event.json \ + output.json +``` + +--- + +### What Happens + +```md +GitHub Actions (scheduler) + ↓ +OIDC authentication + ↓ +Assume BloodhoundGitHubInvokeRole + ↓ +Invoke Bloodhound Lambda + ↓ +Execute scheduled scan pipeline + ↓ +Log results to GitHub Actions + CloudWatch +``` + +The Lambda pipeline performs: + +scan_resources() +compute_budget() +plan_teardown() + +--- + +### Example GitHub Actions Output + +Lambda invocation completed successfully +```json +Scan summary: + +{ + "candidates_total": 61, + "kept_total": 1 +} + + +Budget summary: +{ + "projected_month_end_spend_usd": 3645.47, + "dynamic_monthly_allowance_usd": 0.0, + "over_budget_threshold_met": true +} + +Teardown summary: +{ + "apply_changes": false, + "simulate": true, + "planned_actions": 61 +} +``` + +The workflow also retrieves recent Lambda logs from CloudWatch to improve +observability during CI runs. + +--- + +# Scenario 6 — Manual Operations (GitHub Actions) + +Bloodhound operators can manually trigger infrastructure operations +directly from the GitHub Actions interface. + +![GitHub Actions Manual Operations](images/github_actions_manual_ops.png) + +Workflow file: + +`.github/workflows/bloodhound_ops.yml` + +This workflow allows engineers to run operational commands +without waiting for the scheduled automation. + +--- + +### Available Operations + +When launching the workflow, the operator can choose a mode: + +```md +scan → run infrastructure scan immediately +status → display Bloodhound system status +validate_scheduler → simulate scheduled EventBridge execution +``` + +These options are selected from the **GitHub Actions UI** when clicking +"Run workflow". + +--- + +### Execution Path + +```md +GitHub Actions UI + ↓ +workflow_dispatch + ↓ +OIDC authentication + ↓ +Assume BloodhoundGitHubInvokeRole + ↓ +Invoke Bloodhound Lambda + ↓ +Execute selected operation +``` + +--- + +### Example Lambda Payload + +Example payload generated for a scan operation: + +```json +{ + "source": "scan" +} +``` + +Example payload for a status check: + +```json +{ + "source": "status" +} +``` + +--- + +### Example GitHub Actions Output + +Bloodhound Operation: scan + +Invoking Bloodhound Lambda + +```json +Scan summary: +{ + "candidates_total": 61, + "kept_total": 1 +} + +Budget summary: +{ + "projected_month_end_spend_usd": 3645.47, + "dynamic_monthly_allowance_usd": 0.0, + "over_budget_threshold_met": true +} + +Teardown summary: +{ + "apply_changes": false, + "simulate": true, + "planned_actions": 61 +} +``` +--- + +### Example Execution — Scheduler Validation + +The `validate_scheduler` mode simulates the automated EventBridge +trigger used by scheduled infrastructure scans. + +Example payload generated by the workflow: + +```json +{"source":"scheduled"} +``` + +Example GitHub Actions output: + +```md +======================================== +Bloodhound Operation: validate_scheduler +======================================== + +===== VALIDATING SCHEDULER PATH ===== +Simulating EventBridge scheduled event +{"source":"scheduled"} + +Invoking Bloodhound Lambda +``` + +Example Lambda response: + +```json +{ + "status": "scheduled_scan_executed", + "result": { + "ok": true, + "regions": [ + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2" + ], + "scan": { + "candidates_total": 58, + "kept_total": 1 + }, + "budget": { + "projected_month_end_spend_usd": 3138.03, + "dynamic_monthly_allowance_usd": 0.0, + "over_budget_threshold_met": true + }, + "teardown": { + "apply_changes": false, + "simulate": true, + "planned_actions": 58 + } + } +} +``` + +### Example Execution — Status Operation + +The `status` mode checks the operational health of the Bloodhound +Lambda service without running a full infrastructure scan. + +Example payload generated by the workflow: + +```json +{ + "source": "status" +} +``` + +Example GitHub Actions output: + +```md +======================================== +Bloodhound Operation: status +======================================== + +Invoking Bloodhound Lambda + +Bloodhound status check completed +Lambda invocation completed successfully +``` + +The workflow confirms: + +- Lambda is reachable +- IAM role assumption is functioning +- invocation permissions are correct +- the Bloodhound runtime is operational + +Example Lambda logs (excerpt): + +```md +[BLOODHOUND][SCHEDULED] + +Event received: {'source': 'scheduled'} + +Scheduled Bloodhound scan triggered +Bloodhound pipeline started +Bloodhound pipeline completed +``` + +### CloudWatch Logs + +During execution, the GitHub Actions workflow also retrieves recent +logs from the Lambda CloudWatch log group: + +`/aws/lambda/BloodhoundLambdaV2` + +This provides deeper visibility into: + +- Lambda execution traces +- pipeline stage logging +- AWS API activity +- runtime performance metrics + +Operators can view the full logs directly in the AWS Console: + +`CloudWatch → Log Groups → /aws/lambda/BloodhoundLambdaV2` + +--- + +# Scenario 7 — CloudWatch Log Inspection + +All Bloodhound executions emit structured logs to CloudWatch. + +Log format: + +`[BLOODHOUND][EVENT_TYPE][request_id=...]` + + +These logs provide visibility into pipeline execution, +including scans, budget analysis, and teardown planning. + +--- + +### Example CloudWatch Logs + +```text +[BLOODHOUND][SCHEDULED][request_id=17774179-8cfc-49ee-a503-c2e9d091a1ee] + +Event received: {'source': 'scheduled'} + +Scheduled Bloodhound scan triggered +Bloodhound pipeline started +Event: {"source": "scheduled"} +Bloodhound pipeline completed +``` + +```json +Scan summary: +{ + "candidates_total": 58, + "kept_total": 1 +} + +Budget summary: +{ + "projected_month_end_spend_usd": 3138.03, + "dynamic_monthly_allowance_usd": 0.0, + "over_budget_threshold_met": true +} + +Teardown summary: +{ + "apply_changes": false, + "simulate": true, + "planned_actions": 58 +} +``` +--- + +### Viewing Logs (AWS CLI) + +Operators can stream logs directly from CloudWatch: + +```bash +aws logs tail /aws/lambda/BloodhoundLambdaV2 \ + --region us-west-2 \ + --follow +``` + +This command continuously streams new Bloodhound +Lambda logs as they are generated. + +--- + +### Viewing Logs (AWS Console) + +Bloodhound Lambda logs can also be inspected directly +from the AWS Console. + +Steps: + +1. Open the AWS Console +2. Select the correct region +`United States (Oregon) — us-west-2` + +3. Navigate to CloudWatch +CloudWatch → Logs → Log Management + +4. Locate the Bloodhound Lambda log group + +Search for: +`BloodhoundLambdaV2` + +5. Select a log stream + +Each Lambda invocation creates a log stream containing: + +- request IDs +- pipeline execution stages +- scan summaries +- budget calculations +- teardown planning results + +### Console Walkthrough + +A visual walkthrough of the CloudWatch navigation process +is provided below. + +📄 View guide: + +[CloudWatch Log Navigation](demo/cloudwatch_log_navigation.pdf) + +The guide includes screenshots showing: + +- selecting the correct AWS region +- opening CloudWatch +- locating the Bloodhound log group +- selecting a log stream +- inspecting Lambda execution logs +--- + +![CloudWatch Logs](images/cloudwatch_logs.png) + +--- + +# Scenario 8 — Automated Teardown Validation Workflow + +Bloodhound includes an automated validation workflow that verifies the entire destructive pipeline in a controlled environment. + +This workflow confirms that Bloodhound can: + +detect infrastructure resources +generate a teardown plan +execute deletion +verify that the resource was removed + +The workflow creates a temporary disposable EC2 instance and verifies that Bloodhound successfully deletes it. + +Validation Script + +The validation workflow is executed using the orchestration script: +`tools/run_validation_workflow.sh` + +This script coordinates the full validation process and logs results. + +Run the Validation Workflow + +From the repository root: +```bash +./tools/run_validation_workflow.sh +``` + +What Happens + +The validation workflow executes the following sequence: + +```text +Lambda smoke test + ↓ +Terraform creates disposable EC2 instance + ↓ +Validation event sent to Bloodhound Lambda + ↓ +Bloodhound teardown pipeline executes + ↓ +Instance deletion verified + ↓ +Validation workflow completes +``` + +This confirms that the actual destructive execution path works safely. + +Example Validation Output + +Example terminal output from the validation workflow: + +`./tools/run_validation_workflow.sh` + +```md +================================================ +Bloodhound Validation Workflow +================================================ +``` + +Step 1: Running Lambda smoke test... + +✓ Lambda function exists +✓ Environment variables validated +✓ CloudWatch log group exists +✓ Lambda Function URL configured + +Smoke test passed. + +Step 2: Starting controlled teardown validation... + +Validation Run ID: 20260326_205446 + +Creating disposable EC2 instance... + +Instance created: +i-01fee506f85a03005 + +Triggering Bloodhound validation teardown... + +Lambda response: +```json +{ + "ok": true, + "scan": { + "candidates_total": 62, + "kept_total": 1 + }, + "teardown": { + "apply_changes": true, + "simulate": false, + "execution": { + "attempted": 1, + "succeeded": 1, + "failed": 0 + } + } +} +``` + +Verifying instance deletion... + +SUCCESS: Instance terminated confirmed. +RESULT: PASS +Bloodhound successfully deleted the resource. + +```md +================================================ +Validation workflow completed successfully. +================================================ +``` + +### Full Validation Log + +The full terminal output from the teardown validation workflow +is available below. + +This log shows the complete lifecycle: + +- Lambda smoke test +- Terraform validation resource creation +- Bloodhound teardown execution +- Instance deletion verification +- Terraform state reconciliation + +📄 View full validation log: + +[Teardown Validation Workflow Log](demo/teardown_validation_workflow.pdf) + +The validation workflow ensures that Bloodhound’s destructive pipeline works correctly before enabling deletion in production environments. + +This test should be executed: + +after teardown logic changes +after adding new AWS resource types +after modifying IAM permissions +before enabling apply-mode in production +--- + +# Typical Operational Workflow + +Operators typically use Bloodhound in the following order: + +``` +1️⃣ Run scan + /v2_seek + +2️⃣ Review teardown plan + /v2_seek_destroy_plan + +3️⃣ Validate teardown workflow + ./tools/run_validation_workflow.sh + +4️⃣ Execute teardown (if approved) + /v2_seek_destroy CONFIRM +``` + +This ensures infrastructure cleanup remains **safe, auditable, and controlled**. + +--- + +# Why This Version Works Well + +This demo guide now shows: + +| Capability | Covered | +|---|---| +Local execution | ✔ | +Slack commands | ✔ | +Teardown planning | ✔ | +Controlled deletion | ✔ | +GitHub Actions | ✔ | +Scheduled runs | ✔ | +CloudWatch debugging | ✔ | +Validation workflow | ✔ | + +It demonstrates **every major capability of Bloodhound**. + +--- + +# Next Step for You + +Now you simply: + +1. Run the commands +2. Capture screenshots/output + +### With supporting artifacts stored in: + +```md + +docs/images/ → screenshots used in the guide +docs/demo/ → full walkthroughs and validation logs (PDF) + +Examples: + +docs/images/github_actions_invoke_lambda_scan.png +docs/images/github_actions_manual_ops.png +docs/images/slack_command_preview.png + +docs/demo/cloudwatch_log_navigation.pdf +docs/demo/teardown_validation_workflow.pdf +``` diff --git a/docs/run_validation.md b/docs/run_validation.md new file mode 100644 index 0000000..6e7cba5 --- /dev/null +++ b/docs/run_validation.md @@ -0,0 +1,359 @@ +# Bloodhound Validation — User Guide + +## Table of Contents + +- [When to Run Validation](#when-to-run-validation) +- [Quick Validation Workflow](#quick-validation-workflow) +- [Step 1 — Run the Validation Workflow](#step-1--run-the-validation-workflow) +- [Step 2 — Automatic Validation Invocation](#step-2--automatic-validation-invocation) +- [Validate Lambda Health Endpoint](#validate-lambda-health-endpoint) +- [Step 3 — Automatic Deletion Verification](#step-3--automatic-deletion-verification) +- [Step 4 — Validation Logs](#step-4--validation-logs) +- [Bloodhound Execution Modes](#bloodhound-execution-modes) +- [Expected Slack Output](#expected-slack-output) +- [If Validation Fails](#if-validation-fails) +- [Safety Reminder](#safety-reminder) +- [Recommended Validation Order](#recommended-validation-order) +- [Related Documentation](#related-documentation) + +This guide explains **when and how to run the Bloodhound validation workflow**. + +The validation workflow confirms that the full system is functioning correctly, including: + +* AWS infrastructure +* Lambda execution +* Slack command routing +* teardown logic +* resource deletion + +This process uses **temporary disposable resources** and is safe when run as described. + +--- + +# When to Run Validation + +Run the validation workflow when: + +- deploying Bloodhound for the first time +- modifying Lambda code +- modifying teardown logic +- modifying IAM permissions +- modifying Terraform infrastructure +- upgrading AWS SDK dependencies +- before enabling destructive mode in production + +You **do not need to run validation for every small code change**, but it should be executed before production use. + +--- + +# Quick Validation Workflow + +The fastest way to run the full validation is: + +``` +tools/run_validation_workflow.sh +``` + +This script performs the following checks in order: + +1. Lambda infrastructure smoke test +2. Controlled teardown validation + +If any step fails, the workflow stops immediately. + +--- + +# Step 1 — Run the Validation Workflow + +From the repository root: + +``` +tools/run_validation_workflow.sh +``` + +You will see output similar to: + +``` +Bloodhound Validation Workflow + +Step 1: Running Lambda smoke test +Smoke test passed. + +Step 2: Starting controlled teardown validation +``` + +If the smoke test fails, fix the deployment before continuing. + +--- + +# Step 2 — Automatic Validation Invocation + +During teardown validation the workflow will automatically invoke the +Bloodhound Lambda function in validation mode. + +The validation script performs the following steps: + +1. Creates a temporary validation resource using Terraform +2. Captures the resource ID +3. Invokes the Lambda function with a validation payload + +Example validation event: + +```json +{ + "source": "validation", + "mode": "seek_destroy_validation", + "target_ids": ["i-1234567890"] +} +``` + +The Lambda validation handler enables controlled destructive mode +internally and restricts deletion to the provided validation targets. + +Validation events are routed through the Lambda event router +using the event field: + +```json +"source": "validation" +``` + +This allows the full teardown pipeline to execute automatically +without requiring manual Slack commands. + +The validation harness invokes the Lambda function directly, +bypassing the Slack command interface. + +--- + +## Validate Lambda Health Endpoint + +Optional system status verification: + +/v2_status + +This command returns the Bloodhound system status including +execution mode, deletion safety limits, and recent scan results. + +Before testing Slack integrations, confirm the Lambda service is reachable. + + +curl https://YOUR_LAMBDA_URL/health + +Expected response: + +```json +{ + "ok": true, + "service": "BloodhoundLambdaV2", + "status": "healthy" +} +``` + +This check verifies the Lambda deployment without triggering a scan. + +# Step 3 — Automatic Deletion Verification + +After Lambda executes the validation event, the validation script +automatically verifies that the resource was deleted. + +Example output: + +``` +SUCCESS: Instance no longer exists. +RESULT: PASS +``` + +If the resource still exists: + +``` +ERROR: Instance still exists. +RESULT: FAIL +``` + +This indicates that the teardown pipeline did not execute correctly. + +--- + +# Step 4 — Validation Logs + +Every validation run produces a log file. + +Location: + + +`logs/validation/` + +Example log: + +`logs/validation/teardown_validation_20260307_143221.log` + +Each log records: + +* validation run ID +* created resource ID +* test steps executed +* final result (PASS / FAIL) +* Lambda request_id for traceability + +During validation runs, Lambda logs include a structured marker: + +[BLOODHOUND][VALIDATION][request_id=...] + +The request_id corresponds to the AWS Lambda invocation ID +(context.aws_request_id) and allows a single execution to be +traced across CloudWatch logs. + +Only the **3 most recent logs** are kept automatically. + +--- + +# What the Validation Tests + +# Bloodhound Execution Modes + +Bloodhound has three safety modes controlled by environment variables. + +Dry Run Mode (default) + +APPLY_CHANGES=false +TEARDOWN_SIMULATE=true + +Behavior: +- resources are scanned +- teardown plan is generated +- nothing is deleted + + +Simulation Mode + +APPLY_CHANGES=true +TEARDOWN_SIMULATE=true + +Behavior: +- deletion calls are simulated +- AWS DryRun APIs are used +- nothing is deleted + + +Apply Mode (destructive) + +APPLY_CHANGES=true +TEARDOWN_SIMULATE=false + +Behavior: +- Bloodhound executes deletion actions +- non-whitelisted resources may be removed + +The validation workflow confirms the following systems work together: + +```text +| Component | Verified | +| ------------------------------- | -------- | +| Lambda deployment | ✓ | +| validation event routing | ✓ | +| environment configuration | ✓ | +| resource detection | ✓ | +| teardown plan creation | ✓ | +| controlled destructive teardown | ✓ | +| Slack reporting | ✓ | +``` +--- + +# Expected Slack Output + +When `/v2_seek_destroy CONFIRM` runs successfully, Slack will show a message similar to: + + +Bloodhound v2 — Teardown Results + +Deleted resources: + +`ec2.instance=1` + +--- + +# If Validation Fails + +Check the following: + +**Smoke test failure** + +Run: + +`tools/smoke_test_lambda.sh` + +Fix infrastructure problems before continuing. + +--- + +**Resource not detected** + +Check: + +* instance region +* instance state +* whitelist tags + +Ensure the instance does **not** have: + +``` +bloodhound:keep=true +``` + +--- + +**Resource not deleted** + +Check: + +``` +APPLY_CHANGES=true +TEARDOWN_SIMULATE=false +``` + +Verify IAM permissions allow deletion. + +--- + +# Safety Reminder + +The teardown validation temporarily enables **real deletion mode**. + +Always restore safe mode after testing: + +``` +APPLY_CHANGES=false +TEARDOWN_SIMULATE=true +``` + +Then run: + +``` +terraform apply +``` + +This returns Bloodhound to **dry-run mode**. + +--- + +# Recommended Validation Order + +Always run validations in this order: + +1. Infrastructure smoke test +2. Validation harness invocation +3. Controlled teardown verification + +This ensures problems are caught early before destructive operations are attempted. + +--- + +# Related Documentation + +Slack validation: + +`docs/slack_and_lambda_validation.md` + +Architecture and configuration: + +`docs/bloodhound_v2_plan.md` diff --git a/docs/safe_operations.md b/docs/safe_operations.md new file mode 100644 index 0000000..abe4f6e --- /dev/null +++ b/docs/safe_operations.md @@ -0,0 +1,304 @@ +# Safe Operations Guide (Bloodhound V2) + +⚠️ Bloodhound must always run in dry-run mode unless explicitly validating teardown logic. + +## Table of Contents + +- [Core Safety Principles](#core-safety-principles) +- [Default Safety Configuration](#default-safety-configuration) +- [Safe Validation Procedure](#safe-validation-procedure) +- [Controlled Deletion Procedure](#controlled-deletion-procedure) +- [Full Cleanup](#full-cleanup-use-extreme-caution) +- [Terraform Safety Guard](#terraform-safety-guard) +- [Lambda Version Rollback](#lambda-version-rollback) +- [Emergency Stop](#emergency-stop) +- [Summary](#summary) + +This document describes the operational safeguards built into Bloodhound V2 and the procedures engineers must follow before enabling destructive actions. + +Bloodhound is capable of identifying and deleting unused cloud infrastructure. Because of this capability, strict safeguards are enforced to prevent accidental resource deletion. + +--- + +# Core Safety Principles + +Bloodhound follows a layered safety model: + +```text +Detection + ↓ +Planning + ↓ +Dry-run validation + ↓ +Execution Path + ├─ Operator confirmation (Slack command) + └─ Validation harness (automated testing) + ↓ +Deletion +``` + +Deletion should **never be enabled without first reviewing the dry-run plan.** + +--- + +# Default Safety Configuration + +The system ships in **safe mode** by default. + +These environment variables enforce non-destructive behavior: + +``` +APPLY_CHANGES=false +TEARDOWN_SIMULATE=true +TEARDOWN_ALLOW_ALL=false +``` + +This configuration ensures: + +* Infrastructure scans run normally +* Teardown plans are generated +* No resources are deleted + +Slack will show a **Teardown Plan (dry-run)** message but no destructive actions will execute. + +--- + +# Safe Validation Procedure + +Before enabling deletion, the following validation process must be completed. + +## Step 1 — Run Scan + +In Slack: + +``` +/v2_seek +``` + +Confirm that the system reports: + +* scan summary +* budget summary +* teardown plan + +--- + +## Step 2 — Review Teardown Plan + +Carefully review the Slack message: + +``` +Bloodhound v2 — Teardown Plan (dry-run) +``` + +Verify: + +* resources are expected candidates +* no production resources appear +* whitelisted resources are excluded + +Example: + +``` +simulate: true +planned_actions: 36 +``` + +--- + +## Step 3 — Confirm Whitelisted Resources + +Resources that should never be deleted must have the tag: + +``` +bloodhound:keep=true +``` + +Bloodhound will list these in Slack under: + +``` +Whitelisted Resources (Kept) +``` + +If a resource should be protected but does not appear here, add the tag before proceeding. + +--- + +# Controlled Deletion Procedure + +Deletion should only occur after the dry-run plan has been reviewed. + +## Enable Deletion Mode + +Update environment configuration: + +``` +APPLY_CHANGES=true +TEARDOWN_SIMULATE=false +``` + +Keep this safeguard enabled unless a full cleanup is intended: + +``` +TEARDOWN_ALLOW_ALL=false +``` + +This forces operators to explicitly specify targets. + +--- + +## Targeted Deletion (Recommended) + +Specify the exact resources to delete: + +``` +TEARDOWN_TARGET_IDS= +``` + +Example: + +``` +TEARDOWN_TARGET_IDS=i-0123456789abcdef +``` + +Then execute one of the following: + +Operator-triggered deletion (Slack command path): + +/v2_seek_destroy CONFIRM + +or automated validation execution: + +validation harness → Lambda validation event invocation + +```json +(source: "validation") +``` + +Validation events are routed through the Lambda event router +and handled by the validation execution path. + +--- + +# Full Cleanup (Use Extreme Caution) + +Only enable full cleanup when the environment is confirmed safe. + +Required configuration: + +``` +APPLY_CHANGES=true +TEARDOWN_SIMULATE=false +TEARDOWN_ALLOW_ALL=true +``` + +Then run: + +``` +/v2_seek_destroy CONFIRM +``` + +Bloodhound will execute the teardown plan. + +--- + +# Terraform Safety Guard + +Terraform includes an account safety guard to prevent deploying Bloodhound to the wrong AWS account. + +Variable: + +``` +expected_aws_account_id +``` + +Terraform validates: + +``` +current AWS account ID + == +expected account ID +``` + +If the IDs do not match, the deployment fails. + +This prevents accidental deployments to production or unrelated accounts. + +--- + +# Lambda Version Rollback + +Bloodhound uses Lambda versioning with a `prod` alias. + +Every deployment publishes a new immutable version: + +``` +BloodhoundLambdaV2 + ├ Version 1 + ├ Version 2 + └ Version 3 + ↑ + prod +``` + +If a deployment introduces a problem, the alias can be moved to a previous version. + +Rollback procedure: + +1. Open AWS Console +2. Navigate to Lambda → BloodhoundLambdaV2 +3. Open **Aliases** +4. Edit **prod** +5. Select the previous working version + +Slack integration will continue working because the Function URL always invokes the alias. + +--- + +# Emergency Stop + +If unexpected behavior occurs: + +1. Set: + +``` +APPLY_CHANGES=false +TEARDOWN_SIMULATE=true +``` + +2. Redeploy or update Lambda environment variables. + +This immediately disables destructive actions. + +--- + +# Lambda Execution Logging + +All Bloodhound Lambda executions emit structured log markers: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: + +[BLOODHOUND][SCAN][request_id=...] +[BLOODHOUND][SCHEDULED][request_id=...] +[BLOODHOUND][VALIDATION][request_id=...] + +The request_id corresponds to the AWS Lambda invocation ID +(context.aws_request_id) and allows engineers to trace +individual executions through CloudWatch logs. + +--- + +# Summary + +Bloodhound V2 implements multiple safety layers: + +* dry-run mode enabled by default +* explicit confirmation required +* whitelist protection via resource tags +* Terraform account guard +* Lambda version rollback capability + +Operators must always review teardown plans before enabling deletion. diff --git a/docs/slack_and_lambda_validation.md b/docs/slack_and_lambda_validation.md new file mode 100644 index 0000000..8872665 --- /dev/null +++ b/docs/slack_and_lambda_validation.md @@ -0,0 +1,701 @@ +# Validating Slack Slash Commands and Lambda Execution Logs + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Part A — Open CloudWatch Logs](#part-a--open-cloudwatch-logs-set-this-up-first) +- [Part B — Trigger the Slash Command in Slack](#part-b--trigger-the-slash-command-in-slack) +- [Part C — Watch Logs Update in CloudWatch](#part-c--watch-logs-update-in-cloudwatch) +- [What You Should Look For in Logs](#what-you-should-look-for-in-logs) +- [If You Do Not See Logs](#if-you-do-not-see-logs) +- [Optional Fast Path](#optional-fast-path-sometimes-easier) +- [Success Criteria](#success-criteria) +- [Validating `/v2_seek_destroy` Safety Controls](#validating-v2_seek_destroy-safety-controls) +- [Verify Lambda Environment Variables](#verify-lambda-environment-variables) +- [Watching Lambda Logs Live](#watching-lambda-logs-live) +- [Common Failure Scenarios](#common-failure-scenarios) + +This document verifies that Slack slash commands are correctly wired to **BloodhoundLambdaV2** +and that Lambda execution can be observed in **CloudWatch Logs**. + +Bloodhound uses an event router inside the Lambda handler to determine +which execution path should run: + +Slack HTTP events → Slack handler +Scheduled events → scheduled handler +Manual invocations (scan/status) → main pipeline (`run()`) + +This validation confirms the Slack execution path is functioning correctly. + +Use this procedure after: +- `terraform apply` succeeds +- Slack slash command Request URLs are set to the Lambda Function URL +- `/v2_seek` is expected to produce Slack output + +Note: +This document validates Slack command routing only. +The automated teardown validation workflow is documented in: + +docs/validate_teardown.md + +--- + +## Prerequisites + +- You have access to the AWS account where Bloodhound V2 is deployed. +- You know the AWS region Bloodhound V2 is deployed in (currently `us-west-2`). +- The Slack app is installed in the target workspace and the slash commands exist: + - `/v2_seek` + - `/v2_seek_destroy` + +--- + +## Part A — Open CloudWatch Logs (Set This Up First) + +1. Log into the **AWS Console** (the same AWS account where Terraform deployed Bloodhound V2). + +2. In the AWS console top-right region selector, set the region to: + + - `us-west-2` + + Important: + If you are in the wrong region, the log group will not appear and you may see “Function not found” type errors. + +3. In the AWS Console search bar, type: + + - `CloudWatch` + + Open **CloudWatch**. + +4. In the left-side menu, navigate to: + + - **Logs** + - **Log Management** + - **Log groups** + +5. In the Log groups search bar, type: + + - `/aws/lambda/BloodhoundLambdaV2` + +6. Click the log group: + + - `/aws/lambda/BloodhoundLambdaV2` + +7. Click **Latest log streams** (or click the most recent log stream listed). + + You should now be on a screen that shows log events. + +8. Leave this page open. You are now “watching logs.” + +--- + +## Part B — Trigger the Slash Command in Slack + +9. Open Slack (the workspace where **Bloodhound V2** is installed). + +10. Navigate to your admin/testing channel (example): + + - `bloodhound-seek-n-destroy-admin` + +11. Run the slash command: + + - `/v2_seek` + +12. Confirm Slack immediately responds with a short acknowledgement message like: + + - “BloodHound is on the hunt...please stand by.” + +This message indicates that Slack successfully invoked the Lambda handler and the async worker path has started. + +--- + +## Part C — Watch Logs Update in CloudWatch + +13. Immediately switch back to the CloudWatch log stream page you opened in Part A. + +14. Click the **Refresh** button (or refresh icon). + +15. If you are viewing the **log streams list** (instead of a stream’s log events): + + - refresh the list + - click the newest log stream (the one with the most recent timestamp) + +16. If you are already inside a log stream viewing log events: + + - click refresh until you see new log lines appear + +17. You should see log lines that correspond to the `/v2_seek` invocation. + +Bloodhound logs follow a standardized format: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Example: + +[BLOODHOUND][SLACK][request_id=0f6c9c4e-1234-4e1e-9b7f-6b9d3d3a9c2a] + +The `request_id` value corresponds to the AWS Lambda invocation ID +and allows a single execution to be traced across all CloudWatch logs. + +--- + +## What You Should Look For in Logs + +### Important: Recursion Prevention + +Scheduled execution is isolated from the generic `run()` function to +prevent recursive Lambda invocation. + +Correct behavior: + +- Each invocation logs: + - "Bloodhound pipeline started" + - "Bloodhound pipeline completed" +- Each invocation runs exactly once + +Incorrect behavior (indicates a bug): + +- Repeated "Scheduled Bloodhound scan triggered" +- Multiple identical executions from a single event +- RecursionError or timeout + +If recursion is observed, review Lambda routing and ensure scheduled +events are handled through: + +scheduled_handler → run_scheduled_scan() → execute_pipeline() + +You are confirming these signals: + +- A new log stream appears immediately after `/v2_seek` is run +- The first log line shows the Bloodhound event marker + +Example: + +[BLOODHOUND][SLACK][request_id=...] + +- The same `request_id` appears in all logs for that execution +- Log lines indicate region scanning activity +- Log lines indicate Slack message posting +- No errors occur + +Common errors to watch for: + +- Missing environment variables (Slack tokens, signing secret, etc.) +- Slack authentication errors +- Slack signature verification failures +- AWS permissions errors (AccessDenied) +- Region mismatch (scanning wrong regions) + +--- + +## If You Do Not See Logs + +### 1) Wrong AWS Region + +You must use the same region where Terraform deployed the Lambda. +For Bloodhound V2 this is typically: + +- `us-west-2` + +Fix: +- Switch the AWS Console region to `us-west-2` +- Repeat the steps above + +--- + +### 2) Wrong Log Group + +The log group must be exactly: + +- `/aws/lambda/BloodhoundLambdaV2` + +Fix: +- Re-search log groups in CloudWatch +- Ensure you are clicking the correct group name + +--- + +### 3) No New Log Stream Appears + +If Slack shows no response and CloudWatch shows no activity, Lambda may not be invoked. + +Check: + +- Slack App → Slash Commands → Request URL is correct +- The Request URL matches the Terraform output Lambda Function URL +- The Lambda Function URL is still enabled +- Lambda Function URL permissions still exist +- Your Slack app is installed in the correct workspace + +--- + +## Optional Fast Path (Sometimes Easier) + +Instead of navigating CloudWatch manually: + +1. AWS Console → **Lambda** +2. Click **BloodhoundLambdaV2** +3. Go to the **Monitor** tab +4. Click **View logs in CloudWatch** +5. Continue from Part A step 7 + +--- + +## Success Criteria + +This validation is complete when: + +- `/v2_seek` produces the expected Slack output (scan summary, budget summary, teardown plan) +- `/v2_seek_destroy` enforces its confirmation and allowlist protections +- CloudWatch logs show a corresponding invocation and execution path +- Direct CLI invocation returns a valid pipeline response (`ok: true`) +- No recursive execution or repeated scheduled triggers occur +- No destructive actions occur during validation (dry-run / simulate mode only) + +## Direct Lambda CLI Validation (Recommended) + +In addition to Slack-based validation, you should verify Lambda execution +directly using the AWS CLI. This confirms the core pipeline works +independently of Slack, EventBridge, or GitHub Actions. + +### Step 1 — Invoke Lambda directly + +Run: + +aws lambda invoke \ +--function-name BloodhoundLambdaV2 \ +--region us-west-2 \ +--payload '{"source":"scan"}' \ +--cli-binary-format raw-in-base64-out \ +out.json + +If successful, you will see: + +{ + "StatusCode": 200, + "ExecutedVersion": "$LATEST" +} + +--- + +### Step 2 — Inspect output + +Run: + +cat out.json + +Expected result: + +{ + "ok": true, + ... +} + +--- + +### Step 3 — Verify logs + +Check CloudWatch logs and confirm: + +- "Bloodhound pipeline started" +- "Bloodhound pipeline completed" +- Only ONE execution occurs +- No recursion or repeated scheduled triggers + +--- + +### Why this matters + +This test isolates Lambda execution and confirms: + +- routing logic is correct +- recursion issues are resolved +- pipeline executes deterministically + +This should always be performed after major routing or handler changes. + +## Validating `/v2_seek_destroy` Safety Controls + +The `/v2_seek_destroy` command has multiple protection layers to prevent +accidental destructive actions. + +During validation you should confirm these protections are functioning. + +### Step 1 — Run destroy command without confirmation + +In Slack run: + + +/v2_seek_destroy + + +Expected result: + +Slack should return a message similar to: + + +Not allowed. Use /v2_seek_destroy CONFIRM (and ensure you are allowlisted). + + +This confirms the confirmation token protection is working. + +--- + +### Step 2 — Run destroy command with confirmation + +Run: + +/v2_seek_destroy CONFIRM + +The command may still be rejected if allowlists are configured. + +Bloodhound validates the following environment variables: + +- `SLACK_DESTROY_CONFIRM_TOKEN` +- `SLACK_ALLOWED_USER_IDS` +- `SLACK_ALLOWED_CHANNEL_IDS` + +If allowlists are defined and the user/channel is not included, +the command will return the same **Not allowed** response. + +--- + +### Expected behavior during validation + +Even when the command succeeds, destructive actions should **not** +occur because runtime safety flags are enabled. + +Relevant environment variables: + +APPLY_CHANGES=false +TEARDOWN_SIMULATE=true + +The full teardown safety model and configuration reference +is documented in: + +`docs/bloodhound_v2_plan.md` + +This forces Bloodhound to operate in **dry-run mode**, meaning it +will only generate a teardown plan and never call destructive AWS APIs. + +--- + +### What successful validation looks like + +You should observe: + +1. `/v2_seek_destroy` without confirmation → rejected +2. `/v2_seek_destroy CONFIRM` → allowed only if allowlisted +3. Slack responses generated correctly +4. Lambda invocation visible in CloudWatch logs +5. No AWS resources are deleted + +## Verify Lambda Environment Variables + +If Slack commands behave unexpectedly, verify that the Lambda environment +variables were correctly deployed by Terraform. + +NOTE: Ensure the AWS CLI region matches the Terraform deployment region +(us-west-2). Otherwise Lambda may appear "missing". + +Run: + +aws lambda get-function-configuration \ +--region us-west-2 \ +--function-name BloodhoundLambdaV2 \ +--query 'Environment.Variables' + +This command prints the runtime configuration currently applied to the Lambda. + +Confirm that the expected variables appear, such as: + +SLACK_ENABLED +SLACK_SCAN_CHANNEL_ID +SLACK_ALERT_CHANNEL_ID +SLACK_SIGNING_SECRET +APPLY_CHANGES +TEARDOWN_SIMULATE + +If any variables are missing, Terraform may not have applied the latest +configuration. + +To fix: + +terraform plan +terraform apply + +--- + +## Watching Lambda Logs Live + +Instead of refreshing CloudWatch in the console, you can stream Lambda +logs directly in your terminal. This is very useful when debugging +slash command behavior while triggering `/v2_seek` or `/v2_seek_destroy`. + +### Command + +```bash +aws logs tail /aws/lambda/BloodhoundLambdaV2 \ +--region us-west-2 \ +--follow +```` + +### What this does + +This command: + +``` +connects to CloudWatch +↓ +streams new Lambda log events +↓ +prints them in your terminal +``` + +It behaves similarly to: + +``` +tail -f +``` + +for Lambda logs. + +--- + +### Recommended workflow + +Open **two terminals**. + +#### Terminal 1 — watch logs + +Run: + +```bash +aws logs tail /aws/lambda/BloodhoundLambdaV2 \ +--region us-west-2 \ +--follow +``` + +Leave it running. + +#### Terminal 2 — trigger Slack command + +In Slack run: + +``` +/v2_seek +``` + +or + +``` +/v2_seek_destroy CONFIRM +``` + +--- + +### Expected output + +When Lambda runs you should see logs similar to: + +START RequestId: 0f6c9c4e-1234-4e1e-9b7f-6b9d3d3a9c2a + +[BLOODHOUND][SLACK][request_id=0f6c9c4e-1234-4e1e-9b7f-6b9d3d3a9c2a] +Event received: {...} + +Received Slack slash command +command=/v2_seek +Scanning region us-east-1 +Scanning region us-west-2 +Posting Slack summary + +END RequestId: 0f6c9c4e-1234-4e1e-9b7f-6b9d3d3a9c2a +REPORT Duration: 14110 ms + +This confirms the full execution path from Slack → Lambda → AWS scan. + +--- + +### Why this command is useful + +It allows you to immediately see runtime errors such as: + +``` +Slack signature verification failed +Missing environment variable +AccessDenied +Invalid token +``` + +without navigating through the CloudWatch console. + +--- + +### Optional improvements + +Add timestamps: + +```bash +aws logs tail /aws/lambda/BloodhoundLambdaV2 \ +--region us-west-2 \ +--follow \ +--format short +``` + +Filter only errors: + +```bash +aws logs tail /aws/lambda/BloodhoundLambdaV2 \ +--region us-west-2 \ +--follow \ +--filter-pattern "ERROR" +``` + +## Common Failure Scenarios + +This section lists common issues that may occur when validating +Slack commands or Lambda execution. + +--- + +### 1. Slash command returns “Function not found” + +Example error: + +An error occurred (ResourceNotFoundException) +Function not found + +Cause: + +The AWS CLI or console is using the wrong region. + +Bloodhound V2 is deployed in: + +us-west-2 + +Fix: + +Specify the region explicitly when using the CLI: + +aws lambda get-function-configuration +--region us-west-2 +--function-name BloodhoundLambdaV2 + + +Or switch the AWS Console region to `us-west-2`. + +--- + +### 2. Slack command produces no response + +Possible causes: + +- Slack Request URL is incorrect +- Lambda Function URL is disabled +- Lambda permissions were removed +- Slack app was not reinstalled after manifest changes + +Fix: + +Verify the Slack command configuration: + + +Slack App → Slash Commands + + +Ensure the Request URL matches the Terraform output: + + +Lambda Function URL + + +--- + +### 3. `/v2_seek_destroy` returns “Not allowed” + +Example response: + +Not allowed. Use /v2_seek_destroy CONFIRM (and ensure you are allowlisted). + +This is expected behavior if safety checks fail. + +Possible causes: + +- Missing confirmation token +- User not in `SLACK_ALLOWED_USER_IDS` +- Channel not in `SLACK_ALLOWED_CHANNEL_IDS` + +Verify the Lambda environment variables: + +SLACK_DESTROY_CONFIRM_TOKEN +SLACK_ALLOWED_USER_IDS +SLACK_ALLOWED_CHANNEL_IDS + +--- + +### 4. Slash command succeeds but no logs appear + +Cause: + +The CloudWatch log group may not exist yet if Lambda has never run. + +Fix: + +Run the command again: + +/v2_seek + +Lambda automatically creates the log group on first execution. + +--- + +### 5. Terraform changes not reflected in Lambda + +Cause: + +Terraform may not have applied the latest configuration. + +Fix: + +Run: + +terraform plan +terraform apply + +Then verify Lambda environment variables: + +aws lambda get-function-configuration +--region us-west-2 +--function-name BloodhoundLambdaV2 +--query 'Environment.Variables' + +--- + +### 6. Unexpected teardown behavior + +If `/v2_seek_destroy` appears to plan more resources than expected: + +Check the whitelist configuration: + +KEEP_TAG_KEY +KEEP_TAG_VALUE + +Resources tagged with these values will be excluded from teardown. + +Example: + +bloodhound:keep=true + +--- + +## When to Escalate + +If validation fails after following the steps above: + +1. Capture the Slack command output +2. Capture the CloudWatch logs +3. Capture the Lambda environment variables +4. Capture the Terraform plan output + +These artifacts should be sufficient to diagnose most issues with the +Slack → Lambda → AWS execution pipeline. diff --git a/docs/slack_app_operations.md b/docs/slack_app_operations.md new file mode 100644 index 0000000..cf52d8a --- /dev/null +++ b/docs/slack_app_operations.md @@ -0,0 +1,40 @@ +# Slack App Operations (Bloodhound V2) + +Slack Workspace +Workspace: CodePlatoon + +Admin console: + +https://codeplatoon.slack.com/apps/manage + +Only Slack workspace administrators can install or remove applications. + +Current Slack admins typically include: + +- Francisco A. +- Julius B. +- Mike M. + +## Slack App Ownership + +Slack apps should never be owned by a single engineer. + +To prevent orphaned integrations and loss of access, Slack app +ownership must be shared with multiple workspace administrators. + +Open the Slack developer console: + +https://api.slack.com/apps + +Select: + +Bloodhound-V2 + +Then: + +Settings → Collaborators + +Add: + +Julius Bautista +Francisco Avila \ No newline at end of file diff --git a/docs/troubleshooting_slack_commands.md b/docs/troubleshooting_slack_commands.md new file mode 100644 index 0000000..b7bce68 --- /dev/null +++ b/docs/troubleshooting_slack_commands.md @@ -0,0 +1,493 @@ +# Slack Slash Command Troubleshooting + +## Table of Contents + +- [Common Causes](#common-causes) +- [Step 1 — Verify Lambda Function URL](#step-1--verify-the-lambda-function-url) +- [Step 2 — Verify Slack Slash Command Configuration](#step-2--verify-the-slack-slash-command-configuration) +- [Step 3 — Validate Lambda Endpoint Health](#step-3--validate-lambda-endpoint-health) +- [Step 4 — Reinstall Slack App](#step-4--reinstall-the-slack-app-if-commands-are-missing) +- [Step 5 — Verify Channel Configuration](#step-5--verify-channel-configuration) +- [Step 6 — Perform Direct Command Test](#step-6--perform-a-direct-command-test) +- [Preventing This Issue](#preventing-this-issue) +- [Force Slack to Refresh Slash Commands](#step-7--force-slack-to-refresh-slash-commands) +- [Related Documentation](#related-documentation) + +This guide explains how to diagnose and fix situations where the **Bloodhound Slack commands (such as `/v2_seek`, `/v2_seek_destroy_plan`, `/v2_seek_destroy`, or `/v2_status`) stop appearing or stop responding**. + +These issues typically occur after infrastructure updates or configuration changes. + +Note + +Bloodhound v2 introduced new slash commands prefixed with `/v2_`. +Older documentation and logs may reference legacy commands such as +`/seek` or `/seek_destroy`. + +--- + +# Common Causes + +If a Slack slash command disappears or stops working, the most common reasons are: + +1. **Slack lost the configured request URL** +2. **The Lambda Function URL changed** +3. **Terraform redeployed the Lambda** +4. **Slack app permissions were modified** +5. **Slack temporarily disabled the command due to endpoint failures** + +During Terraform deployments you may see messages such as: + +``` +aws_lambda_function.bloodhound_v2: Modifying... +aws_lambda_alias.bloodhound_prod: Modifying... +``` + +If the Lambda endpoint changes during deployment, Slack may still point to the previous URL. + +When Slack cannot reach the configured endpoint, it may **silently hide or disable the slash command**. + +--- + +# Step 1 — Verify the Lambda Function URL + +Run the following command: + +``` +aws lambda get-function-url-config \ + --function-name BloodhoundLambdaV2 \ + --region us-west-2 +``` + +Expected output: + +``` +{ + "FunctionUrl": "https://xxxx.lambda-url.us-west-2.on.aws/", + "FunctionArn": "arn:aws:lambda:us-west-2:ACCOUNT_ID:function:BloodhoundLambdaV2", + "AuthType": "NONE" +} +``` + +Example from a working deployment: + +``` +https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/ +``` + +Copy the **FunctionUrl** value. + +--- + +# Step 2 — Verify the Slack Slash Command Configuration + +Open the Slack developer console: + +``` +https://api.slack.com/apps +``` + +Navigate to: + +``` +Your App +→ Slash Commands +``` + +Select the command. + +Legacy systems used: + +/seek + +Current Bloodhound v2 systems use: + +/v2_seek + +Verify the **Request URL** is set to: + +``` +https://.lambda-url.us-west-2.on.aws/ +``` + +If the URL does not match the Lambda Function URL retrieved in Step 1, update it. + +Click: + +``` +Save +``` + +--- + +# Step 3 — Validate Lambda Endpoint Health + +Before reconnecting Slack, confirm that the Lambda endpoint is reachable. + +Run: + +curl https://YOUR_LAMBDA_URL + +Example: + +curl https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/ + +A healthy response should return JSON similar to: + +{ + "regions": ["us-east-1","us-east-2","us-west-1","us-west-2"], + "scan": { + "candidates_total": 41, + "kept_total": 1 + }, + "ok": true, + "teardown": { + "planned_actions": 41, + "execution": null, + "targets_filter": null, + "apply_changes": false, + "simulate": true + }, + "budget": { + "over_budget_threshold_met": true, + "projected_month_end_spend_usd": 3139.02, + "dynamic_monthly_allowance_usd": 0.0 + } +} + +The key indicator of a healthy endpoint is: + +"ok": true + +If the endpoint returns a valid JSON response and "ok": true, the Lambda function +is reachable and executing correctly. + +This confirms that: + +• the Lambda Function URL is valid +• the Lambda function is running +• AWS permissions are functioning +• the infrastructure scan logic is executing + +If this test succeeds but the Slack command still fails, the issue is likely +related to Slack configuration or Slack command caching. + +## Step 3.1 — Run Lambda Health Check + +Bloodhound provides a lightweight health endpoint that verifies the Lambda +service is reachable without triggering a full scan. + +Run: + +curl https://YOUR_LAMBDA_URL/health + +Example: + +curl https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/health + +Expected response: + +{ + "ok": true, + "service": "BloodhoundLambdaV2", + "status": "healthy" +} + +If this request succeeds, it confirms: + +• the Lambda Function URL is valid +• the Lambda runtime is working +• the function is deployed and reachable + +If the health check succeeds but Slack commands fail, the issue is likely +related to Slack configuration or caching. + + +--- + +# tep 4 — Reinstall the Slack App (If Commands Are Missing) + +Sometimes Slack will stop showing a slash command if the app configuration changed or the endpoint failed repeatedly. Reinstalling the app refreshes the configuration. + +Follow these steps. + +4.1 Open the Slack App Management Page + +From your Slack workspace (like the one shown in the screenshot): + +Open the workspace in Slack. + +In the left sidebar locate the Apps section. + +Click the Bloodhound V2 - Seek & Destroy app (or the name of your Slack integration). + +If the app does not appear in Slack, continue with the next step. + +4.2 Open the Slack Developer Console + +Open the Slack developer dashboard: + +https://api.slack.com/apps + +You will see a list of Slack apps associated with the workspace. + +Select: + +Bloodhound V2 - Seek & Destroy + +4.3 Navigate to Install App + +In the left sidebar of the Slack developer console, click: + +Settings +→ Install App + +This will open the **Installed App Settings** page. + +4.4 Reinstall the App + +On the Installed App Settings page locate the button: + +Reinstall to CodePlatoon + +Click: + +Reinstall to CodePlatoon + +Slack will open an authorization screen. + +Click: + +Allow + +This forces Slack to re-authorize the application and refresh the integration with the workspace. + +This process refreshes: + +• slash command registrations +• OAuth permissions +• bot connection to the workspace +• request URL bindings + +Note + +Slack may stop showing slash commands if the configured endpoint fails repeatedly +or if infrastructure changes modify the Lambda Function URL. + +Reinstalling the app forces Slack to resynchronize the command configuration +with the workspace. + +4.5 Verify Slash Commands Were Restored + +Return to Slack and test the command: + +/v2_seek + +or + +/v2_seek_destroy_plan + +or + +/v2_seek_destroy + +or + +/v2_status + +If the reinstall succeeded, Slack should now display the command and the Lambda endpoint should receive the request. + +When Reinstallation Is Needed + +Reinstalling the Slack app is often required after: + +• Lambda endpoint URL changes +• Terraform redeploys the Slack integration +• Slack permissions are updated +• Slack temporarily disables a command after repeated endpoint failures + +Reinstalling forces Slack to re-sync the application configuration with the workspace. + +--- + +# Step 5 — Verify Channel Configuration + +Bloodhound restricts command execution using environment variables. + +Check your Lambda environment variables: + +``` +SLACK_SCAN_CHANNEL_ID +SLACK_ALERT_CHANNEL_ID +``` + +Example: + +``` +SLACK_SCAN_CHANNEL_ID=C0A4YLV0HNY +SLACK_ALERT_CHANNEL_ID=C0A4YLV0HNY +``` + +Ensure the Slack command is being executed in the correct channel. + +--- + +# Step 6 — Perform a Direct Command Test + +You can simulate a Slack command by sending a request to Lambda. + +Example: + +``` +Legacy command test: + +curl -X POST https://YOUR_LAMBDA_URL \ + -d "command=/seek" + +Current Bloodhound v2 test: + +curl -X POST https://YOUR_LAMBDA_URL \ + -d "command=/v2_seek" +``` + +If the Lambda returns JSON output, the endpoint is functioning correctly. + +--- + +# Why This Happens + +When Terraform redeploys Lambda, the following may occur: + +``` +aws_lambda_function.bloodhound_v2: Modifying +aws_lambda_alias.bloodhound_prod: Modifying +``` + +If the **Lambda Function URL changes**, Slack continues pointing to the previous URL. + +Slack then attempts to call the endpoint, fails, and may temporarily hide or disable the command. + +--- + +# Preventing This Issue + +To prevent Slack command failures during deployments: + +• ensure Terraform does not recreate the Lambda Function URL unnecessarily +• keep Slack request URLs synchronized with the deployed Lambda endpoint +• validate the Lambda endpoint using the smoke test script after deployments + +--- + +Step 7 — Force Slack to Refresh Slash Commands + +Sometimes Slack still has the old command configuration cached locally, even after the endpoint has been fixed. + +Instead of reinstalling the entire Slack app, you can force Slack to refresh the command registration. + +Method 1 — Edit and Re-Save the Slash Command + +Open the Slack developer dashboard: + +https://api.slack.com/apps + +Select the app: + +Bloodhound V2 - Seek & Destroy + +Navigate to: + +Slash Commands + +Select the command. + +Legacy command: + +/seek + +Current command: + +/v2_seek + +In the command configuration page: + +Change the Request URL temporarily. + +Example: + +https://your-lambda-url.lambda-url.us-west-2.on.aws/ + +Change it to something like: + +https://your-lambda-url.lambda-url.us-west-2.on.aws/?refresh=1 + +Click: + +Save + +Change the URL back to the original value. + +Click: + +Save + +This forces Slack to refresh the command configuration and invalidate its cache. + +Method 2 — Restart Slack Client + +After editing the command, restart the Slack client. + +On Mac: + +Cmd + Q + +Then reopen Slack. + +On Windows: + +Ctrl + Q + +This ensures the Slack UI reloads the latest command definitions. + +Method 3 — Refresh Slack Command Suggestions + +Inside Slack, type: + +/ + +Slack will reload the slash command list. + +If either command appears in the suggestion list, the refresh worked: + +/seek (legacy) +/v2_seek (current) + +# Related Documentation + +Infrastructure validation: + +``` +docs/run_validation.md +``` + +Teardown validation process: + +``` +docs/validate_teardown.md +``` + +System architecture and configuration: + +``` +docs/bloodhound_v2_plan.md +``` + +--- + +If you'd like, I can also help you add a **very useful small section to the main README** that says: + +``` +⚠ If Slack commands stop responding after deployment, +see docs/troubleshooting_slack_commands.md +``` + +This is something many infrastructure repos include so engineers can quickly find fixes. diff --git a/docs/troubleshooting_terraform_lambda.md b/docs/troubleshooting_terraform_lambda.md new file mode 100644 index 0000000..bf45a72 --- /dev/null +++ b/docs/troubleshooting_terraform_lambda.md @@ -0,0 +1,717 @@ +# Terraform + Lambda Troubleshooting + +## Table of Contents + +- [Lambda Runtime Import Errors](#issue-lambda-runtime-import-errors) +- [pip Dependency Resolution Failure During Lambda Packaging](#issue-pip-dependency-resolution-failure-during-lambda-packaging) +- [Cause](#cause) +- [Diagnosis](#diagnosis) +- [Fix](#fix) +- [Additional Verification](#additional-verification) +- [When This Problem Commonly Appears](#when-this-problem-commonly-appears) +- [Best Practice](#best-practice) +- [Terraform Archive Creation Error](#issue-terraform-archive-creation-error) +- [AWS Rejects Lambda Deployment Zip](#issue-aws-rejects-lambda-deployment-zip) +- [.build Directory Issues](#issue-build-directory-behaving-inconsistently) +- [Validation Workflow Reminder](#validation-workflow-reminder) +- [Related Documentation](#related-documentation) + +This guide documents common issues encountered when deploying or validating the **Bloodhound Lambda infrastructure using Terraform**. + +These problems typically occur during packaging, deployment, or validation workflows. + +--- + +# Issue: Lambda Runtime Import Errors + +Example error: + +``` +Runtime.ImportModuleError: +No module named 'bloodhound.aws' +``` + +or + +``` +Unable to import module 'handlers.lambda_function' +``` + +These errors occur when the Lambda runtime cannot locate Python modules inside the deployment package. + +--- + +# Cause + +The Lambda package (`.build/bloodhound_lambda_v2.zip`) did not include the full Bloodhound source tree. + +Bloodhound uses Terraform to construct the Lambda package using: + +``` +infra/build.tf +``` + +This process performs the following steps: + +``` +rm -rf .build +mkdir -p .build/lambda_pkg +pip install dependencies +rsync application source +archive zip +deploy Lambda +``` + +If the packaging step fails or does not run, the Lambda runtime may deploy with **missing modules**. + +--- + +# Diagnosis + +Inspect the built package directory. + +Run: + +``` +ls .build/lambda_pkg/bloodhound +``` + +Expected output: + +``` +app.py +aws.py +budget.py +config.py +messages.py +slack.py +handlers/ +scanner/ +teardown/ +``` + +If files such as `aws.py`, `scanner/`, or `teardown/` are missing, the packaging step did not execute correctly. + +--- + +# Root Cause: Terraform Build Step Was Not Triggered + +Bloodhound uses this resource to build the Lambda package: + +``` +terraform_data.build_lambda_pkg +``` + +Terraform will **only rerun this step when its triggers change**. + +If no triggers change, Terraform assumes the package is already up to date and **skips rebuilding the package**. + +This can lead to situations where: + +• the source code changed +• the package directory still contains an older build +• the deployed Lambda contains incomplete modules + +You may see Terraform output like: + +``` +Plan: 0 to add, 0 to change, 1 to destroy +``` + +This indicates the packaging step did not run. + +--- + +# Fix + +If the `.build` directory was deleted or corrupted, Terraform may attempt to deploy an empty Lambda package. + +The correct recovery procedure is to **force Terraform to rebuild the Lambda package**. + +From the `infra` directory run: + +terraform apply -replace=terraform_data.build_lambda_pkg + +This forces the packaging step to run again regardless of trigger hashes. + +During execution Terraform should run the packaging script: + +rm -rf .build +mkdir -p .build/lambda_pkg +pip install -r requirements.txt -t .build/lambda_pkg +rsync application source +create lambda zip +deploy updated Lambda + +Expected Terraform output: + +terraform_data.build_lambda_pkg: Provisioning with 'local-exec' +Prepared package dir: .build/lambda_pkg +Downloading dependencies +Installing slack_sdk +Creating deployment archive + +If the `pip install` step does not appear in the output, the packaging script did not run. + +After the rebuild completes, verify the package contents: + +ls ../.build/lambda_pkg + +Expected output should include the application and dependencies: + +bloodhound/ +handlers/ +slack_sdk/ +boto3/ +requests/ + +You can also inspect the deployed archive: + +unzip -l ../.build/bloodhound_lambda_v2.zip | head + +The archive should contain the application code and dependencies at the root. + +Post-Recovery Verification + +After rebuilding the Lambda package, confirm that the deployment works correctly. + +Step 1 — Smoke Test the Lambda + +After Terraform completes successfully, you should see the Lambda Function URL in the outputs: + +bloodhound_lambda_url = "https://.lambda-url.us-west-2.on.aws/" + +Run a quick test request: + +curl https://.lambda-url.us-west-2.on.aws/ + +Example successful response: + +{ + "regions": ["us-east-1","us-east-2","us-west-1","us-west-2"], + "scan": { + "candidates_total": 56, + "kept_total": 1 + }, + "ok": true, + "teardown": { + "planned_actions": 56, + "execution": null, + "targets_filter": null, + "apply_changes": false, + "simulate": true + }, + "budget": { + "over_budget_threshold_met": true, + "projected_month_end_spend_usd": 2753.1269, + "dynamic_monthly_allowance_usd": 0.0 + } +} + +A response like this confirms: + +• Lambda successfully executed +• Application modules loaded correctly +• Dependencies were packaged correctly +• Infrastructure scan logic is functioning + +If the request fails with an import error, inspect the Lambda logs in CloudWatch. + +Step 2 — Run the Validation Workflow + +Once the Lambda smoke test succeeds, run the full validation workflow. + +./tools/run_validation_workflow.sh + +This workflow verifies: + +• Lambda deployment +• infrastructure scanning +• teardown planning logic +• Slack command integrations + +This step ensures the entire Bloodhound pipeline functions end-to-end after deployment. + +Expected Outcome + +If all steps succeed: + +• Terraform deployment succeeds +• Lambda responds correctly +• Validation workflow completes without errors + +At this point the infrastructure deployment can be considered healthy and operational. + +--- + +# Additional Verification + +You can inspect the Terraform build state using: + +``` +terraform state show terraform_data.build_lambda_pkg +``` + +Example: + +``` +resource "terraform_data" "build_lambda_pkg" { + triggers_replace = { + handlers_tree_hash + lambda_handler_hash + requirements_hash + source_tree_hash + } +} +``` + +These hashes determine when Terraform rebuilds the Lambda package. + +--- + +## Local Lambda Environment Debugging (Optional) + +If deeper debugging is required, you can run the Lambda runtime +and Lambda build environments locally using Docker. + +This can help diagnose packaging or dependency issues before +deploying with Terraform. + +Simulate the **Lambda runtime environment**: + +docker run -it public.ecr.aws/lambda/python:3.10 bash + +This container mirrors the environment used by AWS Lambda +when executing the deployed function. + +Simulate the **Lambda build environment**: + +docker run -it public.ecr.aws/sam/build-python3.10 bash + +This container matches the environment used to build Lambda +dependencies and install packages compatible with the +Python 3.10 Lambda runtime. + +Using these containers allows engineers to: + +• verify Python imports +• test dependency compatibility +• inspect the Lambda runtime filesystem +• reproduce build issues locally + +--- + +# When This Problem Commonly Appears + +This issue most frequently occurs after: + +• adding new modules to the `bloodhound` package +• modifying handler imports +• switching branches in Git +• manual modifications to `.build/` +• Terraform caching the previous package build + +--- + +# Best Practice + +After modifying the application source tree, run: + +``` +terraform apply -replace=terraform_data.build_lambda_pkg +``` + +This ensures the Lambda package is rebuilt with the latest source. + +--- + +## Issue: Terraform Archive Creation Error + +Example error: + +Error: Archive creation error +error creating archive: archive has not been created as it would be empty + +This occurs during: + +terraform plan + +and originates from the Terraform archive_file data source used to build the Lambda deployment package. + +Cause + +Terraform attempts to archive the Lambda package directory: + +.build/lambda_pkg + +If the directory exists but contains no files, the archive_file provider refuses to create a zip archive. + +This commonly occurs when: + +• .build was manually deleted +• the repository was freshly cloned +• the build step has not executed yet + +Example directory state: + +.build/ + lambda_pkg/ + +Since the directory is empty, Terraform cannot create the archive. + +Diagnosis + +Check the contents of the package directory. + +ls -a .build/lambda_pkg + +If the output is only: + +. +.. + +then the directory is empty. + +Fix + +Create a placeholder file so Terraform can archive the directory. + +touch .build/lambda_pkg/.placeholder + +Verify: + +ls -a .build/lambda_pkg + +Expected output: + +. +.. +.placeholder + +Then rerun Terraform: + +cd infra +terraform plan +Issue: AWS Rejects Lambda Deployment Zip + +Example error: + +InvalidParameterValueException: +Uploaded file must be a non-empty zip +Cause + +Terraform created a zip archive from .build/lambda_pkg, but the directory contained only a placeholder file. + +This happens when the Terraform build step was not triggered. + +Diagnosis + +Check the package contents: + +ls .build/lambda_pkg + +If you see only: + +.placeholder + +then the Lambda package was not built. + +Fix + +Force Terraform to rebuild the Lambda package. + +terraform apply -replace=terraform_data.build_lambda_pkg + +This reruns the packaging step: + +pip install dependencies +rsync application source +build lambda package +create zip +deploy Lambda + +After completion, verify: + +ls .build/lambda_pkg + +Expected contents: + +bloodhound/ +handlers/ +requests/ +boto3/ +... + +# Issue: pip Dependency Resolution Failure During Lambda Packaging + +Example error during `terraform apply`: + +```text +ERROR: Cannot install -r requirements.txt because these package versions have conflicting dependencies. +The conflict is caused by: +botocore 1.34.x depends on urllib3<1.27 +The user requested urllib3==2.0.7 + +ERROR: ResolutionImpossible +``` + + +This error occurs during the Lambda packaging step when Terraform runs: + + +pip install -r requirements.txt -t .build/lambda_pkg + + +The pip dependency resolver is unable to construct a valid dependency tree. + +--- + +# Cause + +A **transitive dependency** was manually pinned in `requirements.txt`. + +Example problematic configuration: + + +boto3==1.34.x +botocore==1.34.x +urllib3==2.0.7 + + +The AWS SDK dependency chain looks like this: + + +boto3 +└── botocore +└── urllib3 (<1.27) + + +Because `urllib3` was forced to version `2.x`, pip could not satisfy +botocore's requirement. + +This caused the dependency resolver to fail before the Lambda package +could be built. + +--- + +# Diagnosis + +If Terraform fails during the packaging step, inspect the output for pip +dependency resolution errors. + +Typical indicators include: + + +ResolutionImpossible + + +or + + +conflicting dependencies + + +To reproduce locally, run: + + +pip install -r requirements.txt + + +If pip fails locally, Terraform will also fail during Lambda packaging. + +--- + +# Fix + +Remove the manually pinned transitive dependency. + +Example corrected configuration: + + +slack-sdk==3.26.1 +python-dotenv==1.0.0 + + +Do **not manually pin** dependencies managed by other libraries. + +Allow pip to resolve the dependency tree automatically. + +--- + +# Best Practice + +Only specify **top-level dependencies** required by the Lambda function. + +Avoid pinning dependencies managed internally by other libraries. + +Examples that should generally **not be pinned**: + + +botocore +urllib3 +s3transfer +jmespath + + +These dependencies are automatically managed by the AWS SDK. + +Additionally, AWS Lambda already provides the following libraries in the runtime: + + +boto3 +botocore + + +Because of this, they usually **do not need to be included in `requirements.txt`.** + +Refer to: + + +docs/lambda_packaging.md + + +for the dependency strategy used by the Bloodhound Lambda deployment. + +--- + +# When This Problem Commonly Appears + +This issue most often occurs when: + +• adding new dependencies to `requirements.txt` +• copying dependency lists from other projects +• manually upgrading libraries without reviewing transitive dependencies +• pinning versions to resolve security scanner warnings + +Always verify dependency compatibility before committing changes to +`requirements.txt`. + +--- + +## Issue: .build Directory Behaving Inconsistently + +Rarely, the .build directory may appear to ignore newly created files. + +Example symptom: + +mkdir .build/lambda_pkg +ls -R .build + +but the directory does not appear. + +Cause + +This can happen if the directory was deleted while the shell still had an open reference to it: + +rm -rf .build +mkdir .build + +The shell may still point to the old directory inode. + +Fix + +Open a fresh terminal session and recreate the directory. + +rm -rf .build +mkdir -p .build/lambda_pkg +touch .build/lambda_pkg/.placeholder + +Verify: + +ls -R .build + +Expected: + +.build/ +lambda_pkg/ + +.build/lambda_pkg/ +.placeholder +Important Note + +Terraform references this directory from the infra folder: + +../.build/lambda_pkg + +The correct path must therefore be: + +Bloodhound/.build/lambda_pkg + +### Best Practice + +If the repository is freshly cloned and Terraform fails during plan +due to a missing build directory, initialize the structure with: + +mkdir -p .build/lambda_pkg +touch .build/lambda_pkg/.placeholder + +This ensures the archive_file provider can evaluate during terraform plan. + +During terraform apply, the packaging script will populate the +directory with the correct contents. + +The placeholder file is only required to allow Terraform to evaluate the +archive step during planning. It is replaced during the build process +when the packaging script constructs the final Lambda package. +Additional Improvement (Recommended) + +To ensure Terraform automatically rebuilds the Lambda package when source code changes, add a source hash trigger to the build resource. + +Example: + +triggers_replace = { + source_hash = sha256(join("", fileset("${path.module}/..", "**/*.py"))) +} + +This ensures the packaging step runs whenever Python source files change. + +Key Takeaway + +Most Lambda packaging failures fall into one of three categories: + +Missing modules in the deployment package + +Terraform skipping the build step + +Terraform attempting to archive an empty directory + +Verifying .build/lambda_pkg contents will quickly identify which condition occurred. + +--- + +# Validation Workflow Reminder + +After rebuilding the package, rerun the validation workflow: + +``` +./tools/run_validation_workflow.sh +``` + +This will verify: + +• Lambda deployment +• infrastructure scan +• controlled teardown logic + +--- + +# Related Documentation + +Infrastructure validation process: + +``` +docs/run_validation.md +``` + +Teardown validation workflow: + +``` +docs/validate_teardown.md +``` + +Slack command troubleshooting: + +``` +docs/troubleshooting_slack_commands.md +``` + diff --git a/docs/validate_teardown.md b/docs/validate_teardown.md new file mode 100644 index 0000000..d620113 --- /dev/null +++ b/docs/validate_teardown.md @@ -0,0 +1,459 @@ +# Controlled Teardown Validation + +## Table of Contents + +- [Safety Warning](#safety-warning) +- [Validation Overview](#validation-overview) +- [Infrastructure Smoke Test](#infrastructure-smoke-test) +- [Automation Script](#automation-script) +- [Step 1 — Create Disposable Test Resource](#step-1--create-disposable-test-resource) +- [Step 2 — Capture the Instance ID](#step-2--capture-the-instance-id) +- [Step 3 — Verify Bloodhound Detects the Resource](#step-3--verify-bloodhound-detects-the-resource) +- [Step 4 — Invoke Validation Mode](#step-4--invoke-validation-mode) +- [Step 5 — Execute Teardown](#step-5--execute-teardown-automated) +- [Step 6 — Automatic Deletion Verification](#step-6--automatic-deletion-verification) +- [Step 7 — Restore Safe Mode](#step-7--restore-safe-mode) +- [Step 8 — Cleanup Terraform State](#step-8--cleanup-terraform-state) +- [Expected Lambda Log Flow](#expected-lambda-log-flow) +- [Success Criteria](#success-criteria) +- [When to Run This Test](#when-to-run-this-test) +- [Related Documentation](#related-documentation) + +This document describes how to safely validate the **actual resource deletion path** +in Bloodhound v2. + +Unlike the Slack validation guide, this test confirms that Bloodhound can: + +- detect a resource +- include it in the teardown plan +- execute a deletion +- report results back to Slack + +This validation intentionally deletes a **temporary disposable AWS resource**. + +This procedure should only be performed after the following validations succeed: + +- Infrastructure smoke test (`tools/smoke_test_lambda.sh`) +- Slack command routing (`/v2_seek`) +- Lambda execution confirmed +- CloudWatch logs visible +- `/v2_seek_destroy` confirmation protections working +- Terraform environment variables verified + +See: + +`docs/slack_and_lambda_validation.md` + +--- + +# Safety Warning + +This test temporarily enables **apply-mode**, allowing Bloodhound to execute +real deletion actions. + +The test uses a **small disposable EC2 instance** created specifically for +validation. + +Never run this procedure against production infrastructure without +understanding Bloodhound’s teardown safety controls. + +--- + +# Validation Overview +... +Terraform → create validation resource +Validation script → capture instance ID +Validation script → invoke Lambda validation event +Lambda validation handler → enable controlled destructive mode +Bloodhound pipeline → execute deletion +CLI → verify resource deletion + +--- + +# Infrastructure Smoke Test + +Before running the teardown validation, confirm that the Lambda +deployment is healthy. + +Run: + +tools/smoke_test_lambda.sh + +This script verifies: + +- Lambda function exists +- environment variables are configured +- CloudWatch log group exists +- Lambda Function URL is configured + +If the smoke test fails, fix the infrastructure deployment before +continuing with teardown validation. + +# Automation Script + +The teardown validation workflow can be automated using: + +tools/validate_teardown.sh + +This script performs the following steps automatically: + +1. Create the validation resource using Terraform +2. Capture the resource ID +3. Invoke the Bloodhound Lambda function in validation mode +4. Wait for teardown execution +5. Verify that the resource was deleted + +Example validation payload sent to Lambda: + + +```json +{ + "source": "validation", + "mode": "seek_destroy_validation", + "target_ids": ["INSTANCE_ID"] +} +``` + +The Lambda validation handler enables destructive execution internally +and restricts deletion to the specified validation targets. + +No Slack commands are required to execute the validation teardown. + +--- + +# Step 1 — Create Disposable Test Resource + +Create the Terraform test resource file: + +`infra/test_resource.tf` + +Example: + +```bash +resource "aws_instance" "bloodhound_teardown_test" { + + ami = "ami-0c02fb55956c7d316" # Amazon Linux 2 (us-west-2) + instance_type = "t3.micro" + + tags = { + Name = "bloodhound-teardown-test" + Environment = "test" + + # IMPORTANT: + # Do NOT include the whitelist tag + # bloodhound:keep=true + } + +} + +output "bloodhound_test_instance_id" { + value = aws_instance.bloodhound_teardown_test.id +} +``` + +Apply the resource: + +`terraform apply` + +Terraform will create a small EC2 instance used only for validation. + +--- + +# Step 2 — Capture the Instance ID + +Retrieve the instance ID generated by Terraform. + +Run: + +`terraform output bloodhound_test_instance_id` + +Example output: + +`i-0abc123def456` + +This ID will be used later to verify deletion. + +--- + +# Step 3 — Verify Bloodhound Detects the Resource + +Run the scan command in Slack: + + +`/v2_seek` + +Expected result: + +Slack scan summary should show at least one EC2 instance. + +Example output snippet: + +`ec2.instance=1` + +If the instance does not appear: + +Check: + +* the instance is running +* the instance is in one of the configured regions +* the instance is not tagged with the whitelist tag + +`bloodhound:keep=true` + +--- + +Optional verification: + +Preview the teardown plan before executing destructive mode: + +`/v2_seek_destroy_plan` + +The disposable test instance should appear in the teardown preview. + +# Step 4 — Invoke Validation Mode + +The validation workflow triggers destructive execution using a +dedicated validation event rather than modifying Lambda +environment variables. + +The validation script invokes Lambda with a payload similar to: + +```json +{ + "source": "validation", + "mode": "seek_destroy_validation", + "target_ids": ["INSTANCE_ID"] +} +``` + +The validation handler performs the following safety steps: + +- verifies the invocation originates from the validation harness +- verifies the correct validation mode +- requires explicit target_ids +- enables controlled destructive execution +- restricts deletion to validation targets only + +This approach allows teardown validation without modifying +Lambda environment configuration. + +## Terraform Apply-Mode Safety Guard + +Bloodhound can delete AWS resources when `APPLY_CHANGES=true`. + +Because deletion is dangerous, Terraform includes a **deployment safety guard** +that prevents destructive mode from being enabled accidentally. + +Why this exists: + +It is easy for someone to accidentally enable destructive mode by: + +- committing `APPLY_CHANGES=true` to configuration +- copying values from a test environment +- forgetting to switch back to safe mode after testing +- running Terraform in the wrong environment + +To prevent this, Terraform will **refuse to deploy** if it detects that +destructive mode is enabled. + +Example: + +If the Lambda environment variable contains: + +`APPLY_CHANGES=true` + +Terraform will stop the deployment and show an error like: + +Deployment blocked: APPLY_CHANGES=true requires -var allow_apply_mode=true + +This forces the engineer to **explicitly acknowledge the risk** before enabling +real deletion. + +To intentionally enable apply-mode (for controlled testing only): + +`terraform apply -var allow_apply_mode=true` + +This extra step ensures that destructive mode can never be enabled silently +or by accident. + +In normal operation Bloodhound should run with: + +`APPLY_CHANGES=false` + +which means it will only produce teardown plans and will not delete resources. + +Update Lambda environment variables: + +```bash +APPLY_CHANGES=true +TEARDOWN_SIMULATE=false +``` + +Redeploy configuration: + +`terraform apply` + +This enables real teardown execution. + +--- + +## Step 5 — Execute Teardown (Automated) + +The validation workflow invokes the Bloodhound Lambda function +directly using a validation event. + +No Slack commands are required. + +The validation harness sends a payload similar to: + +{ +"source": "validation", +"mode": "seek_destroy_validation", +"target_ids": ["INSTANCE_ID"] +} + +The Lambda validation handler enables controlled destructive mode +internally and restricts deletion to the specified validation target. + +This allows the teardown pipeline to run automatically without +modifying Lambda environment variables or running Slack commands. + + +Expected Slack output: + +``` +Bloodhound v2 — Teardown Results +``` + +The Slack message should include the EC2 instance created in Step 1. + +--- + +# Step 6 — Automatic Deletion Verification + +The validation script automatically verifies that the instance +was successfully deleted. + +The workflow repeatedly queries the EC2 API until the instance +reaches the `terminated` state. + +Example output: + +Instance state: shutting-down (waiting...) +Instance state: shutting-down (waiting...) + +SUCCESS: Instance terminated successfully. +RESULT: PASS + +This confirms that Bloodhound successfully deleted the instance. + +--- + +# Step 7 — Restore Safe Mode + +Immediately restore Bloodhound's safe configuration. + +Set: + +``` +APPLY_CHANGES=false +TEARDOWN_SIMULATE=true +``` + +Redeploy configuration: + +``` +terraform apply +``` + +Bloodhound will return to **dry-run mode**. + +--- + +# Step 8 — Cleanup Terraform State + +If Terraform still tracks the resource, run: + +``` +terraform destroy -target aws_instance.bloodhound_teardown_test +``` + +If Bloodhound already deleted the instance, Terraform will simply refresh state. + +--- + +# Expected Lambda Log Flow + +When teardown executes, CloudWatch logs should show something similar to: + +START RequestId + +[BLOODHOUND][VALIDATION][request_id=...] + +Validation event received +Building teardown plan +Executing deletion +Deleting EC2 instance +Posting Slack results + +END RequestId +REPORT Duration + +Each validation run will include a structured log marker: + +[BLOODHOUND][VALIDATION][request_id=...] + +The `request_id` corresponds to the AWS Lambda invocation ID +(`context.aws_request_id`) and allows engineers to trace a +single execution across all CloudWatch log lines. + +Logs can be streamed live using: + +```bash +aws logs tail /aws/lambda/BloodhoundLambdaV2 \ +--region us-west-2 \ +--follow +``` + +--- + +# Success Criteria + +This validation is successful when: + +* Validation script successfully invokes Lambda +* Bloodhound detects the test EC2 instance +* Lambda executes teardown without safety errors +* Slack reports the deletion +* AWS CLI confirms the instance no longer exists +* Lambda logs show the teardown executor path +* Bloodhound is returned to safe mode + +--- + +# When to Run This Test + +Run this teardown validation: + +* after major teardown logic changes +* after adding new resource types +* after IAM permission changes +* before enabling apply-mode in production + +--- + +# Related Documentation + +Slack command validation: + +``` +docs/validate_slack_lambda.md +``` + +Architecture and configuration reference: + +``` +docs/bloodhound_v2_plan.md +``` + +``` \ No newline at end of file diff --git a/env.example b/env.example new file mode 100644 index 0000000..b987dc7 --- /dev/null +++ b/env.example @@ -0,0 +1,112 @@ +### Bloodhound v2 environment variables (example) +# +# Copy to .env and fill in values: +# cp env.example .env +# +# v2 loads .env automatically for local runs. +# In AWS Lambda, set these in the Lambda "Environment variables" section instead. + +### AWS Safety Guard + +# Expected AWS account ID for this environment. +# Validation scripts will refuse to run if the current AWS +# credentials are pointing to a different account. +# +# Example: +# EXPECTED_AWS_ACCOUNT_ID=123456789012 +# +#Update Your .env + +#Now add the real value. + +#First determine your account ID: + +#aws sts get-caller-identity + +#Example output: + +#"Account": "123456789012" + +EXPECTED_AWS_ACCOUNT_ID=REPLACE_ME + +### Slack + +# Enable/disable Slack posting (useful for local testing) +SLACK_ENABLED=true + +# Slack bot token (xoxb-...) +SLACK_BOT_TOKEN=REPLACE_ME + +# Scan summaries channel +SLACK_SCAN_CHANNEL_ID=C0A4YLV0HNY + +# Alerts/teardown channel +SLACK_ALERT_CHANNEL_ID=C0A4YLV0HNY + +### Slack slash commands (Function URL) + +# Required to validate /seek and /seek_destroy requests from Slack +SLACK_SIGNING_SECRET=REPLACE_ME + +# Optional allowlists (comma-separated). If unset, allow all. +SLACK_ALLOWED_USER_IDS= +SLACK_ALLOWED_CHANNEL_IDS= + +# Safety: /seek_destroy requires exact text match to this token +SLACK_DESTROY_CONFIRM_TOKEN=CONFIRM + +### Regions + +# explicit = scan REGIONS list +# discover = auto-discover AWS regions +REGION_MODE=explicit +REGIONS=us-east-1,us-east-2,us-west-1,us-west-2 + +### Whitelist + +# If a resource has this tag, it will be considered "kept" (whitelisted) and excluded from teardown. +KEEP_TAG_KEY=bloodhound:keep +KEEP_TAG_VALUE=true + +# Optional: always keep these IDs/ARNs (comma-separated) +KEEP_RESOURCE_IDS= + +### Teardown + +# If false, Bloodhound only posts a teardown plan (dry-run). +# If true, Bloodhound may delete resources. +# +# IMPORTANT: +# Terraform includes a safety guard that prevents deployments +# with APPLY_CHANGES=true unless the engineer explicitly confirms it. +# +# To intentionally enable destructive mode during deployment: +# +# terraform apply -var allow_apply_mode=true +# +APPLY_CHANGES=false + +# If true, never performs destructive calls. EC2-style deletes are validated using DryRun where supported. +TEARDOWN_SIMULATE=true + +# Optional safety rail: only delete resources in this list (comma-separated IDs/ARNs). +TEARDOWN_TARGET_IDS= + +# If true, apply-mode can delete all non-whitelisted candidates without needing TEARDOWN_TARGET_IDS. +TEARDOWN_ALLOW_ALL=false + +# Maximum number of resources Bloodhound is allowed to delete in a single run. +# If the teardown plan exceeds this number, execution will stop. +TEARDOWN_MAX_DELETE_COUNT=5 + +# For RDS deletions: if false, skip final snapshot (fast/cheap). +RDS_FINAL_SNAPSHOT=false + +### Budget (dynamic cohort) + +COHORT_START_YYYY_MM=2025-12 +COHORT_TOTAL_BUDGET_USD=3000 +COHORT_LENGTH_MONTHS=7 +BUDGET_OVER_DAYS=2 + + diff --git a/handlers/__init__.py b/handlers/__init__.py new file mode 100644 index 0000000..3c64b66 --- /dev/null +++ b/handlers/__init__.py @@ -0,0 +1,8 @@ +""" +handlers/ + +Deployment entrypoints. +These modules are referenced by AWS Lambda handler strings. +""" + + diff --git a/handlers/lambda_function.py b/handlers/lambda_function.py new file mode 100644 index 0000000..9e06a2a --- /dev/null +++ b/handlers/lambda_function.py @@ -0,0 +1,123 @@ +"""" +handlers/lambda_function.py + +AWS Lambda entrypoint for Bloodhound v2. + +This handler routes incoming events to explicit execution paths: + +- Slack HTTP events (Function URL) +- Scheduled events (EventBridge or manual triggers) +- Default/manual invocations (scan, status, validation) + +Each event type is routed to a dedicated handler or execution path +to ensure deterministic behavior and prevent recursive execution. + +This file also standardizes CloudWatch logging, including: +- event type tagging (e.g., SCAN, STATUS, SCHEDULED) +- request ID tracing via context.aws_request_id + +Terraform config points at: +- handlers.lambda_function.lambda_handler +""" + +from __future__ import annotations + +from bloodhound.app import run +from bloodhound.slack_commands import handle_slack_command_http, is_slack_http_event + +def log_event(event_type, event, request_id=None): + """ + Standardized CloudWatch log helper. + + Provides consistent log structure across all event types. + Includes optional request_id so each Lambda invocation + can be traced end-to-end in CloudWatch logs. + + Parameters + ---------- + event_type : str + Logical type of the event (e.g., "scan", "status", "scheduled", "slack") + + event : any + Raw event payload received by the Lambda + + request_id : str, optional + AWS Lambda request ID (context.aws_request_id) used for tracing + """ + print("\n" + "=" * 60) + + # Include request_id if available for traceability + if request_id: + print("[BLOODHOUND][" + str(event_type).upper() + "][request_id=" + str(request_id) + "]") + else: + print("[BLOODHOUND][" + str(event_type).upper() + "]") + + print("=" * 60) + print("Event received:", event) + print("=" * 60 + "\n") + + +def lambda_handler(event, context): + """ + AWS Lambda entrypoint for Bloodhound. + + Routes incoming events to the appropriate execution path: + + - Slack HTTP events → handled immediately + - Scheduled events → routed to scheduled handler + - Default events → processed through main pipeline (run) + + This routing ensures deterministic execution and prevents recursion. + + Parameters + ---------- + event : dict + Incoming Lambda event payload + + Examples: + {"source": "scheduled"} + {"detail-type": "Scheduled Event"} + {"source": "scan"} + + context : object + AWS Lambda context (used for request_id tracing) + + Example: + context.aws_request_id -> "abc123-xyz" + + Returns + ------- + dict + Lambda response payload + + Example: + {"ok": True, "scan": {...}, "budget": {...}} + """ + + + # 1) Slack slash commands (HTTP events) + # Responds immediately; may trigger async follow-up work internally. + if isinstance(event, dict) and is_slack_http_event(event): + log_event("slack", event, context.aws_request_id) + return handle_slack_command_http(event) + + # 2) Scheduled events (EventBridge or manual trigger). + if isinstance(event, dict) and (event.get("detail-type") == "Scheduled Event" or event.get("source") == "scheduled"): + from bloodhound.handlers.scheduled_handler import handle_scheduled_event + log_event("scheduled", event, context.aws_request_id) + return handle_scheduled_event(event, context) + + #3) Default invocation (manual CLI / GHA operations such as scan/status). + # Determine event type for logging. + # Most events include a "source" field (scan, status, etc.). + # If missing or event is not a dict, label as "unknown". + if isinstance(event, dict): + event_type = event.get("source", "unknown") + else: + event_type = "unknown" + + log_event(event_type, event, context.aws_request_id) + + return run(event=event, context=context) + + diff --git a/infra/.terraform.lock.hcl b/infra/.terraform.lock.hcl new file mode 100644 index 0000000..6c39e9c --- /dev/null +++ b/infra/.terraform.lock.hcl @@ -0,0 +1,45 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/archive" { + version = "2.7.1" + constraints = "~> 2.0" + hashes = [ + "h1:A7EnRBVm4h9ryO9LwxYnKr4fy7ExPMwD5a1DsY7m1Y0=", + "zh:19881bb356a4a656a865f48aee70c0b8a03c35951b7799b6113883f67f196e8e", + "zh:2fcfbf6318dd514863268b09bbe19bfc958339c636bcbcc3664b45f2b8bf5cc6", + "zh:3323ab9a504ce0a115c28e64d0739369fe85151291a2ce480d51ccbb0c381ac5", + "zh:362674746fb3da3ab9bd4e70c75a3cdd9801a6cf258991102e2c46669cf68e19", + "zh:7140a46d748fdd12212161445c46bbbf30a3f4586c6ac97dd497f0c2565fe949", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:875e6ce78b10f73b1efc849bfcc7af3a28c83a52f878f503bb22776f71d79521", + "zh:b872c6ed24e38428d817ebfb214da69ea7eefc2c38e5a774db2ccd58e54d3a22", + "zh:cd6a44f731c1633ae5d37662af86e7b01ae4c96eb8b04144255824c3f350392d", + "zh:e0600f5e8da12710b0c52d6df0ba147a5486427c1a2cc78f31eea37a47ee1b07", + "zh:f21b2e2563bbb1e44e73557bcd6cdbc1ceb369d471049c40eb56cb84b6317a60", + "zh:f752829eba1cc04a479cf7ae7271526b402e206d5bcf1fcce9f535de5ff9e4e6", + ] +} + +provider "registry.terraform.io/hashicorp/aws" { + version = "6.27.0" + constraints = ">= 5.0.0" + hashes = [ + "h1:emgTfB1LXSFYh9uAwgsRMoMIN5Wz7jNNKq3rqC0EHWk=", + "zh:177a24b806c72e8484b5cabc93b2b38e3d770ae6f745a998b54d6619fd0e8129", + "zh:4ac4a85c14fb868a3306b542e6a56c10bd6c6d5a67bc0c9b8f6a9060cf5f3be7", + "zh:552652185bc85c8ba1da1d65dea47c454728a5c6839c458b6dcd3ce71c19ccfc", + "zh:60284b8172d09aee91eae0856f09855eaf040ce3a58d6933602ae17c53f8ed04", + "zh:6be38d156756ca61fb8e7c752cc5d769cd709686700ac4b230f40a6e95b5dbc9", + "zh:7a409138fae4ef42e3a637e37cb9efedf96459e28a3c764fc4e855e8db9a7485", + "zh:8070cf5224ed1ed3a3e9a59f7c30ff88bf071c7567165275d477c1738a56c064", + "zh:894439ef340a9a79f69cd759e27ad11c7826adeca27be1b1ca82b3c9702fa300", + "zh:89d035eebf08a97c89374ff06040955ddc09f275ecca609d0c9d58d149bef5cf", + "zh:985b1145d724fc1f38369099e4a5087141885740fd6c0b1dbc492171e73c2e49", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:a80b47ae8d1475201c86bd94a5dcb9dd4da5e8b73102a90820b68b66b76d50fd", + "zh:d3395be1556210f82199b9166a6b2e677cee9c4b67e96e63f6c3a98325ad7ab0", + "zh:db0b869d09657f6f1e4110b56093c5fcdf9dbdd97c020db1e577b239c0adcbce", + "zh:ffc72e680370ae7c21f9bd3082c6317730df805c6797427839a6b6b7e9a26a01", + ] +} diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..3f202c5 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,657 @@ +# Bloodhound v2 Infrastructure (Terraform) + +## Table of Contents + +- [First-Time Terraform Setup](#first-time-terraform-setup) +- [What Terraform Creates](#what-terraform-creates) +- [Deploy Flow](#deploy-flow) +- [Python Version Requirement for Lambda Packaging](#python-version-requirement-for-lambda-packaging) +- [AWS Region Alignment](#aws-region-alignment) +- [Environment Variables and Secrets](#environment-variables-and-secrets) +- [Lambda Versioning and Alias](#lambda-versioning-and-alias) +- [Performing a Rollback](#performing-a-rollback) +- [Permanent Rollback Using Terraform](#permanent-rollback-using-terraform) +- [Destructive Mode Deployment Guard](#destructive-mode-deployment-guard) + +This directory provisions the AWS infrastructure for running Bloodhound v2 with Slack slash commands. + +We use a **Lambda Function URL** (single endpoint) for `/v2_seek` and `/v2_seek_destroy`. + +## First-Time Terraform Setup (Existing AWS Resources) + +If the IAM role or IAM policy already exist in the AWS account, +Terraform must import them into state before the first `terraform apply`. + +This situation commonly occurs when: + +- Bloodhound resources were created manually +- The project was previously deployed outside Terraform +- The AWS account already contains earlier Bloodhound infrastructure + +To prevent Terraform errors such as: + +EntityAlreadyExists: Role with name bloodhound-v2-role already exists + +this repository includes a helper script that automatically imports +existing resources into Terraform state if they are detected. + +### Run the bootstrap helper + +From the `infra/` directory: + +```bash +./bootstrap_imports.sh +``` + +The script will: + +Detect if the IAM role bloodhound-v2-role exists +Detect if the IAM policy bloodhound-v2-policy exists +Import them into Terraform state if necessary + +After running the script, proceed normally: + +terraform init +terraform apply +When this step is required + +You typically only need to run the bootstrap script: + +the first time Terraform is introduced into an AWS account +when existing infrastructure already exists + +Once resources are managed by Terraform, this step is no longer necessary. + +Why this script exists + +Terraform cannot automatically adopt resources that already exist in AWS. + +The bootstrap script ensures Terraform can safely begin managing +existing infrastructure without requiring engineers to manually run +terraform import commands. + +This helps avoid common onboarding errors and keeps infrastructure +management consistent across environments. + +## What Terraform creates + +- Lambda function: `BloodhoundLambdaV2` +- Lambda Function URL (public, `authorization_type = NONE`) +- IAM role + policies for: + - CloudWatch logs + - scanning resources (EC2/RDS/ELBv2) + - cost explorer (CE) + - teardown actions (terminate/delete) + - async self-invocation (so slash commands can return immediately) + + ### Execution Model Notes + +Bloodhound uses different execution paths depending on invocation type: + +All events first pass through the Lambda event router, which determines +the correct execution path. + +- Slack commands may use async self-invocation so responses return immediately +- Scheduled scans run synchronously through a dedicated execution path +- Manual operations execute through the main pipeline + +Important: + +Scheduled events do NOT use async self-invocation and must not pass through +the generic `run()` function. + +They are routed through a dedicated scheduled execution handler. + +They follow: + +scheduled_handler → run_scheduled_scan() → execute_pipeline() + +This separation prevents recursive execution loops. + +### Lambda Execution Logging + +Bloodhound Lambda executions emit structured log markers to make +CloudWatch debugging easier. + +Format: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: + +[BLOODHOUND][SLACK][request_id=...] +[BLOODHOUND][SCHEDULED][request_id=...] +[BLOODHOUND][VALIDATION][request_id=...] + +The `request_id` corresponds to the AWS Lambda invocation ID +(`context.aws_request_id`) and allows engineers to trace +individual executions across CloudWatch logs. + +### Deploy flow + +1. Apply Terraform (from `Bloodhound/infra/`): + +```bash +terraform init +terraform apply +``` + +Terraform automatically builds the Lambda deployment package during `terraform apply`. + +Packaging flow: + +```text +terraform apply + ↓ +terraform_data.build_lambda_pkg + ↓ +scripts/build_lambda.sh + ↓ +build mode + ├─ Docker build (default) + │ ↓ + │ Docker (AWS Lambda runtime container) + │ ↓ + │ pip install dependencies + │ + └─ Local build (fallback) + ↓ + python3 + pip + ↓ + pip install dependencies + ↓ +copy application source + ↓ +.build/ + lambda_pkg/ final Lambda package + ↓ +archive_file + ↓ +.build/bloodhound_lambda_v2.zip +``` + +Note: +The current build system installs dependencies directly into +`.build/lambda_pkg` instead of using intermediate dependency or +source directories. + +This keeps the packaging process simple and ensures Terraform +always archives a complete, ready-to-deploy Lambda package. + +The build process is triggered only when Terraform detects changes to: + +- application source code (bloodhound/, handlers/) +- requirements.txt + +This ensures fast incremental builds while avoiding unnecessary dependency installation. + +### ⚠️ Important + +Engineers modifying the Lambda build process should review: + +docs/lambda_packaging.md + +before making changes to avoid breaking Terraform packaging or deployment. + +2. Configure Slack slash commands + +In your Slack App settings, set the Request URL for `v2_seek` and `/v2_seek_destroy` to the Terraform output: + +- `lambda_function_url` + +### Lambda Build System Overview + +Bloodhound uses a layered build system to improve performance and reliability. + +```text +.build/ + lambda_pkg/ final deployment package +``` + +The Lambda build script installs Python dependencies and copies +application source code directly into the Lambda package directory. + +Contents typically include: + +- Bloodhound application modules (`bloodhound/`) +- Lambda handler entrypoints (`handlers/`) +- Python dependencies installed from `requirements.txt` + +Terraform then archives this directory into the deployment artifact: + +`.build/bloodhound_lambda_v2.zip` + +### Forcing a Lambda Rebuild + +Terraform automatically rebuilds the Lambda package when +application code or `requirements.txt` changes. + +If the packaging logic or build script changes, Terraform +may not detect the modification automatically. + +Engineers can force a rebuild using: + +```bash +terraform apply -replace=terraform_data.build_lambda_pkg +``` +This forces Terraform to rerun the Lambda build pipeline and +recreate the deployment artifact. + +## Python Version Requirement for Lambda Packaging + +The Lambda runtime for Bloodhound v2 is currently: + +`python3.10` + +During deployment, Terraform builds the Lambda package locally using pip before uploading it to AWS. + + +Dependency installation is handled by the build script: + +`scripts/build_lambda.sh` + +By default, the Lambda package is built using Docker to ensure +the dependency environment matches the AWS Lambda runtime. + +Docker builds run inside the official AWS Lambda runtime container: + +public.ecr.aws/lambda/python:3.10 + +If Docker is unavailable, engineers can temporarily use a local +Python environment instead: + +`terraform apply -var="use_docker_build=false"` + +Docker builds use the AWS Lambda runtime container: + +public.ecr.aws/lambda/python:3.10 + +This ensures dependencies are built in an environment that matches AWS Lambda. + +Legacy step was executed by Terraform using: + +`python3 -m pip install -r requirements.txt -t .build/lambda_pkg` + +Because Python dependency resolution can vary between versions, +the Python version used to build the Lambda package should match +the Lambda runtime version. + +This prevents dependency conflicts and ensures the deployed +package behaves the same in AWS as it does during packaging. + +If your system default python3 is a newer version (for example Python 3.12 or Python 3.13), the packaging step may fail with dependency resolution errors during: + +`terraform apply` + +Example error: + +`ERROR: Cannot install ... because these package versions have conflicting dependencies` + +To avoid this issue, ensure that the Lambda package is built using Python 3.10. + +Example: + +`python3.10 -m pip install -r requirements.txt -t .build/lambda_pkg` + +Your local development environment can still use newer Python versions (3.11+), but the Lambda packaging step should use the same version as the configured Lambda runtime. + +Note: +The Lambda runtime version is defined in lambda.tf. If the runtime is upgraded (for example to python3.11 or python3.12), the packaging Python version used in build.tf should be updated to match. + +See `docs/lambda_packaging.md` for full details on: + +- Lambda packaging architecture +- dependency caching strategy +- Docker-based builds +- Terraform build triggers + +### AWS Region Alignment + +All components of the Bloodhound deployment must use the **same AWS region**. + +Terraform deploys the Lambda function to the region defined in the Terraform provider configuration. + +Example: + +provider "aws" { + region = "us-east-1" +} + +Any external systems invoking the Lambda must use the **same region**, including: + +- GitHub Actions workflows +- AWS CLI commands +- Manual testing scripts + +For example, the GitHub workflow must use: + +aws-region: us-east-1 + +and: + +--region us-east-1 + +If the regions do not match, the workflow will fail because AWS will not find the Lambda function. + +### Notes + +- Terraform invokes a build script (`scripts/build_lambda.sh`) to construct the Lambda package. +- The build script requires `python3`, `pip`, `zip`, and `rsync` when using local builds. +- If Docker mode is enabled, only Docker is required for dependency installation. +- Putting secrets in `var.lambda_env` stores them in Terraform state. Prefer setting secrets in the Lambda console (or a secrets manager). + +## Environment Variables and Secrets + +Bloodhound uses different configuration sources for **local development** and **AWS runtime**. + +### Local Development + +When running locally, configuration is loaded from the `.env` file: + +.env + +Example usage: + +```bash +python -m tools.run_local +``` + +The .env file is only used for local development and should never be committed to Git. + +AWS Runtime Configuration + +When deployed to AWS Lambda, Bloodhound does not use `.env`. + +Instead, configuration is provided through Lambda environment variables. + +After Terraform creates the Lambda function, configure secrets in the AWS console: + +AWS Console +→ Lambda +→ BloodhoundLambdaV2 +→ Configuration +→ Environment Variables + +Add the following values: + +SLACK_BOT_TOKEN +SLACK_SIGNING_SECRET +SLACK_SCAN_CHANNEL_ID +SLACK_ALERT_CHANNEL_ID + +The application reads these values at runtime using: + +`os.getenv("VARIABLE_NAME")` + +This works both locally (.env) and in AWS (Lambda environment variables). + +Terraform-Managed Variables + +Terraform may manage non-secret configuration variables, such as: + +REGION_MODE +REGIONS +APPLY_CHANGES +TEARDOWN_SIMULATE +TEARDOWN_ALLOW_ALL +COHORT_START_YYYY_MM +COHORT_TOTAL_BUDGET_USD +lambda_alias_version_override + +lambda_alias_version_override allows temporarily pinning the production alias to a specific Lambda version for rollback. + +These values can safely live in: + +terraform.tfvars + +Terraform will inject them into the Lambda environment during deployment. + +Secrets Policy + +Secrets must not be stored in Terraform variables because they would be written into the Terraform state file. + +This includes: + +SLACK_BOT_TOKEN +SLACK_SIGNING_SECRET +SLACK_SCAN_CHANNEL_ID +SLACK_ALERT_CHANNEL_ID + +Instead, secrets should be configured directly in the Lambda environment variables after deployment. + +This prevents secrets from appearing in: + +Git repositories + +Terraform configuration files + +Terraform state + +## Lambda Versioning and Alias + +Bloodhound uses **Lambda version publishing with a production alias** to enable safer deployments and easy rollback. + +Each time Terraform deploys the Lambda function, AWS publishes a **new immutable version** of the function. + +Example structure in AWS: + + +BloodhoundLambdaV2 +├─ $LATEST +├─ Version 1 +├─ Version 2 +└─ Version 3 +↑ +alias: prod + + +Terraform automatically moves the `prod` alias to the newest published version during each deploy. + +### Deployment Behavior + +The deployment process works as follows: + + +terraform plan +↓ +preview infrastructure changes + +terraform apply +↓ +publish new Lambda version +↓ +update prod alias → newest version + + +The `publish = true` setting in `lambda.tf` ensures that every code change results in a new version being created. + +The alias defined in `alias.tf` ensures that the production entry point always points to the most recently deployed version. + +### Why This Is Used + +Using Lambda versioning provides several operational benefits: + +- **Immutable deployments** – each version represents a fixed snapshot of the code +- **Safe rollbacks** – the alias can be repointed to a previous version if a deployment fails +- **Deployment history** – all previous Lambda versions remain available for debugging + +### Rollback Example + +If a deployment introduces an issue, the production alias can be moved back to a previous version. + +Example: + +prod → Version 2 + +This immediately restores the previous working deployment without needing to redeploy code. + +### Slack Integration + +Slack continues to call the same **Lambda Function URL**. + +Internally AWS routes the request to the alias: + +Slack +↓ +Lambda Function URL +↓ +BloodhoundLambdaV2:prod +↓ +Active Lambda version + +Because of this, deployments and rollbacks do **not require changing the Slack configuration**. + +### Viewing Lambda Versions + +Lambda versions can be viewed in the AWS console. + +Navigate to: + +AWS Console +→ Lambda +→ BloodhoundLambdaV2 +→ Versions + +You will see a list similar to: + +$LATEST +Version 1 +Version 2 +Version 3 + +The production alias will indicate which version is currently active: + +prod → Version 3 + +--- + +### Performing a Rollback + +If a deployment introduces an issue, the production alias can be moved back to a previous version. + +Navigate to: + +AWS Console +→ Lambda +→ BloodhoundLambdaV2 +→ Aliases +→ prod +→ Edit + +Then change the alias target to the previous version. + +Example: + +prod → Version 2 + +This immediately restores the previous working deployment without redeploying code. + +--- + +### Important: Terraform Deploy Behavior + +Terraform automatically moves the `prod` alias to the **latest published version** during each deployment. + +Example deploy: + +terraform apply +↓ +publish Version 4 +↓ +prod → Version 4 + + +Because of this behavior, a manual rollback performed in the AWS console is **temporary**. + +Running `terraform apply` again will move the alias back to the newest deployed version. + +If a rollback must remain active, the underlying issue should be fixed before the next Terraform deployment +------ + +### Permanent Rollback Using Terraform + +Because Terraform manages the Lambda alias, a rollback performed in the AWS console is temporary. + +To make a rollback permanent, Terraform provides a **version override variable** that allows the production alias to be pinned to a specific version. + +This avoids editing infrastructure code during incidents. + +--- + +### Step 1: Identify the Working Version + +Find the version to roll back to in the AWS console: + +AWS Console +→ Lambda +→ BloodhoundLambdaV2 +→ Versions + +Example: + +$LATEST +Version 1 +Version 2 +Version 3 + +--- + +### Step 2: Apply Rollback Using Terraform + +Use the override variable when running Terraform: + +Syntax: +```bash +terraform apply -var="lambda_alias_version_override=" +``` + +Example: +```bash +terraform apply -var="lambda_alias_version_override=2" +``` + +Result: + +prod → Version 2 + +The production alias will now permanently point to that version. + +Step 3: Restore Normal Deploy Behavior + +Once the issue is resolved, remove the override: + +```bash +terraform apply -var="lambda_alias_version_override=null" +``` + +After this, Terraform deployments will again move the prod alias to the newest published version automatically. + +## Destructive Mode Deployment Guard + +Bloodhound includes a Terraform safety guard that prevents +deployments when destructive mode is enabled. + +If the Lambda environment variable: + +APPLY_CHANGES=true + +Terraform will refuse to deploy unless the engineer explicitly +acknowledges the action using the override variable. + +This guard prevents accidental deployments where Bloodhound +would be allowed to delete infrastructure. + +Example error: + +Deployment blocked: APPLY_CHANGES=true requires -var allow_apply_mode=true + +To intentionally deploy with destructive mode enabled: + +terraform apply -var allow_apply_mode=true + +This guard exists to prevent accidental deployments that could +allow Bloodhound to delete infrastructure. + +Normal deployments should run with: + +APPLY_CHANGES=false \ No newline at end of file diff --git a/infra/account_guard.tf b/infra/account_guard.tf new file mode 100644 index 0000000..0df8af1 --- /dev/null +++ b/infra/account_guard.tf @@ -0,0 +1,17 @@ +/* +infra/account_guard.tf + +Safety guard to prevent Terraform from deploying +to the wrong AWS account. +*/ + +data "aws_caller_identity" "current" {} + +resource "terraform_data" "account_guard" { + lifecycle { + precondition { + condition = data.aws_caller_identity.current.account_id == var.expected_aws_account_id + error_message = "Terraform is connected to the wrong AWS account." + } + } +} \ No newline at end of file diff --git a/infra/alias.tf b/infra/alias.tf new file mode 100644 index 0000000..b8cc2ea --- /dev/null +++ b/infra/alias.tf @@ -0,0 +1,23 @@ +/* +infra/alias.tf + +Defines the Lambda alias used for production traffic. + +Key points: +- The alias provides a stable name ("prod") that points to a specific + published Lambda version. +- Each Terraform deploy publishes a new immutable Lambda version. +- The alias automatically shifts to the latest version during deploys. +- An optional override variable allows pinning the alias to an older + version for rollback without modifying infrastructure code. +*/ + +resource "aws_lambda_alias" "bloodhound_prod" { + name = "prod" + description = "Production alias for Bloodhound Lambda" + function_name = aws_lambda_function.bloodhound_v2.function_name + function_version = coalesce( + var.lambda_alias_version_override, + aws_lambda_function.bloodhound_v2.version + ) +} \ No newline at end of file diff --git a/infra/bootstrap_imports.sh b/infra/bootstrap_imports.sh new file mode 100755 index 0000000..5b524ee --- /dev/null +++ b/infra/bootstrap_imports.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +# ------------------------------------------------------------------------------ +# bootstrap_imports.sh +# +# Purpose +# ------- +# Terraform cannot automatically manage resources that already exist in AWS. +# If IAM roles or policies were previously created manually (or by earlier +# tooling), Terraform will fail during `terraform apply` with: +# +# EntityAlreadyExists +# +# This helper script detects existing Bloodhound IAM resources and imports +# them into Terraform state so Terraform can manage them safely. +# +# When to run +# ----------- +# Run this script once before the first `terraform apply` in an account where +# Bloodhound infrastructure may already exist. +# +# Typical workflow: +# +# cd infra +# ./bootstrap_imports.sh +# terraform apply +# +# The script is safe to run multiple times. +# +# Requirements +# ------------ +# - AWS CLI configured +# - Terraform installed +# - Appropriate AWS IAM permissions +# +# ------------------------------------------------------------------------------ + +set -e +set -o pipefail # makes bash fail if any command in a pipeline fails + +# ------------------------------------------------------------------------------ +# Safety Check +# ------------------------------------------------------------------------------ +# This script must be executed from the `infra/` directory because Terraform +# commands rely on the Terraform configuration files in this directory. +# +# If someone runs the script from the repository root (or another location), +# Terraform would fail because it cannot find the infrastructure configuration. +# +# We detect this by verifying that a known Terraform file exists locally. +# ------------------------------------------------------------------------------ + +if [ ! -f "lambda.tf" ]; then + echo "Error: this script must be run from the infra/ directory." + echo "Example:" + echo " cd infra" + echo " ./bootstrap_imports.sh" + exit 1 +fi + +ROLE_NAME="bloodhound-v2-role" +POLICY_NAME="bloodhound-v2-policy" + +echo "Checking existing IAM resources..." + +# Retrieve AWS account ID for constructing policy ARN +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + +# --------------------------------------------------- +# Check if IAM role exists +# --------------------------------------------------- +ROLE_EXISTS=$(aws iam get-role --role-name $ROLE_NAME 2>/dev/null || true) + +if [ -n "$ROLE_EXISTS" ]; then + echo "Existing IAM role detected. Importing into Terraform state..." + terraform import aws_iam_role.lambda_role $ROLE_NAME || true +fi + +# --------------------------------------------------- +# Check if IAM policy exists +# --------------------------------------------------- +POLICY_ARN="arn:aws:iam::$ACCOUNT_ID:policy/$POLICY_NAME" + +POLICY_EXISTS=$(aws iam get-policy --policy-arn $POLICY_ARN 2>/dev/null || true) + +if [ -n "$POLICY_EXISTS" ]; then + echo "Existing IAM policy detected. Importing into Terraform state..." + terraform import aws_iam_policy.bloodhound_policy $POLICY_ARN || true +fi + +echo "Terraform import check complete." \ No newline at end of file diff --git a/infra/build.tf b/infra/build.tf new file mode 100644 index 0000000..556030e --- /dev/null +++ b/infra/build.tf @@ -0,0 +1,126 @@ +/* +infra/build.tf + +Build pipeline for the Lambda zip (Terraform-managed). + +How it works: +1) terraform_data.build_lambda_pkg prepares ../.build/lambda_pkg by: + - pip installing dependencies into the package directory + - copying our source code into the package directory +2) archive_file.lambda_zip zips that directory into ../.build/bloodhound_lambda_v2.zip + +Notes: + +• This build runs locally on the machine executing `terraform apply`. + +• Required local tools: + - python3 + - pip + - rsync + +• Dependency installation uses: + + python3 -m pip install --upgrade --no-cache-dir -r requirements.txt -t .build/lambda_pkg + + Flags explained: + + --upgrade + Ensures the newest compatible versions of dependencies are installed. + + --no-cache-dir + Prevents pip cache reuse which can cause corrupted or inconsistent + dependency resolution across machines. + + -t .build/lambda_pkg + Installs dependencies directly into the Lambda package directory. + +• Only runtime dependencies listed in `requirements.txt` are packaged. + + Development-only dependencies live in `requirements-dev.txt` + and are used only for local development/testing. +*/ + +resource "terraform_data" "build_lambda_pkg" { + + triggers_replace = { + + # ------------------------------------------------------------------- + # Rebuild Lambda when dependencies change + # ------------------------------------------------------------------- + requirements_hash = filesha256("${path.module}/../requirements.txt") + + # ------------------------------------------------------------------- + # Rebuild if main handler changes + # ------------------------------------------------------------------- + lambda_handler_hash = filesha256("${path.module}/../handlers/lambda_function.py") + + # ------------------------------------------------------------------- + # SAFETY TRIGGER + # + # Terraform cannot detect changes to application code when the + # Lambda package is built locally via `local-exec`. + # + # We compute a fingerprint (hash) of all Python runtime files so that any change + # to the Lambda source forces this resource to rebuild the package. + # + # Only runtime directories are included to avoid rebuilds caused by + # unrelated files (.build, .venv, scripts, tests, etc). + # ------------------------------------------------------------------- + python_sources_hash = sha256(join("", concat( + + # Hash all Python files inside the main application package + [ + for f in fileset("${path.module}/../bloodhound", "**/*.py") : + filesha256("${path.module}/../bloodhound/${f}") + ], + + # Hash Lambda handler entrypoints + [ + for f in fileset("${path.module}/../handlers", "**/*.py") : + filesha256("${path.module}/../handlers/${f}") + ] + + ))) + } + # Build the Lambda package locally using a shell script + provisioner "local-exec" { + + # -------------------------------------------------------------------- + # Build Lambda package via external build script + # + # The packaging logic has been moved to scripts/build_lambda.sh + # so that: + # + # • Terraform focuses only on infrastructure orchestration + # • build logic becomes easier to maintain + # • Docker-based builds can be supported + # + # The script prepares: + # + # .build/lambda_pkg + # + # Terraform then archives that directory using archive_file. + # -------------------------------------------------------------------- + + working_dir = "${path.module}/.." + + command = "bash scripts/build_lambda.sh" + + environment = { + USE_DOCKER_BUILD = var.use_docker_build + } + + interpreter = ["/bin/bash", "-lc"] + } +} + +# Zip is created by the archive provider (no manual zip commands). +data "archive_file" "lambda_zip" { + type = "zip" + source_dir = local.pkg_dir + output_path = local.zip_path + + depends_on = [terraform_data.build_lambda_pkg] +} + + diff --git a/infra/function_url.tf b/infra/function_url.tf new file mode 100644 index 0000000..8eabf6b --- /dev/null +++ b/infra/function_url.tf @@ -0,0 +1,44 @@ +/* +infra/function_url.tf + +Public HTTPS endpoint for Slack slash commands. + +We use a Lambda Function URL (single endpoint) and validate Slack signatures in code. + +AWS change (Oct 2025): +Function URLs now require BOTH permissions: + - lambda:InvokeFunctionUrl + - lambda:InvokeFunction +*/ + +# --------------------------------------------------------- +# Lambda Function URL +# --------------------------------------------------------- + +resource "aws_lambda_function_url" "bloodhound_url" { + function_name = aws_lambda_function.bloodhound_v2.function_name + authorization_type = "NONE" +} + +# --------------------------------------------------------- +# Allow public HTTP invocation of the Function URL +# --------------------------------------------------------- + +resource "aws_lambda_permission" "function_url_public" { + statement_id = "AllowFunctionUrlPublic" + action = "lambda:InvokeFunctionUrl" + function_name = aws_lambda_function.bloodhound_v2.function_name + principal = "*" + function_url_auth_type = "NONE" +} + +# --------------------------------------------------------- +# Allow the Lambda service to execute the function +# --------------------------------------------------------- + +resource "aws_lambda_permission" "function_url_invoke" { + statement_id = "AllowFunctionInvoke" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.bloodhound_v2.function_name + principal = "*" +} \ No newline at end of file diff --git a/infra/iam.tf b/infra/iam.tf new file mode 100644 index 0000000..2aefe56 --- /dev/null +++ b/infra/iam.tf @@ -0,0 +1,84 @@ +/* +infra/iam.tf + +IAM role + policies for BloodhoundLambdaV2. + +Intent: +- allow scanning (Describe APIs) + Cost Explorer reads +- allow teardown actions (terminate/delete) when enabled by env flags +- allow lambda self-invoke for slash commands (async worker invocation) +*/ + +data "aws_iam_policy_document" "lambda_assume" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "lambda_role" { + name = "${local.name_prefix}-role" + assume_role_policy = data.aws_iam_policy_document.lambda_assume.json + + lifecycle { + prevent_destroy = true + } +} + +resource "aws_iam_role_policy_attachment" "basic_logs" { + role = aws_iam_role.lambda_role.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" +} + +data "aws_iam_policy_document" "bloodhound" { + # Read scanning permissions + statement { + actions = [ + "ec2:Describe*", + "rds:Describe*", + "rds:ListTagsForResource", + "elasticloadbalancing:Describe*", + "ce:GetCostAndUsage", + "ssm:GetParameter" + ] + resources = ["*"] + } + + # Teardown permissions (delete/terminate) + statement { + actions = [ + "ec2:TerminateInstances", + "ec2:DeleteVolume", + "ec2:ReleaseAddress", + "ec2:DeleteNatGateway", + "rds:DeleteDBInstance", + "elasticloadbalancing:DeleteLoadBalancer" + ] + resources = ["*"] + } + + # Allow async self-invoke for slash commands. + statement { + actions = ["lambda:InvokeFunction"] + resources = ["*"] + } +} + +resource "aws_iam_policy" "bloodhound_policy" { + name = "${local.name_prefix}-policy" + policy = data.aws_iam_policy_document.bloodhound.json + + lifecycle { + prevent_destroy = true + } +} + +resource "aws_iam_role_policy_attachment" "bloodhound_attach" { + role = aws_iam_role.lambda_role.name + policy_arn = aws_iam_policy.bloodhound_policy.arn +} + + diff --git a/infra/lambda.tf b/infra/lambda.tf new file mode 100644 index 0000000..490c40b --- /dev/null +++ b/infra/lambda.tf @@ -0,0 +1,73 @@ +/* +infra/lambda.tf + +The Bloodhound v2 Lambda function resource. + +Key points: +- Code package comes from infra/build.tf (archive_file output) +- Env vars are provided via var.lambda_env (often sourced from terraform.tfvars) +*/ + +/* +Detect whether destructive mode is enabled. + +lambda_env is a map of environment variables passed to the Lambda. +We check if APPLY_CHANGES is set to "true". + +If the variable does not exist, default to "false". +*/ +locals { + + apply_changes_enabled = lookup(var.lambda_env, "APPLY_CHANGES", "false") == "true" + +} + +/* +Deployment safety guard. + +If APPLY_CHANGES=true AND allow_apply_mode=false, +Terraform will stop the deployment. + +This prevents someone from accidentally deploying Bloodhound +in destructive mode. + +Engineers must explicitly acknowledge the action using: + +terraform apply -var allow_apply_mode=true +*/ +check "apply_mode_guard" { + + assert { + condition = !(local.apply_changes_enabled && var.allow_apply_mode == false) + error_message = "Deployment blocked: APPLY_CHANGES=true requires -var allow_apply_mode=true" + } + +} + +resource "aws_lambda_function" "bloodhound_v2" { + function_name = var.lambda_function_name + role = aws_iam_role.lambda_role.arn + handler = "handlers.lambda_function.lambda_handler" + runtime = var.lambda_runtime + + filename = data.archive_file.lambda_zip.output_path + source_code_hash = data.archive_file.lambda_zip.output_base64sha256 + + timeout = var.lambda_timeout_seconds + memory_size = var.lambda_memory_mb + + publish = true + + environment { + variables = var.lambda_env + } + + # prevent_destroy is intentionally disabled. + # Lambdas are stateless and safe to recreate during deploys. + # Enable this only if the function ever manages critical resources. + lifecycle { + prevent_destroy = false + } +} + + diff --git a/infra/locals.tf b/infra/locals.tf new file mode 100644 index 0000000..ac36282 --- /dev/null +++ b/infra/locals.tf @@ -0,0 +1,15 @@ +/* +infra/locals.tf + +Local paths and naming helpers. +These keep the build/archive + Lambda resources consistent and easy to tweak. +*/ + +locals { + build_dir = "${path.module}/../.build" + pkg_dir = "${local.build_dir}/lambda_pkg" + zip_path = "${local.build_dir}/bloodhound_lambda_v2.zip" + name_prefix = var.name_prefix +} + + diff --git a/infra/main.tf b/infra/main.tf new file mode 100644 index 0000000..fc78831 --- /dev/null +++ b/infra/main.tf @@ -0,0 +1,23 @@ +terraform { + /* + infra/main.tf + + Terraform root settings for Bloodhound v2. + This file intentionally contains ONLY provider requirements and version constraints. + Resources are split into dedicated files (iam.tf, lambda.tf, function_url.tf, build.tf, ...). + */ + # terraform main file intentionally keeps only provider requirements. + required_version = ">= 1.5.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0.0" + } + archive = { + source = "hashicorp/archive" + version = "~> 2.0" + } + } +} + + diff --git a/infra/outputs.tf b/infra/outputs.tf new file mode 100644 index 0000000..02bb3f2 --- /dev/null +++ b/infra/outputs.tf @@ -0,0 +1,19 @@ +/* +infra/outputs.tf + +Outputs used by operators when configuring Slack commands +and validating deployments. + +To retrieve the Slack command endpoint: + +terraform output bloodhound_lambda_url +*/ + +output "lambda_function_name" { + value = aws_lambda_function.bloodhound_v2.function_name +} + +output "bloodhound_lambda_url" { + description = "Public Lambda Function URL used by Slack slash commands" + value = aws_lambda_function_url.bloodhound_url.function_url +} \ No newline at end of file diff --git a/infra/providers.tf b/infra/providers.tf new file mode 100644 index 0000000..e8171bc --- /dev/null +++ b/infra/providers.tf @@ -0,0 +1,13 @@ +/* +infra/providers.tf + +AWS provider configuration for this module. +We keep provider config separate so main.tf can stay provider-requirements-only. +*/ + +provider "aws" { + region = var.aws_region + profile = var.aws_profile +} + + diff --git a/infra/slack/README.md b/infra/slack/README.md new file mode 100644 index 0000000..81e19be --- /dev/null +++ b/infra/slack/README.md @@ -0,0 +1,245 @@ +# Slack App Manifest — Bloodhound-V2 + +This directory contains the Slack App configuration for Bloodhound-V2. + +## Source of Truth + +The Slack configuration for Bloodhound is managed via a manifest to +ensure reproducibility and prevent configuration drift. + +Slack configuration is defined in: + +infra/slack/bloodhound_v2_manifest.json + +All Slack app settings must be applied from this manifest to ensure reproducibility and prevent configuration drift. + +--- + +## Bloodhound Slack Command Interface + +Bloodhound exposes both legacy and versioned commands. + +Legacy compatibility commands: + +/seek + Run AWS resource scan + +/seek_destroy CONFIRM + Execute destructive teardown + +Preferred V2 commands: + +/v2_seek + Run AWS resource scan + +/v2_seek_destroy_plan + Preview teardown plan + +/v2_seek_destroy CONFIRM + Execute destructive teardown + +/v2_status + Show system status and safety configuration + +## Manifest Structure Overview + +### display_information + +Controls how the app appears in Slack. + +- `name` — App name displayed in Slack. +- `description` — Short summary of purpose. +- `background_color` — Cosmetic only. + +Safe to modify: +- Description +- Background color + +Changing the name should be reviewed. + +--- + +### features.bot_user + +Defines the bot identity. + +- `display_name` — Name used when posting messages. +- `always_online=false` — Bot does not simulate presence. + +No presence spoofing is enabled. + +--- + +### features.slash_commands + +Bloodhound supports both **legacy commands** and **versioned v2 commands**. + +Legacy commands (kept for compatibility): + +- `/seek` — Non-destructive AWS scan. +- `/seek_destroy CONFIRM` — Execute destructive cleanup. + +Preferred v2 command interface: + +- `/v2_seek` — Non-destructive AWS resource scan. +- `/v2_seek_destroy_plan` — Preview the teardown plan without deleting resources. +- `/v2_seek_destroy CONFIRM` — Execute destructive cleanup of non-whitelisted resources. +- `/v2_status` — Show Bloodhound system status and safety configuration. + +Important: + +The `url` field in the manifest is a placeholder during setup. + +After Terraform deploys the Lambda function, the Slack Request URL +for each slash command must be updated to the Lambda Function URL +output by Terraform. + +Example: + +https://.lambda-url..on.aws + +This URL becomes the public HTTPS endpoint used by Slack to +invoke Bloodhound. + +Note: + +The `/seek_destroy` and `/v2_seek_destroy` commands require the confirmation token `CONFIRM`. +Slack sends this token as command text, and Bloodhound validates it before +executing destructive operations. + +--- + +### oauth_config.scopes.bot + +Defines bot permissions. + +Current scopes: + +- chat:write +- commands +- channels:read +- groups:read +- im:read +- mpim:read + +These are intentionally minimal. + +Do not add: +- admin scopes +- user impersonation scopes +- channel history scopes +- additional permissions + +Without review. + +--- + +### settings + +- `socket_mode_enabled=false` + - The app uses HTTPS (Lambda Function URL), not persistent sockets. + +- `token_rotation_enabled=false` + - Token rotation is not currently implemented. + +- `interactivity.is_enabled=false` + - No interactive components (buttons/modals). + +--- + +## Architectural Notes + +Slack commands provide the operational interface for Bloodhound. + +Command flow: + +```text +Slack + ↓ +Lambda Function URL + ↓ +Lambda event router + ↓ +Slack handler (bloodhound.handlers.slack_handler) + ↓ +Slack command parser (bloodhound.slack_commands) + ↓ +Execution pipeline (bloodhound.app) +``` + +Supported operational commands: + +`/v2_seek` + Run AWS resource scan + +`/v2_seek_destroy_plan` + Preview teardown plan + +`/v2_seek_destroy CONFIRM` + Execute destructive teardown + +`/v2_status` + Show system health and configuration + +Slack only triggers execution. + +All scanning and deletion logic lives inside AWS Lambda. + +### Note: + +Slack never performs AWS operations directly. + +Slack only sends HTTPS requests to the Lambda Function URL. +All scanning, planning, and deletion logic runs entirely inside +the Bloodhound Lambda execution environment. + +### Destructive behavior is gated by: + +Destructive behavior is gated by multiple safety controls: + +- confirmation token (`CONFIRM`) sent in the slash command +- environment configuration (APPLY_CHANGES, TEARDOWN_SIMULATE) +- Lambda safety rails that prevent destructive execution unless + explicitly enabled + +Slack never performs deletion directly. + +### Lambda Execution Logging + +Bloodhound Lambda executions emit structured log markers: + +[BLOODHOUND][EVENT_TYPE][request_id=...] + +Examples: + +[BLOODHOUND][SLACK][request_id=...] +[BLOODHOUND][SCAN][request_id=...] +[BLOODHOUND][STATUS][request_id=...] + +The request_id corresponds to the AWS Lambda invocation ID +(context.aws_request_id) and allows engineers to trace a single +execution across CloudWatch logs. + +--- + +## Change Policy + +Allowed changes without review: +- Display text updates +- Slash command descriptions + +Changes requiring review: + +- Adding OAuth scopes +- Enabling socket mode +- Enabling token rotation +- Enabling interactivity +- Adding new slash commands beyond the current interface + +Current supported command set: + +`/v2_seek` +`/v2_seek_destroy_plan` +`/v2_seek_destroy CONFIRM` +`/v2_status` + diff --git a/infra/slack/bloodhound_v2_manifest.json b/infra/slack/bloodhound_v2_manifest.json new file mode 100644 index 0000000..79eb3a4 --- /dev/null +++ b/infra/slack/bloodhound_v2_manifest.json @@ -0,0 +1,73 @@ +{ + "display_information": { + "name": "Bloodhound-V2", + "description": "AWS resource scanning and teardown automation (V2)", + "background_color": "#2E3A59" + }, + "features": { + "bot_user": { + "display_name": "Bloodhound-V2", + "always_online": false + }, + "slash_commands": [ + { + "command": "/seek", + "description": "Scan AWS resources and report results", + "should_escape": false, + "url": "https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/" + }, + { + "command": "/seek_destroy", + "description": "Execute destructive cleanup (requires CONFIRM)", + "usage_hint": "CONFIRM", + "should_escape": false, + "url": "https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/" + }, + { + "command": "/v2_seek", + "description": "Run a non-destructive AWS resource scan", + "should_escape": false, + "url": "https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/" + }, + { + "command": "/v2_seek_destroy_plan", + "description": "Preview the teardown plan without deleting resources", + "should_escape": false, + "url": "https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/" + }, + { + "command": "/v2_seek_destroy", + "description": "Execute destructive cleanup of non-whitelisted resources", + "usage_hint": "CONFIRM", + "should_escape": false, + "url": "https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/" + }, + { + "command": "/v2_status", + "description": "Show Bloodhound system status and safety configuration", + "should_escape": false, + "url": "https://kn5244cyenar6pzgexnrq5oh2m0ohyvb.lambda-url.us-west-2.on.aws/" + } + ] + }, + "oauth_config": { + "scopes": { + "bot": [ + "chat:write", + "commands", + "channels:read", + "groups:read", + "im:read", + "mpim:read" + ] + } + }, + "settings": { + "org_deploy_enabled": false, + "socket_mode_enabled": false, + "token_rotation_enabled": false, + "interactivity": { + "is_enabled": false + } + } +} \ No newline at end of file diff --git a/infra/terraform.tfvars.example b/infra/terraform.tfvars.example new file mode 100644 index 0000000..8d3e7b4 --- /dev/null +++ b/infra/terraform.tfvars.example @@ -0,0 +1,59 @@ +# Example Terraform inputs for Bloodhound v2. +# +# Copy to terraform.tfvars (do not commit terraform.tfvars; it's gitignored). +# +# NOTE: Any values you put in lambda_env will end up in Terraform state. +# Avoid putting Slack tokens or secrets here. Prefer setting secrets in the Lambda console +# or via your preferred secret management approach. + +aws_region = "us-west-2" + +# Optional: AWS CLI profile Terraform should use when authenticating. +# If omitted, Terraform will use the default AWS credential chain +# (environment variables, SSO session, or the default profile). +#aws_profile = "" + +name_prefix = "bloodhound-v2" +lambda_function_name = "BloodhoundLambdaV2" +lambda_runtime = "python3.10" + +# Safety guard: Terraform will refuse to deploy if the active AWS +# credentials belong to a different account than this ID. +expected_aws_account_id = "123456789012" + +# Non-secret env vars can live here safely. +lambda_env = { + SLACK_ENABLED = "true" + # Secrets (OK to set here if you accept they will be stored in Terraform state) + SLACK_BOT_TOKEN = "REPLACE_ME" + SLACK_SIGNING_SECRET = "REPLACE_ME" + + SLACK_SCAN_CHANNEL_ID = "C0A4YLV0HNY" + SLACK_ALERT_CHANNEL_ID = "C0A4YLV0HNY" + REGION_MODE = "explicit" + REGIONS = "us-east-1,us-east-2,us-west-1,us-west-2" + KEEP_TAG_KEY = "bloodhound:keep" + KEEP_TAG_VALUE = "true" + COHORT_START_YYYY_MM = "2025-12" + COHORT_TOTAL_BUDGET_USD = "3000" + COHORT_LENGTH_MONTHS = "7" + BUDGET_OVER_DAYS = "2" + + # Slash commands (non-secret) + SLACK_DESTROY_CONFIRM_TOKEN = "CONFIRM" + + # Optional allowlists (comma-separated) + SLACK_ALLOWED_USER_IDS = "" + SLACK_ALLOWED_CHANNEL_IDS = "" + + # Teardown controls + APPLY_CHANGES = "false" + TEARDOWN_SIMULATE = "true" + TEARDOWN_TARGET_IDS = "" + TEARDOWN_ALLOW_ALL = "false" + RDS_FINAL_SNAPSHOT = "false" + + TEARDOWN_MAX_DELETE_COUNT = "5" +} + + diff --git a/infra/test_resource.tf b/infra/test_resource.tf new file mode 100644 index 0000000..0750037 --- /dev/null +++ b/infra/test_resource.tf @@ -0,0 +1,81 @@ +# ------------------------------------------------------------ +# Bloodhound Teardown Validation Resource +# +# This Terraform resource defines a temporary EC2 instance used +# exclusively for validating Bloodhound's teardown pipeline. +# ------------------------------------------------------------ + + +# ------------------------------------------------------------ +# Lookup Latest Amazon Linux 2 AMI +# +# Hardcoding AMI IDs eventually breaks because AWS retires +# images over time. Instead Terraform dynamically retrieves +# the newest Amazon Linux 2 image published by AWS. +# ------------------------------------------------------------ +data "aws_ami" "amazon_linux_2" { + + most_recent = true + + owners = ["amazon"] + + filter { + name = "name" + values = ["amzn2-ami-hvm-*-x86_64-gp2"] + } + +} + + +# ------------------------------------------------------------ +# Lookup available subnets +# +# Some AWS accounts do not have a default subnet automatically +# selected for EC2 launches. This data source retrieves a list +# of subnets so Terraform can attach the validation instance. +# ------------------------------------------------------------ +data "aws_subnets" "default" {} + + +# ------------------------------------------------------------ +# Temporary EC2 instance used for teardown validation +# ------------------------------------------------------------ +resource "aws_instance" "bloodhound_teardown_test" { + + # Create only during validation runs + count = var.enable_validation_resources ? 1 : 0 + + # Instance configuration + ami = data.aws_ami.amazon_linux_2.id + instance_type = "t3.micro" + + # Attach instance to a discovered subnet + subnet_id = data.aws_subnets.default.ids[0] + + # ---------------------------------------------------------- + # Lifecycle rule + # ---------------------------------------------------------- + lifecycle { + create_before_destroy = true + } + + # ---------------------------------------------------------- + # Resource tags + # ---------------------------------------------------------- + tags = { + Name = "bloodhound-teardown-test" + Environment = "validation" + + "bloodhound:test" = "true" + "bloodhound:test_type" = "teardown_validation" + } + +} + + +# ------------------------------------------------------------ +# Terraform Output +# ------------------------------------------------------------ +output "bloodhound_test_instance_id" { + value = try(aws_instance.bloodhound_teardown_test[0].id, null) +} \ No newline at end of file diff --git a/infra/variables.tf b/infra/variables.tf new file mode 100644 index 0000000..5288b09 --- /dev/null +++ b/infra/variables.tf @@ -0,0 +1,150 @@ +/* +infra/variables.tf + +Input variables for the Terraform module. +Keep these minimal; most per-environment config is passed via lambda_env. +*/ + +variable "aws_region" { + type = string + description = "AWS region to deploy the Lambda into." + default = "us-west-2" +} + +variable "aws_profile" { + type = string + description = "Optional AWS CLI profile name for Terraform." + default = null +} + +variable "name_prefix" { + type = string + description = "Prefix for IAM resources." + default = "bloodhound-v2" +} + +variable "lambda_function_name" { + type = string + description = "Name of the Bloodhound v2 Lambda function." + default = "BloodhoundLambdaV2" +} + +variable "lambda_runtime" { + type = string + description = "Lambda runtime." + default = "python3.10" +} + +variable "lambda_timeout_seconds" { + type = number + description = "Lambda timeout in seconds." + default = 120 +} + +variable "lambda_memory_mb" { + type = number + description = "Lambda memory size." + default = 256 +} + +variable "lambda_env" { + type = map(string) + description = "Lambda environment variables (copy from your .env, minus secrets you don't want in TF state)." + default = {} +} + +# ------------------------------------------------------------ +# Optional Docker-based Lambda dependency build. +# +# When enabled, Lambda dependencies are installed inside the +# official AWS Lambda runtime container instead of the +# engineer's local Python environment. +# +# This guarantees compatibility with the Lambda runtime and +# ensures deterministic builds across engineers and CI runs. +# +# Example usage: +# +# terraform apply -var="use_docker_build=true" +# +# Default behavior uses the local Python environment for +# faster builds during development. +# ------------------------------------------------------------ + +variable "use_docker_build" { + type = bool + default = false + description = "Build Lambda dependencies using the AWS Lambda Docker runtime." +} + + +/* +Optional override for the Lambda alias version. + +When null (default), the alias points to the newest published version. + +When set to a specific version number, Terraform will point the +production alias to that version instead. This allows safe rollbacks +without modifying infrastructure code. +*/ +variable "lambda_alias_version_override" { + type = string + default = null + description = "Optional Lambda version to pin the prod alias to for rollback." +} + + +variable "expected_aws_account_id" { + description = "Safety guard: ensure Terraform is running against the correct AWS account." + type = string +} + +/* +Safety override for destructive mode. + +Bloodhound can delete infrastructure when APPLY_CHANGES=true. +To prevent accidental deletion, Terraform blocks deployment +unless this variable is explicitly enabled. + +Example failure: + +APPLY_CHANGES=true +allow_apply_mode=false + +Terraform will stop with an error. + +To intentionally enable destructive mode for testing: + +terraform apply -var allow_apply_mode=true + +Default is false so deletion cannot be enabled accidentally. +*/ +variable "allow_apply_mode" { + description = "Explicit override required to deploy Bloodhound with APPLY_CHANGES=true." + type = bool + default = false +} + +# ------------------------------------------------------------ +# Unique ID used to trace validation runs. +# Passed from validation scripts so AWS resources can be +# associated with a specific validation execution. +# ------------------------------------------------------------ + +variable "validation_run_id" { + description = "Unique validation run identifier" + type = string + default = "manual" +} + +# ------------------------------------------------------------ +# Enables temporary infrastructure used for validation tests. +# This should normally be disabled during regular Terraform +# deployments. +# ------------------------------------------------------------ + +variable "enable_validation_resources" { + description = "Create temporary validation infrastructure" + type = bool + default = false +} \ No newline at end of file diff --git a/logs/validation_history.log b/logs/validation_history.log new file mode 100644 index 0000000..56a5a99 --- /dev/null +++ b/logs/validation_history.log @@ -0,0 +1,15 @@ +20260309_213031 FAIL +20260312_225332 FAIL +20260314_095824 FAIL +20260314_103717 PASS +20260314_105140 PASS +20260314_105737 FAIL +20260314_110435 PASS +20260314_112239 PASS +20260314_112636 PASS +20260314_171204 PASS +20260314_203853 PASS +20260314_214337 PASS +20260314_220557 PASS +20260314_230130 PASS +20260326_205446 PASS diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..15ee576 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,9 @@ +boto3==1.34.3 +botocore==1.34.3 +jmespath==1.0.1 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +s3transfer==0.9.0 +six==1.16.0 +slack-sdk==3.26.1 +urllib3==2.0.7 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 155ad8c..a51b5b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,2 @@ -boto3==1.34.3 -botocore==1.34.3 -jmespath==1.0.1 -python-dateutil==2.8.2 python-dotenv==1.0.0 -s3transfer==0.9.0 -six==1.16.0 slack-sdk==3.26.1 -urllib3==2.1.0 diff --git a/scripts/bootstrap_github_oidc.sh b/scripts/bootstrap_github_oidc.sh new file mode 100755 index 0000000..80f3181 --- /dev/null +++ b/scripts/bootstrap_github_oidc.sh @@ -0,0 +1,365 @@ +#!/usr/bin/env bash + +# ========================================================= +# Bloodhound GitHub OIDC Bootstrap Script +# ========================================================= +# +# PURPOSE +# +# This script bootstraps the AWS infrastructure required +# for GitHub Actions to securely invoke the Bloodhound +# Lambda function using OpenID Connect (OIDC). +# +# Using OIDC eliminates the need to store long-lived AWS +# credentials (AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY) +# inside GitHub repository secrets. +# +# Instead, GitHub exchanges a short-lived OIDC token with +# AWS STS to assume an IAM role and obtain temporary +# credentials during workflow execution. +# +# +# RESULTING AUTHENTICATION FLOW +# +# GitHub Actions workflow +# ↓ +# GitHub OIDC token +# ↓ +# AWS STS AssumeRoleWithWebIdentity +# ↓ +# BloodhoundGitHubInvokeRole +# ↓ +# Temporary AWS credentials +# ↓ +# Invoke BloodhoundLambdaV2 +# +# +# WHAT THIS SCRIPT CONFIGURES +# +# 1. Ensures the GitHub OIDC provider exists in the AWS account +# 2. Creates or updates the IAM role used by GitHub Actions +# 3. Configures the trust policy for GitHub OIDC +# 4. Attaches permissions allowing the Lambda to be invoked +# 5. Outputs the role ARN for GitHub workflow configuration +# +# +# IMPORTANT DESIGN CHOICE +# +# The trust policy allows: +# +# repo:*/Bloodhound:* +# +# This permits GitHub workflows running from forks of the +# Bloodhound repository to assume the IAM role. +# +# This is necessary because contributors typically run CI +# from their personal forks before opening pull requests +# to the canonical repository: +# +# codeplatoon-devops/Bloodhound +# +# Without this rule, AWS would reject OIDC tokens issued +# from forked repositories because the repository owner +# would not match the organization name. +# +# +# SCRIPT BEHAVIOR +# +# This script is intentionally idempotent: +# +# • Safe to run multiple times +# • Existing infrastructure will not be duplicated +# • IAM trust policy will be updated if the role exists +# +# +# PREREQUISITES +# +# - AWS CLI installed +# - AWS CLI authenticated +# - IAM permissions to create roles and policies +# +# +# VERIFY AWS AUTHENTICATION +# +# aws sts get-caller-identity +# +# +# HOW TO RUN +# +# chmod +x scripts/bootstrap_github_oidc.sh +# ./scripts/bootstrap_github_oidc.sh +# +# +# AFTER RUNNING +# +# Update the GitHub Actions workflow to use: +# +# aws-actions/configure-aws-credentials@v4 +# +# with the role ARN printed by this script. +# +# ========================================================= + + +# --------------------------------------------------------- +# Cleanup temporary IAM policy artifacts +# +# The AWS CLI requires policy documents to be supplied as +# files when creating IAM roles and attaching policies. +# +# This script dynamically generates the following files: +# trust-policy.json +# lambda-policy.json +# +# These files are temporary artifacts and should never be +# committed to the repository. +# +# The EXIT trap ensures they are automatically removed when +# the script finishes, regardless of whether execution +# succeeds, fails, or is interrupted. +# --------------------------------------------------------- +trap "rm -f trust-policy.json lambda-policy.json" EXIT + +set -euo pipefail + +# Disable AWS CLI pager so scripts never hang +export AWS_PAGER="" + + +# --------------------------------------------------------- +# Configuration +# +# These values define the AWS and GitHub environment +# this bootstrap script will configure. +# +# The AWS account ID is automatically detected using STS +# so the script can run safely in any AWS account without +# requiring manual configuration. +# +# NOTE: +# We use the canonical repository owner (Code Platoon org) +# instead of a personal fork so the role remains valid +# when CI runs from the upstream repository. +# --------------------------------------------------------- + +# Detect AWS account automatically +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) + +AWS_REGION="us-west-2" + +# Canonical repository (not personal fork) +GITHUB_ORG="codeplatoon-devops" +GITHUB_REPO="Bloodhound" + +LAMBDA_NAME="BloodhoundLambdaV2" + +ROLE_NAME="BloodhoundGitHubInvokeRole" + + +# GitHub OIDC provider information +OIDC_PROVIDER_URL="https://token.actions.githubusercontent.com" +OIDC_PROVIDER_ARN="arn:aws:iam::$ACCOUNT_ID:oidc-provider/token.actions.githubusercontent.com" + + +echo "=======================================" +echo "Bootstrapping GitHub OIDC for Bloodhound" +echo "Account: $ACCOUNT_ID" +echo "Repository: $GITHUB_ORG/$GITHUB_REPO" +echo "=======================================" + + +# --------------------------------------------------------- +# 1. Check if the GitHub OIDC provider already exists +# --------------------------------------------------------- + +echo "" +echo "Checking for existing GitHub OIDC provider..." + +EXISTING_PROVIDER=$(aws iam list-open-id-connect-providers \ + --query "OpenIDConnectProviderList[?contains(Arn, 'token.actions.githubusercontent.com')].Arn" \ + --output text) + +if [[ -z "$EXISTING_PROVIDER" ]]; then + echo "OIDC provider not found. Creating..." + + aws iam create-open-id-connect-provider \ + --url $OIDC_PROVIDER_URL \ + --client-id-list sts.amazonaws.com \ + --thumbprint-list 6938fd4d98bab03faadb97b34396831e3780aea1 + + echo "OIDC provider created." + +else + echo "OIDC provider already exists." +fi + + +# --------------------------------------------------------- +# 2. Generate GitHub OIDC trust policy +# +# GitHub OIDC Trust Policy (Strict Repo Control) +# +# This IAM role allows GitHub Actions workflows to assume +# the role using OIDC (sts:AssumeRoleWithWebIdentity). +# +# SECURITY MODEL +# +# Access is restricted to explicitly listed repositories +# instead of allowing all repositories named "Bloodhound". +# +# Currently allowed repositories: +# +# • codeplatoon-devops/Bloodhound +# • mmccla1n/Bloodhound +# +# Branch restrictions: +# +# • The upstream repository is restricted to the main branch +# • The maintainer fork allows all branches for development +# +# CONTRIBUTOR NOTE +# +# If another contributor wants to run GitHub Actions from +# their fork of the repository, their fork must be added +# to the "sub" condition below using the format: +# +# repo:/Bloodhound:ref:refs/heads/* +# +# Example: +# +# repo:janedoe/Bloodhound:ref:refs/heads/* +# +# This keeps the role secure while still allowing approved +# forks to run CI workflows. +# --------------------------------------------------------- + +echo "" +echo "Creating trust policy..." + +cat < trust-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "$OIDC_PROVIDER_ARN" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "token.actions.githubusercontent.com:aud": "sts.amazonaws.com" + }, + "StringLike": { + "token.actions.githubusercontent.com:sub": [ + "repo:codeplatoon-devops/Bloodhound:ref:refs/heads/main", + "repo:mmccla1n/Bloodhound:ref:refs/heads/*" + ] + } + } + } + ] +} +EOF + + +# --------------------------------------------------------- +# 3. Create or update IAM role for GitHub Actions +# +# If the role already exists, the trust policy is updated +# to ensure it reflects the current repository policy. +# --------------------------------------------------------- + +echo "" +echo "Creating IAM role: $ROLE_NAME" + +if aws iam get-role --role-name $ROLE_NAME > /dev/null 2>&1; then + echo "Role already exists. Updating trust policy..." + + aws iam update-assume-role-policy \ + --role-name $ROLE_NAME \ + --policy-document file://trust-policy.json + +else + + aws iam create-role \ + --role-name $ROLE_NAME \ + --assume-role-policy-document file://trust-policy.json \ + --tags \ + Key=Project,Value=Bloodhound \ + Key=Owner,Value=CodePlatoon-DevOps \ + Key=ManagedBy,Value=GitHubActions \ + Key=Component,Value=CI-Infrastructure + + echo "Role created." + +fi + + +# --------------------------------------------------------- +# 4. Configure IAM permissions for Lambda invocation +# +# The role requires permission to invoke the Bloodhound +# Lambda function and optionally read its CloudWatch logs +# for debugging CI failures. +# --------------------------------------------------------- + +echo "" +echo "Creating Lambda invoke policy..." + +cat < lambda-policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "InvokeBloodhoundLambda", + "Effect": "Allow", + "Action": [ + "lambda:InvokeFunction", + "lambda:GetFunction", + "lambda:GetFunctionConfiguration", + "lambda:GetFunctionUrlConfig" + ], + "Resource": "arn:aws:lambda:$AWS_REGION:$ACCOUNT_ID:function:$LAMBDA_NAME" + }, + { + "Sid": "ReadLambdaLogs", + "Effect": "Allow", + "Action": [ + "logs:DescribeLogGroups", + "logs:DescribeLogStreams", + "logs:GetLogEvents" + ], + "Resource": "*" + } + ] +} +EOF + + +aws iam put-role-policy \ + --role-name $ROLE_NAME \ + --policy-name BloodhoundInvokePolicy \ + --policy-document file://lambda-policy.json + +echo "Policy attached." + + +# --------------------------------------------------------- +# 5. Output IAM role ARN for GitHub workflow configuration +# +# This ARN must be referenced in the GitHub Actions +# workflow using the configure-aws-credentials action. +# --------------------------------------------------------- + +ROLE_ARN="arn:aws:iam::$ACCOUNT_ID:role/$ROLE_NAME" + +echo "" +echo "=======================================" +echo "Bootstrap complete." +echo "" +echo "Use this role in your GitHub workflow:" +echo "" +echo "$ROLE_ARN" +echo "" +echo "=======================================" \ No newline at end of file diff --git a/scripts/build_lambda.sh b/scripts/build_lambda.sh new file mode 100755 index 0000000..bdf86c0 --- /dev/null +++ b/scripts/build_lambda.sh @@ -0,0 +1,193 @@ +#!/usr/bin/env bash +set -euo pipefail + +: <<'DOC' +scripts/build_lambda.sh + +Builds the Bloodhound Lambda deployment package. + +This script prepares the directory: + + .build/lambda_pkg + +Terraform then archives that directory into: + + .build/bloodhound_lambda_v2.zip + +using the archive_file provider. + +--------------------------------------------------------------------- + +Build modes + +Docker build (default) + Installs dependencies using the official AWS Lambda runtime + container to guarantee compatibility with the Lambda environment. + +Local build + Installs dependencies using your local Python environment. + This is faster but may produce incompatible binaries. + +--------------------------------------------------------------------- + +Usage + +Show help + + ./scripts/build_lambda.sh -h + +Docker build (default) + + ./scripts/build_lambda.sh + +Force Docker build + + ./scripts/build_lambda.sh --docker + +Force local build + + ./scripts/build_lambda.sh --local + +--------------------------------------------------------------------- + +Required tools + +Docker mode: + docker + +Local mode: + python3 + pip + rsync +DOC + + +# ------------------------------------------------------------ +# Help helper +# ------------------------------------------------------------ + +show_help() { + sed -n '/^DOC$/,/^DOC$/p' "$0" | sed '1d;$d' + exit 0 +} + + +# ------------------------------------------------------------ +# Parse arguments +# ------------------------------------------------------------ + +BUILD_MODE="docker" + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + show_help + ;; + --docker) + BUILD_MODE="docker" + shift + ;; + --local) + BUILD_MODE="local" + shift + ;; + *) + echo "Unknown option: $1" + echo "Run with -h for usage." + exit 1 + ;; + esac +done + +echo +echo "---------------------------------------------" +echo "Lambda build mode: $BUILD_MODE" +echo "---------------------------------------------" + + +# ------------------------------------------------------------ +# Step 1 — Prepare clean build directory +# +# Lambda packages must be deterministic. We remove any +# previous build artifacts to prevent stale dependencies +# or leftover files from entering the deployment package. +# ------------------------------------------------------------ + +echo +echo "Preparing Lambda build directories..." + +BUILD_DIR=".build" +PKG_DIR="$BUILD_DIR/lambda_pkg" + +rm -rf "$BUILD_DIR" +mkdir -p "$PKG_DIR" + + +# ------------------------------------------------------------ +# Step 2 — Install runtime dependencies +# +# Dependencies from requirements.txt are installed directly +# into the Lambda package directory so they are included in +# the final deployment package. +# ------------------------------------------------------------ + +echo +echo "Installing dependencies..." + +if [[ "$BUILD_MODE" == "docker" ]]; then + + echo "Using Docker Lambda runtime (Amazon Linux)" + + docker run --rm \ + -v "$PWD":/var/task \ + public.ecr.aws/sam/build-python3.10 \ + pip install --upgrade --no-cache-dir -r requirements.txt -t .build/lambda_pkg + +else + + echo "Using local Python environment" + + python3 -m pip install --upgrade --no-cache-dir -r requirements.txt -t .build/lambda_pkg + +fi + + +# ------------------------------------------------------------ +# Step 3 — Copy application source code +# +# The Lambda package must include the Bloodhound application +# modules and the Lambda handler entrypoint. +# rsync preserves directory structure and includes new modules. +# ------------------------------------------------------------ + +echo +echo "Copying application source..." + +# Engineer note: +# rsync preserves directory structure and ensures new modules +# such as bloodhound/aws.py or scanner/ are included automatically. + +rsync -a --exclude "__pycache__" --exclude "*.pyc" bloodhound/ .build/lambda_pkg/bloodhound/ + +rsync -a --exclude "__pycache__" --exclude "*.pyc" handlers/ .build/lambda_pkg/handlers/ + + +# ------------------------------------------------------------ +# Step 4 — Debug visibility +# +# Display the final Lambda package contents to confirm the +# correct files are included. +# Useful when diagnosing import or packaging issues. +# ------------------------------------------------------------ + +echo +echo "Prepared package dir: .build/lambda_pkg" +echo "Lambda package contents:" +ls -R .build/lambda_pkg + +echo +echo "Package size:" +du -sh .build/lambda_pkg + +echo +echo "Lambda package build complete." \ No newline at end of file diff --git a/test_event.json b/test_event.json deleted file mode 100644 index 9e26dfe..0000000 --- a/test_event.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/tools/run_local.py b/tools/run_local.py new file mode 100644 index 0000000..5c2f415 --- /dev/null +++ b/tools/run_local.py @@ -0,0 +1,19 @@ +""" +tools/run_local.py + +Local runner for Bloodhound v2 (uses `.env` + your AWS_PROFILE). +This is the fastest way to validate config + Slack output before deploying to Lambda. +""" + +from __future__ import annotations + +import json + +from bloodhound.app import run + + +if __name__ == "__main__": + result = run(event={}, context=None) + print(json.dumps(result, indent=2, sort_keys=True)) + + diff --git a/tools/run_validation_workflow.sh b/tools/run_validation_workflow.sh new file mode 100755 index 0000000..4226b49 --- /dev/null +++ b/tools/run_validation_workflow.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +# ------------------------------------------------------------ +# Bloodhound Validation Workflow Runner +# +# This script orchestrates the full validation workflow. +# +# It runs validation tools in the correct order: +# +# 1. Infrastructure smoke test +# 2. Controlled teardown validation +# +# If the smoke test fails, the workflow stops immediately. +# +# Usage: +# +# ./tools/run_validation_workflow.sh +# +# ------------------------------------------------------------ + +set -euo pipefail # exit immediately if any command fails + +# ------------------------------------------------------------ +# Disable AWS CLI pager +# +# AWS CLI v2 automatically sends long output to a pager +# (usually "less"), which pauses scripts and displays "(END)" +# until the user presses 'q'. +# +# Automation scripts should disable this behavior so output +# prints directly to the terminal. +# ------------------------------------------------------------ +export AWS_PAGER="" + +# ------------------------------------------------------------ +# Validation Run Identifier +# ------------------------------------------------------------ + +RUN_ID=$(date +"%Y%m%d_%H%M%S") + +# ------------------------------------------------------------ +# Workflow Logging +# ------------------------------------------------------------ + +LOG_DIR="logs/validation" +mkdir -p "$LOG_DIR" + +LOG_FILE="$LOG_DIR/workflow_validation_${RUN_ID}.log" + +# send all output to terminal AND log +exec > >(tee -a "$LOG_FILE") 2>&1 + + +echo "" +echo "================================================" +echo "Bloodhound Validation Workflow" +echo "================================================" +echo "" + +# ------------------------------------------------------------ +# Step 1 — Run infrastructure smoke test +# ------------------------------------------------------------ + +echo "Step 1: Running Lambda smoke test..." +echo "" + +./tools/smoke_test_lambda.sh + +echo "" +echo "Smoke test passed." +echo "" + +# ------------------------------------------------------------ +# Step 2 — Run controlled teardown validation +# ------------------------------------------------------------ + +echo "Step 2: Starting controlled teardown validation..." +echo "" + +echo "Validation Run ID: $RUN_ID" + +./tools/validate_teardown.sh "$RUN_ID" + +echo "" +echo "================================================" +echo "Validation workflow completed successfully." +echo "================================================" +echo "" + +# ------------------------------------------------------------------ +# Workflow Log Retention +# +# Keep only the 3 most recent workflow validation logs. +# Older logs are removed automatically to prevent the +# validation directory from growing indefinitely. +# +# Detailed teardown logs have their own retention policy +# inside validate_teardown.sh. +# ------------------------------------------------------------------ + +echo "Keeping only the 3 most recent workflow logs." + +ls -1t "$LOG_DIR"/workflow_validation_* 2>/dev/null | tail -n +4 | xargs -r rm \ No newline at end of file diff --git a/tools/show_validation_history.sh b/tools/show_validation_history.sh new file mode 100755 index 0000000..48a284b --- /dev/null +++ b/tools/show_validation_history.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# ------------------------------------------------------------ +# Bloodhound Validation History +# +# Displays the full validation history stored in: +# +# logs/validation_history.log +# ------------------------------------------------------------ + +HISTORY_FILE="logs/validation_history.log" + +echo "" +echo "Bloodhound Validation History" +echo "-----------------------------" +echo "" + +if [ ! -f "$HISTORY_FILE" ]; then + echo "No validation history found." + exit 0 +fi + +cat "$HISTORY_FILE" + +echo "" \ No newline at end of file diff --git a/tools/smoke_test_lambda.sh b/tools/smoke_test_lambda.sh new file mode 100755 index 0000000..b30c0c7 --- /dev/null +++ b/tools/smoke_test_lambda.sh @@ -0,0 +1,272 @@ +#!/usr/bin/env bash + +# ------------------------------------------------------------ +# Bloodhound Lambda Smoke Test +# +# Purpose +# ------- +# Quickly verify that the Bloodhound Lambda deployment is healthy. +# +# This script detects common deployment problems such as: +# +# - wrong AWS region +# - Lambda function not deployed +# - missing or incorrect environment variables +# - CloudWatch log group missing +# - Lambda Function URL not configured +# +# It also validates that critical Lambda environment variables +# match the local `.env` configuration. This helps detect cases +# where Terraform changes were not applied. +# +# This script is intended to be run immediately after: +# +# terraform apply +# +# It should complete in a few seconds and detect most +# infrastructure deployment issues. +# ------------------------------------------------------------ + +set -euo pipefail # Exit on error, treat unset variables as errors, and fail on pipeline errors + +MODE="${MODE:-}" + +if [ -z "$MODE" ]; then + if [ "${CI:-}" = "true" ]; then + MODE="ci" + else + MODE="local" + fi +fi + +# ------------------------------------------------------------ +# Disable AWS CLI pager +# +# AWS CLI v2 automatically sends long output to a pager +# (usually "less"), which pauses scripts and displays "(END)" +# until the user presses 'q'. +# +# Automation scripts should disable this behavior so output +# prints directly to the terminal. +# ------------------------------------------------------------ +export AWS_PAGER="" + +# ------------------------------------------------------------ +# Dependency check +# +# The script uses jq to parse JSON returned by AWS CLI. +# ------------------------------------------------------------ +command -v jq >/dev/null 2>&1 || { + echo "ERROR: jq is required but not installed." + exit 1 +} + +# ------------------------------------------------------------ +# Configuration +# +# These values should match your Terraform deployment. +# ------------------------------------------------------------ +REGION="us-west-2" +FUNCTION_NAME="BloodhoundLambdaV2" +LOG_GROUP="/aws/lambda/BloodhoundLambdaV2" + +echo "" +echo "--------------------------------------" +echo "Bloodhound Lambda Smoke Test" +echo "--------------------------------------" +echo "" + +# ------------------------------------------------------------ +# Step 1 — Verify Lambda function exists +# +# If this fails, Terraform deployment likely failed. +# ------------------------------------------------------------ +echo "Checking Lambda existence..." + +aws lambda get-function \ + --function-name "$FUNCTION_NAME" \ + --region "$REGION" >/dev/null + +echo "✓ Lambda function exists" + +# ------------------------------------------------------------ +# Step 2 — Retrieve Lambda environment variables +# +# These variables are injected by Terraform during deployment. +# ------------------------------------------------------------ +echo "" +echo "Fetching Lambda environment variables..." + +LAMBDA_ENV=$(aws lambda get-function-configuration \ + --function-name "$FUNCTION_NAME" \ + --region "$REGION" \ + --query 'Environment.Variables' \ + --output json) + +echo "✓ Retrieved Lambda environment variables" + +# ------------------------------------------------------------ +# Step 3 — Validate Lambda environment variables +# +# LOCAL MODE +# Compare deployed Lambda variables against local .env +# +# CI MODE (GitHub Actions) +# Verify required variables exist in Lambda +# +# GitHub automatically sets: +# CI=true +# ------------------------------------------------------------ +echo "" +echo "Validating critical environment variables..." + +# ------------------------------------------------------------ +# Determine execution mode +# ------------------------------------------------------------ +if [ "${CI:-}" = "true" ]; then + MODE="ci" +else + MODE="local" +fi + + +# ------------------------------------------------------------ +# LOCAL MODE VALIDATION +# +# Compare deployed Lambda variables against local .env +# ------------------------------------------------------------ +check_env_match () { + + VAR_NAME=$1 + + EXPECTED=${!VAR_NAME} + + ACTUAL=$(echo "$LAMBDA_ENV" | jq -r ".${VAR_NAME}") + + if [ "$EXPECTED" != "$ACTUAL" ]; then + + echo "" + echo "ERROR: Environment variable mismatch" + echo "Variable: $VAR_NAME" + echo "Expected: $EXPECTED" + echo "Actual: $ACTUAL" + echo "" + echo "Terraform deployment may be out of sync." + echo "Run: terraform apply" + exit 1 + + fi +} + + +# ------------------------------------------------------------ +# CI MODE VALIDATION +# +# Ensure required variables exist in Lambda +# ------------------------------------------------------------ +check_env_exists () { + + VAR_NAME=$1 + + ACTUAL=$(echo "$LAMBDA_ENV" | jq -r ".${VAR_NAME}") + + if [ "$ACTUAL" = "null" ] || [ -z "$ACTUAL" ]; then + + echo "" + echo "ERROR: Lambda environment variable missing" + echo "Variable: $VAR_NAME" + echo "" + exit 1 + + fi +} + + +# ------------------------------------------------------------ +# Execute validation +# ------------------------------------------------------------ +if [ "$MODE" = "local" ]; then + + echo "Local mode detected — validating against .env" + + source .env + + check_env_match APPLY_CHANGES + check_env_match TEARDOWN_SIMULATE + check_env_match SLACK_SCAN_CHANNEL_ID + check_env_match SLACK_ALERT_CHANNEL_ID + +else + + echo "CI mode detected — verifying variables exist in Lambda" + + check_env_exists APPLY_CHANGES + check_env_exists TEARDOWN_SIMULATE + check_env_exists SLACK_SCAN_CHANNEL_ID + check_env_exists SLACK_ALERT_CHANNEL_ID + +fi + +echo "✓ Environment variable validation passed" + +# ------------------------------------------------------------ +# Step 4 — Display deployed Lambda environment variables +# +# Local mode prints the full environment variables directly +# from AWS for developer inspection. +# +# CI mode skips printing the variables in GitHub Actions logs. +# ------------------------------------------------------------ +echo "" + +if [ "$MODE" = "ci" ]; then + + echo "CI mode detected — skipping environment variable display" + +else + + echo "Deployed Lambda environment variables:" + + aws lambda get-function-configuration \ + --function-name "$FUNCTION_NAME" \ + --region "$REGION" \ + --query 'Environment.Variables' + +fi + +echo "" +echo "✓ Environment variables retrieved" + +# ------------------------------------------------------------ +# Step 5 — Verify CloudWatch log group exists +# +# Lambda automatically creates this on first execution. +# ------------------------------------------------------------ +echo "" +echo "Checking CloudWatch log group..." + +aws logs describe-log-groups \ + --log-group-name-prefix "$LOG_GROUP" \ + --region "$REGION" >/dev/null + +echo "✓ Log group exists" + +# ------------------------------------------------------------ +# Step 6 — Verify Lambda Function URL +# +# Slack slash commands depend on this endpoint. +# ------------------------------------------------------------ +echo "" +echo "Checking Lambda Function URL..." + +aws lambda get-function-url-config \ + --function-name "$FUNCTION_NAME" \ + --region "$REGION" >/dev/null + +echo "✓ Function URL configured" + +echo "" +echo "--------------------------------------" +echo "Smoke test completed successfully" +echo "--------------------------------------" +echo "" \ No newline at end of file diff --git a/tools/test_event.json b/tools/test_event.json new file mode 100644 index 0000000..5cf32ec --- /dev/null +++ b/tools/test_event.json @@ -0,0 +1,3 @@ +{} + + diff --git a/tools/trust-policy.json b/tools/trust-policy.json new file mode 100644 index 0000000..27875d5 --- /dev/null +++ b/tools/trust-policy.json @@ -0,0 +1,14 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "lambda.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} + + diff --git a/tools/validate_teardown.sh b/tools/validate_teardown.sh new file mode 100755 index 0000000..cd84348 --- /dev/null +++ b/tools/validate_teardown.sh @@ -0,0 +1,680 @@ +#!/usr/bin/env bash + +# ------------------------------------------------------------------ +# Bloodhound Controlled Teardown Validation Script +# +# Purpose +# ------- +# This script validates the full Bloodhound teardown pipeline. +# +# It verifies that Bloodhound can: +# 1. Detect an AWS resource during a scan +# 2. Include the resource in the teardown plan +# 3. Execute the deletion successfully +# +# Validation Workflow +# ------------------- +# 1. Terraform creates a temporary EC2 instance +# 2. Engineer runs `/v2_seek` in Slack to confirm detection +# 3. Engineer runs `/v2_seek_destroy CONFIRM` +# 4. Bloodhound deletes the instance +# 5. Script verifies the instance no longer exists +# +# All validation runs produce: +# +# • a detailed run log (logs/validation/) +# • a history record (logs/validation_history.log) +# +# Documentation reference: +# docs/validate_teardown.md +# ------------------------------------------------------------------ + +# ------------------------------------------------------------------ +# Ensure script is executed from repository root +# +# This prevents path issues when referencing directories like: +# logs/ +# infra/ +# tools/ +# +# always run validation via: +# +# ./tools/run_validation_workflow.sh +# ------------------------------------------------------------------ +if [ ! -d "tools" ] || [ ! -d "infra" ]; then + echo "ERROR: run this script from the repository root." + echo "Example:" + echo " ./tools/run_validation_workflow.sh" + exit 1 +fi + +# Stop the script immediately if any command fails +set -euo pipefail + +# ensures failures message in CI logs. +trap 'echo "Validation script failed"; exit 1' ERR + +# ------------------------------------------------------------------ +# Validation Mode Configuration +# +# Default behavior: +# automated validation (CI/CD safe) +# +# Optional: +# --manual enables Slack confirmation steps so engineers can +# visually verify the scan and teardown plan. +# +# Examples: +# +# ./tools/run_validation_workflow.sh +# ./tools/run_validation_workflow.sh --manual +# ------------------------------------------------------------------ + +MANUAL_MODE=false + +if [ "$1" == "--manual" ]; then + MANUAL_MODE=true +fi + +# ------------------------------------------------------------------ +# Disable AWS CLI Pager +# +# Some AWS CLI commands automatically open output in a pager +# (usually "less"), which pauses script execution and displays +# an "(END)" prompt until the user exits manually. +# +# +# Setting AWS_PAGER="" disables the pager so AWS CLI commands +# print directly to stdout instead of launching an interactive +# viewer. +# ------------------------------------------------------------------ +export AWS_PAGER="" + +# ------------------------------------------------------------------ +# Load environment variables +# +# This loads the .env file so validation scripts can access +# configuration values such as EXPECTED_AWS_ACCOUNT_ID. +# ------------------------------------------------------------------ + +if [ -f ".env" ]; then + source .env +fi + +# CI fallback values +APPLY_CHANGES="${APPLY_CHANGES:-false}" +TEARDOWN_SIMULATE="${TEARDOWN_SIMULATE:-true}" +TEARDOWN_MAX_DELETE_COUNT="${TEARDOWN_MAX_DELETE_COUNT:-5}" + +# -------------------------------------------------- +# CI fallback for account validation +# In CI, .env is not available. Use AWS STS to determine +# the current account ID from the authenticated AWS credentials. +# -------------------------------------------------- +EXPECTED_AWS_ACCOUNT_ID="${EXPECTED_AWS_ACCOUNT_ID:-$(aws sts get-caller-identity \ + --query Account \ + --output text)}" + +# ------------------------------------------------------------------ +# AWS Account Safety Guard +# +# Prevents validation tests from running in the wrong AWS account. +# ------------------------------------------------------------------ + +CURRENT_ACCOUNT_ID=$(aws sts get-caller-identity \ + --query Account \ + --output text) + +if [ "$CURRENT_ACCOUNT_ID" != "$EXPECTED_AWS_ACCOUNT_ID" ]; then + echo "" + echo "ERROR: Wrong AWS account detected." + echo "" + echo "Expected: $EXPECTED_AWS_ACCOUNT_ID" + echo "Actual: $CURRENT_ACCOUNT_ID" + echo "" + echo "Refusing to run teardown validation." + echo "" + exit 1 +fi + + +# ------------------------------------------------------------------ +# Configuration Consistency Check +# +# APPLY_CHANGES and TEARDOWN_SIMULATE must not both be true. +# This configuration is contradictory and indicates a misconfigured +# environment. +# +# If APPLY_CHANGES=true → real deletions should occur. +# If TEARDOWN_SIMULATE=true → no deletions should occur. +# ------------------------------------------------------------------ + +if [ "$APPLY_CHANGES" = "true" ] && [ "$TEARDOWN_SIMULATE" = "true" ]; then + echo "" + echo "ERROR: Invalid teardown configuration." + echo "" + echo "APPLY_CHANGES=true" + echo "TEARDOWN_SIMULATE=true" + echo "" + echo "Both modes cannot be enabled simultaneously." + echo "Fix the configuration in your .env file." + echo "" + exit 1 +fi + +# ------------------------------------------------------------------ +# Validation Run Identifier +# +# Each validation run receives a unique ID. This ID is used to: +# - name the validation log file +# - track validation activity +# ------------------------------------------------------------------ + +RUN_ID=$1 + +if [ -z "$RUN_ID" ]; then + RUN_ID=$(date +"%Y%m%d_%H%M%S") +fi + + +# ------------------------------------------------------------------ +# Validation Logging Setup +# +# logs/validation/ → detailed logs for individual runs +# logs/validation_history.log → append-only validation history +# ------------------------------------------------------------------ +LOG_DIR="logs/validation" +HISTORY_FILE="logs/validation_history.log" + +mkdir -p "$LOG_DIR" +touch "$HISTORY_FILE" + +LOG_FILE="$LOG_DIR/teardown_validation_${RUN_ID}.log" + +# ------------------------------------------------------------------ +# Logging helper +# +# Writes messages to both: +# • terminal output +# • validation log file +# +# This allows engineers to see progress live while preserving +# a full execution record for debugging. +# ------------------------------------------------------------------ +log() { + echo "$1" | tee -a "$LOG_FILE" +} + +log "Validation Run ID: $RUN_ID" + + +# ------------------------------------------------------------------ +# AWS Environment Configuration +# ------------------------------------------------------------------ + +LOG_GROUP="/aws/lambda/BloodhoundLambdaV2" +REGION="us-west-2" + + +# ------------------------------------------------------------------ +# Optional: Stream Lambda Logs +# +# Engineers can open another terminal to watch the Lambda execution +# in real time while the validation test runs. +# ------------------------------------------------------------------ + +echo "" +echo "Optional: stream Lambda logs during validation." +echo "Open a second terminal and run:" +echo "" +echo "aws logs tail $LOG_GROUP --region $REGION --follow" +echo "" + + +echo "----------------------------------------" +echo "Bloodhound Controlled Teardown Validation" +echo "----------------------------------------" + + +# ------------------------------------------------------------------ +# Step 1 — Create Disposable EC2 Instance +# +# Terraform creates the temporary validation resource defined in: +# +# infra/test_resource.tf +# +# This instance exists only for validation and is tagged so +# for easily identification in the AWS console. +# ------------------------------------------------------------------ + +echo "" +log "Step 1: Creating disposable EC2 instance" +echo "" + +# Initialize Terraform (required for CI environments) +terraform -chdir=infra init -input=false + +terraform -chdir=infra apply \ + -var "validation_run_id=$RUN_ID" \ + -var "enable_validation_resources=true" \ + -auto-approve + + +# ------------------------------------------------------------------ +# Step 2 — Capture Instance ID +# +# Terraform outputs the instance ID which is used later to verify +# the deletion occurred successfully. +# ------------------------------------------------------------------ + +echo "" +echo "Capturing instance ID from Terraform output..." +echo "" + +INSTANCE_ID=$(terraform -chdir=infra output -raw bloodhound_test_instance_id) + +# ------------------------------------------------------------------ +# Safety Guard — Ensure Terraform returned a valid instance ID +# +# If INSTANCE_ID is empty, something failed during Terraform +# provisioning and the validation workflow must stop immediately. +# ------------------------------------------------------------------ + +if [ -z "$INSTANCE_ID" ]; then + echo "" + echo "ERROR: Terraform did not return a valid instance ID." + echo "Validation cannot continue." + exit 1 +fi + +echo "Instance created:" +echo "$INSTANCE_ID" + + +# ------------------------------------------------------------------ +# Allow time for AWS APIs to propagate the new instance +# ------------------------------------------------------------------ + +echo "" +echo "Waiting for instance to become visible to AWS APIs..." +sleep 10 + + +# ------------------------------------------------------------------ +# Step 3 — Slack Scan Validation +# +# Engineer manually runs the Slack command /v2_seek to confirm the +# instance appears in the scan results. + + +echo "" +echo "----------------------------------------" +log "Scan Confirmation Step" +echo "----------------------------------------" +echo "" + +# ------------------------------------------------------------------ +# Manual verification mode +# +# Engineers can observe the scan results via Slack before +# continuing the validation workflow. +# ------------------------------------------------------------------ + +if [ "$MANUAL_MODE" = true ]; then + + echo "Run this command in Slack:" + echo "" + echo " /v2_seek" + echo "" + echo "Confirm the EC2 instance appears in the scan results." + echo "" + + read -p "Press ENTER once /v2_seek has confirmed detection..." + +else + + echo "Skipping Slack scan confirmation (automated validation mode)." + +fi + + +# ------------------------------------------------------------------ +# Deletion Safety Guard +# +# Prevents the validation workflow from proceeding if the teardown +# candidate set exceeds the configured safety limit. +# +# This protects against: +# - scanning bugs +# - unexpected AWS API responses +# - misconfigured filters +# +# The limit is defined in the .env file: +# TEARDOWN_MAX_DELETE_COUNT +# ------------------------------------------------------------------ + +EXPECTED_DELETE_COUNT=1 + +if [ "$EXPECTED_DELETE_COUNT" -gt "$TEARDOWN_MAX_DELETE_COUNT" ]; then + echo "" + echo "ERROR: Deletion safety limit exceeded." + echo "" + echo "Expected deletions: $EXPECTED_DELETE_COUNT" + echo "Maximum allowed: $TEARDOWN_MAX_DELETE_COUNT" + echo "" + echo "Validation aborted." + echo "" + exit 1 +fi + + +# ------------------------------------------------------------------ +# Step 4 — Execute Teardown +# +# Engineer manually runs the Slack command that triggers deletion. +# ------------------------------------------------------------------ + +echo "" +echo "----------------------------------------" +log "Triggering Bloodhound Validation Teardown" +echo "----------------------------------------" +echo "" + +# ------------------------------------------------------------------ +# Automated validation teardown +# +# Instead of requiring a Slack command, the validation script +# directly invokes the Lambda function with a validation payload. +# +# This ensures CI/CD pipelines can run validation automatically. +# ------------------------------------------------------------------ +# --------------------------------------------------------------- +# --------------------------------------------------------------- +# Build validation payload +# +# The payload is written to /tmp so the repository directory +# is not polluted with temporary files during validation runs. +# +# /tmp is safe for this purpose because: +# +# • files are automatically cleared by the OS eventually +# • the file only needs to exist during this script run +# • it avoids shell quoting issues with inline JSON +# --------------------------------------------------------------- + +PAYLOAD_FILE="/tmp/bloodhound_validation_payload.json" + +cat > "$PAYLOAD_FILE" </dev/null || { + echo "" + echo "ERROR: Lambda returned failure response." + echo "" + echo "Full Lambda response:" + cat "$RESULT_FILE" + echo "" + exit 1 +} + +echo "" +echo "Validating Lambda teardown execution metrics..." + +# --------------------------------------------------------------- +# Expected Lambda response structure +# +# The Lambda teardown response contains execution statistics +# describing what actions were attempted and whether any failed. +# +# Example expected response fragment: +# +# { +# "teardown": { +# "execution": { +# "attempted": 1, +# "succeeded": 1, +# "failed": 0 +# } +# } +# } +# +# What this script validates: +# +# failed == 0 → no teardown errors occurred +# succeeded >= 1 → at least one resource was deleted +# +# If either condition is not met, validation fails immediately. +# --------------------------------------------------------------- + + +# Extract execution metrics from the Lambda response JSON +ATTEMPTED=$(jq '.teardown.execution.attempted // 0' "$RESULT_FILE") +SUCCEEDED=$(jq '.teardown.execution.succeeded // 0' "$RESULT_FILE") +FAILED=$(jq '.teardown.execution.failed // 0' "$RESULT_FILE") + +# Ensure values are numeric +if ! [[ "$FAILED" =~ ^[0-9]+$ ]]; then + echo "ERROR: Invalid Lambda response format." + cat "$RESULT_FILE" + exit 1 +fi + + +# Display the extracted metrics for visibility in logs +echo "Execution metrics:" +echo " attempted: $ATTEMPTED" +echo " succeeded: $SUCCEEDED" +echo " failed: $FAILED" + + +# --------------------------------------------------------------- +# Fail immediately if Lambda reported teardown errors +# --------------------------------------------------------------- + +if [ "$FAILED" -ne 0 ]; then + echo "" + echo "ERROR: Lambda reported teardown failures." + exit 1 +fi + + +# --------------------------------------------------------------- +# Ensure at least one resource was successfully deleted +# --------------------------------------------------------------- + +if [ "$SUCCEEDED" -lt 1 ]; then + echo "" + echo "ERROR: Lambda did not execute any teardown actions." + exit 1 +fi + + +echo "Lambda execution metrics validated successfully." + +# ------------------------------------------------------------------ +# Optional Slack verification (manual mode) +# +# Engineers may optionally observe the teardown behavior using +# Slack commands before the script verifies deletion. +# This step is skipped during automated CI validation. +# ------------------------------------------------------------------ + +if [ "$MANUAL_MODE" = true ]; then + + echo "" + echo "Optional verification using Slack:" + echo "" + echo " /v2_seek_destroy CONFIRM" + echo "" + + read -p "Press ENTER once Slack teardown results are confirmed..." + +fi + + + +echo "" +log "Step 6: Verifying instance deletion..." +echo "" + +# --------------------------------------------------------------- +# Adaptive termination verification +# +# Instead of sleeping a fixed amount of time, poll AWS until the +# instance reaches the "terminated" state. +# +# This makes validation faster when AWS responds quickly and +# still safe when termination takes longer. +# --------------------------------------------------------------- + +MAX_WAIT=120 # maximum time to wait (seconds) +WAIT_INTERVAL=3 # poll interval +ELAPSED=0 + +while [ $ELAPSED -lt $MAX_WAIT ]; do + + STATE=$(aws ec2 describe-instances \ + --instance-ids "$INSTANCE_ID" \ + --region "$REGION" \ + --query "Reservations[].Instances[].State.Name" \ + --output text 2>/dev/null || echo "terminated") + + if [ "$STATE" = "terminated" ] || [ "$STATE" = "shutting-down" ]; then + + RESULT="PASS" + + echo "" + log "SUCCESS: Instance terminated confirmed." + log "RESULT: $RESULT" + + # record validation result in append-only history file + echo "$RUN_ID $RESULT" >> "$HISTORY_FILE" + + echo "Bloodhound successfully deleted the resource." + + break + fi + + echo "Instance state: $STATE (waiting...)" + + sleep $WAIT_INTERVAL + ELAPSED=$((ELAPSED + WAIT_INTERVAL)) + +done + + +# --------------------------------------------------------------- +# Failure condition +# --------------------------------------------------------------- + +if [ "$STATE" != "terminated" ] && [ "$STATE" != "shutting-down" ]; then + + RESULT="FAIL" + + echo "" + log "ERROR: Instance still exists after waiting." + log "RESULT: $RESULT" + + # record validation result in append-only history file + echo "$RUN_ID $RESULT" >> "$HISTORY_FILE" + + echo "Bloodhound did not delete the resource." + exit 1 + +fi + + +# ------------------------------------------------------------------ +# Step 7 — Restore Safe Terraform Configuration +# +# Ensures the infrastructure returns to the safe configuration. +# ------------------------------------------------------------------ + +terraform -chdir=infra apply \ + -var="enable_validation_resources=false" \ + -auto-approve + + +# ------------------------------------------------------------------ +# Step 8 — Reconcile Terraform State +# +# The validation resource was intentionally deleted by the +# Bloodhound Lambda during the teardown test. +# +# Terraform must refresh its state to recognize that the +# resource no longer exists. +# +# Applying with enable_validation_resources=false ensures +# Terraform returns the infrastructure to the safe default +# configuration. +# ------------------------------------------------------------------ + +echo "" +log "Step 8: Reconciling Terraform state" +echo "" + +terraform -chdir=infra apply \ + -var="enable_validation_resources=false" \ + -auto-approve + + +echo "" +echo "----------------------------------------" +echo "Validation Complete" +echo "----------------------------------------" +echo "" +echo "Teardown pipeline verified." +echo "" + + +# ------------------------------------------------------------------ +# Validation Log Retention +# +# Keep only the 3 most recent detailed validation logs. +# Older logs are removed automatically to prevent the +# validation directory from growing indefinitely. +# +# Note: +# validation_history.log still records every validation run. +# ------------------------------------------------------------------ + +log "Keeping only the 3 most recent validation logs." + +ls -1t "$LOG_DIR"/teardown_validation_* 2>/dev/null | tail -n +4 | xargs -r rm \ No newline at end of file diff --git a/trust-policy.json b/trust-policy.json deleted file mode 100644 index 942abb3..0000000 --- a/trust-policy.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "lambda.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] -} -