From d2f403c51c0a6f64c410bdb1fa091a3dca597a05 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 19 May 2026 08:04:55 +0000 Subject: [PATCH] Build sim image via install-sim.sh, drop GHA buildx cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The multi-stage Dockerfile + inlined pip install was hard to trace when the buildx GHA cache appeared to serve stale layers (cache-from/cache-to scoped only by platform, shared across all branches). Collapse to a single stage that just runs scripts/install-sim.sh — the same path used for bare-metal install — so the image build is the install script, full stop. Also drop the cache-from/cache-to directives from build-sim-image.yml. The image build is short enough without them, and removing the cache eliminates the class of "image rebuilt but still has old behaviour" mystery the multi-stage layout enabled. --- .github/workflows/build-sim-image.yml | 9 +- scripts/sim.Dockerfile | 114 ++++++-------------------- 2 files changed, 25 insertions(+), 98 deletions(-) diff --git a/.github/workflows/build-sim-image.yml b/.github/workflows/build-sim-image.yml index a86727e1..f2851c65 100644 --- a/.github/workflows/build-sim-image.yml +++ b/.github/workflows/build-sim-image.yml @@ -73,9 +73,8 @@ jobs: runs-on: ${{ matrix.runner }} steps: - # ``linux/amd64`` → ``linux-amd64``. Used as a key for the digest - # artifact and the buildx cache scope so different platforms - # don't trample each other's GHA cache. + # ``linux/amd64`` → ``linux-amd64``. Used as the key for the + # digest artifact so different platforms don't collide. - name: Sanitise platform name id: plat run: | @@ -113,8 +112,6 @@ jobs: file: scripts/sim.Dockerfile platforms: ${{ matrix.platform }} push: false - cache-from: type=gha,scope=${{ steps.plat.outputs.key }} - cache-to: type=gha,mode=max,scope=${{ steps.plat.outputs.key }} # main / dispatch: push by digest. The image gets uploaded to the # registry under its content-addressable digest; no tag is set @@ -128,8 +125,6 @@ jobs: file: scripts/sim.Dockerfile platforms: ${{ matrix.platform }} outputs: type=image,name=${{ steps.img.outputs.ref }},push-by-digest=true,name-canonical=true,push=true - cache-from: type=gha,scope=${{ steps.plat.outputs.key }} - cache-to: type=gha,mode=max,scope=${{ steps.plat.outputs.key }} - name: Stash digest for the merge job if: github.event_name != 'pull_request' diff --git a/scripts/sim.Dockerfile b/scripts/sim.Dockerfile index 0ac66f7e..55b4828f 100644 --- a/scripts/sim.Dockerfile +++ b/scripts/sim.Dockerfile @@ -1,79 +1,28 @@ -# LLMServingSim simulator image (slim, multi-stage). +# LLMServingSim simulator image. # # Built by .github/workflows/build-sim-image.yml on every push to main # that touches scripts/install-sim.sh, the astra-sim submodule, or this # file. Published to ghcr.io///sim. # -# Stage 1 (builder) carries the C++ toolchain to compile ASTRA-Sim; -# Stage 2 (runtime) carries only the python interpreter, the -# simulator's runtime python deps, the compiled ASTRA-Sim binary, the -# chakra package, the repo source, and the ``.git/`` tree (kept so -# users can ``git log`` / ``git rev-parse`` / ``git submodule status`` -# inside a running container to identify the exact revision they're -# debugging against). .dockerignore at the repo root strips -# perf/results/venv/CMake artefacts from the context so neither stage -# sees them. +# Single stage: install-sim.sh does the full setup (apt deps, python +# deps, ASTRA-Sim build, chakra package), so this Dockerfile just +# bootstraps a base image and hands off. The multi-stage / pip-inline +# version was hard to reason about when the GHA buildx cache served +# stale layers; replacing it with a single ``RUN install-sim.sh`` keeps +# the image build path identical to the bare-metal install path. # -# install-sim.sh also pulls workload-generator / bench / power-model -# training deps (transformers, datasets, scikit-learn, xgboost, -# matplotlib). Those are NOT installed here: the sim image is for -# `python -m serving` only, and workload generation lives in the vLLM -# docker image (scripts/docker-vllm.sh). +# The ``.git/`` tree (main repo + submodules) is intentionally KEPT in +# the image so users can ``git log`` / ``git rev-parse`` / ``git +# submodule status`` inside a running container to identify the exact +# revision they're debugging against. ``git gc`` at the end compacts +# the packs so the size impact stays reasonable, and +# ``safe.directory=*`` is set system-wide because the .git/ trees were +# created during the build as root and may now be inspected by a +# different uid under enroot / docker --user. # # Local build: # docker build -f scripts/sim.Dockerfile -t llmservingsim-sim . -# ============================================================================ -# Stage 1: builder — toolchain for ASTRA-Sim native build -# ============================================================================ -FROM ubuntu:24.04 AS builder - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update -qq \ - && apt-get install -y -qq --no-install-recommends \ - git ca-certificates \ - build-essential cmake \ - protobuf-compiler libprotobuf-dev \ - && rm -rf /var/lib/apt/lists/* - -# Same WORKDIR (/workspace) as the runtime stage. ASTRA-Sim's build.sh -# creates legacy-compat symlinks (e.g. AnalyticalAstra/bin/AnalyticalAstra -# → bin/AstraSim_Analytical_Congestion_Unaware) using ABSOLUTE paths -# rooted at ${BUILD_DIR}. If the builder lives at /build but the -# runtime stage mounts the tree at /workspace, those absolute symlinks -# become dangling after the COPY — which broke -# serving/__main__.py's hard-coded legacy binary path on both archs. -# Keep the path identical so the symlinks resolve in both stages. -WORKDIR /workspace -COPY . /workspace - -# Init submodules, capture the commit SHA for runtime inspection, -# build the analytical backend, then prune everything we don't need at -# runtime: CMake build trees, downloaded _deps, object files, -# __pycache__. ``.git`` trees are KEPT (main repo + submodules) so the -# image can self-report its version via ``git rev-parse HEAD`` / -# ``git submodule status``. ``git gc`` at the end compacts loose -# objects + repacks so deep local clones don't bloat the image; CI -# checkouts are already shallow. -RUN git config --global --add safe.directory '*' \ - && git submodule update --init --recursive --depth 1 \ - && (git rev-parse HEAD 2>/dev/null || echo unknown) > /workspace/.git-rev \ - && bash astra-sim/build/astra_analytical/build.sh \ - && rm -rf astra-sim/build/astra_analytical/build/CMakeFiles \ - astra-sim/build/astra_analytical/build/_deps \ - astra-sim/build/astra_analytical/build/CMakeCache.txt \ - astra-sim/build/astra_analytical/build/cmake_install.cmake \ - astra-sim/build/astra_analytical/build/Makefile \ - && find astra-sim -name '*.o' -delete \ - && find astra-sim -name '*.a' -delete \ - && find . -type d -name __pycache__ -prune -exec rm -rf {} + \ - && git gc --quiet --prune=now --aggressive \ - && (git submodule foreach --recursive 'git gc --quiet --prune=now --aggressive' || true) - -# ============================================================================ -# Stage 2: runtime — slim, only what's needed to RUN the simulator -# ============================================================================ FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive \ @@ -81,23 +30,15 @@ ENV DEBIAN_FRONTEND=noninteractive \ PIP_NO_CACHE_DIR=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 -# Runtime apt: python + pip + tls roots + git. No compilers, no -# protobuf C++ runtime — the python protobuf wheel ships its own. -# ``git`` is here so ``git rev-parse HEAD`` / ``git log`` / ``git -# submodule status`` work against the bundled .git tree for debugging. -# ``safe.directory=*`` because the .git/ trees were created during the -# builder stage as root and may now be inspected by a different uid -# under enroot / docker --user; without it git refuses with "dubious -# ownership in repository". -RUN apt-get update -qq \ - && apt-get install -y -qq --no-install-recommends \ - python3 python3-pip python-is-python3 \ - ca-certificates git \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ - && git config --system --add safe.directory '*' - WORKDIR /workspace -COPY --from=builder /workspace /workspace +COPY . /workspace + +RUN bash scripts/install-sim.sh \ + && (git rev-parse HEAD 2>/dev/null || echo unknown) > /workspace/.git-rev \ + && git gc --quiet --prune=now --aggressive \ + && (git submodule foreach --recursive 'git gc --quiet --prune=now --aggressive' || true) \ + && git config --system --add safe.directory '*' \ + && find / -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true # Sanity-check that the legacy binary symlink resolves. ASTRA-Sim's # build.sh wires AnalyticalAstra/bin/AnalyticalAstra → the canonical @@ -111,13 +52,4 @@ RUN ASTRA_BIN_LEGACY=/workspace/astra-sim/build/astra_analytical/build/Analytica exit 1; \ } -# Simulator runtime python deps. Narrower than scripts/install-sim.sh -# on purpose — see the header comment. -RUN pip3 install --quiet --no-input \ - pyyaml pyinstrument msgspec \ - pandas numpy rich protobuf \ - && pip3 install --quiet --no-input --no-deps \ - /workspace/astra-sim/extern/graph_frontend/chakra \ - && find / -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true - CMD ["/bin/bash"]