Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions .github/workflows/build-sim-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,8 @@ jobs:
runs-on: ${{ matrix.runner }}

steps:
# ``linux/amd64`` → ``linux-amd64``. Used as a key for the digest
# artifact and the buildx cache scope so different platforms
# don't trample each other's GHA cache.
# ``linux/amd64`` → ``linux-amd64``. Used as the key for the
# digest artifact so different platforms don't collide.
- name: Sanitise platform name
id: plat
run: |
Expand Down Expand Up @@ -113,8 +112,6 @@ jobs:
file: scripts/sim.Dockerfile
platforms: ${{ matrix.platform }}
push: false
cache-from: type=gha,scope=${{ steps.plat.outputs.key }}
cache-to: type=gha,mode=max,scope=${{ steps.plat.outputs.key }}

# main / dispatch: push by digest. The image gets uploaded to the
# registry under its content-addressable digest; no tag is set
Expand All @@ -128,8 +125,6 @@ jobs:
file: scripts/sim.Dockerfile
platforms: ${{ matrix.platform }}
outputs: type=image,name=${{ steps.img.outputs.ref }},push-by-digest=true,name-canonical=true,push=true
cache-from: type=gha,scope=${{ steps.plat.outputs.key }}
cache-to: type=gha,mode=max,scope=${{ steps.plat.outputs.key }}

- name: Stash digest for the merge job
if: github.event_name != 'pull_request'
Expand Down
114 changes: 23 additions & 91 deletions scripts/sim.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,103 +1,44 @@
# LLMServingSim simulator image (slim, multi-stage).
# LLMServingSim simulator image.
#
# Built by .github/workflows/build-sim-image.yml on every push to main
# that touches scripts/install-sim.sh, the astra-sim submodule, or this
# file. Published to ghcr.io/<owner>/<repo>/sim.
#
# Stage 1 (builder) carries the C++ toolchain to compile ASTRA-Sim;
# Stage 2 (runtime) carries only the python interpreter, the
# simulator's runtime python deps, the compiled ASTRA-Sim binary, the
# chakra package, the repo source, and the ``.git/`` tree (kept so
# users can ``git log`` / ``git rev-parse`` / ``git submodule status``
# inside a running container to identify the exact revision they're
# debugging against). .dockerignore at the repo root strips
# perf/results/venv/CMake artefacts from the context so neither stage
# sees them.
# Single stage: install-sim.sh does the full setup (apt deps, python
# deps, ASTRA-Sim build, chakra package), so this Dockerfile just
# bootstraps a base image and hands off. The multi-stage / pip-inline
# version was hard to reason about when the GHA buildx cache served
# stale layers; replacing it with a single ``RUN install-sim.sh`` keeps
# the image build path identical to the bare-metal install path.
#
# install-sim.sh also pulls workload-generator / bench / power-model
# training deps (transformers, datasets, scikit-learn, xgboost,
# matplotlib). Those are NOT installed here: the sim image is for
# `python -m serving` only, and workload generation lives in the vLLM
# docker image (scripts/docker-vllm.sh).
# The ``.git/`` tree (main repo + submodules) is intentionally KEPT in
# the image so users can ``git log`` / ``git rev-parse`` / ``git
# submodule status`` inside a running container to identify the exact
# revision they're debugging against. ``git gc`` at the end compacts
# the packs so the size impact stays reasonable, and
# ``safe.directory=*`` is set system-wide because the .git/ trees were
# created during the build as root and may now be inspected by a
# different uid under enroot / docker --user.
#
# Local build:
# docker build -f scripts/sim.Dockerfile -t llmservingsim-sim .

# ============================================================================
# Stage 1: builder — toolchain for ASTRA-Sim native build
# ============================================================================
FROM ubuntu:24.04 AS builder

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update -qq \
&& apt-get install -y -qq --no-install-recommends \
git ca-certificates \
build-essential cmake \
protobuf-compiler libprotobuf-dev \
&& rm -rf /var/lib/apt/lists/*

# Same WORKDIR (/workspace) as the runtime stage. ASTRA-Sim's build.sh
# creates legacy-compat symlinks (e.g. AnalyticalAstra/bin/AnalyticalAstra
# → bin/AstraSim_Analytical_Congestion_Unaware) using ABSOLUTE paths
# rooted at ${BUILD_DIR}. If the builder lives at /build but the
# runtime stage mounts the tree at /workspace, those absolute symlinks
# become dangling after the COPY — which broke
# serving/__main__.py's hard-coded legacy binary path on both archs.
# Keep the path identical so the symlinks resolve in both stages.
WORKDIR /workspace
COPY . /workspace

# Init submodules, capture the commit SHA for runtime inspection,
# build the analytical backend, then prune everything we don't need at
# runtime: CMake build trees, downloaded _deps, object files,
# __pycache__. ``.git`` trees are KEPT (main repo + submodules) so the
# image can self-report its version via ``git rev-parse HEAD`` /
# ``git submodule status``. ``git gc`` at the end compacts loose
# objects + repacks so deep local clones don't bloat the image; CI
# checkouts are already shallow.
RUN git config --global --add safe.directory '*' \
&& git submodule update --init --recursive --depth 1 \
&& (git rev-parse HEAD 2>/dev/null || echo unknown) > /workspace/.git-rev \
&& bash astra-sim/build/astra_analytical/build.sh \
&& rm -rf astra-sim/build/astra_analytical/build/CMakeFiles \
astra-sim/build/astra_analytical/build/_deps \
astra-sim/build/astra_analytical/build/CMakeCache.txt \
astra-sim/build/astra_analytical/build/cmake_install.cmake \
astra-sim/build/astra_analytical/build/Makefile \
&& find astra-sim -name '*.o' -delete \
&& find astra-sim -name '*.a' -delete \
&& find . -type d -name __pycache__ -prune -exec rm -rf {} + \
&& git gc --quiet --prune=now --aggressive \
&& (git submodule foreach --recursive 'git gc --quiet --prune=now --aggressive' || true)

# ============================================================================
# Stage 2: runtime — slim, only what's needed to RUN the simulator
# ============================================================================
FROM ubuntu:24.04

ENV DEBIAN_FRONTEND=noninteractive \
PIP_BREAK_SYSTEM_PACKAGES=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1

# Runtime apt: python + pip + tls roots + git. No compilers, no
# protobuf C++ runtime — the python protobuf wheel ships its own.
# ``git`` is here so ``git rev-parse HEAD`` / ``git log`` / ``git
# submodule status`` work against the bundled .git tree for debugging.
# ``safe.directory=*`` because the .git/ trees were created during the
# builder stage as root and may now be inspected by a different uid
# under enroot / docker --user; without it git refuses with "dubious
# ownership in repository".
RUN apt-get update -qq \
&& apt-get install -y -qq --no-install-recommends \
python3 python3-pip python-is-python3 \
ca-certificates git \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& git config --system --add safe.directory '*'

WORKDIR /workspace
COPY --from=builder /workspace /workspace
COPY . /workspace

RUN bash scripts/install-sim.sh \
&& (git rev-parse HEAD 2>/dev/null || echo unknown) > /workspace/.git-rev \
&& git gc --quiet --prune=now --aggressive \
&& (git submodule foreach --recursive 'git gc --quiet --prune=now --aggressive' || true) \
&& git config --system --add safe.directory '*' \
&& find / -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true

# Sanity-check that the legacy binary symlink resolves. ASTRA-Sim's
# build.sh wires AnalyticalAstra/bin/AnalyticalAstra → the canonical
Expand All @@ -111,13 +52,4 @@ RUN ASTRA_BIN_LEGACY=/workspace/astra-sim/build/astra_analytical/build/Analytica
exit 1; \
}

# Simulator runtime python deps. Narrower than scripts/install-sim.sh
# on purpose — see the header comment.
RUN pip3 install --quiet --no-input \
pyyaml pyinstrument msgspec \
pandas numpy rich protobuf \
&& pip3 install --quiet --no-input --no-deps \
/workspace/astra-sim/extern/graph_frontend/chakra \
&& find / -type d -name __pycache__ -prune -exec rm -rf {} + 2>/dev/null || true

CMD ["/bin/bash"]
Loading