Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# MLPerf Storage — runtime environment configuration
# Copy this file to .env and fill in your values.
# The .env file is gitignored and never committed.
#
# All tests/object-store scripts load .env automatically.
# Values already set in the shell take precedence over .env.

# ── S3 / Object Storage ───────────────────────────────────────────────────────
# Endpoint URL for your S3-compatible storage (MinIO, VAST, AWS S3, etc.)
AWS_ENDPOINT_URL=http://your-s3-endpoint:9000

# Credentials
AWS_ACCESS_KEY_ID=your_access_key
AWS_SECRET_ACCESS_KEY=your_secret_key
AWS_REGION=us-east-1

# ── Bucket / Storage ──────────────────────────────────────────────────────────
# Target bucket for test data
BUCKET=mlp-test

# Storage library to use: s3dlio (recommended), minio
STORAGE_LIBRARY=s3dlio

# ── Test tuning (optional) ────────────────────────────────────────────────────
# Number of MPI ranks for parallel data generation
NP=8

# Set to 1 to overwrite existing data without prompting
FORCE=0
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ venv/
.venv/
env/
.env
.env.*
!.env.example
env-*
**/.venv
**/.env
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ code or running benchmarks:
| **[docs/README.md](docs/README.md)** | Complete project overview: all four benchmark workloads, document reference, object storage library guides, and quick-link index to every test script |
| **[tests/README.md](tests/README.md)** | Everything needed to run tests: environment setup, unit tests, integration tests, object-store performance scripts, and how pytest is configured |

Additional quick links:

| Document | What it covers |
|----------|----------------|
| **[docs/OBJECT_STORAGE_GUIDE.md](docs/OBJECT_STORAGE_GUIDE.md)** | All settings required to run against S3-compatible storage with `--object` — `.env` setup, env vars, URI schemes, multi-endpoint |
| **[tests/object-store/bench-results-retinanet-20260425.md](tests/object-store/bench-results-retinanet-20260425.md)** | April 25, 2026 benchmark results: RetinaNet write_threads sweep on s3-ultra (loopback) |

The top-level sections below give the official MLCommons parameter reference and
are retained for submission compliance.

Expand Down
65 changes: 0 additions & 65 deletions configs/dlio/workload/datagen_s3dlio_azure.yaml

This file was deleted.

71 changes: 0 additions & 71 deletions configs/dlio/workload/datagen_s3dlio_multiendpoint.yaml

This file was deleted.

57 changes: 0 additions & 57 deletions configs/dlio/workload/datagen_s3dlio_s3.yaml

This file was deleted.

1 change: 0 additions & 1 deletion configs/dlio/workload/hybrid_storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ storage:
use_mpi_endpoint_distribution: true

storage_options:
region: us-east-1

reader:
data_loader: pytorch
Expand Down
75 changes: 75 additions & 0 deletions configs/dlio/workload/llama3_8b_checkpoint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# LLaMA 3 8B — Generic Checkpoint Workload Config
#
# WORKLOAD PARAMETERS ONLY — no runtime/environment configuration here.
# Runtime parameters (endpoint, bucket, storage library) are supplied via
# environment variables, a .env file, or Hydra overrides on the command line.
#
# Model sizing (ZeRO-3, 8 ranks, fp16 model + fp32 optimizer):
# Total model+optimizer: 15 GB + 90 GB = 105 GB
# Per-rank write: 105 GB / 8 ranks ≈ 13.1 GB
# Per-checkpoint total I/O: ~105 GB write + ~105 GB read = ~210 GB
#
# Usage (via run_checkpointing.sh):
# cd /path/to/mlp-storage
# LIBRARY=s3dlio BUCKET=my-bucket bash tests/object-store/run_checkpointing.sh
#
# Usage (direct, with Hydra overrides):
# cd /path/to/mlp-storage
# source .env && source .venv/bin/activate
# DLIO_S3_IMPLEMENTATION=mlp \
# mpirun -n 1 --allow-run-as-root \
# .venv/bin/dlio_benchmark \
# workload=llama3_8b_checkpoint \
# ++workload.storage.storage_root=${BUCKET} \
# ++workload.storage.storage_library=${LIBRARY} \
# ++workload.storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} \
# "++workload.checkpoint.checkpoint_folder=s3://${BUCKET}/${LIBRARY}/llama3-8b" \
# --config-dir=/path/to/mlp-storage/configs/dlio

model:
name: llama_8b
type: transformer
num_layers: 32
model_datatype: fp16
optimizer_datatype: fp32
parallelism:
pipeline: 1
tensor: 1
zero_stage: 3
transformer:
vocab_size: 128256
hidden_size: 4096
ffn_hidden_size: 14336
num_attention_heads: 32
num_kv_heads: 8

framework: pytorch

workflow:
generate_data: False
train: False
checkpoint: True

# ---------------------------------------------------------------------------
# Storage — values here are PLACEHOLDERS only.
# All storage runtime parameters MUST be supplied via Hydra overrides.
# See run_checkpointing.sh or the Usage section above.
# ---------------------------------------------------------------------------
storage:
storage_type: s3
storage_root: BUCKET_PLACEHOLDER # override: ++workload.storage.storage_root=<bucket>
storage_library: LIBRARY_PLACEHOLDER # override: ++workload.storage.storage_library=s3dlio|minio

storage_options:
endpoint_url: ENDPOINT_PLACEHOLDER # override: ++workload.storage.storage_options.endpoint_url=https://...
# All other storage_options (region, s3_force_path_style, credentials)
# are supplied at runtime via Hydra overrides in run_checkpointing.sh

# ---------------------------------------------------------------------------
# Checkpoint
# ---------------------------------------------------------------------------
checkpoint:
checkpoint_folder: s3://BUCKET_PLACEHOLDER/LIBRARY_PLACEHOLDER/llama3-8b # override at runtime
time_between_checkpoints: 5
num_checkpoints_write: 2
num_checkpoints_read: 2
1 change: 0 additions & 1 deletion configs/dlio/workload/multi_endpoint_mpi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ storage:
storage_options:
# Credentials come from environment variables — NEVER hardcode in YAML.
# Before running: source /path/to/.env (sets AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
region: us-east-1

reader:
data_loader: pytorch
Expand Down
1 change: 0 additions & 1 deletion configs/dlio/workload/multi_endpoint_roundrobin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ storage:
storage_options:
# Credentials come from environment variables — NEVER hardcode in YAML.
# Before running: source /path/to/.env (sets AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
region: us-east-1

reader:
data_loader: pytorch
Expand Down
Loading
Loading