mlcommons · russfellows · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -0,0 +1,29 @@
+# MLPerf Storage — runtime environment configuration
+# Copy this file to .env and fill in your values.
+# The .env file is gitignored and never committed.
+#
+# All tests/object-store scripts load .env automatically.
+# Values already set in the shell take precedence over .env.
+
+# ── S3 / Object Storage ───────────────────────────────────────────────────────
+# Endpoint URL for your S3-compatible storage (MinIO, VAST, AWS S3, etc.)
+AWS_ENDPOINT_URL=http://your-s3-endpoint:9000
+
+# Credentials
+AWS_ACCESS_KEY_ID=your_access_key
+AWS_SECRET_ACCESS_KEY=your_secret_key
+AWS_REGION=us-east-1
+
+# ── Bucket / Storage ──────────────────────────────────────────────────────────
+# Target bucket for test data
+BUCKET=mlp-test
+
+# Storage library to use: s3dlio (recommended), minio
+STORAGE_LIBRARY=s3dlio
+
+# ── Test tuning (optional) ────────────────────────────────────────────────────
+# Number of MPI ranks for parallel data generation
+NP=8
+
+# Set to 1 to overwrite existing data without prompting
+FORCE=0
@@ -14,6 +14,8 @@ venv/
 .venv/
 env/
 .env
+.env.*
+!.env.example
 env-*
 **/.venv
 **/.env

@@ -27,6 +27,13 @@ code or running benchmarks:
 | **[docs/README.md](docs/README.md)** | Complete project overview: all four benchmark workloads, document reference, object storage library guides, and quick-link index to every test script |
 | **[tests/README.md](tests/README.md)** | Everything needed to run tests: environment setup, unit tests, integration tests, object-store performance scripts, and how pytest is configured |
 
+Additional quick links:
+
+| Document | What it covers |
+|----------|----------------|
+| **[docs/OBJECT_STORAGE_GUIDE.md](docs/OBJECT_STORAGE_GUIDE.md)** | All settings required to run against S3-compatible storage with `--object` — `.env` setup, env vars, URI schemes, multi-endpoint |
+| **[tests/object-store/bench-results-retinanet-20260425.md](tests/object-store/bench-results-retinanet-20260425.md)** | April 25, 2026 benchmark results: RetinaNet write_threads sweep on s3-ultra (loopback) |
+
 The top-level sections below give the official MLCommons parameter reference and
 are retained for submission compliance.
 

@@ -30,7 +30,6 @@ storage:
   use_mpi_endpoint_distribution: true
 
   storage_options:
-    region: us-east-1
 
 reader: 
   data_loader: pytorch

@@ -0,0 +1,75 @@
+# LLaMA 3 8B — Generic Checkpoint Workload Config
+#
+# WORKLOAD PARAMETERS ONLY — no runtime/environment configuration here.
+# Runtime parameters (endpoint, bucket, storage library) are supplied via
+# environment variables, a .env file, or Hydra overrides on the command line.
+#
+# Model sizing (ZeRO-3, 8 ranks, fp16 model + fp32 optimizer):
+#   Total model+optimizer: 15 GB + 90 GB = 105 GB
+#   Per-rank write:  105 GB / 8 ranks ≈ 13.1 GB
+#   Per-checkpoint total I/O: ~105 GB write + ~105 GB read = ~210 GB
+#
+# Usage (via run_checkpointing.sh):
+#   cd /path/to/mlp-storage
+#   LIBRARY=s3dlio BUCKET=my-bucket bash tests/object-store/run_checkpointing.sh
+#
+# Usage (direct, with Hydra overrides):
+#   cd /path/to/mlp-storage
+#   source .env && source .venv/bin/activate
+#   DLIO_S3_IMPLEMENTATION=mlp \
+#   mpirun -n 1 --allow-run-as-root \
+#     .venv/bin/dlio_benchmark \
+#     workload=llama3_8b_checkpoint \
+#     ++workload.storage.storage_root=${BUCKET} \
+#     ++workload.storage.storage_library=${LIBRARY} \
+#     ++workload.storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} \
+#     "++workload.checkpoint.checkpoint_folder=s3://${BUCKET}/${LIBRARY}/llama3-8b" \
+#     --config-dir=/path/to/mlp-storage/configs/dlio
+
+model:
+  name: llama_8b
+  type: transformer
+  num_layers: 32
+  model_datatype: fp16
+  optimizer_datatype: fp32
+  parallelism:
+    pipeline: 1
+    tensor: 1
+    zero_stage: 3
+  transformer:
+    vocab_size: 128256
+    hidden_size: 4096
+    ffn_hidden_size: 14336
+    num_attention_heads: 32
+    num_kv_heads: 8
+
+framework: pytorch
+
+workflow:
+  generate_data: False
+  train: False
+  checkpoint: True
+
+# ---------------------------------------------------------------------------
+# Storage — values here are PLACEHOLDERS only.
+# All storage runtime parameters MUST be supplied via Hydra overrides.
+# See run_checkpointing.sh or the Usage section above.
+# ---------------------------------------------------------------------------
+storage:
+  storage_type: s3
+  storage_root: BUCKET_PLACEHOLDER      # override: ++workload.storage.storage_root=<bucket>
+  storage_library: LIBRARY_PLACEHOLDER  # override: ++workload.storage.storage_library=s3dlio|minio
+
+  storage_options:
+    endpoint_url: ENDPOINT_PLACEHOLDER  # override: ++workload.storage.storage_options.endpoint_url=https://...
+    # All other storage_options (region, s3_force_path_style, credentials)
+    # are supplied at runtime via Hydra overrides in run_checkpointing.sh
+
+# ---------------------------------------------------------------------------
+# Checkpoint
+# ---------------------------------------------------------------------------
+checkpoint:
+  checkpoint_folder: s3://BUCKET_PLACEHOLDER/LIBRARY_PLACEHOLDER/llama3-8b  # override at runtime
+  time_between_checkpoints: 5
+  num_checkpoints_write: 2
+  num_checkpoints_read: 2
@@ -38,7 +38,6 @@ storage:
   storage_options:
     # Credentials come from environment variables — NEVER hardcode in YAML.
     # Before running: source /path/to/.env  (sets AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-    region: us-east-1
 
 reader: 
   data_loader: pytorch

@@ -35,7 +35,6 @@ storage:
   storage_options:
     # Credentials come from environment variables — NEVER hardcode in YAML.
     # Before running: source /path/to/.env  (sets AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-    region: us-east-1
 
 reader: 
   data_loader: pytorch
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,8 @@ venv/ @@
     .venv/
     env/
     .env
+    .env.*
+    !.env.example
     env-*
     **/.venv
     **/.env
@@ Expand Down @@