amd · marikurz-amd · May 18, 2026 · May 18, 2026 · May 19, 2026
diff --git a/MLExamples/TinyTransformer/README.md b/MLExamples/TinyTransformer/README.md
@@ -42,7 +42,7 @@ This workshop follows a progressive optimization methodology with four implement
 | **V3 Triton** | 156,652 | 52.3 | 51.3 | 0.6 | 0.4 | 916.2 | **3.13x** |
 | **V4 Ultra** | 157,169 | 52.1 | 51.1 | 0.6 | 0.4 | 916.5 | **3.14x** |
 
-**See [PERFORMANCE_RESULTS.md](PERFORMANCE_RESULTS.md) for complete analysis**
+Performance figures for the small and medium configurations are summarized in the tables above and in [Key Performance Insights](#key-performance-insights).
 
 ### Profiling Tools Progression
 
@@ -70,15 +70,15 @@ Each version introduces additional profiling capabilities:
 
 ## Quick Start
 
-### 0. Set up environment
-On the training cluster's compute node, the required environment may be set up using the following
-commands:
+### 0. Set up and verify environment
+On the training cluster's compute node, load the modules (adjust names/versions for your site):
 
 ```bash
 module load rocm pytorch openmpi rocprofiler-compute rocprofiler-systems/develop
 ```
 
-### 1. Verify Environment
+Then confirm ROCm, PyTorch, and the GPU(s) are setup correctly:
+
 ```bash
 # Check ROCm installation
 rocminfo
@@ -90,7 +90,7 @@ rocm-smi
 python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"N/A\"}')"
 ```
 
-### 2. Run Version 1 (Baseline) - 5 minutes
+### 1. Run Version 1 (Baseline) - 5 minutes
 ```bash
 cd version1_pytorch_baseline/
 python tiny_llama_v1.py --batch-size 8 --seq-len 128 --num-steps 20
@@ -105,7 +105,7 @@ For a deeper analysis with the PyTorch profiler, and visualizing the output in T
 please follow the workshop exercises in
 [version1_pytorch_baseline/README.md](https://github.com/amd/HPCTrainingExamples/tree/main/MLExamples/TinyTransformer/version1_pytorch_baseline#workshop-exercises).
 
-### 3. Run Version 2 (Fused) - 5 minutes
+### 2. Run Version 2 (Fused) - 5 minutes
 ```bash
 cd version2_pytorch_fused
 python tiny_llama_v2.py --batch-size 8 --seq-len 128 --num-steps 30
@@ -148,7 +148,7 @@ with `rocprof-sys` using the command below:
 ```bash
 rocprof-sys-run --profile --trace -- python tiny_llama_v3.py --batch-size 8 --seq-len 128 --num-steps 30
 ```
-View the trace at [https://ui.perfetto.dev](https://ui.perfetto.dev).
+View the trace with [https://ui.perfetto.dev](https://ui.perfetto.dev).
 
 ### 4. Run Version 4 (Ultra optimized) - 5 minutes
 ```bash
@@ -174,75 +174,33 @@ cd version3_triton/exercises/performance_debugging/
 
 ## Directory Structure
 
+Layout under `MLExamples/TinyTransformer/` in this repository:
+
 ```
-ai-workshop-training/
- README.md                              # This overview
- setup/                                 # Environment and prerequisites
-    environment_setup.md               # Detailed setup instructions
-    environment_setup.sh               # Automated setup script
-    requirements.txt                   # Python dependencies
-    validation_scripts/                # Environment validation
-        test_environment.py            # Comprehensive environment test
-        test_rocm_installation.py      # ROCm stack validation
-        test_profiling_tools.py        # Profiling tools validation
- version1_pytorch_baseline/             # Standard PyTorch implementation
-    README.md                          # Detailed guided instructions
-    tiny_llama_v1.py                   # Enhanced baseline implementation
-    run_pytorch_profiler.py            # PyTorch profiler integration
-    run_deepspeed_flops.py            # DeepSpeed FLOPS profiler
-    run_all_profilers.sh              # Orchestrated profiling script
-    exercises/                         # Hands-on exercises and analysis
-        exercise_1_baseline_analysis.md
-        exercise_2_memory_analysis.md
-        exercise_3_bottleneck_identification.md
- version2_pytorch_fused/                # Fused operations optimization
-    README.md                          # Fusion optimization guide
-    tiny_llama_v2.py                   # Fused implementation
-    run_pytorch_profiler.py            # Enhanced PyTorch profiling
-    run_deepspeed_flops.py            # FLOPS analysis
-    run_rocprofv3.sh                   # rocprofv3 integration
-    run_rocprof_sys.sh                # System profiling
-    run_rocprof_compute.sh             # Kernel-level profiling
-    run_all_profilers.sh              # Complete profiling suite
-    exercises/                         # Advanced profiling exercises
-        exercise_1_fusion_analysis.md
-        exercise_2_flash_attention.md
-        exercise_3_rocm_tools_intro.md
- version3_triton/                       # Triton kernel integration
-    README.md                          # Triton optimization guide
-    tiny_llama_v3.py                   # Triton-enhanced implementation
-    triton_kernels.py                  # Custom Triton kernels
-    run_pytorch_profiler.py            # Framework profiling
-    run_deepspeed_flops.py            # Computational analysis
-    run_rocprofv3.sh                   # Legacy profiling
-    run_rocprof_sys.sh                # System monitoring
-    run_rocprof_compute.sh             # Advanced kernel analysis
-    run_all_profilers.sh              # Complete profiling
-    exercises/                         # Triton development exercises
-        exercise_1_triton_basics.md
-        exercise_2_custom_kernels.md
-        exercise_3_performance_tuning.md
- version4_pytorch_sdpa/                 # Ultra-fused implementation
-    README.md                          # Ultra-optimization guide
-    tiny_llama_v4.py                   # Ultra-fused implementation
-    triton_ultra_kernels.py            # Ultra-fused kernels
-    [profiling scripts]                # Complete profiling suite
-    exercises/                         # Advanced optimization
-        exercise_1_ultra_fusion.md
-        exercise_2_register_optimization.md
-        exercise_3_production_deployment.md
- analysis_tools/                        # Performance analysis utilities
-    compare_versions.py                # Cross-version performance comparison
-    roofline_analysis.py               # Roofline model implementation
-    performance_dashboard.py           # Interactive performance dashboard
-    regression_tester.py               # Automated regression testing
-    report_generator.py                # Comprehensive report generation
- slides/                                # Presentation materials
-     luka_presentation_materials/        # AI workshop slides
-         workshop_overview.pptx
-         profiling_methodology.pptx
-         optimization_techniques.pptx
-         results_analysis.pptx
+TinyTransformer/
+├── README.md
+├── TINY_LLAMA_ARCHITECTURE.md
+├── TECHNICAL_APPENDICES.md
+├── version1_pytorch_baseline/
+│   ├── tiny_llama_v1.py
+│   ├── run_pytorch_profiler.py, run_deepspeed_flops.py, run_all_profilers.sh
+│   ├── run_*.sh, launch_performance_study.sh
+│   └── exercises/
+│       ├── exercise_1_baseline_analysis.md
+│       ├── exercise_2_memory_analysis.md
+│       └── exercise_3_bottleneck_identification.md
+├── version2_pytorch_fused/
+│   ├── tiny_llama_v2.py
+│   ├── run_*.py, run_*.sh, launch_performance_study.sh
+│   └── exercises/
+├── version3_triton/
+│   ├── tiny_llama_v3.py, run_triton_profiling.py, run_rocprof_triton.sh
+│   ├── launch_performance_study.sh
+│   └── exercises/  (including performance_debugging/)
+└── version4_pytorch_sdpa/
+    ├── tiny_llama_v4.py, run_ultra_profiling.py, launch_performance_study.sh
+    └── exercises/
+        └── exercise1_ultra_fusion.md
 ```
 
 ## Workshop Execution Timeline
@@ -345,10 +303,10 @@ Developed for the CASTIEL AI Workshop (October 16, 2024) by HPC/AI performance e
 
 ## License
 
-MIT License - See LICENSE file for details
+MIT License — see the repository [`LICENSE.md`](../../LICENSE.md) at the git root of **HPCTrainingExamples**.
 
 ---
 
-**Ready to start profiling? Begin with the [Environment Setup Guide](setup/environment_setup.md)**
+**Ready to start profiling?** Begin with [Quick Start](#quick-start) (environment modules and first runs) above.
 
 
diff --git a/...yTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md b/...yTransformer/version1_pytorch_baseline/PYTORCH_BASELINE_WORKSHOP_WALKTHROUGH.md
@@ -807,7 +807,7 @@ Understanding the available options:
 ```bash
 --enable-pytorch-profiler   # Enable PyTorch profiler
 --profile-dir ./profiles    # Directory for profile output
---profile-memory            # Include memory profiling
+--enable-memory-profiling   # CUDA alloc records in profiler trace (TensorBoard memory views)
 --profile-operators         # Detailed operator profiling
 --profile-steps 5           # Number of steps to profile
 ```
@@ -934,7 +934,7 @@ python3 tiny_llama_v1.py \
     --num-steps 20 \
     --enable-pytorch-profiler \
     --profile-dir ./pytorch_profiles \
-    --profile-memory
+    --enable-memory-profiling
 ```
 
 **With DeepSpeed FLOPS Profiler:**
@@ -1472,7 +1472,7 @@ python3 tiny_llama_v1.py \
     --seq-len 128 \
     --num-steps 15 \
     --enable-pytorch-profiler \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_analysis_bs4
 
 # Batch size 8
@@ -1481,7 +1481,7 @@ python3 tiny_llama_v1.py \
     --seq-len 128 \
     --num-steps 15 \
     --enable-pytorch-profiler \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_analysis_bs8
 
 # Batch size 16
@@ -1490,7 +1490,7 @@ python3 tiny_llama_v1.py \
     --seq-len 128 \
     --num-steps 15 \
     --enable-pytorch-profiler \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_analysis_bs16
 ```
 
@@ -1618,31 +1618,31 @@ python3 tiny_llama_v1.py \
     --batch-size 8 \
     --seq-len 64 \
     --num-steps 10 \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_seq64
 
 # Sequence length 128 (baseline)
 python3 tiny_llama_v1.py \
     --batch-size 8 \
     --seq-len 128 \
     --num-steps 10 \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_seq128
 
 # Sequence length 256
 python3 tiny_llama_v1.py \
     --batch-size 8 \
     --seq-len 256 \
     --num-steps 10 \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_seq256
 
 # Sequence length 512 (might OOM - use smaller batch if needed)
 python3 tiny_llama_v1.py \
     --batch-size 4 \
     --seq-len 512 \
     --num-steps 5 \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_seq512
 ```
 
@@ -1694,7 +1694,7 @@ python3 tiny_llama_v1.py \
     --seq-len 128 \
     --num-steps 10 \
     --enable-pytorch-profiler \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-operators \
     --profile-dir ./memory_hotspots
 ```

diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md b/MLExamples/TinyTransformer/version1_pytorch_baseline/README.md
@@ -398,7 +398,7 @@ python tiny_llama_v1.py \
     --batch-size 8 \
     --seq-len 128 \
     --enable-pytorch-profiler \
-    --profile-memory \
+    --enable-memory-profiling \
     --profile-dir ./memory_analysis
 
 # Generate memory timeline visualization

diff --git a/...Transformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md b/...Transformer/version1_pytorch_baseline/exercises/exercise_1_baseline_analysis.md
@@ -8,7 +8,7 @@ Establish baseline performance metrics for Tiny LLaMA V1 and understand the prof
 
 ### Prerequisites
 
-- Completed environment setup from `../setup/`
+- Environment ready for PyTorch + ROCm (see workshop `README.md` [Quick Start](../../README.md#quick-start))
 - Verified environment with validation scripts
 
 ### Duration

diff --git a/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py b/MLExamples/TinyTransformer/version1_pytorch_baseline/tiny_llama_v1.py
@@ -22,7 +22,7 @@
     python tiny_llama_v1.py --enable-pytorch-profiler --profile-dir ./profiles
 
     # With memory profiling
-    python tiny_llama_v1.py --enable-pytorch-profiler --profile-memory
+    python tiny_llama_v1.py --enable-pytorch-profiler --enable-memory-profiling
 
     # Complete profiling suite
     python tiny_llama_v1.py --enable-all-profiling --profile-dir ./complete_analysis
@@ -117,6 +117,7 @@ def reset(self):
         self.metrics = {
             'training_speed': [],
             'memory_usage': [],
+            'gpu_peak_memory_mb': [],
             'gpu_utilization': [],
             'loss_values': [],
             'batch_times': [],
@@ -141,8 +142,14 @@ def end_timing(self) -> float:
         self.start_time = None
         return elapsed
 
-    def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, float]):
-        """Record metrics for a training batch."""
+    def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str, float],
+                             gpu_peak_memory_mb: Optional[float] = None):
+        """Record metrics for a training batch.
+
+        gpu_peak_memory_mb: per-step peak device memory (bytes->MB) from
+        torch.cuda.max_memory_allocated() after reset_peak_memory_stats() at
+        step start; captures transient activations during backward.
+        """
         self.total_samples += batch_size
         self.metrics['loss_values'].append(loss)
         self.metrics['batch_times'].append(timings.get('total', 0))
@@ -154,6 +161,8 @@ def record_batch_metrics(self, batch_size: int, loss: float, timings: Dict[str,
         if torch.cuda.is_available():
             memory_mb = torch.cuda.memory_allocated() / (1024**2)
             self.metrics['memory_usage'].append(memory_mb)
+            if gpu_peak_memory_mb is not None:
+                self.metrics['gpu_peak_memory_mb'].append(gpu_peak_memory_mb)
 
         # Training speed (samples per second)
         if timings.get('total', 0) > 0:
@@ -175,7 +184,13 @@ def get_summary(self) -> Dict[str, Any]:
             'avg_optimizer_time': np.mean(self.metrics['optimizer_times']),
         }
 
-        if self.metrics['memory_usage']:
+        if self.metrics['gpu_peak_memory_mb']:
+            summary.update({
+                'peak_memory_mb': max(self.metrics['gpu_peak_memory_mb']),
+                'avg_peak_memory_mb': np.mean(self.metrics['gpu_peak_memory_mb']),
+                'avg_memory_mb': np.mean(self.metrics['memory_usage']) if self.metrics['memory_usage'] else 0.0,
+            })
+        elif self.metrics['memory_usage']:
             summary.update({
                 'peak_memory_mb': max(self.metrics['memory_usage']),
                 'avg_memory_mb': np.mean(self.metrics['memory_usage'])
@@ -647,6 +662,9 @@ def train_tiny_llama(
     print("=" * 70)
 
     for step in range(num_steps):
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+
         # Start batch timing
         batch_timings = {}
         monitor.start_timing()
@@ -692,8 +710,14 @@ def train_tiny_llama(
         # Total batch time
         batch_timings['total'] = sum(batch_timings.values())
 
+        peak_mb: Optional[float] = None
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+            peak_mb = torch.cuda.max_memory_allocated() / (1024**2)
+
         # Record metrics
-        monitor.record_batch_metrics(batch_size, loss.item(), batch_timings)
+        monitor.record_batch_metrics(batch_size, loss.item(), batch_timings,
+                                     gpu_peak_memory_mb=peak_mb)
 
         # PyTorch profiler step
         if pytorch_profiler:
@@ -702,12 +726,13 @@ def train_tiny_llama(
         # Progress logging
         if step % 10 == 0:
             speed = batch_size / batch_timings['total'] if batch_timings['total'] > 0 else 0
-            memory_mb = torch.cuda.memory_allocated() / (1024**2) if torch.cuda.is_available() else 0
+            live_mb = torch.cuda.memory_allocated() / (1024**2) if torch.cuda.is_available() else 0
+            peak_log = f"{peak_mb:6.1f}" if peak_mb is not None else "  n/a"
 
             print(f"Step {step:3d}/{num_steps} | "
                   f"Loss: {loss.item():.4f} | "
                   f"Speed: {speed:5.1f} samples/sec | "
-                  f"Memory: {memory_mb:6.1f} MB | "
+                  f"Peak: {peak_log} MB | Live: {live_mb:6.1f} MB | "
                   f"Time: {batch_timings['total']*1000:5.1f}ms")
 
     print("=" * 70)
@@ -743,7 +768,9 @@ def train_tiny_llama(
     print(f"   Final loss: {summary.get('avg_loss', 0):.4f}")
 
     if 'peak_memory_mb' in summary:
-        print(f"   Peak memory usage: {summary['peak_memory_mb']:.1f} MB")
+        print(f"   Peak device memory (high-water per step): {summary['peak_memory_mb']:.1f} MB")
+        if 'avg_peak_memory_mb' in summary:
+            print(f"   Avg peak per step: {summary['avg_peak_memory_mb']:.1f} MB")
 
     # Save performance data
     if profiler_config.profile_dir:
@@ -771,6 +798,7 @@ def train_tiny_llama(
         }
 
         profile_path = Path(profiler_config.profile_dir) / "performance_summary.json"
+        profile_path.parent.mkdir(parents=True, exist_ok=True)
         with open(profile_path, 'w') as f:
             json.dump(profile_data, f, indent=2)