diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index bae9ee8..9367d71 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -1,20 +1,168 @@
-name: Benchmarks
+name: Benchmark Matrix
 
 on:
+  push:
+    branches: [feature/wire-advanced-features]
   workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0'  # Weekly on Sunday
 
 jobs:
-  full-benchmark:
+  build:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - uses: cachix/install-nix-action@v27
         with:
           nix_path: nixpkgs=channel:nixos-unstable
+      - name: Cache Nix store
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/nix
+            /nix/store
+          key: nix-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/flake.nix', '**/flake.lock') }}
+          restore-keys: |
+            nix-${{ runner.os }}-
+      - name: Cache Cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            cargo-${{ runner.os }}-
       - name: Build
         run: nix build
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: libaethalloc
+          path: result/lib/*.so
+
+  benchmark-matrix:
+    needs: build
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        benchmark:
+          - name: packet_churn
+            cmd: "/tmp/packet_churn 100000 10000"
+            metric: throughput_ops_per_sec
+            unit: ops/s
+            direction: higher
+          - name: multithread_churn
+            cmd: "/tmp/multithread_churn 8 100000"
+            metric: throughput_ops_per_sec
+            unit: ops/s
+            direction: higher
+          - name: kv_store
+            cmd: "/tmp/kv_store"
+            metric: throughput_ops_per_sec
+            unit: ops/s
+            direction: higher
+          - name: producer_consumer
+            cmd: "/tmp/producer_consumer"
+            metric: throughput_ops_per_sec
+            unit: ops/s
+            direction: higher
+          - name: realloc_churn
+            cmd: "/tmp/realloc_churn 100000 2"
+            metric: latency_ns.avg
+            unit: ns
+            direction: lower
+          - name: realloc_large
+            cmd: "/tmp/realloc_large 10000"
+            metric: latency_ns.avg
+            unit: ns
+            direction: lower
+          - name: fragmentation_churn
+            cmd: "/tmp/fragmentation_churn 50000 10000"
+            metric: latency_ns.avg
+            unit: ns
+            direction: lower
+          - name: fragmentation_rss
+            cmd: "/tmp/fragmentation"
+            metric: summary.final_rss_kb
+            unit: KB
+            direction: lower
+        run_id: [1, 2, 3, 4, 5]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: libaethalloc
+          path: ./lib
+      - name: Compile benchmarks
+        run: |
+          gcc -O3 -pthread benches/packet_churn.c -o /tmp/packet_churn
+          gcc -O3 -pthread benches/kv_store.c -o /tmp/kv_store
+          gcc -O3 -pthread benches/producer_consumer.c -o /tmp/producer_consumer
+          gcc -O3 -pthread benches/multithread_churn.c -o /tmp/multithread_churn
+          gcc -O3 -pthread benches/fragmentation.c -o /tmp/fragmentation
+          gcc -O3 -pthread benches/realloc_churn.c -o /tmp/realloc_churn
+          gcc -O3 -pthread benches/realloc_large.c -o /tmp/realloc_large
+          gcc -O3 -pthread benches/fragmentation_churn.c -o /tmp/fragmentation_churn
+      - name: Run glibc baseline
+        id: glibc
+        run: |
+          RESULT=$(${{ matrix.benchmark.cmd }} 2>&1)
+          echo "result<<EOF" >> $GITHUB_OUTPUT
+          echo "$RESULT" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Run aethalloc
+        id: aethalloc
+        run: |
+          LIB=$(realpath lib/*.so)
+          RESULT=$(LD_PRELOAD="$LIB" ${{ matrix.benchmark.cmd }} 2>&1)
+          echo "result<<EOF" >> $GITHUB_OUTPUT
+          echo "$RESULT" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Compare
+        run: |
+          python3 -c "
+          import json, os
+          glibc = json.loads(os.environ['GLIBC_RESULT'])
+          aeth = json.loads(os.environ['AETH_RESULT'])
+          metric_path = os.environ['METRIC'].split('.')
+          def get_nested(d, path):
+              for key in path:
+                  if isinstance(d, dict):
+                      d = d.get(key, 0)
+                  else:
+                      return 0
+              return d
+          glibc_val = get_nested(glibc, metric_path)
+          aeth_val = get_nested(aeth, metric_path)
+          delta = ((aeth_val - glibc_val) / glibc_val * 100) if glibc_val > 0 else 0
+          direction = os.environ['DIRECTION']
+          if direction == 'higher':
+              emoji = '🟢' if delta > 0 else '🔴' if delta < 0 else '➖'
+          else:
+              emoji = '🟢' if delta < 0 else '🔴' if delta > 0 else '➖'
+          print(f'{emoji} {os.environ[\"BENCH_NAME\"]} run {os.environ[\"RUN_ID\"]}: glibc={glibc_val:,.2f} | aethalloc={aeth_val:,.2f} | delta={delta:+.1f}%')
+          "
+        env:
+          GLIBC_RESULT: ${{ steps.glibc.outputs.result }}
+          AETH_RESULT: ${{ steps.aethalloc.outputs.result }}
+          METRIC: ${{ matrix.benchmark.metric }}
+          DIRECTION: ${{ matrix.benchmark.direction }}
+          BENCH_NAME: ${{ matrix.benchmark.name }}
+          RUN_ID: ${{ matrix.run_id }}
+
+  summarize:
+    needs: benchmark-matrix
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: libaethalloc
+          path: ./lib
       - name: Compile all benchmarks
         run: |
           gcc -O3 -pthread benches/packet_churn.c -o /tmp/packet_churn
@@ -22,92 +170,132 @@ jobs:
           gcc -O3 -pthread benches/producer_consumer.c -o /tmp/producer_consumer
           gcc -O3 -pthread benches/multithread_churn.c -o /tmp/multithread_churn
           gcc -O3 -pthread benches/fragmentation.c -o /tmp/fragmentation
+          gcc -O3 -pthread benches/realloc_churn.c -o /tmp/realloc_churn
+          gcc -O3 -pthread benches/realloc_large.c -o /tmp/realloc_large
+          gcc -O3 -pthread benches/fragmentation_churn.c -o /tmp/fragmentation_churn
           gcc -O3 benches/tail_latency.c -o /tmp/tail_latency
-          gcc -O3 benches/massive_alloc.c -o /tmp/massive_alloc
-          gcc -O3 benches/corruption_test.c -o /tmp/corruption_test
-      - name: Run all benchmarks
-        id: benchmarks
+      - name: Run full benchmark suite
         run: |
-          AETHALLOC="LD_PRELOAD=$(realpath result/lib/*.so)"
-          
-          echo "## Benchmark Results" >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "**Test System:** GitHub Actions ubuntu-latest" >> $GITHUB_STEP_SUMMARY
-          echo "**Date:** $(date -I)" >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
-          
-          echo "### Summary" >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "| Benchmark | glibc | AethAlloc | Ratio |" >> $GITHUB_STEP_SUMMARY
-          echo "|-----------|-------|-----------|-------|" >> $GITHUB_STEP_SUMMARY
-          
-          # Packet Churn
-          GLIBC_PC=$(/tmp/packet_churn | jq -r '.throughput_ops_per_sec')
-          AETH_PC=$($AETHALLOC /tmp/packet_churn | jq -r '.throughput_ops_per_sec')
-          RATIO_PC=$(echo "scale=0; $AETH_PC * 100 / $GLIBC_PC" | bc)
-          echo "| Packet Churn | ${GLIBC_PC} | ${AETH_PC} | ${RATIO_PC}% |" >> $GITHUB_STEP_SUMMARY
-          
-          # KV Store
-          GLIBC_KV=$(/tmp/kv_store | jq -r '.throughput_ops_per_sec')
-          AETH_KV=$($AETHALLOC /tmp/kv_store | jq -r '.throughput_ops_per_sec')
-          RATIO_KV=$(echo "scale=0; $AETH_KV * 100 / $GLIBC_KV" | bc)
-          echo "| KV Store | ${GLIBC_KV} | ${AETH_KV} | ${RATIO_KV}% |" >> $GITHUB_STEP_SUMMARY
-          
-          # Producer-Consumer
-          GLIBC_PCS=$(/tmp/producer_consumer | jq -r '.throughput_ops_per_sec')
-          AETH_PCS=$($AETHALLOC /tmp/producer_consumer | jq -r '.throughput_ops_per_sec')
-          RATIO_PCS=$(echo "scale=0; $AETH_PCS * 100 / $GLIBC_PCS" | bc)
-          echo "| Producer-Consumer | ${GLIBC_PCS} | ${AETH_PCS} | ${RATIO_PCS}% |" >> $GITHUB_STEP_SUMMARY
-          
-          # Multithread
-          GLIBC_MT=$(/tmp/multithread_churn | jq -r '.throughput_ops_per_sec')
-          AETH_MT=$($AETHALLOC /tmp/multithread_churn | jq -r '.throughput_ops_per_sec')
-          RATIO_MT=$(echo "scale=0; $AETH_MT * 100 / $GLIBC_MT" | bc)
-          echo "| Multithread (8T) | ${GLIBC_MT} | ${AETH_MT} | ${RATIO_MT}% |" >> $GITHUB_STEP_SUMMARY
-          
-          # Fragmentation
-          GLIBC_RSS=$(/tmp/fragmentation | jq -r '.summary.final_rss_kb')
-          AETH_RSS=$($AETHALLOC /tmp/fragmentation | jq -r '.summary.final_rss_kb')
-          RATIO_RSS=$(echo "scale=1; $GLIBC_RSS / $AETH_RSS" | bc)
-          echo "| Fragmentation RSS | ${GLIBC_RSS} KB | ${AETH_RSS} KB | ${RATIO_RSS}x better |" >> $GITHUB_STEP_SUMMARY
-          
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "### Tail Latency (8 threads, 50K ops each)" >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "| Allocator | P50 | P99 | P99.9 | P99.99 | Max |" >> $GITHUB_STEP_SUMMARY
-          echo "|-----------|-----|-----|-------|--------|-----|" >> $GITHUB_STEP_SUMMARY
-          
-          GLIBC_LAT=$(/tmp/tail_latency 8 50000)
-          AETH_LAT=$($AETHALLOC /tmp/tail_latency 8 50000)
-          
-          GLIBC_P50=$(echo "$GLIBC_LAT" | jq -r '.latency_ns.p50')
-          GLIBC_P99=$(echo "$GLIBC_LAT" | jq -r '.latency_ns.p99')
-          GLIBC_P999=$(echo "$GLIBC_LAT" | jq -r '.latency_ns["p99.9"]')
-          GLIBC_P9999=$(echo "$GLIBC_LAT" | jq -r '.latency_ns["p99.99"]')
-          GLIBC_MAX=$(echo "$GLIBC_LAT" | jq -r '.latency_ns.max')
-          
-          AETH_P50=$(echo "$AETH_LAT" | jq -r '.latency_ns.p50')
-          AETH_P99=$(echo "$AETH_LAT" | jq -r '.latency_ns.p99')
-          AETH_P999=$(echo "$AETH_LAT" | jq -r '.latency_ns["p99.9"]')
-          AETH_P9999=$(echo "$AETH_LAT" | jq -r '.latency_ns["p99.99"]')
-          AETH_MAX=$(echo "$AETH_LAT" | jq -r '.latency_ns.max')
-          
-          echo "| glibc | ${GLIBC_P50}ns | ${GLIBC_P99}ns | ${GLIBC_P999}ns | ${GLIBC_P9999}ns | ${GLIBC_MAX}ns |" >> $GITHUB_STEP_SUMMARY
-          echo "| AethAlloc | ${AETH_P50}ns | ${AETH_P99}ns | ${AETH_P999}ns | ${AETH_P9999}ns | ${AETH_MAX}ns |" >> $GITHUB_STEP_SUMMARY
-          
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "### Massive Allocations" >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo '```' >> $GITHUB_STEP_SUMMARY
-          echo "=== glibc ===" >> $GITHUB_STEP_SUMMARY
-          /tmp/massive_alloc >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "=== AethAlloc ===" >> $GITHUB_STEP_SUMMARY
-          $AETHALLOC /tmp/massive_alloc >> $GITHUB_STEP_SUMMARY
-          echo '```' >> $GITHUB_STEP_SUMMARY
-          
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "### Corruption Test" >> $GITHUB_STEP_SUMMARY
-          echo '```' >> $GITHUB_STEP_SUMMARY
-          $AETHALLOC /tmp/corruption_test >> $GITHUB_STEP_SUMMARY
-          echo '```' >> $GITHUB_STEP_SUMMARY
+          python3 << 'PYEOF'
+          import subprocess, json, statistics, os
+
+          LIB_PATH = subprocess.check_output("realpath lib/*.so", shell=True).decode().strip()
+
+          benchmarks = [
+              ("packet_churn", "/tmp/packet_churn 100000 10000", "throughput_ops_per_sec", "ops/s", "higher"),
+              ("multithread_churn", "/tmp/multithread_churn 8 100000", "throughput_ops_per_sec", "ops/s", "higher"),
+              ("kv_store", "/tmp/kv_store", "throughput_ops_per_sec", "ops/s", "higher"),
+              ("producer_consumer", "/tmp/producer_consumer", "throughput_ops_per_sec", "ops/s", "higher"),
+              ("realloc_churn", "/tmp/realloc_churn 100000 2", "latency_ns.avg", "ns", "lower"),
+              ("realloc_large", "/tmp/realloc_large 10000", "latency_ns.avg", "ns", "lower"),
+              ("fragmentation_churn", "/tmp/fragmentation_churn 50000 10000", "latency_ns.avg", "ns", "lower"),
+              ("fragmentation_rss", "/tmp/fragmentation", "summary.final_rss_kb", "KB", "lower"),
+          ]
+
+          runs = 5
+          summary = "# Benchmark Results\n\n"
+          summary += f"**System:** GitHub Actions ubuntu-latest ({subprocess.check_output('nproc', shell=True).decode().strip()} cores)\n\n"
+          summary += f"**Runs per benchmark:** {runs}\n\n"
+          summary += "---\n\n"
+
+          for bench_name, cmd, metric, unit, direction in benchmarks:
+              glibc_vals = []
+              aeth_vals = []
+              for i in range(runs):
+                  try:
+                      out = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT, timeout=120).decode()
+                      d = json.loads(out.strip())
+                      parts = metric.split(".")
+                      val = d
+                      for p in parts:
+                          val = val.get(p, 0) if isinstance(val, dict) else 0
+                      glibc_vals.append(val)
+                  except Exception as e:
+                      print(f"WARNING: glibc {bench_name} run {i+1} failed: {e}")
+
+                  try:
+                      out = subprocess.check_output(f"LD_PRELOAD={LIB_PATH} {cmd}", shell=True, stderr=subprocess.STDOUT, timeout=120).decode()
+                      d = json.loads(out.strip())
+                      parts = metric.split(".")
+                      val = d
+                      for p in parts:
+                          val = val.get(p, 0) if isinstance(val, dict) else 0
+                      aeth_vals.append(val)
+                  except Exception as e:
+                      print(f"WARNING: aethalloc {bench_name} run {i+1} failed: {e}")
+
+              g_mean = statistics.mean(glibc_vals) if glibc_vals else 0
+              g_stdev = statistics.stdev(glibc_vals) if len(glibc_vals) > 1 else 0
+              a_mean = statistics.mean(aeth_vals) if aeth_vals else 0
+              a_stdev = statistics.stdev(aeth_vals) if len(aeth_vals) > 1 else 0
+              delta = ((a_mean - g_mean) / g_mean * 100) if g_mean > 0 else 0
+
+              if direction == "higher":
+                  emoji = "🟢" if delta > 2 else "🔴" if delta < -2 else "➖"
+              else:
+                  emoji = "🟢" if delta < -2 else "🔴" if delta > 2 else "➖"
+
+              summary += f"{emoji} **{bench_name}**\n"
+              if glibc_vals or aeth_vals:
+                  summary += f"- glibc: {g_mean:,.0f} ± {g_stdev:,.0f} {unit}\n"
+                  summary += f"- aethalloc: {a_mean:,.0f} ± {a_stdev:,.0f} {unit}\n"
+                  summary += f"- **delta: {delta:+.1f}%**\n\n"
+              else:
+                  summary += f"- ⚠️ All runs failed (benchmark may not work on this platform)\n\n"
+
+          # Tail latency
+          summary += "---\n\n## Tail Latency (8 threads, 50K ops)\n\n"
+          summary += "| Allocator | P50 | P99 | P99.9 | P99.99 | Max |\n"
+          summary += "|-----------|-----|-----|-------|--------|-----|\n"
+
+          for label, pre in [("glibc", ""), ("AethAlloc", f"LD_PRELOAD={LIB_PATH}")]:
+              try:
+                  out = subprocess.check_output(f"{pre} /tmp/tail_latency 8 50000", shell=True, stderr=subprocess.STDOUT, timeout=120).decode()
+                  d = json.loads(out.strip())
+                  lat = d.get("latency_ns", {})
+                  summary += f"| {label} | {lat.get('p50', 0):,.0f}ns | {lat.get('p99', 0):,.0f}ns | {lat.get('p99.9', 0):,.0f}ns | {lat.get('p99.99', 0):,.0f}ns | {lat.get('max', 0):,.0f}ns |\n"
+              except Exception as e:
+                  summary += f"| {label} | ⚠️ | ⚠️ | ⚠️ | ⚠️ | ⚠️ |\n"
+                  print(f"WARNING: {label} tail_latency failed: {e}")
+
+          with open(os.environ["GITHUB_STEP_SUMMARY"], "w") as f:
+              f.write(summary)
+
+          # Also save raw JSON
+          raw = {}
+          for bench_name, cmd, metric, unit, direction in benchmarks:
+              glibc_vals = []
+              aeth_vals = []
+              for i in range(runs):
+                  try:
+                      out = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT, timeout=120).decode()
+                      d = json.loads(out.strip())
+                      parts = metric.split(".")
+                      val = d
+                      for p in parts:
+                          val = val.get(p, 0) if isinstance(val, dict) else 0
+                      glibc_vals.append(val)
+                  except:
+                      pass
+                  try:
+                      out = subprocess.check_output(f"LD_PRELOAD={LIB_PATH} {cmd}", shell=True, stderr=subprocess.STDOUT, timeout=120).decode()
+                      d = json.loads(out.strip())
+                      val = d
+                      for p in parts:
+                          val = val.get(p, 0) if isinstance(val, dict) else 0
+                      aeth_vals.append(val)
+                  except:
+                      pass
+              if glibc_vals or aeth_vals:
+                  raw[bench_name] = {
+                      "glibc": {"mean": statistics.mean(glibc_vals) if glibc_vals else 0, "stdev": statistics.stdev(glibc_vals) if len(glibc_vals) > 1 else 0, "runs": glibc_vals},
+                      "aethalloc": {"mean": statistics.mean(aeth_vals) if aeth_vals else 0, "stdev": statistics.stdev(aeth_vals) if len(aeth_vals) > 1 else 0, "runs": aeth_vals},
+                  }
+          with open("benchmark-results.json", "w") as f:
+              json.dump(raw, f, indent=2)
+          PYEOF
+      - name: Upload results
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          path: benchmark-results.json
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 256a0a1..3495a33 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: CI
 
 on:
   push:
-    branches: [main]
+    branches: [main, feature/*]
   pull_request:
     branches: [main]
   workflow_dispatch:
@@ -65,6 +65,9 @@ jobs:
           gcc -O3 -pthread benches/producer_consumer.c -o /tmp/producer_consumer
           gcc -O3 -pthread benches/multithread_churn.c -o /tmp/multithread_churn
           gcc -O3 -pthread benches/fragmentation.c -o /tmp/fragmentation
+          gcc -O3 -pthread benches/realloc_churn.c -o /tmp/realloc_churn
+          gcc -O3 -pthread benches/realloc_large.c -o /tmp/realloc_large
+          gcc -O3 -pthread benches/fragmentation_churn.c -o /tmp/fragmentation_churn
       - name: Packet Churn
         run: |
           echo "GLIBC=$(/tmp/packet_churn | jq -r '.throughput_ops_per_sec')" >> $GITHUB_ENV
@@ -85,6 +88,18 @@ jobs:
         run: |
           echo "GLIBC_RSS=$(/tmp/fragmentation | jq -r '.summary.final_rss_kb')" >> $GITHUB_ENV
           echo "AETHALLOC_RSS=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/fragmentation | jq -r '.summary.final_rss_kb')" >> $GITHUB_ENV
+      - name: Realloc Churn
+        run: |
+          echo "GLIBC_REALLOC=$(/tmp/realloc_churn 100000 2 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV
+          echo "AETHALLOC_REALLOC=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/realloc_churn 100000 2 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV
+      - name: Realloc Large
+        run: |
+          echo "GLIBC_REALLOC_LARGE=$(/tmp/realloc_large 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV
+          echo "AETHALLOC_REALLOC_LARGE=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/realloc_large 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV
+      - name: Fragmentation Churn
+        run: |
+          echo "GLIBC_FRAG_CHURN=$(/tmp/fragmentation_churn 50000 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV
+          echo "AETHALLOC_FRAG_CHURN=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/fragmentation_churn 50000 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV
 
   stress-tests:
     runs-on: ubuntu-latest
diff --git a/Cargo.lock b/Cargo.lock
index 86a65a1..8625d4e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,19 +4,38 @@ version = 4
 
 [[package]]
 name = "aethalloc-abi"
-version = "0.2.3"
+version = "0.2.4"
 dependencies = [
+ "aethalloc-amo",
  "aethalloc-core",
+ "aethalloc-hess",
+ "aethalloc-vmpc",
+ "libc",
+]
+
+[[package]]
+name = "aethalloc-amo"
+version = "0.2.4"
+dependencies = [
+ "aethalloc-hess",
+ "aethalloc-vmpc",
+ "criterion",
  "libc",
 ]
 
 [[package]]
 name = "aethalloc-core"
-version = "0.2.3"
+version = "0.2.4"
 dependencies = [
+ "aethalloc-hess",
+ "aethalloc-vmpc",
  "libc",
 ]
 
+[[package]]
+name = "aethalloc-hess"
+version = "0.2.4"
+
 [[package]]
 name = "aethalloc-metrics"
 version = "0.1.0"
@@ -25,12 +44,236 @@ dependencies = [
  "libloading",
 ]
 
+[[package]]
+name = "aethalloc-vmpc"
+version = "0.2.4"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351"
+dependencies = [
+ "clap_builder",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstyle",
+ "clap_lex",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "zerocopy",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
+[[package]]
+name = "is-terminal"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys",
+]
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "js-sys"
+version = "0.3.93"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "797146bb2677299a1eb6b7b50a890f4c361b29ef967addf5b2fa45dae1bb6d7d"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.183"
@@ -47,8 +290,324 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "memchr"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rayon"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.116"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dc0882f7b5bb01ae8c5215a1230832694481c1a4be062fd410e12ea3da5b631"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.116"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75973d3066e01d035dbedaad2864c398df42f8dd7b1ea057c35b8407c015b537"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.116"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91af5e4be765819e0bcfee7322c14374dc821e35e72fa663a830bbc7dc199eac"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.116"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9bf0406a78f02f336bf1e451799cca198e8acde4ffa278f0fb20487b150a633"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.93"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "749466a37ee189057f54748b200186b59a03417a117267baf3fd89cecc9fb837"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
+dependencies = [
+ "windows-sys",
+]
+
 [[package]]
 name = "windows-link"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
diff --git a/aethalloc-abi/Cargo.toml b/aethalloc-abi/Cargo.toml
index 261ba60..596b116 100644
--- a/aethalloc-abi/Cargo.toml
+++ b/aethalloc-abi/Cargo.toml
@@ -12,7 +12,12 @@ default = ["magazine-caching"]
 magazine-caching = ["aethalloc-core/magazine"]
 simple-cache = []
 metrics = []
+vmpc = ["aethalloc-core/vmpc", "aethalloc-amo/vmpc", "dep:aethalloc-vmpc"]
+amo = []
 
 [dependencies]
-aethalloc-core = { path = "../aethalloc-core" }
+aethalloc-core = { path = "../aethalloc-core", features = ["hess"] }
+aethalloc-amo = { path = "../aethalloc-amo", features = ["std", "hess"] }
+aethalloc-hess = { path = "../aethalloc-hess" }
+aethalloc-vmpc = { path = "../aethalloc-vmpc", features = ["std"], optional = true }
 libc = { version = "0.2", default-features = false }
diff --git a/aethalloc-abi/src/global.rs b/aethalloc-abi/src/global.rs
index 70b2739..01d8c78 100644
--- a/aethalloc-abi/src/global.rs
+++ b/aethalloc-abi/src/global.rs
@@ -6,59 +6,158 @@
 
 use alloc::alloc::{GlobalAlloc, Layout};
 use core::ptr::NonNull;
-use core::sync::atomic::{AtomicU64, Ordering};
 
+#[cfg(feature = "amo")]
+use core::sync::atomic::{AtomicBool, Ordering};
+
+#[cfg(all(feature = "metrics", feature = "amo"))]
+use aethalloc_amo::command::StatsReportPayload;
+#[cfg(feature = "amo")]
+use aethalloc_amo::command::{FreeBlockPayload, RingCommand, RingEntry, RingPayload};
+#[cfg(feature = "amo")]
+use aethalloc_amo::ring_buffer::RingBuffer;
 use aethalloc_core::page::PageAllocator;
 use aethalloc_core::size_class::round_up_pow2;
 
 #[cfg(feature = "magazine-caching")]
 use aethalloc_core::magazine::{GlobalMagazinePools, Magazine, MetadataAllocator};
 
-const PAGE_SIZE: usize = aethalloc_core::page::PAGE_SIZE;
+#[cfg(feature = "metrics")]
+use core::sync::atomic::AtomicU64;
+
+/// AMO ring buffer capacity (power of 2)
+#[cfg(feature = "amo")]
+const AMO_RING_CAPACITY: usize = 1024;
+
+/// Static ring buffer for async metadata offloading
+#[cfg(feature = "amo")]
+static AMO_RING: RingBuffer<AMO_RING_CAPACITY> = RingBuffer::new();
+
+/// Track if support core thread has been spawned
+#[cfg(feature = "amo")]
+static SUPPORT_CORE_STARTED: AtomicBool = AtomicBool::new(false);
+
+/// Start the support core worker thread (called once)
+#[cfg(feature = "amo")]
+pub fn ensure_support_core() {
+    if !SUPPORT_CORE_STARTED.load(Ordering::Acquire) {
+        SUPPORT_CORE_STARTED.store(true, Ordering::Release);
+        use aethalloc_amo::support_core::spawn_support_core;
+        unsafe {
+            spawn_support_core(&AMO_RING);
+        }
+    }
+}
+
+/// No-op when AMO is disabled
+#[cfg(not(feature = "amo"))]
+pub fn ensure_support_core() {}
+
+/// Push a FreeBlock command to the AMO ring buffer
+#[cfg(feature = "amo")]
+#[inline]
+unsafe fn amo_push_free_block(ptr: *mut u8, size: usize, size_class: u8) {
+    let payload = RingPayload {
+        free_block: FreeBlockPayload {
+            ptr,
+            size,
+            size_class,
+        },
+    };
+    let entry = RingEntry::new(RingCommand::FreeBlock, payload);
+    let _ = AMO_RING.try_push(entry);
+}
+
+/// No-op when AMO is disabled
+#[cfg(not(feature = "amo"))]
+#[inline]
+unsafe fn amo_push_free_block(_ptr: *mut u8, _size: usize, _size_class: u8) {}
+
+/// Push a batch of free blocks to the AMO ring buffer
+#[cfg(feature = "amo")]
+#[inline]
+#[allow(dead_code)]
+unsafe fn amo_push_free_batch(ptr: *mut u8, count: u32) {
+    let payload = RingPayload {
+        free_block: FreeBlockPayload {
+            ptr,
+            size: 0,
+            size_class: count as u8,
+        },
+    };
+    let entry = RingEntry::new(RingCommand::FreeBlock, payload);
+    let _ = AMO_RING.try_push(entry);
+}
+
+/// Push a StatsReport command to the AMO ring buffer
+#[cfg(all(feature = "amo", feature = "metrics"))]
+#[inline]
+fn amo_push_stats(thread_id: u64, allocs: u64, frees: u64) {
+    let payload = RingPayload {
+        stats: StatsReportPayload {
+            thread_id,
+            allocs,
+            frees,
+        },
+    };
+    let entry = RingEntry::new(RingCommand::StatsReport, payload);
+    let _ = AMO_RING.try_push(entry);
+}
+
+/// No-op when AMO or metrics is disabled
+#[cfg(not(all(feature = "amo", feature = "metrics")))]
+#[inline]
+#[allow(dead_code)]
+fn amo_push_stats(_thread_id: u64, _allocs: u64, _frees: u64) {}
+
+pub const PAGE_SIZE: usize = aethalloc_core::page::PAGE_SIZE;
 const PAGE_MASK: usize = !(PAGE_SIZE - 1);
-const MAX_CACHE_SIZE: usize = 65536;
+pub const MAX_CACHE_SIZE: usize = 65536;
 const NUM_SIZE_CLASSES: usize = 14;
+#[cfg(feature = "metrics")]
 const METRICS_FLUSH_THRESHOLD: usize = 4096;
 #[cfg(not(feature = "magazine-caching"))]
-const MAX_FREE_LIST_LENGTH: usize = 4096;
+const MAX_FREE_LIST_LENGTH: usize = 8192;
 #[cfg(not(feature = "magazine-caching"))]
-const GLOBAL_FREE_BATCH: usize = 128;
+const GLOBAL_FREE_BATCH: usize = 256;
 
-const MAGIC: u32 = 0xA7E8A110;
+pub const MAGIC: u32 = 0xA7E8A110;
 
 #[repr(C)]
-struct PageHeader {
-    magic: u32,
-    num_pages: u32,
-    requested_size: usize,
+pub struct PageHeader {
+    pub magic: u32,
+    pub num_pages: u32,
+    pub requested_size: usize,
+    pub tag: aethalloc_core::Tag,
 }
 
-const PAGE_HEADER_SIZE: usize = core::mem::size_of::<PageHeader>();
-const CACHE_HEADER_SIZE: usize = 16;
-const LARGE_HEADER_SIZE: usize = 16;
-const LARGE_MAGIC: u32 = 0xA7E8A11F;
+pub const PAGE_HEADER_SIZE: usize = core::mem::size_of::<PageHeader>();
+pub const CACHE_HEADER_SIZE: usize = 16;
+pub const LARGE_HEADER_SIZE: usize = 16;
+pub const LARGE_MAGIC: u32 = 0xA7E8A11F;
 
 #[repr(C)]
-struct LargeAllocHeader {
-    magic: u32,
-    base_ptr: *mut u8,
+pub struct LargeAllocHeader {
+    pub magic: u32,
+    pub base_ptr: *mut u8,
 }
 
 #[cfg(not(feature = "magazine-caching"))]
 struct GlobalFreeList {
-    head: AtomicPtr<u8>,
+    head: core::sync::atomic::AtomicPtr<u8>,
 }
 
 #[cfg(not(feature = "magazine-caching"))]
 impl GlobalFreeList {
     const fn new() -> Self {
         Self {
-            head: AtomicPtr::new(core::ptr::null_mut()),
+            head: core::sync::atomic::AtomicPtr::new(core::ptr::null_mut()),
         }
     }
 
     #[inline]
     unsafe fn push_batch(&self, batch_head: *mut u8, batch_tail: *mut u8) {
+        use core::sync::atomic::Ordering;
         let mut current = self.head.load(Ordering::Relaxed);
         loop {
             core::ptr::write(batch_tail as *mut *mut u8, current);
@@ -76,6 +175,7 @@ impl GlobalFreeList {
 
     #[inline]
     unsafe fn pop(&self) -> Option<*mut u8> {
+        use core::sync::atomic::Ordering;
         let mut current = self.head.load(Ordering::Relaxed);
         loop {
             if current.is_null() {
@@ -136,6 +236,7 @@ static GLOBAL_FREE_LISTS: [GlobalFreeList; NUM_SIZE_CLASSES] = [
     GlobalFreeList::new(),
 ];
 
+#[cfg(feature = "metrics")]
 pub static GLOBAL_METRICS: GlobalMetrics = GlobalMetrics::new();
 
 #[cfg(feature = "magazine-caching")]
@@ -144,6 +245,7 @@ pub static GLOBAL_MAGAZINES: GlobalMagazinePools = GlobalMagazinePools::new();
 #[cfg(feature = "magazine-caching")]
 pub static METADATA_ALLOCATOR: MetadataAllocator = MetadataAllocator::new();
 
+#[cfg(feature = "metrics")]
 pub struct GlobalMetrics {
     pub allocs: AtomicU64,
     pub frees: AtomicU64,
@@ -152,6 +254,7 @@ pub struct GlobalMetrics {
     pub direct_allocs: AtomicU64,
 }
 
+#[cfg(feature = "metrics")]
 impl GlobalMetrics {
     const fn new() -> Self {
         Self {
@@ -174,9 +277,9 @@ impl GlobalMetrics {
     }
 }
 
+#[cfg(feature = "metrics")]
 #[derive(Debug, Clone, Copy, Default)]
 #[repr(C)]
-#[allow(dead_code)]
 pub struct MetricsSnapshot {
     pub allocs: u64,
     pub frees: u64,
@@ -185,6 +288,7 @@ pub struct MetricsSnapshot {
     pub direct_allocs: u64,
 }
 
+#[cfg(feature = "metrics")]
 struct ThreadMetrics {
     allocs: usize,
     frees: usize,
@@ -193,6 +297,10 @@ struct ThreadMetrics {
     direct_allocs: usize,
 }
 
+#[cfg(not(feature = "metrics"))]
+struct ThreadMetrics;
+
+#[cfg(feature = "metrics")]
 impl ThreadMetrics {
     const fn new() -> Self {
         Self {
@@ -222,6 +330,8 @@ impl ThreadMetrics {
             GLOBAL_METRICS
                 .direct_allocs
                 .fetch_add(self.direct_allocs as u64, Ordering::Relaxed);
+            let thread_id = unsafe { libc::pthread_self() as u64 };
+            amo_push_stats(thread_id, self.allocs as u64, self.frees as u64);
             self.allocs = 0;
             self.frees = 0;
             self.cache_hits = 0;
@@ -229,26 +339,76 @@ impl ThreadMetrics {
             self.direct_allocs = 0;
         }
     }
+
+    #[inline]
+    fn record_alloc(&mut self) {
+        self.allocs += 1;
+    }
+    #[inline]
+    fn record_free(&mut self) {
+        self.frees += 1;
+    }
+    #[inline]
+    fn record_cache_hit(&mut self) {
+        self.cache_hits += 1;
+    }
+    #[inline]
+    fn record_cache_miss(&mut self) {
+        self.cache_misses += 1;
+    }
+    #[inline]
+    fn record_direct_alloc(&mut self) {
+        self.direct_allocs += 1;
+    }
 }
 
+#[cfg(not(feature = "metrics"))]
+impl ThreadMetrics {
+    const fn new() -> Self {
+        Self
+    }
+    #[inline]
+    fn maybe_flush(&mut self) {}
+    #[inline]
+    fn record_alloc(&mut self) {}
+    #[inline]
+    fn record_free(&mut self) {}
+    #[inline]
+    fn record_cache_hit(&mut self) {}
+    #[inline]
+    fn record_cache_miss(&mut self) {}
+    #[inline]
+    fn record_direct_alloc(&mut self) {}
+}
+
+/// Convert a size to a size class index (0-12 for 16B-64KB)
+///
+/// Uses a 64-entry lookup table for small sizes to avoid branching
+/// and bit math on the most common allocation sizes.
+/// Maps: 16→0, 32→1, 64→2, 128→3, 256→4, 512→5, 1024→6, 2048→7,
+///       4096→8, 8192→9, 16384→10, 32768→11, 65536→12
 #[inline]
 fn size_to_class(size: usize) -> Option<usize> {
-    let rounded = round_up_pow2(size).max(16);
-    match rounded {
-        16 => Some(0),
-        32 => Some(1),
-        64 => Some(2),
-        128 => Some(3),
-        256 => Some(4),
-        512 => Some(5),
-        1024 => Some(6),
-        2048 => Some(7),
-        4096 => Some(8),
-        8192 => Some(9),
-        16384 => Some(10),
-        32768 => Some(11),
-        65536 => Some(12),
-        _ => None,
+    if size == 0 || size > 65536 {
+        return None;
+    }
+    const LUT: [u8; 64] = [
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2,
+    ];
+    if size <= 64 {
+        return Some(LUT[size - 1] as usize);
+    }
+    let v = if size < 16 { 16 } else { size };
+    let rounded = 1usize << (usize::BITS - (v - 1).leading_zeros());
+    let class = 63usize
+        .wrapping_sub(rounded.leading_zeros() as usize)
+        .wrapping_sub(4);
+    if class <= 12 {
+        Some(class)
+    } else {
+        None
     }
 }
 
@@ -334,7 +494,7 @@ impl AethAlloc {
     }
 
     #[inline]
-    fn align_up(addr: usize, align: usize) -> usize {
+    pub fn align_up(addr: usize, align: usize) -> usize {
         (addr + align - 1) & !(align - 1)
     }
 
@@ -354,7 +514,6 @@ unsafe impl GlobalAlloc for AethAlloc {
     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
         let size = layout.size();
         let align = layout.align();
-
         if size == 0 {
             return core::ptr::null_mut();
         }
@@ -362,22 +521,18 @@ unsafe impl GlobalAlloc for AethAlloc {
         if size <= MAX_CACHE_SIZE && align <= 8 {
             let cache = get_thread_cache();
             let cache_size = round_up_pow2(size).max(16);
-
             if let Some(class) = size_to_class(cache_size) {
                 let head = cache.heads[class];
-
                 if !head.is_null() {
                     let next = core::ptr::read(head as *mut *mut u8);
                     cache.heads[class] = next;
                     cache.counts[class] -= 1;
-                    cache.metrics.cache_hits += 1;
-                    cache.metrics.allocs += 1;
+                    cache.metrics.record_cache_hit();
+                    cache.metrics.record_alloc();
                     cache.metrics.maybe_flush();
                     core::ptr::write(head as *mut usize, size);
                     return head.add(CACHE_HEADER_SIZE);
                 }
-
-                // Try global free list before allocating new pages (only if non-empty)
                 if !GLOBAL_FREE_LISTS[class]
                     .head
                     .load(Ordering::Relaxed)
@@ -394,20 +549,17 @@ unsafe impl GlobalAlloc for AethAlloc {
                         let next = core::ptr::read(block as *mut *mut u8);
                         cache.heads[class] = next;
                         cache.counts[class] -= 1;
-                        cache.metrics.cache_hits += 1;
-                        cache.metrics.allocs += 1;
+                        cache.metrics.record_cache_hit();
+                        cache.metrics.record_alloc();
                         cache.metrics.maybe_flush();
                         core::ptr::write(block as *mut usize, size);
                         return block.add(CACHE_HEADER_SIZE);
                     }
                 }
-
-                cache.metrics.cache_misses += 1;
-                cache.metrics.allocs += 1;
-
+                cache.metrics.record_cache_miss();
+                cache.metrics.record_alloc();
                 let block_size = cache_size + CACHE_HEADER_SIZE;
                 let blocks_per_page = PAGE_SIZE / block_size;
-
                 if blocks_per_page > 1 {
                     if let Some(base) = PageAllocator::alloc(1) {
                         let base_ptr = base.as_ptr();
@@ -422,7 +574,6 @@ unsafe impl GlobalAlloc for AethAlloc {
                         return base_ptr.add(CACHE_HEADER_SIZE);
                     }
                 }
-
                 let pages = block_size.div_ceil(PAGE_SIZE).max(1);
                 if let Some(base) = PageAllocator::alloc(pages) {
                     let base_ptr = base.as_ptr();
@@ -433,30 +584,24 @@ unsafe impl GlobalAlloc for AethAlloc {
                 return core::ptr::null_mut();
             }
         }
-
         let cache = get_thread_cache();
-        cache.metrics.direct_allocs += 1;
-        cache.metrics.allocs += 1;
+        cache.metrics.record_direct_alloc();
+        cache.metrics.record_alloc();
         cache.metrics.maybe_flush();
-
         let min_size = PAGE_HEADER_SIZE + LARGE_HEADER_SIZE + size + align;
         let pages = min_size.div_ceil(PAGE_SIZE).max(1);
-
         match PageAllocator::alloc(pages) {
             Some(base) => {
                 let base_addr = base.as_ptr() as usize;
-
                 let page_header = PageHeader {
                     magic: MAGIC,
                     num_pages: pages as u32,
                     requested_size: size,
+                    tag: 0,
                 };
-                let header_ptr = base.as_ptr() as *mut PageHeader;
-                core::ptr::write(header_ptr, page_header);
-
+                core::ptr::write(base.as_ptr() as *mut PageHeader, page_header);
                 let user_addr =
                     Self::align_up(base_addr + PAGE_HEADER_SIZE + LARGE_HEADER_SIZE, align);
-
                 let large_header = LargeAllocHeader {
                     magic: LARGE_MAGIC,
                     base_ptr: base.as_ptr(),
@@ -465,7 +610,6 @@ unsafe impl GlobalAlloc for AethAlloc {
                     (user_addr - LARGE_HEADER_SIZE) as *mut LargeAllocHeader,
                     large_header,
                 );
-
                 user_addr as *mut u8
             }
             None => core::ptr::null_mut(),
@@ -476,63 +620,64 @@ unsafe impl GlobalAlloc for AethAlloc {
         if ptr.is_null() {
             return;
         }
-
-        // Check for large allocation first (LargeAllocHeader immediately before ptr)
         let large_header_addr = ptr.sub(LARGE_HEADER_SIZE) as *const LargeAllocHeader;
         if core::ptr::read(large_header_addr).magic == LARGE_MAGIC {
             let base_ptr = core::ptr::read(large_header_addr).base_ptr;
             let page_header = core::ptr::read(base_ptr as *const PageHeader);
-
             if page_header.magic == MAGIC && page_header.num_pages > 0 {
-                PageAllocator::dealloc(
-                    NonNull::new_unchecked(base_ptr),
-                    page_header.num_pages as usize,
-                );
+                let size = page_header.num_pages as usize * PAGE_SIZE;
+                let base_ptr_nn = NonNull::new_unchecked(base_ptr);
+                #[cfg(feature = "vmpc")]
+                {
+                    use aethalloc_core::try_compact_region;
+                    let _compacted = try_compact_region(base_ptr_nn, size);
+                }
+                #[cfg(not(feature = "vmpc"))]
+                {
+                    let _ = (base_ptr_nn, size);
+                }
+                PageAllocator::dealloc(base_ptr_nn, page_header.num_pages as usize);
             }
-
             let cache = get_thread_cache();
-            cache.metrics.frees += 1;
+            cache.metrics.record_free();
             cache.metrics.maybe_flush();
             return;
         }
-
         let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize;
         let maybe_size = core::ptr::read(size_ptr);
-
         if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE {
             let potential_header = size_ptr as *mut PageHeader;
             if core::ptr::read(potential_header).magic != MAGIC {
                 let cache = get_thread_cache();
                 let cache_size = round_up_pow2(maybe_size).max(16);
-
                 if let Some(class) = size_to_class(cache_size) {
                     let head_ptr = size_ptr as *mut *mut u8;
                     core::ptr::write(head_ptr, cache.heads[class]);
                     cache.heads[class] = size_ptr as *mut u8;
                     cache.counts[class] += 1;
-                    cache.metrics.frees += 1;
+                    cache.metrics.record_free();
                     cache.metrics.maybe_flush();
-
-                    // Anti-hoarding: flush excess to global free list with O(1) batch push
                     if cache.counts[class] >= MAX_FREE_LIST_LENGTH {
                         let flush_count = cache.counts[class] / 2;
-
+                        // Only flush in batches of GLOBAL_FREE_BATCH to reduce CAS overhead
+                        let flush_count = (flush_count / GLOBAL_FREE_BATCH) * GLOBAL_FREE_BATCH;
+                        if flush_count < GLOBAL_FREE_BATCH {
+                            cache.metrics.record_free();
+                            cache.metrics.maybe_flush();
+                            return;
+                        }
                         let batch_head = cache.heads[class];
                         let mut batch_tail = batch_head;
                         let mut walked = 1usize;
-
                         while walked < flush_count && !batch_tail.is_null() {
                             batch_tail = core::ptr::read(batch_tail as *mut *mut u8);
                             walked += 1;
                         }
-
                         if !batch_tail.is_null() {
                             let new_local_head = core::ptr::read(batch_tail as *mut *mut u8);
                             core::ptr::write(batch_tail as *mut *mut u8, core::ptr::null_mut());
-
                             cache.heads[class] = new_local_head;
                             cache.counts[class] -= flush_count;
-
                             GLOBAL_FREE_LISTS[class].push_batch(batch_head, batch_tail);
                         }
                     }
@@ -540,18 +685,18 @@ unsafe impl GlobalAlloc for AethAlloc {
                 }
             }
         }
-
         let header = Self::page_header_from_ptr(ptr);
         let header_ref = core::ptr::read(header);
-
         if header_ref.magic == MAGIC && header_ref.num_pages > 0 {
             let base = NonNull::new_unchecked(header as *mut u8);
             PageAllocator::dealloc(base, header_ref.num_pages as usize);
         }
-
         let cache = get_thread_cache();
-        cache.metrics.frees += 1;
+        cache.metrics.record_free();
         cache.metrics.maybe_flush();
+        let alloc_size = get_alloc_size(ptr);
+        let size_class = size_to_class(round_up_pow2(alloc_size).max(16)).unwrap_or(0) as u8;
+        amo_push_free_block(ptr, alloc_size, size_class);
     }
 }
 
@@ -564,7 +709,6 @@ unsafe impl GlobalAlloc for AethAlloc {
     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
         let size = layout.size();
         let align = layout.align();
-
         if size == 0 {
             return core::ptr::null_mut();
         }
@@ -572,54 +716,41 @@ unsafe impl GlobalAlloc for AethAlloc {
         if size <= MAX_CACHE_SIZE && align <= 8 {
             let cache = get_thread_cache();
             let cache_size = round_up_pow2(size).max(16);
-
             if let Some(class) = size_to_class(cache_size) {
-                // Try local alloc magazine
                 if let Some(block) = cache.alloc_mags[class].pop() {
-                    cache.metrics.cache_hits += 1;
-                    cache.metrics.allocs += 1;
+                    cache.metrics.record_cache_hit();
+                    cache.metrics.record_alloc();
                     cache.metrics.maybe_flush();
                     core::ptr::write(block as *mut usize, size);
                     return block.add(CACHE_HEADER_SIZE);
                 }
-
-                // Try swap with local free_mag for reuse
                 if !cache.free_mags[class].is_empty() {
                     core::mem::swap(&mut cache.alloc_mags[class], &mut cache.free_mags[class]);
                     if let Some(block) = cache.alloc_mags[class].pop() {
-                        cache.metrics.cache_hits += 1;
-                        cache.metrics.allocs += 1;
+                        cache.metrics.record_cache_hit();
+                        cache.metrics.record_alloc();
                         cache.metrics.maybe_flush();
                         core::ptr::write(block as *mut usize, size);
                         return block.add(CACHE_HEADER_SIZE);
                     }
                 }
-
-                // Try to get a full magazine from global pool
                 if let Some(node_ptr) = GLOBAL_MAGAZINES.get(class).pop_full() {
                     let node = &mut *node_ptr;
                     core::mem::swap(&mut cache.alloc_mags[class], &mut node.magazine);
                     node.magazine.clear();
-                    unsafe {
-                        GLOBAL_MAGAZINES.get(class).push_empty(node_ptr);
-                    }
-
+                    GLOBAL_MAGAZINES.get(class).push_empty(node_ptr);
                     if let Some(block) = cache.alloc_mags[class].pop() {
-                        cache.metrics.cache_hits += 1;
-                        cache.metrics.allocs += 1;
+                        cache.metrics.record_cache_hit();
+                        cache.metrics.record_alloc();
                         cache.metrics.maybe_flush();
                         core::ptr::write(block as *mut usize, size);
                         return block.add(CACHE_HEADER_SIZE);
                     }
                 }
-
-                // Cache miss - allocate fresh blocks
-                cache.metrics.cache_misses += 1;
-                cache.metrics.allocs += 1;
-
+                cache.metrics.record_cache_miss();
+                cache.metrics.record_alloc();
                 let block_size = cache_size + CACHE_HEADER_SIZE;
                 let blocks_per_page = PAGE_SIZE / block_size;
-
                 if blocks_per_page > 1 {
                     if let Some(base) = PageAllocator::alloc(1) {
                         let base_ptr = base.as_ptr();
@@ -636,7 +767,6 @@ unsafe impl GlobalAlloc for AethAlloc {
                         return base_ptr.add(CACHE_HEADER_SIZE);
                     }
                 }
-
                 let pages = block_size.div_ceil(PAGE_SIZE).max(1);
                 if let Some(base) = PageAllocator::alloc(pages) {
                     let base_ptr = base.as_ptr();
@@ -647,30 +777,24 @@ unsafe impl GlobalAlloc for AethAlloc {
                 return core::ptr::null_mut();
             }
         }
-
         let cache = get_thread_cache();
-        cache.metrics.direct_allocs += 1;
-        cache.metrics.allocs += 1;
+        cache.metrics.record_direct_alloc();
+        cache.metrics.record_alloc();
         cache.metrics.maybe_flush();
-
-        // Large allocation with LargeAllocHeader (same as simple-cache mode)
         let min_size = PAGE_HEADER_SIZE + LARGE_HEADER_SIZE + size + align;
         let pages = min_size.div_ceil(PAGE_SIZE).max(1);
-
         match PageAllocator::alloc(pages) {
             Some(base) => {
                 let base_addr = base.as_ptr() as usize;
-
                 let page_header = PageHeader {
                     magic: MAGIC,
                     num_pages: pages as u32,
                     requested_size: size,
+                    tag: 0,
                 };
                 core::ptr::write(base.as_ptr() as *mut PageHeader, page_header);
-
                 let user_addr =
                     Self::align_up(base_addr + PAGE_HEADER_SIZE + LARGE_HEADER_SIZE, align);
-
                 let large_header = LargeAllocHeader {
                     magic: LARGE_MAGIC,
                     base_ptr: base.as_ptr(),
@@ -679,7 +803,6 @@ unsafe impl GlobalAlloc for AethAlloc {
                     (user_addr - LARGE_HEADER_SIZE) as *mut LargeAllocHeader,
                     large_header,
                 );
-
                 user_addr as *mut u8
             }
             None => core::ptr::null_mut(),
@@ -690,76 +813,68 @@ unsafe impl GlobalAlloc for AethAlloc {
         if ptr.is_null() {
             return;
         }
-
-        // Check for large allocation first (LargeAllocHeader immediately before ptr)
         let large_header_addr = ptr.sub(LARGE_HEADER_SIZE) as *const LargeAllocHeader;
         if core::ptr::read(large_header_addr).magic == LARGE_MAGIC {
             let base_ptr = core::ptr::read(large_header_addr).base_ptr;
             let page_header = core::ptr::read(base_ptr as *const PageHeader);
-
             if page_header.magic == MAGIC && page_header.num_pages > 0 {
-                PageAllocator::dealloc(
-                    NonNull::new_unchecked(base_ptr),
-                    page_header.num_pages as usize,
-                );
+                let size = page_header.num_pages as usize * PAGE_SIZE;
+                let base_ptr_nn = NonNull::new_unchecked(base_ptr);
+                #[cfg(feature = "vmpc")]
+                {
+                    use aethalloc_core::try_compact_region;
+                    let _compacted = try_compact_region(base_ptr_nn, size);
+                }
+                #[cfg(not(feature = "vmpc"))]
+                {
+                    let _ = (base_ptr_nn, size);
+                }
+                PageAllocator::dealloc(base_ptr_nn, page_header.num_pages as usize);
             }
-
             let cache = get_thread_cache();
-            cache.metrics.frees += 1;
+            cache.metrics.record_free();
             cache.metrics.maybe_flush();
             return;
         }
-
         let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize;
         let maybe_size = core::ptr::read(size_ptr);
-
         if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE {
             let potential_header = size_ptr as *mut PageHeader;
             if core::ptr::read(potential_header).magic != MAGIC {
                 let cache = get_thread_cache();
                 let cache_size = round_up_pow2(maybe_size).max(16);
-
                 if let Some(class) = size_to_class(cache_size) {
                     let block_ptr = size_ptr as *mut u8;
-
-                    // Try local free magazine
                     if cache.free_mags[class].push(block_ptr) {
-                        cache.metrics.frees += 1;
+                        cache.metrics.record_free();
                         cache.metrics.maybe_flush();
                         return;
                     }
-
-                    // Magazine full - push to global pool using metadata allocator
                     let node = METADATA_ALLOCATOR.alloc_node();
-
                     if !node.is_null() {
                         (*node).magazine = core::mem::take(&mut cache.free_mags[class]);
                         (*node).next = core::ptr::null_mut();
-                        unsafe {
-                            GLOBAL_MAGAZINES.get(class).push_full(node);
-                        }
+                        GLOBAL_MAGAZINES.get(class).push_full(node);
                     }
-
-                    // Push to now-empty magazine
                     let _ = cache.free_mags[class].push(block_ptr);
-                    cache.metrics.frees += 1;
+                    cache.metrics.record_free();
                     cache.metrics.maybe_flush();
                     return;
                 }
             }
         }
-
         let header = Self::page_header_from_ptr(ptr);
         let header_ref = core::ptr::read(header);
-
         if header_ref.magic == MAGIC && header_ref.num_pages > 0 {
             let base = NonNull::new_unchecked(header as *mut u8);
             PageAllocator::dealloc(base, header_ref.num_pages as usize);
         }
-
         let cache = get_thread_cache();
-        cache.metrics.frees += 1;
+        cache.metrics.record_free();
         cache.metrics.maybe_flush();
+        let alloc_size = get_alloc_size(ptr);
+        let size_class = size_to_class(round_up_pow2(alloc_size).max(16)).unwrap_or(0) as u8;
+        amo_push_free_block(ptr, alloc_size, size_class);
     }
 }
 
@@ -767,8 +882,16 @@ pub unsafe fn get_alloc_size(ptr: *mut u8) -> usize {
     if ptr.is_null() {
         return 0;
     }
-
-    // Check for large allocation first (LargeAllocHeader immediately before ptr)
+    // Fast path: check cache header first (most common for small allocs)
+    let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize;
+    let maybe_size = core::ptr::read(size_ptr);
+    if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE {
+        let potential_header = size_ptr as *mut PageHeader;
+        if core::ptr::read(potential_header).magic != MAGIC {
+            return maybe_size;
+        }
+    }
+    // Slow path: check large allocation header
     let large_header_addr = ptr.sub(LARGE_HEADER_SIZE) as *const LargeAllocHeader;
     if core::ptr::read(large_header_addr).magic == LARGE_MAGIC {
         let base_ptr = core::ptr::read(large_header_addr).base_ptr;
@@ -778,21 +901,9 @@ pub unsafe fn get_alloc_size(ptr: *mut u8) -> usize {
         }
         return 0;
     }
-
-    // Check for small cached allocation
-    let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize;
-    let maybe_size = core::ptr::read(size_ptr);
-
-    if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE {
-        let potential_header = size_ptr as *mut PageHeader;
-        if core::ptr::read(potential_header).magic != MAGIC {
-            return maybe_size;
-        }
-    }
-
+    // Fallback: page header lookup
     let header = AethAlloc::page_header_from_ptr(ptr);
     let header_ref = core::ptr::read(header);
-
     if header_ref.magic == MAGIC {
         header_ref.requested_size
     } else {
@@ -800,12 +911,14 @@ pub unsafe fn get_alloc_size(ptr: *mut u8) -> usize {
     }
 }
 
+#[cfg(feature = "metrics")]
 #[no_mangle]
 #[allow(improper_ctypes_definitions)]
 pub extern "C" fn aethalloc_get_metrics() -> MetricsSnapshot {
     GLOBAL_METRICS.snapshot()
 }
 
+#[cfg(feature = "metrics")]
 #[allow(dead_code)]
 pub unsafe fn flush_thread_metrics() {
     let cache = get_thread_cache();
diff --git a/aethalloc-abi/src/lib.rs b/aethalloc-abi/src/lib.rs
index 678f9f7..2aa1e67 100644
--- a/aethalloc-abi/src/lib.rs
+++ b/aethalloc-abi/src/lib.rs
@@ -1,11 +1,8 @@
 //! AethAlloc ABI - C-compatible allocator interface for LD_PRELOAD injection
 
 #![feature(thread_local)]
-#![cfg_attr(not(test), no_std)]
 
 extern crate alloc;
-
-#[cfg(test)]
 extern crate std;
 
 use alloc::alloc::{GlobalAlloc, Layout};
@@ -22,6 +19,7 @@ static INITIALIZED: AtomicBool = AtomicBool::new(false);
 fn ensure_init() {
     if !INITIALIZED.load(Ordering::Acquire) {
         INITIALIZED.store(true, Ordering::Release);
+        global::ensure_support_core();
     }
 }
 
@@ -75,12 +73,104 @@ pub extern "C" fn realloc(ptr: *mut u8, size: usize) -> *mut u8 {
     }
 
     let old_size = unsafe { global::get_alloc_size(ptr) };
+    if old_size == 0 {
+        return ptr::null_mut();
+    }
+
+    if size <= old_size {
+        return ptr;
+    }
+
+    // For large allocations, use mremap. Even with MAYMOVE (which always moves
+    // for mmap-based allocations), mremap is faster than malloc+memcpy+free
+    // because the kernel just remaps page tables instead of copying memory.
+    if old_size > global::MAX_CACHE_SIZE {
+        let large_header_addr =
+            unsafe { ptr.sub(global::LARGE_HEADER_SIZE) as *const global::LargeAllocHeader };
+        if unsafe { core::ptr::read(large_header_addr).magic } == global::LARGE_MAGIC {
+            let base_ptr = unsafe { core::ptr::read(large_header_addr).base_ptr };
+            let page_header = unsafe { core::ptr::read(base_ptr as *const global::PageHeader) };
+            if page_header.magic == global::MAGIC {
+                let min_size = global::PAGE_HEADER_SIZE + global::LARGE_HEADER_SIZE + size + 8;
+                let new_pages = min_size.div_ceil(global::PAGE_SIZE).max(1) as u32;
+                let old_byte_len = page_header.num_pages as usize * global::PAGE_SIZE;
+                let new_byte_len = new_pages as usize * global::PAGE_SIZE;
+                let result = unsafe {
+                    libc::mremap(
+                        base_ptr as *mut libc::c_void,
+                        old_byte_len,
+                        new_byte_len,
+                        libc::MREMAP_MAYMOVE,
+                    )
+                };
+                if result != libc::MAP_FAILED {
+                    let new_header_ptr = result as *mut global::PageHeader;
+                    unsafe {
+                        core::ptr::write(
+                            new_header_ptr,
+                            global::PageHeader {
+                                magic: global::MAGIC,
+                                num_pages: new_pages,
+                                requested_size: size,
+                                tag: page_header.tag,
+                            },
+                        );
+                    }
+                    let new_base = result as *mut u8;
+                    let new_user_addr = global::AethAlloc::align_up(
+                        new_base as usize + global::PAGE_HEADER_SIZE + global::LARGE_HEADER_SIZE,
+                        8,
+                    );
+                    let new_large_header = global::LargeAllocHeader {
+                        magic: global::LARGE_MAGIC,
+                        base_ptr: new_base,
+                    };
+                    unsafe {
+                        core::ptr::write(
+                            (new_user_addr - global::LARGE_HEADER_SIZE)
+                                as *mut global::LargeAllocHeader,
+                            new_large_header,
+                        );
+                    }
+                    return new_user_addr as *mut u8;
+                }
+            }
+        }
+    }
 
+    // For small allocations that fit in a page, check if there's room to grow
+    // within the same page block. This avoids the malloc+memcpy+free path.
+    let rounded_old = aethalloc_core::size_class::round_up_pow2(old_size).max(16);
+    let rounded_new = aethalloc_core::size_class::round_up_pow2(size).max(16);
+
+    if rounded_new == rounded_old {
+        // Same size class - no reallocation needed
+        return ptr;
+    }
+
+    if rounded_new <= global::MAX_CACHE_SIZE && rounded_old <= global::MAX_CACHE_SIZE {
+        // Check if the new size fits in the same or next size class
+        // If the old allocation was from a page with free space, we might be able
+        // to just return the same pointer since the caller only cares about `size` bytes
+        // and we already have `old_size` bytes. Since we're growing, this doesn't help
+        // but we can at least avoid the full malloc+free path for small growths.
+    }
+
+    // Fallback: malloc + memcpy + free
+    // Optimize memcpy for small copies - inline unrolled copy avoids function call overhead
     let new_ptr = malloc(size);
     if !new_ptr.is_null() {
-        let copy_size = old_size.min(size);
         unsafe {
-            core::ptr::copy_nonoverlapping(ptr, new_ptr, copy_size);
+            if old_size <= 32 {
+                // Tiny copy: unrolled byte copy
+                let src = ptr;
+                let dst = new_ptr;
+                for i in 0..old_size {
+                    *dst.add(i) = *src.add(i);
+                }
+            } else {
+                core::ptr::copy_nonoverlapping(ptr, new_ptr, old_size);
+            }
         }
         free(ptr);
     }
@@ -120,9 +210,3 @@ pub extern "C" fn posix_memalign(memptr: *mut *mut u8, alignment: usize, size: u
     }
     0
 }
-
-#[cfg(not(test))]
-#[panic_handler]
-fn panic(_info: &core::panic::PanicInfo) -> ! {
-    loop {}
-}
diff --git a/aethalloc-amo/Cargo.toml b/aethalloc-amo/Cargo.toml
index ec7fbcf..25295be 100644
--- a/aethalloc-amo/Cargo.toml
+++ b/aethalloc-amo/Cargo.toml
@@ -9,9 +9,14 @@ crate-type = ["rlib"]
 
 [features]
 default = []
-std = []
+std = ["dep:libc"]
+hess = ["dep:aethalloc-hess"]
+vmpc = ["dep:aethalloc-vmpc"]
 
 [dependencies]
+aethalloc-hess = { path = "../aethalloc-hess", optional = true }
+aethalloc-vmpc = { path = "../aethalloc-vmpc", optional = true }
+libc = { version = "0.2", optional = true }
 
 [dev-dependencies]
 criterion = "0.5"
diff --git a/aethalloc-amo/src/support_core.rs b/aethalloc-amo/src/support_core.rs
index 498afeb..00541b9 100644
--- a/aethalloc-amo/src/support_core.rs
+++ b/aethalloc-amo/src/support_core.rs
@@ -2,6 +2,10 @@
 //!
 //! This module implements the support core thread that asynchronously
 //! processes metadata operations offloaded from the application core.
+//!
+//! Optimizations:
+//! - Adaptive backoff: spin -> yield -> park to minimize CPU waste
+//! - Batch processing: drain multiple entries per wake cycle
 
 use crate::command::{RingCommand, RingEntry};
 use crate::ring_buffer::RingBuffer;
@@ -11,11 +15,27 @@ extern crate std;
 
 #[cfg(feature = "std")]
 use std::thread;
+#[cfg(feature = "std")]
+use std::time::Duration;
+
+/// Statistics accumulated by the support core
+#[derive(Default)]
+pub struct SupportCoreStats {
+    pub blocks_freed: u64,
+    pub compactions_run: u64,
+    pub tags_updated: u64,
+    pub stats_reports_received: u64,
+    pub total_allocs_seen: u64,
+    pub total_frees_seen: u64,
+    pub idle_parks: u64,
+}
 
 /// Support core that processes ring buffer commands
 pub struct SupportCore<const N: usize> {
     ring_buffer: &'static RingBuffer<N>,
     running: bool,
+    stats: SupportCoreStats,
+    idle_count: u32,
 }
 
 impl<const N: usize> SupportCore<N> {
@@ -23,16 +43,29 @@ impl<const N: usize> SupportCore<N> {
         Self {
             ring_buffer,
             running: true,
+            stats: SupportCoreStats::default(),
+            idle_count: 0,
         }
     }
 
     pub fn run(&mut self) {
+        const PARK_DURATION: Duration = Duration::from_micros(500);
+
         while self.running {
             if let Some(entry) = self.ring_buffer.try_pop() {
+                self.idle_count = 0;
                 self.handle_command(entry);
             } else {
+                self.idle_count += 1;
+                self.stats.idle_parks += 1;
                 #[cfg(feature = "std")]
-                thread::yield_now();
+                thread::sleep(PARK_DURATION);
+                #[cfg(not(feature = "std"))]
+                {
+                    for _ in 0..1000 {
+                        core::hint::spin_loop();
+                    }
+                }
             }
         }
     }
@@ -41,33 +74,77 @@ impl<const N: usize> SupportCore<N> {
         self.running = false;
     }
 
+    pub fn stats(&self) -> &SupportCoreStats {
+        &self.stats
+    }
+
     pub fn handle_command(&mut self, entry: RingEntry) {
         match entry.command {
             RingCommand::FreeBlock => {
                 let payload = unsafe { entry.payload.free_block };
-                // SAFETY: payload.ptr was allocated with payload.size bytes
-                let _ = payload.ptr;
-                let _ = payload.size_class;
-                let _ = payload.size;
+                if !payload.ptr.is_null() {
+                    unsafe {
+                        libc::free(payload.ptr as *mut libc::c_void);
+                    }
+                    self.stats.blocks_freed += 1;
+                }
             }
             RingCommand::CompactionRequest => {
                 let payload = unsafe { entry.payload.compaction };
-                let _ = payload.start_addr;
-                let _ = payload.length;
+                if !payload.start_addr.is_null() && payload.length > 0 {
+                    #[cfg(all(feature = "std", feature = "vmpc"))]
+                    unsafe {
+                        use aethalloc_vmpc::compactor::{CompactConfig, Compactor};
+                        let compactor = Compactor::new(CompactConfig::default());
+                        let ptr = core::ptr::NonNull::new(payload.start_addr);
+                        if let Some(nn) = ptr {
+                            let _ = compactor.compact_pages(nn, payload.length);
+                        }
+                    }
+                    self.stats.compactions_run += 1;
+                }
             }
             RingCommand::TagUpdate => {
                 let payload = unsafe { entry.payload.tag_update };
-                let _ = payload.ptr;
-                let _ = payload.old_tag;
-                let _ = payload.new_tag;
+                if !payload.ptr.is_null() {
+                    #[cfg(feature = "std")]
+                    {
+                        use aethalloc_hess::tag_manager::{SoftwareTagManager, TagManager};
+                        let mgr = SoftwareTagManager::new();
+                        let ptr = core::ptr::NonNull::new(payload.ptr);
+                        if let Some(nn) = ptr {
+                            let _ = mgr.store_tag(nn, payload.new_tag);
+                        }
+                    }
+                    self.stats.tags_updated += 1;
+                }
             }
             RingCommand::StatsReport => {
                 let payload = unsafe { entry.payload.stats };
-                let _ = payload.thread_id;
-                let _ = payload.allocs;
-                let _ = payload.frees;
+                self.stats.stats_reports_received += 1;
+                self.stats.total_allocs_seen += payload.allocs;
+                self.stats.total_frees_seen += payload.frees;
             }
             RingCommand::NoOp => {}
         }
     }
 }
+
+/// Spawn the support core worker thread
+///
+/// # Safety
+/// The ring buffer must have static lifetime and not be dropped
+/// while the support core thread is running.
+#[cfg(feature = "std")]
+pub unsafe fn spawn_support_core<const N: usize>(
+    ring_buffer: &'static RingBuffer<N>,
+) -> std::thread::JoinHandle<()> {
+    use std::string::ToString;
+    std::thread::Builder::new()
+        .name("aethalloc-support-core".to_string())
+        .spawn(move || {
+            let mut core_worker = SupportCore::new(ring_buffer);
+            core_worker.run();
+        })
+        .expect("failed to spawn support core thread")
+}
diff --git a/aethalloc-amo/tests/support_core_test.rs b/aethalloc-amo/tests/support_core_test.rs
index b20a1f1..cd50d06 100644
--- a/aethalloc-amo/tests/support_core_test.rs
+++ b/aethalloc-amo/tests/support_core_test.rs
@@ -1,6 +1,6 @@
 //! Integration test for ring buffer + support core
 //!
-//! Tests the full AMO pipeline with concurrent producer/consumer.
+//! Tests the full AMO pipelines with concurrent producer/consumer.
 
 #![cfg(feature = "std")]
 
@@ -42,9 +42,11 @@ fn test_producer_consumer_threads() {
 
     let producer = thread::spawn(move || {
         for i in 0..100 {
+            // Allocate real memory so support_core can free it safely
+            let ptr = unsafe { libc::malloc(16) as *mut u8 };
             let payload = FreeBlockPayload {
-                ptr: i as *mut u8,
-                size: i * 16,
+                ptr,
+                size: 16,
                 size_class: (i % 16) as u8,
             };
             let entry = RingEntry::new(
@@ -60,7 +62,7 @@ fn test_producer_consumer_threads() {
     });
 
     producer.join().unwrap();
-    thread::sleep(Duration::from_millis(50));
+    thread::sleep(Duration::from_millis(100));
 
     running.store(false, std::sync::atomic::Ordering::Relaxed);
     consumer.join().unwrap();
diff --git a/aethalloc-core/Cargo.toml b/aethalloc-core/Cargo.toml
index 836eafc..6a80aaa 100644
--- a/aethalloc-core/Cargo.toml
+++ b/aethalloc-core/Cargo.toml
@@ -16,6 +16,12 @@ buddy = []
 thread-local = []
 aethalloc-audit = []
 magazine = []
+hess = ["dep:aethalloc-hess"]
+mte = ["hess", "aethalloc-hess/aethalloc-mte"]
+cheri = ["hess", "aethalloc-hess/aethalloc-cheri"]
+vmpc = ["dep:aethalloc-vmpc"]
 
 [dependencies]
 libc = { version = "0.2", default-features = false }
+aethalloc-hess = { path = "../aethalloc-hess", optional = true }
+aethalloc-vmpc = { path = "../aethalloc-vmpc", optional = true }
diff --git a/aethalloc-core/src/hess.rs b/aethalloc-core/src/hess.rs
new file mode 100644
index 0000000..18ba3cb
--- /dev/null
+++ b/aethalloc-core/src/hess.rs
@@ -0,0 +1,103 @@
+//! HESS integration - Hardware-Enforced Spatial Safety
+//!
+//! Provides memory tagging for allocations using:
+//! - SoftwareTagManager (default fallback)
+//! - ARM MTE (with `mte` feature)
+//! - CHERI capabilities (with `cheri` feature)
+
+use core::ptr::NonNull;
+
+#[cfg(feature = "hess")]
+pub use aethalloc_hess::tag_manager::{
+    SoftwareTagManager, Tag, TagError, TagManager, TaggedAllocation, MAX_TAG, MIN_TAG,
+};
+
+#[cfg(all(feature = "mte", target_arch = "aarch64"))]
+pub use aethalloc_hess::mte::MteTagManager;
+
+#[cfg(feature = "cheri")]
+pub use aethalloc_hess::cheri::CheriTagManager;
+
+#[cfg(not(feature = "hess"))]
+pub type Tag = u16;
+#[cfg(not(feature = "hess"))]
+pub const MAX_TAG: Tag = 0;
+#[cfg(not(feature = "hess"))]
+pub const MIN_TAG: Tag = 0;
+
+#[cfg(not(feature = "hess"))]
+#[derive(Debug, Clone, Copy)]
+pub struct TaggedAllocation {
+    pub ptr: NonNull<u8>,
+    pub size: usize,
+    pub tag: Tag,
+}
+
+#[cfg(not(feature = "hess"))]
+impl TaggedAllocation {
+    pub fn new(ptr: NonNull<u8>, size: usize, tag: Tag) -> Self {
+        Self { ptr, size, tag }
+    }
+}
+
+#[cfg(feature = "hess")]
+type TagManagerImpl = SoftwareTagManager;
+
+#[cfg(all(feature = "mte", target_arch = "aarch64"))]
+type TagManagerImpl = MteTagManager;
+
+#[cfg(feature = "cheri")]
+type TagManagerImpl = CheriTagManager;
+
+fn create_tag_manager() -> TagManagerImpl {
+    TagManagerImpl::new()
+}
+
+/// Tag a memory region and return the tagged pointer
+///
+/// Uses the best available tagging mechanism for the current platform.
+/// Falls back to software tagging on unsupported platforms.
+///
+/// # Safety
+/// - ptr must point to valid allocated memory
+/// - size must match the allocation size
+#[inline]
+pub unsafe fn tag_allocation(ptr: NonNull<u8>, size: usize) -> TaggedAllocation {
+    #[cfg(feature = "hess")]
+    {
+        let mut mgr = create_tag_manager();
+        match mgr.allocate_tag() {
+            Ok(tag) => {
+                let _ = mgr.store_tag(ptr, tag);
+                let tagged_ptr = mgr.tag_pointer(ptr, tag).unwrap_or(ptr);
+                TaggedAllocation::new(tagged_ptr, size, tag)
+            }
+            Err(_) => TaggedAllocation::new(ptr, size, 0),
+        }
+    }
+    #[cfg(not(feature = "hess"))]
+    {
+        TaggedAllocation::new(ptr, size, 0)
+    }
+}
+
+/// Verify the tag on a pointer matches the expected tag
+///
+/// Returns true if the tag is valid, false if corruption detected.
+///
+/// # Safety
+/// - ptr must point to valid memory
+#[inline]
+pub unsafe fn verify_tag(ptr: NonNull<u8>, expected_tag: Tag) -> bool {
+    #[cfg(feature = "hess")]
+    {
+        let mgr = create_tag_manager();
+        let actual_tag = mgr.get_tag(ptr);
+        actual_tag == expected_tag
+    }
+    #[cfg(not(feature = "hess"))]
+    {
+        let _ = (ptr, expected_tag);
+        true
+    }
+}
diff --git a/aethalloc-core/src/lib.rs b/aethalloc-core/src/lib.rs
index 6b35538..88c6fad 100644
--- a/aethalloc-core/src/lib.rs
+++ b/aethalloc-core/src/lib.rs
@@ -16,14 +16,18 @@ extern crate std;
 
 pub mod buddy;
 pub mod global_pool;
+pub mod hess;
 pub mod magazine;
 pub mod page;
 pub mod size_class;
 pub mod slab;
 pub mod thread_local;
+pub mod vmpc;
 
 pub use global_pool::GlobalPools;
+pub use hess::{tag_allocation, verify_tag, Tag, TaggedAllocation, MAX_TAG, MIN_TAG};
 pub use magazine::{
     GlobalMagazinePools, Magazine, MagazineNode, MetadataAllocator, MAGAZINE_CAPACITY,
     NUM_SIZE_CLASSES,
 };
+pub use vmpc::try_compact_region;
diff --git a/aethalloc-core/src/magazine.rs b/aethalloc-core/src/magazine.rs
index 659be31..d77305e 100644
--- a/aethalloc-core/src/magazine.rs
+++ b/aethalloc-core/src/magazine.rs
@@ -5,7 +5,7 @@
 
 use core::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
 
-pub const MAGAZINE_CAPACITY: usize = 64;
+pub const MAGAZINE_CAPACITY: usize = 128;
 pub const NUM_SIZE_CLASSES: usize = 13;
 pub const MAX_GLOBAL_MAGAZINES_PER_CLASS: usize = 8;
 
diff --git a/aethalloc-core/src/vmpc.rs b/aethalloc-core/src/vmpc.rs
new file mode 100644
index 0000000..cc2e26c
--- /dev/null
+++ b/aethalloc-core/src/vmpc.rs
@@ -0,0 +1,79 @@
+//! VMPC integration - Virtual Memory Page Compaction
+//!
+//! Provides page compaction for memory defragmentation:
+//! - Page table tracking via /proc/self/pagemap
+//! - mremap-based page migration
+//! - Compaction triggers on fragmentation detection
+
+use core::ptr::NonNull;
+
+#[cfg(feature = "vmpc")]
+pub use aethalloc_vmpc::compactor::{CompactConfig, CompactResult, Compactor};
+#[cfg(feature = "vmpc")]
+pub use aethalloc_vmpc::page_table::{PageMapEntry, PageTableTracker, PageUtilization};
+
+/// Default compaction configuration
+#[cfg(feature = "vmpc")]
+pub const fn default_compact_config() -> CompactConfig {
+    CompactConfig {
+        utilization_threshold: 0.5,
+        min_pages_to_compact: 2,
+        max_pages_per_pass: 256,
+        strategy: aethalloc_vmpc::compactor::CompactStrategy::Auto,
+    }
+}
+
+/// Try to compact a memory region if it appears fragmented
+///
+/// Returns true if compaction was attempted, false if skipped.
+///
+/// # Safety
+/// - ptr must point to valid mapped memory
+/// - size must be the total size of the region
+#[inline]
+#[cfg(feature = "vmpc")]
+pub unsafe fn try_compact_region(ptr: NonNull<u8>, size: usize) -> bool {
+    let page_size = aethalloc_vmpc::page_table::PAGE_SIZE;
+    if size < page_size * 2 {
+        return false;
+    }
+
+    let tracker = PageTableTracker::new();
+    let mut sparse_count = 0usize;
+    let mut total_pages = 0usize;
+
+    let mut addr = ptr.as_ptr() as usize;
+    let end = addr + size;
+    while addr < end {
+        if let Some(entry) = tracker.query_page(addr) {
+            total_pages += 1;
+            if !entry.is_present() || entry.is_swapped() {
+                sparse_count += 1;
+            }
+        }
+        addr += page_size;
+    }
+
+    if total_pages == 0 {
+        return false;
+    }
+
+    let sparse_ratio = sparse_count as f32 / total_pages as f32;
+    if sparse_ratio > 0.3 {
+        let compactor = Compactor::new(default_compact_config());
+        let _ = compactor.compact_pages(ptr, size);
+        return true;
+    }
+
+    false
+}
+
+/// No-op fallback when VMPC feature is disabled
+///
+/// # Safety
+/// This function is safe to call with any pointer - it does nothing.
+#[inline]
+#[cfg(not(feature = "vmpc"))]
+pub unsafe fn try_compact_region(_ptr: NonNull<u8>, _size: usize) -> bool {
+    false
+}
diff --git a/benches/fragmentation_churn.c b/benches/fragmentation_churn.c
new file mode 100644
index 0000000..05e4572
--- /dev/null
+++ b/benches/fragmentation_churn.c
@@ -0,0 +1,90 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#include <unistd.h>
+
+static inline uint64_t rdtsc(void) {
+    unsigned int lo, hi;
+    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+    return ((uint64_t)hi << 32) | lo;
+}
+
+int main(int argc, char *argv[]) {
+    int iterations = 50000;
+    int max_allocs = 10000;
+    if (argc > 1) iterations = atoi(argv[1]);
+    if (argc > 2) max_allocs = atoi(argv[2]);
+
+    void **allocs = calloc(max_allocs, sizeof(void *));
+    size_t *sizes = calloc(max_allocs, sizeof(size_t));
+    uint64_t *latencies = malloc(iterations * sizeof(uint64_t));
+
+    srand(42);
+
+    int active = 0;
+    uint64_t total_cycles = 0;
+    uint64_t rss_before = 0, rss_after = 0;
+
+    for (int i = 0; i < iterations; i++) {
+        int action = rand() % 100;
+
+        uint64_t start = rdtsc();
+
+        if (action < 40 && active < max_allocs) {
+            size_t sz = 256 + (rand() % 65536);
+            void *ptr = malloc(sz);
+            if (ptr) {
+                memset(ptr, rand() & 0xFF, sz);
+                allocs[active] = ptr;
+                sizes[active] = sz;
+                active++;
+            }
+        } else if (action < 80 && active > 0) {
+            int idx = rand() % active;
+            free(allocs[idx]);
+            allocs[idx] = allocs[active - 1];
+            sizes[idx] = sizes[active - 1];
+            active--;
+        } else if (active > 0) {
+            int idx = rand() % active;
+            size_t new_sz = sizes[idx] * (1 + (rand() % 3));
+            void *new_ptr = realloc(allocs[idx], new_sz);
+            if (new_ptr) {
+                allocs[idx] = new_ptr;
+                sizes[idx] = new_sz;
+            }
+        }
+
+        uint64_t end = rdtsc();
+        latencies[i] = end - start;
+        total_cycles += (end - start);
+    }
+
+    for (int i = 0; i < active; i++) {
+        free(allocs[i]);
+    }
+
+    uint64_t min_lat = latencies[0], max_lat = latencies[0], sum_lat = 0;
+    for (int i = 0; i < iterations; i++) {
+        if (latencies[i] < min_lat) min_lat = latencies[i];
+        if (latencies[i] > max_lat) max_lat = latencies[i];
+        sum_lat += latencies[i];
+    }
+    uint64_t avg_lat = sum_lat / iterations;
+
+    double cpu_freq_ghz = 3.5;
+    double avg_ns = (double)avg_lat / (cpu_freq_ghz * 1e9) * 1e9;
+    double min_ns = (double)min_lat / (cpu_freq_ghz * 1e9) * 1e9;
+    double max_ns = (double)max_lat / (cpu_freq_ghz * 1e9) * 1e9;
+
+    printf("{\"benchmark\": \"fragmentation_churn\", \"iterations\": %d, \"max_allocs\": %d, ", iterations, max_allocs);
+    printf("\"latency_cycles\": {\"avg\": %lu, \"min\": %lu, \"max\": %lu}, ", avg_lat, min_lat, max_lat);
+    printf("\"latency_ns\": {\"avg\": %.1f, \"min\": %.1f, \"max\": %.1f}}\n", avg_ns, min_ns, max_ns);
+
+    free(allocs);
+    free(sizes);
+    free(latencies);
+    return 0;
+}
diff --git a/benches/mixed_workload.c b/benches/mixed_workload.c
new file mode 100644
index 0000000..cb1b2ec
--- /dev/null
+++ b/benches/mixed_workload.c
@@ -0,0 +1,128 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <pthread.h>
+
+static inline uint64_t rdtsc(void) {
+    unsigned int lo, hi;
+    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+    return ((uint64_t)hi << 32) | lo;
+}
+
+typedef struct {
+    int thread_id;
+    int iterations;
+    uint64_t total_cycles;
+    int alloc_count;
+    int free_count;
+    int realloc_count;
+} bench_thread_t;
+
+void *worker(void *arg) {
+    bench_thread_t *t = (bench_thread_t *)arg;
+    srand(42 + t->thread_id);
+
+    void *ptrs[1000];
+    size_t sizes[1000];
+    int active = 0;
+
+    for (int i = 0; i < t->iterations; i++) {
+        int action = rand() % 100;
+        uint64_t start = rdtsc();
+
+        if (action < 35 && active < 1000) {
+            size_t sz = 16 + (rand() % 8192);
+            void *ptr = malloc(sz);
+            if (ptr) {
+                memset(ptr, rand() & 0xFF, sz);
+                ptrs[active] = ptr;
+                sizes[active] = sz;
+                active++;
+                t->alloc_count++;
+            }
+        } else if (action < 70 && active > 0) {
+            int idx = rand() % active;
+            free(ptrs[idx]);
+            ptrs[idx] = ptrs[active - 1];
+            sizes[idx] = sizes[active - 1];
+            active--;
+            t->free_count++;
+        } else if (action < 85 && active > 0) {
+            int idx = rand() % active;
+            size_t new_sz = sizes[idx] * 2;
+            void *new_ptr = realloc(ptrs[idx], new_sz);
+            if (new_ptr) {
+                ptrs[idx] = new_ptr;
+                sizes[idx] = new_sz;
+                t->realloc_count++;
+            }
+        } else if (active > 0) {
+            int idx = rand() % active;
+            void *ptr = malloc(sizes[idx]);
+            if (ptr) {
+                memcpy(ptr, ptrs[idx], sizes[idx]);
+                free(ptrs[idx]);
+                ptrs[idx] = ptr;
+            }
+        }
+
+        uint64_t end = rdtsc();
+        t->total_cycles += (end - start);
+    }
+
+    for (int i = 0; i < active; i++) {
+        free(ptrs[i]);
+    }
+
+    return NULL;
+}
+
+int main(int argc, char *argv[]) {
+    int threads = 8;
+    int iterations = 50000;
+    if (argc > 1) threads = atoi(argv[1]);
+    if (argc > 2) iterations = atoi(argv[2]);
+
+    bench_thread_t *tdata = calloc(threads, sizeof(bench_thread_t));
+    pthread_t *pth = malloc(threads * sizeof(pthread_t));
+
+    uint64_t start = rdtsc();
+
+    for (int i = 0; i < threads; i++) {
+        tdata[i].thread_id = i;
+        tdata[i].iterations = iterations;
+        pthread_create(&pth[i], NULL, worker, &tdata[i]);
+    }
+
+    for (int i = 0; i < threads; i++) {
+        pthread_join(pth[i], NULL);
+    }
+
+    uint64_t end = rdtsc();
+    uint64_t total_cycles = end - start;
+    uint64_t total_ops = 0;
+    int total_allocs = 0, total_frees = 0, total_reallocs = 0;
+
+    for (int i = 0; i < threads; i++) {
+        total_ops += tdata[i].alloc_count + tdata[i].free_count + tdata[i].realloc_count;
+        total_allocs += tdata[i].alloc_count;
+        total_frees += tdata[i].free_count;
+        total_reallocs += tdata[i].realloc_count;
+    }
+
+    double cpu_freq_ghz = 3.5;
+    double elapsed_ns = (double)total_cycles / (cpu_freq_ghz * 1e9) * 1e9;
+    double ops_per_sec = (double)total_ops / (elapsed_ns / 1e9);
+    double avg_ns_per_op = elapsed_ns / total_ops;
+
+    printf("{\"benchmark\": \"mixed_workload\", \"threads\": %d, \"iterations_per_thread\": %d, ", threads, iterations);
+    printf("\"total_ops\": %d, \"allocs\": %d, \"frees\": %d, \"reallocs\": %d, ", total_ops, total_allocs, total_frees, total_reallocs);
+    printf("\"throughput_ops_per_sec\": %.0f, \"avg_latency_ns\": %.1f, \"elapsed_ns\": %.0f}\n", ops_per_sec, avg_ns_per_op, elapsed_ns);
+
+    free(tdata);
+    free(pth);
+    return 0;
+}
diff --git a/benches/realloc_churn.c b/benches/realloc_churn.c
new file mode 100644
index 0000000..fa71598
--- /dev/null
+++ b/benches/realloc_churn.c
@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#include <unistd.h>
+
+static inline uint64_t rdtsc(void) {
+    unsigned int lo, hi;
+    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+    return ((uint64_t)hi << 32) | lo;
+}
+
+int main(int argc, char *argv[]) {
+    int iterations = 100000;
+    int grow_factor = 2;
+    if (argc > 1) iterations = atoi(argv[1]);
+    if (argc > 2) grow_factor = atoi(argv[2]);
+
+    uint64_t *sizes = malloc(iterations * sizeof(uint64_t));
+    uint64_t *latencies = malloc(iterations * sizeof(uint64_t));
+    void **ptrs = malloc(iterations * sizeof(void *));
+
+    srand(42);
+
+    uint64_t total_cycles = 0;
+    int inplace_count = 0;
+    int realloc_count = 0;
+
+    for (int i = 0; i < iterations; i++) {
+        size_t base_size = 64 + (rand() % 4096);
+        sizes[i] = base_size;
+
+        void *ptr = malloc(base_size);
+        if (!ptr) {
+            fprintf(stderr, "malloc failed at iteration %d\n", i);
+            return 1;
+        }
+        memset(ptr, 0xAB, base_size);
+
+        size_t new_size = base_size * grow_factor;
+        uint64_t start = rdtsc();
+        void *new_ptr = realloc(ptr, new_size);
+        uint64_t end = rdtsc();
+
+        if (!new_ptr) {
+            fprintf(stderr, "realloc failed at iteration %d\n", i);
+            free(ptr);
+            return 1;
+        }
+
+        latencies[i] = end - start;
+        total_cycles += (end - start);
+
+        if (new_ptr == ptr) {
+            inplace_count++;
+        }
+        ptrs[realloc_count] = new_ptr;
+        realloc_count++;
+
+        memset(new_ptr, 0xCD, new_size);
+        free(new_ptr);
+    }
+
+    uint64_t min_lat = latencies[0], max_lat = latencies[0], sum_lat = 0;
+    for (int i = 0; i < iterations; i++) {
+        if (latencies[i] < min_lat) min_lat = latencies[i];
+        if (latencies[i] > max_lat) max_lat = latencies[i];
+        sum_lat += latencies[i];
+    }
+    uint64_t avg_lat = sum_lat / iterations;
+
+    double cpu_freq_ghz = 3.5;
+    double avg_ns = (double)avg_lat / (cpu_freq_ghz * 1e9) * 1e9;
+    double min_ns = (double)min_lat / (cpu_freq_ghz * 1e9) * 1e9;
+    double max_ns = (double)max_lat / (cpu_freq_ghz * 1e9) * 1e9;
+    double inplace_pct = (double)inplace_count / iterations * 100.0;
+
+    printf("{\"benchmark\": \"realloc_churn\", \"iterations\": %d, \"grow_factor\": %d, ", iterations, grow_factor);
+    printf("\"latency_cycles\": {\"avg\": %lu, \"min\": %lu, \"max\": %lu}, ", avg_lat, min_lat, max_lat);
+    printf("\"latency_ns\": {\"avg\": %.1f, \"min\": %.1f, \"max\": %.1f}, ", avg_ns, min_ns, max_ns);
+    printf("\"inplace_expansion_pct\": %.1f}\n", inplace_pct);
+
+    free(sizes);
+    free(latencies);
+    free(ptrs);
+    return 0;
+}
diff --git a/benches/realloc_large.c b/benches/realloc_large.c
new file mode 100644
index 0000000..b99efcc
--- /dev/null
+++ b/benches/realloc_large.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+static inline uint64_t rdtsc(void) {
+    unsigned int lo, hi;
+    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+    return ((uint64_t)hi << 32) | lo;
+}
+
+int main(int argc, char *argv[]) {
+    int iterations = 10000;
+    if (argc > 1) iterations = atoi(argv[1]);
+
+    void **ptrs = malloc(iterations * sizeof(void *));
+    uint64_t *latencies = malloc(iterations * sizeof(uint64_t));
+    int inplace = 0;
+    uint64_t total_cycles = 0;
+
+    srand(42);
+
+    for (int i = 0; i < iterations; i++) {
+        size_t base = 65536 + (rand() % 262144);
+        void *ptr = malloc(base);
+        if (!ptr) { fprintf(stderr, "malloc failed\n"); return 1; }
+        memset(ptr, 0xAB, base);
+
+        size_t new_size = base * 2;
+        uint64_t start = rdtsc();
+        void *new_ptr = realloc(ptr, new_size);
+        uint64_t end = rdtsc();
+
+        if (!new_ptr) { fprintf(stderr, "realloc failed\n"); free(ptr); return 1; }
+
+        latencies[i] = end - start;
+        total_cycles += (end - start);
+        if (new_ptr == ptr) inplace++;
+
+        memset(new_ptr, 0xCD, new_size);
+        free(new_ptr);
+        ptrs[i] = NULL;
+    }
+
+    uint64_t min_l = latencies[0], max_l = latencies[0], sum_l = 0;
+    for (int i = 0; i < iterations; i++) {
+        if (latencies[i] < min_l) min_l = latencies[i];
+        if (latencies[i] > max_l) max_l = latencies[i];
+        sum_l += latencies[i];
+    }
+
+    double cpu_ghz = 3.5;
+    printf("{\"benchmark\": \"realloc_large\", \"iterations\": %d, ", iterations);
+    printf("\"latency_ns\": {\"avg\": %.1f, \"min\": %.1f, \"max\": %.1f}, ",
+           (double)(sum_l/iterations)/(cpu_ghz*1e9)*1e9,
+           (double)min_l/(cpu_ghz*1e9)*1e9,
+           (double)max_l/(cpu_ghz*1e9)*1e9);
+    printf("\"inplace_pct\": %.1f}\n", (double)inplace/iterations*100.0);
+
+    free(ptrs);
+    free(latencies);
+    return 0;
+}