diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index bae9ee8..9367d71 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -1,20 +1,168 @@ -name: Benchmarks +name: Benchmark Matrix on: + push: + branches: [feature/wire-advanced-features] workflow_dispatch: - schedule: - - cron: '0 0 * * 0' # Weekly on Sunday jobs: - full-benchmark: + build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: cachix/install-nix-action@v27 with: nix_path: nixpkgs=channel:nixos-unstable + - name: Cache Nix store + uses: actions/cache@v4 + with: + path: | + ~/.cache/nix + /nix/store + key: nix-${{ runner.os }}-${{ hashFiles('**/Cargo.lock', '**/flake.nix', '**/flake.lock') }} + restore-keys: | + nix-${{ runner.os }}- + - name: Cache Cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + cargo-${{ runner.os }}- - name: Build run: nix build + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: libaethalloc + path: result/lib/*.so + + benchmark-matrix: + needs: build + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + benchmark: + - name: packet_churn + cmd: "/tmp/packet_churn 100000 10000" + metric: throughput_ops_per_sec + unit: ops/s + direction: higher + - name: multithread_churn + cmd: "/tmp/multithread_churn 8 100000" + metric: throughput_ops_per_sec + unit: ops/s + direction: higher + - name: kv_store + cmd: "/tmp/kv_store" + metric: throughput_ops_per_sec + unit: ops/s + direction: higher + - name: producer_consumer + cmd: "/tmp/producer_consumer" + metric: throughput_ops_per_sec + unit: ops/s + direction: higher + - name: realloc_churn + cmd: "/tmp/realloc_churn 100000 2" + metric: latency_ns.avg + unit: ns + direction: lower + - name: realloc_large + cmd: "/tmp/realloc_large 10000" + metric: latency_ns.avg + unit: ns + direction: lower + - name: fragmentation_churn + cmd: "/tmp/fragmentation_churn 50000 10000" + metric: latency_ns.avg + unit: ns + direction: lower + - name: fragmentation_rss + cmd: "/tmp/fragmentation" + metric: summary.final_rss_kb + unit: KB + direction: lower + run_id: [1, 2, 3, 4, 5] + steps: + - uses: actions/checkout@v4 + - name: Download artifact + uses: actions/download-artifact@v4 + with: + name: libaethalloc + path: ./lib + - name: Compile benchmarks + run: | + gcc -O3 -pthread benches/packet_churn.c -o /tmp/packet_churn + gcc -O3 -pthread benches/kv_store.c -o /tmp/kv_store + gcc -O3 -pthread benches/producer_consumer.c -o /tmp/producer_consumer + gcc -O3 -pthread benches/multithread_churn.c -o /tmp/multithread_churn + gcc -O3 -pthread benches/fragmentation.c -o /tmp/fragmentation + gcc -O3 -pthread benches/realloc_churn.c -o /tmp/realloc_churn + gcc -O3 -pthread benches/realloc_large.c -o /tmp/realloc_large + gcc -O3 -pthread benches/fragmentation_churn.c -o /tmp/fragmentation_churn + - name: Run glibc baseline + id: glibc + run: | + RESULT=$(${{ matrix.benchmark.cmd }} 2>&1) + echo "result<> $GITHUB_OUTPUT + echo "$RESULT" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + - name: Run aethalloc + id: aethalloc + run: | + LIB=$(realpath lib/*.so) + RESULT=$(LD_PRELOAD="$LIB" ${{ matrix.benchmark.cmd }} 2>&1) + echo "result<> $GITHUB_OUTPUT + echo "$RESULT" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + - name: Compare + run: | + python3 -c " + import json, os + glibc = json.loads(os.environ['GLIBC_RESULT']) + aeth = json.loads(os.environ['AETH_RESULT']) + metric_path = os.environ['METRIC'].split('.') + def get_nested(d, path): + for key in path: + if isinstance(d, dict): + d = d.get(key, 0) + else: + return 0 + return d + glibc_val = get_nested(glibc, metric_path) + aeth_val = get_nested(aeth, metric_path) + delta = ((aeth_val - glibc_val) / glibc_val * 100) if glibc_val > 0 else 0 + direction = os.environ['DIRECTION'] + if direction == 'higher': + emoji = '🟢' if delta > 0 else '🔴' if delta < 0 else '➖' + else: + emoji = '🟢' if delta < 0 else '🔴' if delta > 0 else '➖' + print(f'{emoji} {os.environ[\"BENCH_NAME\"]} run {os.environ[\"RUN_ID\"]}: glibc={glibc_val:,.2f} | aethalloc={aeth_val:,.2f} | delta={delta:+.1f}%') + " + env: + GLIBC_RESULT: ${{ steps.glibc.outputs.result }} + AETH_RESULT: ${{ steps.aethalloc.outputs.result }} + METRIC: ${{ matrix.benchmark.metric }} + DIRECTION: ${{ matrix.benchmark.direction }} + BENCH_NAME: ${{ matrix.benchmark.name }} + RUN_ID: ${{ matrix.run_id }} + + summarize: + needs: benchmark-matrix + runs-on: ubuntu-latest + if: always() + steps: + - uses: actions/checkout@v4 + - name: Download artifact + uses: actions/download-artifact@v4 + with: + name: libaethalloc + path: ./lib - name: Compile all benchmarks run: | gcc -O3 -pthread benches/packet_churn.c -o /tmp/packet_churn @@ -22,92 +170,132 @@ jobs: gcc -O3 -pthread benches/producer_consumer.c -o /tmp/producer_consumer gcc -O3 -pthread benches/multithread_churn.c -o /tmp/multithread_churn gcc -O3 -pthread benches/fragmentation.c -o /tmp/fragmentation + gcc -O3 -pthread benches/realloc_churn.c -o /tmp/realloc_churn + gcc -O3 -pthread benches/realloc_large.c -o /tmp/realloc_large + gcc -O3 -pthread benches/fragmentation_churn.c -o /tmp/fragmentation_churn gcc -O3 benches/tail_latency.c -o /tmp/tail_latency - gcc -O3 benches/massive_alloc.c -o /tmp/massive_alloc - gcc -O3 benches/corruption_test.c -o /tmp/corruption_test - - name: Run all benchmarks - id: benchmarks + - name: Run full benchmark suite run: | - AETHALLOC="LD_PRELOAD=$(realpath result/lib/*.so)" - - echo "## Benchmark Results" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "**Test System:** GitHub Actions ubuntu-latest" >> $GITHUB_STEP_SUMMARY - echo "**Date:** $(date -I)" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - echo "### Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "| Benchmark | glibc | AethAlloc | Ratio |" >> $GITHUB_STEP_SUMMARY - echo "|-----------|-------|-----------|-------|" >> $GITHUB_STEP_SUMMARY - - # Packet Churn - GLIBC_PC=$(/tmp/packet_churn | jq -r '.throughput_ops_per_sec') - AETH_PC=$($AETHALLOC /tmp/packet_churn | jq -r '.throughput_ops_per_sec') - RATIO_PC=$(echo "scale=0; $AETH_PC * 100 / $GLIBC_PC" | bc) - echo "| Packet Churn | ${GLIBC_PC} | ${AETH_PC} | ${RATIO_PC}% |" >> $GITHUB_STEP_SUMMARY - - # KV Store - GLIBC_KV=$(/tmp/kv_store | jq -r '.throughput_ops_per_sec') - AETH_KV=$($AETHALLOC /tmp/kv_store | jq -r '.throughput_ops_per_sec') - RATIO_KV=$(echo "scale=0; $AETH_KV * 100 / $GLIBC_KV" | bc) - echo "| KV Store | ${GLIBC_KV} | ${AETH_KV} | ${RATIO_KV}% |" >> $GITHUB_STEP_SUMMARY - - # Producer-Consumer - GLIBC_PCS=$(/tmp/producer_consumer | jq -r '.throughput_ops_per_sec') - AETH_PCS=$($AETHALLOC /tmp/producer_consumer | jq -r '.throughput_ops_per_sec') - RATIO_PCS=$(echo "scale=0; $AETH_PCS * 100 / $GLIBC_PCS" | bc) - echo "| Producer-Consumer | ${GLIBC_PCS} | ${AETH_PCS} | ${RATIO_PCS}% |" >> $GITHUB_STEP_SUMMARY - - # Multithread - GLIBC_MT=$(/tmp/multithread_churn | jq -r '.throughput_ops_per_sec') - AETH_MT=$($AETHALLOC /tmp/multithread_churn | jq -r '.throughput_ops_per_sec') - RATIO_MT=$(echo "scale=0; $AETH_MT * 100 / $GLIBC_MT" | bc) - echo "| Multithread (8T) | ${GLIBC_MT} | ${AETH_MT} | ${RATIO_MT}% |" >> $GITHUB_STEP_SUMMARY - - # Fragmentation - GLIBC_RSS=$(/tmp/fragmentation | jq -r '.summary.final_rss_kb') - AETH_RSS=$($AETHALLOC /tmp/fragmentation | jq -r '.summary.final_rss_kb') - RATIO_RSS=$(echo "scale=1; $GLIBC_RSS / $AETH_RSS" | bc) - echo "| Fragmentation RSS | ${GLIBC_RSS} KB | ${AETH_RSS} KB | ${RATIO_RSS}x better |" >> $GITHUB_STEP_SUMMARY - - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Tail Latency (8 threads, 50K ops each)" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "| Allocator | P50 | P99 | P99.9 | P99.99 | Max |" >> $GITHUB_STEP_SUMMARY - echo "|-----------|-----|-----|-------|--------|-----|" >> $GITHUB_STEP_SUMMARY - - GLIBC_LAT=$(/tmp/tail_latency 8 50000) - AETH_LAT=$($AETHALLOC /tmp/tail_latency 8 50000) - - GLIBC_P50=$(echo "$GLIBC_LAT" | jq -r '.latency_ns.p50') - GLIBC_P99=$(echo "$GLIBC_LAT" | jq -r '.latency_ns.p99') - GLIBC_P999=$(echo "$GLIBC_LAT" | jq -r '.latency_ns["p99.9"]') - GLIBC_P9999=$(echo "$GLIBC_LAT" | jq -r '.latency_ns["p99.99"]') - GLIBC_MAX=$(echo "$GLIBC_LAT" | jq -r '.latency_ns.max') - - AETH_P50=$(echo "$AETH_LAT" | jq -r '.latency_ns.p50') - AETH_P99=$(echo "$AETH_LAT" | jq -r '.latency_ns.p99') - AETH_P999=$(echo "$AETH_LAT" | jq -r '.latency_ns["p99.9"]') - AETH_P9999=$(echo "$AETH_LAT" | jq -r '.latency_ns["p99.99"]') - AETH_MAX=$(echo "$AETH_LAT" | jq -r '.latency_ns.max') - - echo "| glibc | ${GLIBC_P50}ns | ${GLIBC_P99}ns | ${GLIBC_P999}ns | ${GLIBC_P9999}ns | ${GLIBC_MAX}ns |" >> $GITHUB_STEP_SUMMARY - echo "| AethAlloc | ${AETH_P50}ns | ${AETH_P99}ns | ${AETH_P999}ns | ${AETH_P9999}ns | ${AETH_MAX}ns |" >> $GITHUB_STEP_SUMMARY - - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Massive Allocations" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - echo "=== glibc ===" >> $GITHUB_STEP_SUMMARY - /tmp/massive_alloc >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "=== AethAlloc ===" >> $GITHUB_STEP_SUMMARY - $AETHALLOC /tmp/massive_alloc >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Corruption Test" >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - $AETHALLOC /tmp/corruption_test >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY + python3 << 'PYEOF' + import subprocess, json, statistics, os + + LIB_PATH = subprocess.check_output("realpath lib/*.so", shell=True).decode().strip() + + benchmarks = [ + ("packet_churn", "/tmp/packet_churn 100000 10000", "throughput_ops_per_sec", "ops/s", "higher"), + ("multithread_churn", "/tmp/multithread_churn 8 100000", "throughput_ops_per_sec", "ops/s", "higher"), + ("kv_store", "/tmp/kv_store", "throughput_ops_per_sec", "ops/s", "higher"), + ("producer_consumer", "/tmp/producer_consumer", "throughput_ops_per_sec", "ops/s", "higher"), + ("realloc_churn", "/tmp/realloc_churn 100000 2", "latency_ns.avg", "ns", "lower"), + ("realloc_large", "/tmp/realloc_large 10000", "latency_ns.avg", "ns", "lower"), + ("fragmentation_churn", "/tmp/fragmentation_churn 50000 10000", "latency_ns.avg", "ns", "lower"), + ("fragmentation_rss", "/tmp/fragmentation", "summary.final_rss_kb", "KB", "lower"), + ] + + runs = 5 + summary = "# Benchmark Results\n\n" + summary += f"**System:** GitHub Actions ubuntu-latest ({subprocess.check_output('nproc', shell=True).decode().strip()} cores)\n\n" + summary += f"**Runs per benchmark:** {runs}\n\n" + summary += "---\n\n" + + for bench_name, cmd, metric, unit, direction in benchmarks: + glibc_vals = [] + aeth_vals = [] + for i in range(runs): + try: + out = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT, timeout=120).decode() + d = json.loads(out.strip()) + parts = metric.split(".") + val = d + for p in parts: + val = val.get(p, 0) if isinstance(val, dict) else 0 + glibc_vals.append(val) + except Exception as e: + print(f"WARNING: glibc {bench_name} run {i+1} failed: {e}") + + try: + out = subprocess.check_output(f"LD_PRELOAD={LIB_PATH} {cmd}", shell=True, stderr=subprocess.STDOUT, timeout=120).decode() + d = json.loads(out.strip()) + parts = metric.split(".") + val = d + for p in parts: + val = val.get(p, 0) if isinstance(val, dict) else 0 + aeth_vals.append(val) + except Exception as e: + print(f"WARNING: aethalloc {bench_name} run {i+1} failed: {e}") + + g_mean = statistics.mean(glibc_vals) if glibc_vals else 0 + g_stdev = statistics.stdev(glibc_vals) if len(glibc_vals) > 1 else 0 + a_mean = statistics.mean(aeth_vals) if aeth_vals else 0 + a_stdev = statistics.stdev(aeth_vals) if len(aeth_vals) > 1 else 0 + delta = ((a_mean - g_mean) / g_mean * 100) if g_mean > 0 else 0 + + if direction == "higher": + emoji = "🟢" if delta > 2 else "🔴" if delta < -2 else "➖" + else: + emoji = "🟢" if delta < -2 else "🔴" if delta > 2 else "➖" + + summary += f"{emoji} **{bench_name}**\n" + if glibc_vals or aeth_vals: + summary += f"- glibc: {g_mean:,.0f} ± {g_stdev:,.0f} {unit}\n" + summary += f"- aethalloc: {a_mean:,.0f} ± {a_stdev:,.0f} {unit}\n" + summary += f"- **delta: {delta:+.1f}%**\n\n" + else: + summary += f"- ⚠️ All runs failed (benchmark may not work on this platform)\n\n" + + # Tail latency + summary += "---\n\n## Tail Latency (8 threads, 50K ops)\n\n" + summary += "| Allocator | P50 | P99 | P99.9 | P99.99 | Max |\n" + summary += "|-----------|-----|-----|-------|--------|-----|\n" + + for label, pre in [("glibc", ""), ("AethAlloc", f"LD_PRELOAD={LIB_PATH}")]: + try: + out = subprocess.check_output(f"{pre} /tmp/tail_latency 8 50000", shell=True, stderr=subprocess.STDOUT, timeout=120).decode() + d = json.loads(out.strip()) + lat = d.get("latency_ns", {}) + summary += f"| {label} | {lat.get('p50', 0):,.0f}ns | {lat.get('p99', 0):,.0f}ns | {lat.get('p99.9', 0):,.0f}ns | {lat.get('p99.99', 0):,.0f}ns | {lat.get('max', 0):,.0f}ns |\n" + except Exception as e: + summary += f"| {label} | ⚠️ | ⚠️ | ⚠️ | ⚠️ | ⚠️ |\n" + print(f"WARNING: {label} tail_latency failed: {e}") + + with open(os.environ["GITHUB_STEP_SUMMARY"], "w") as f: + f.write(summary) + + # Also save raw JSON + raw = {} + for bench_name, cmd, metric, unit, direction in benchmarks: + glibc_vals = [] + aeth_vals = [] + for i in range(runs): + try: + out = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT, timeout=120).decode() + d = json.loads(out.strip()) + parts = metric.split(".") + val = d + for p in parts: + val = val.get(p, 0) if isinstance(val, dict) else 0 + glibc_vals.append(val) + except: + pass + try: + out = subprocess.check_output(f"LD_PRELOAD={LIB_PATH} {cmd}", shell=True, stderr=subprocess.STDOUT, timeout=120).decode() + d = json.loads(out.strip()) + val = d + for p in parts: + val = val.get(p, 0) if isinstance(val, dict) else 0 + aeth_vals.append(val) + except: + pass + if glibc_vals or aeth_vals: + raw[bench_name] = { + "glibc": {"mean": statistics.mean(glibc_vals) if glibc_vals else 0, "stdev": statistics.stdev(glibc_vals) if len(glibc_vals) > 1 else 0, "runs": glibc_vals}, + "aethalloc": {"mean": statistics.mean(aeth_vals) if aeth_vals else 0, "stdev": statistics.stdev(aeth_vals) if len(aeth_vals) > 1 else 0, "runs": aeth_vals}, + } + with open("benchmark-results.json", "w") as f: + json.dump(raw, f, indent=2) + PYEOF + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: benchmark-results.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 256a0a1..3495a33 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: CI on: push: - branches: [main] + branches: [main, feature/*] pull_request: branches: [main] workflow_dispatch: @@ -65,6 +65,9 @@ jobs: gcc -O3 -pthread benches/producer_consumer.c -o /tmp/producer_consumer gcc -O3 -pthread benches/multithread_churn.c -o /tmp/multithread_churn gcc -O3 -pthread benches/fragmentation.c -o /tmp/fragmentation + gcc -O3 -pthread benches/realloc_churn.c -o /tmp/realloc_churn + gcc -O3 -pthread benches/realloc_large.c -o /tmp/realloc_large + gcc -O3 -pthread benches/fragmentation_churn.c -o /tmp/fragmentation_churn - name: Packet Churn run: | echo "GLIBC=$(/tmp/packet_churn | jq -r '.throughput_ops_per_sec')" >> $GITHUB_ENV @@ -85,6 +88,18 @@ jobs: run: | echo "GLIBC_RSS=$(/tmp/fragmentation | jq -r '.summary.final_rss_kb')" >> $GITHUB_ENV echo "AETHALLOC_RSS=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/fragmentation | jq -r '.summary.final_rss_kb')" >> $GITHUB_ENV + - name: Realloc Churn + run: | + echo "GLIBC_REALLOC=$(/tmp/realloc_churn 100000 2 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV + echo "AETHALLOC_REALLOC=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/realloc_churn 100000 2 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV + - name: Realloc Large + run: | + echo "GLIBC_REALLOC_LARGE=$(/tmp/realloc_large 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV + echo "AETHALLOC_REALLOC_LARGE=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/realloc_large 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV + - name: Fragmentation Churn + run: | + echo "GLIBC_FRAG_CHURN=$(/tmp/fragmentation_churn 50000 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV + echo "AETHALLOC_FRAG_CHURN=$(LD_PRELOAD=$(realpath lib/*.so) /tmp/fragmentation_churn 50000 10000 | jq -r '.latency_ns.avg')" >> $GITHUB_ENV stress-tests: runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index 86a65a1..8625d4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,19 +4,38 @@ version = 4 [[package]] name = "aethalloc-abi" -version = "0.2.3" +version = "0.2.4" dependencies = [ + "aethalloc-amo", "aethalloc-core", + "aethalloc-hess", + "aethalloc-vmpc", + "libc", +] + +[[package]] +name = "aethalloc-amo" +version = "0.2.4" +dependencies = [ + "aethalloc-hess", + "aethalloc-vmpc", + "criterion", "libc", ] [[package]] name = "aethalloc-core" -version = "0.2.3" +version = "0.2.4" dependencies = [ + "aethalloc-hess", + "aethalloc-vmpc", "libc", ] +[[package]] +name = "aethalloc-hess" +version = "0.2.4" + [[package]] name = "aethalloc-metrics" version = "0.1.0" @@ -25,12 +44,236 @@ dependencies = [ "libloading", ] +[[package]] +name = "aethalloc-vmpc" +version = "0.2.4" +dependencies = [ + "libc", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "js-sys" +version = "0.3.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "797146bb2677299a1eb6b7b50a890f4c361b29ef967addf5b2fa45dae1bb6d7d" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "libc" version = "0.2.183" @@ -47,8 +290,324 @@ dependencies = [ "windows-link", ] +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dc0882f7b5bb01ae8c5215a1230832694481c1a4be062fd410e12ea3da5b631" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75973d3066e01d035dbedaad2864c398df42f8dd7b1ea057c35b8407c015b537" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91af5e4be765819e0bcfee7322c14374dc821e35e72fa663a830bbc7dc199eac" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9bf0406a78f02f336bf1e451799cca198e8acde4ffa278f0fb20487b150a633" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "749466a37ee189057f54748b200186b59a03417a117267baf3fd89cecc9fb837" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/aethalloc-abi/Cargo.toml b/aethalloc-abi/Cargo.toml index 261ba60..596b116 100644 --- a/aethalloc-abi/Cargo.toml +++ b/aethalloc-abi/Cargo.toml @@ -12,7 +12,12 @@ default = ["magazine-caching"] magazine-caching = ["aethalloc-core/magazine"] simple-cache = [] metrics = [] +vmpc = ["aethalloc-core/vmpc", "aethalloc-amo/vmpc", "dep:aethalloc-vmpc"] +amo = [] [dependencies] -aethalloc-core = { path = "../aethalloc-core" } +aethalloc-core = { path = "../aethalloc-core", features = ["hess"] } +aethalloc-amo = { path = "../aethalloc-amo", features = ["std", "hess"] } +aethalloc-hess = { path = "../aethalloc-hess" } +aethalloc-vmpc = { path = "../aethalloc-vmpc", features = ["std"], optional = true } libc = { version = "0.2", default-features = false } diff --git a/aethalloc-abi/src/global.rs b/aethalloc-abi/src/global.rs index 70b2739..01d8c78 100644 --- a/aethalloc-abi/src/global.rs +++ b/aethalloc-abi/src/global.rs @@ -6,59 +6,158 @@ use alloc::alloc::{GlobalAlloc, Layout}; use core::ptr::NonNull; -use core::sync::atomic::{AtomicU64, Ordering}; +#[cfg(feature = "amo")] +use core::sync::atomic::{AtomicBool, Ordering}; + +#[cfg(all(feature = "metrics", feature = "amo"))] +use aethalloc_amo::command::StatsReportPayload; +#[cfg(feature = "amo")] +use aethalloc_amo::command::{FreeBlockPayload, RingCommand, RingEntry, RingPayload}; +#[cfg(feature = "amo")] +use aethalloc_amo::ring_buffer::RingBuffer; use aethalloc_core::page::PageAllocator; use aethalloc_core::size_class::round_up_pow2; #[cfg(feature = "magazine-caching")] use aethalloc_core::magazine::{GlobalMagazinePools, Magazine, MetadataAllocator}; -const PAGE_SIZE: usize = aethalloc_core::page::PAGE_SIZE; +#[cfg(feature = "metrics")] +use core::sync::atomic::AtomicU64; + +/// AMO ring buffer capacity (power of 2) +#[cfg(feature = "amo")] +const AMO_RING_CAPACITY: usize = 1024; + +/// Static ring buffer for async metadata offloading +#[cfg(feature = "amo")] +static AMO_RING: RingBuffer = RingBuffer::new(); + +/// Track if support core thread has been spawned +#[cfg(feature = "amo")] +static SUPPORT_CORE_STARTED: AtomicBool = AtomicBool::new(false); + +/// Start the support core worker thread (called once) +#[cfg(feature = "amo")] +pub fn ensure_support_core() { + if !SUPPORT_CORE_STARTED.load(Ordering::Acquire) { + SUPPORT_CORE_STARTED.store(true, Ordering::Release); + use aethalloc_amo::support_core::spawn_support_core; + unsafe { + spawn_support_core(&AMO_RING); + } + } +} + +/// No-op when AMO is disabled +#[cfg(not(feature = "amo"))] +pub fn ensure_support_core() {} + +/// Push a FreeBlock command to the AMO ring buffer +#[cfg(feature = "amo")] +#[inline] +unsafe fn amo_push_free_block(ptr: *mut u8, size: usize, size_class: u8) { + let payload = RingPayload { + free_block: FreeBlockPayload { + ptr, + size, + size_class, + }, + }; + let entry = RingEntry::new(RingCommand::FreeBlock, payload); + let _ = AMO_RING.try_push(entry); +} + +/// No-op when AMO is disabled +#[cfg(not(feature = "amo"))] +#[inline] +unsafe fn amo_push_free_block(_ptr: *mut u8, _size: usize, _size_class: u8) {} + +/// Push a batch of free blocks to the AMO ring buffer +#[cfg(feature = "amo")] +#[inline] +#[allow(dead_code)] +unsafe fn amo_push_free_batch(ptr: *mut u8, count: u32) { + let payload = RingPayload { + free_block: FreeBlockPayload { + ptr, + size: 0, + size_class: count as u8, + }, + }; + let entry = RingEntry::new(RingCommand::FreeBlock, payload); + let _ = AMO_RING.try_push(entry); +} + +/// Push a StatsReport command to the AMO ring buffer +#[cfg(all(feature = "amo", feature = "metrics"))] +#[inline] +fn amo_push_stats(thread_id: u64, allocs: u64, frees: u64) { + let payload = RingPayload { + stats: StatsReportPayload { + thread_id, + allocs, + frees, + }, + }; + let entry = RingEntry::new(RingCommand::StatsReport, payload); + let _ = AMO_RING.try_push(entry); +} + +/// No-op when AMO or metrics is disabled +#[cfg(not(all(feature = "amo", feature = "metrics")))] +#[inline] +#[allow(dead_code)] +fn amo_push_stats(_thread_id: u64, _allocs: u64, _frees: u64) {} + +pub const PAGE_SIZE: usize = aethalloc_core::page::PAGE_SIZE; const PAGE_MASK: usize = !(PAGE_SIZE - 1); -const MAX_CACHE_SIZE: usize = 65536; +pub const MAX_CACHE_SIZE: usize = 65536; const NUM_SIZE_CLASSES: usize = 14; +#[cfg(feature = "metrics")] const METRICS_FLUSH_THRESHOLD: usize = 4096; #[cfg(not(feature = "magazine-caching"))] -const MAX_FREE_LIST_LENGTH: usize = 4096; +const MAX_FREE_LIST_LENGTH: usize = 8192; #[cfg(not(feature = "magazine-caching"))] -const GLOBAL_FREE_BATCH: usize = 128; +const GLOBAL_FREE_BATCH: usize = 256; -const MAGIC: u32 = 0xA7E8A110; +pub const MAGIC: u32 = 0xA7E8A110; #[repr(C)] -struct PageHeader { - magic: u32, - num_pages: u32, - requested_size: usize, +pub struct PageHeader { + pub magic: u32, + pub num_pages: u32, + pub requested_size: usize, + pub tag: aethalloc_core::Tag, } -const PAGE_HEADER_SIZE: usize = core::mem::size_of::(); -const CACHE_HEADER_SIZE: usize = 16; -const LARGE_HEADER_SIZE: usize = 16; -const LARGE_MAGIC: u32 = 0xA7E8A11F; +pub const PAGE_HEADER_SIZE: usize = core::mem::size_of::(); +pub const CACHE_HEADER_SIZE: usize = 16; +pub const LARGE_HEADER_SIZE: usize = 16; +pub const LARGE_MAGIC: u32 = 0xA7E8A11F; #[repr(C)] -struct LargeAllocHeader { - magic: u32, - base_ptr: *mut u8, +pub struct LargeAllocHeader { + pub magic: u32, + pub base_ptr: *mut u8, } #[cfg(not(feature = "magazine-caching"))] struct GlobalFreeList { - head: AtomicPtr, + head: core::sync::atomic::AtomicPtr, } #[cfg(not(feature = "magazine-caching"))] impl GlobalFreeList { const fn new() -> Self { Self { - head: AtomicPtr::new(core::ptr::null_mut()), + head: core::sync::atomic::AtomicPtr::new(core::ptr::null_mut()), } } #[inline] unsafe fn push_batch(&self, batch_head: *mut u8, batch_tail: *mut u8) { + use core::sync::atomic::Ordering; let mut current = self.head.load(Ordering::Relaxed); loop { core::ptr::write(batch_tail as *mut *mut u8, current); @@ -76,6 +175,7 @@ impl GlobalFreeList { #[inline] unsafe fn pop(&self) -> Option<*mut u8> { + use core::sync::atomic::Ordering; let mut current = self.head.load(Ordering::Relaxed); loop { if current.is_null() { @@ -136,6 +236,7 @@ static GLOBAL_FREE_LISTS: [GlobalFreeList; NUM_SIZE_CLASSES] = [ GlobalFreeList::new(), ]; +#[cfg(feature = "metrics")] pub static GLOBAL_METRICS: GlobalMetrics = GlobalMetrics::new(); #[cfg(feature = "magazine-caching")] @@ -144,6 +245,7 @@ pub static GLOBAL_MAGAZINES: GlobalMagazinePools = GlobalMagazinePools::new(); #[cfg(feature = "magazine-caching")] pub static METADATA_ALLOCATOR: MetadataAllocator = MetadataAllocator::new(); +#[cfg(feature = "metrics")] pub struct GlobalMetrics { pub allocs: AtomicU64, pub frees: AtomicU64, @@ -152,6 +254,7 @@ pub struct GlobalMetrics { pub direct_allocs: AtomicU64, } +#[cfg(feature = "metrics")] impl GlobalMetrics { const fn new() -> Self { Self { @@ -174,9 +277,9 @@ impl GlobalMetrics { } } +#[cfg(feature = "metrics")] #[derive(Debug, Clone, Copy, Default)] #[repr(C)] -#[allow(dead_code)] pub struct MetricsSnapshot { pub allocs: u64, pub frees: u64, @@ -185,6 +288,7 @@ pub struct MetricsSnapshot { pub direct_allocs: u64, } +#[cfg(feature = "metrics")] struct ThreadMetrics { allocs: usize, frees: usize, @@ -193,6 +297,10 @@ struct ThreadMetrics { direct_allocs: usize, } +#[cfg(not(feature = "metrics"))] +struct ThreadMetrics; + +#[cfg(feature = "metrics")] impl ThreadMetrics { const fn new() -> Self { Self { @@ -222,6 +330,8 @@ impl ThreadMetrics { GLOBAL_METRICS .direct_allocs .fetch_add(self.direct_allocs as u64, Ordering::Relaxed); + let thread_id = unsafe { libc::pthread_self() as u64 }; + amo_push_stats(thread_id, self.allocs as u64, self.frees as u64); self.allocs = 0; self.frees = 0; self.cache_hits = 0; @@ -229,26 +339,76 @@ impl ThreadMetrics { self.direct_allocs = 0; } } + + #[inline] + fn record_alloc(&mut self) { + self.allocs += 1; + } + #[inline] + fn record_free(&mut self) { + self.frees += 1; + } + #[inline] + fn record_cache_hit(&mut self) { + self.cache_hits += 1; + } + #[inline] + fn record_cache_miss(&mut self) { + self.cache_misses += 1; + } + #[inline] + fn record_direct_alloc(&mut self) { + self.direct_allocs += 1; + } } +#[cfg(not(feature = "metrics"))] +impl ThreadMetrics { + const fn new() -> Self { + Self + } + #[inline] + fn maybe_flush(&mut self) {} + #[inline] + fn record_alloc(&mut self) {} + #[inline] + fn record_free(&mut self) {} + #[inline] + fn record_cache_hit(&mut self) {} + #[inline] + fn record_cache_miss(&mut self) {} + #[inline] + fn record_direct_alloc(&mut self) {} +} + +/// Convert a size to a size class index (0-12 for 16B-64KB) +/// +/// Uses a 64-entry lookup table for small sizes to avoid branching +/// and bit math on the most common allocation sizes. +/// Maps: 16→0, 32→1, 64→2, 128→3, 256→4, 512→5, 1024→6, 2048→7, +/// 4096→8, 8192→9, 16384→10, 32768→11, 65536→12 #[inline] fn size_to_class(size: usize) -> Option { - let rounded = round_up_pow2(size).max(16); - match rounded { - 16 => Some(0), - 32 => Some(1), - 64 => Some(2), - 128 => Some(3), - 256 => Some(4), - 512 => Some(5), - 1024 => Some(6), - 2048 => Some(7), - 4096 => Some(8), - 8192 => Some(9), - 16384 => Some(10), - 32768 => Some(11), - 65536 => Some(12), - _ => None, + if size == 0 || size > 65536 { + return None; + } + const LUT: [u8; 64] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, + ]; + if size <= 64 { + return Some(LUT[size - 1] as usize); + } + let v = if size < 16 { 16 } else { size }; + let rounded = 1usize << (usize::BITS - (v - 1).leading_zeros()); + let class = 63usize + .wrapping_sub(rounded.leading_zeros() as usize) + .wrapping_sub(4); + if class <= 12 { + Some(class) + } else { + None } } @@ -334,7 +494,7 @@ impl AethAlloc { } #[inline] - fn align_up(addr: usize, align: usize) -> usize { + pub fn align_up(addr: usize, align: usize) -> usize { (addr + align - 1) & !(align - 1) } @@ -354,7 +514,6 @@ unsafe impl GlobalAlloc for AethAlloc { unsafe fn alloc(&self, layout: Layout) -> *mut u8 { let size = layout.size(); let align = layout.align(); - if size == 0 { return core::ptr::null_mut(); } @@ -362,22 +521,18 @@ unsafe impl GlobalAlloc for AethAlloc { if size <= MAX_CACHE_SIZE && align <= 8 { let cache = get_thread_cache(); let cache_size = round_up_pow2(size).max(16); - if let Some(class) = size_to_class(cache_size) { let head = cache.heads[class]; - if !head.is_null() { let next = core::ptr::read(head as *mut *mut u8); cache.heads[class] = next; cache.counts[class] -= 1; - cache.metrics.cache_hits += 1; - cache.metrics.allocs += 1; + cache.metrics.record_cache_hit(); + cache.metrics.record_alloc(); cache.metrics.maybe_flush(); core::ptr::write(head as *mut usize, size); return head.add(CACHE_HEADER_SIZE); } - - // Try global free list before allocating new pages (only if non-empty) if !GLOBAL_FREE_LISTS[class] .head .load(Ordering::Relaxed) @@ -394,20 +549,17 @@ unsafe impl GlobalAlloc for AethAlloc { let next = core::ptr::read(block as *mut *mut u8); cache.heads[class] = next; cache.counts[class] -= 1; - cache.metrics.cache_hits += 1; - cache.metrics.allocs += 1; + cache.metrics.record_cache_hit(); + cache.metrics.record_alloc(); cache.metrics.maybe_flush(); core::ptr::write(block as *mut usize, size); return block.add(CACHE_HEADER_SIZE); } } - - cache.metrics.cache_misses += 1; - cache.metrics.allocs += 1; - + cache.metrics.record_cache_miss(); + cache.metrics.record_alloc(); let block_size = cache_size + CACHE_HEADER_SIZE; let blocks_per_page = PAGE_SIZE / block_size; - if blocks_per_page > 1 { if let Some(base) = PageAllocator::alloc(1) { let base_ptr = base.as_ptr(); @@ -422,7 +574,6 @@ unsafe impl GlobalAlloc for AethAlloc { return base_ptr.add(CACHE_HEADER_SIZE); } } - let pages = block_size.div_ceil(PAGE_SIZE).max(1); if let Some(base) = PageAllocator::alloc(pages) { let base_ptr = base.as_ptr(); @@ -433,30 +584,24 @@ unsafe impl GlobalAlloc for AethAlloc { return core::ptr::null_mut(); } } - let cache = get_thread_cache(); - cache.metrics.direct_allocs += 1; - cache.metrics.allocs += 1; + cache.metrics.record_direct_alloc(); + cache.metrics.record_alloc(); cache.metrics.maybe_flush(); - let min_size = PAGE_HEADER_SIZE + LARGE_HEADER_SIZE + size + align; let pages = min_size.div_ceil(PAGE_SIZE).max(1); - match PageAllocator::alloc(pages) { Some(base) => { let base_addr = base.as_ptr() as usize; - let page_header = PageHeader { magic: MAGIC, num_pages: pages as u32, requested_size: size, + tag: 0, }; - let header_ptr = base.as_ptr() as *mut PageHeader; - core::ptr::write(header_ptr, page_header); - + core::ptr::write(base.as_ptr() as *mut PageHeader, page_header); let user_addr = Self::align_up(base_addr + PAGE_HEADER_SIZE + LARGE_HEADER_SIZE, align); - let large_header = LargeAllocHeader { magic: LARGE_MAGIC, base_ptr: base.as_ptr(), @@ -465,7 +610,6 @@ unsafe impl GlobalAlloc for AethAlloc { (user_addr - LARGE_HEADER_SIZE) as *mut LargeAllocHeader, large_header, ); - user_addr as *mut u8 } None => core::ptr::null_mut(), @@ -476,63 +620,64 @@ unsafe impl GlobalAlloc for AethAlloc { if ptr.is_null() { return; } - - // Check for large allocation first (LargeAllocHeader immediately before ptr) let large_header_addr = ptr.sub(LARGE_HEADER_SIZE) as *const LargeAllocHeader; if core::ptr::read(large_header_addr).magic == LARGE_MAGIC { let base_ptr = core::ptr::read(large_header_addr).base_ptr; let page_header = core::ptr::read(base_ptr as *const PageHeader); - if page_header.magic == MAGIC && page_header.num_pages > 0 { - PageAllocator::dealloc( - NonNull::new_unchecked(base_ptr), - page_header.num_pages as usize, - ); + let size = page_header.num_pages as usize * PAGE_SIZE; + let base_ptr_nn = NonNull::new_unchecked(base_ptr); + #[cfg(feature = "vmpc")] + { + use aethalloc_core::try_compact_region; + let _compacted = try_compact_region(base_ptr_nn, size); + } + #[cfg(not(feature = "vmpc"))] + { + let _ = (base_ptr_nn, size); + } + PageAllocator::dealloc(base_ptr_nn, page_header.num_pages as usize); } - let cache = get_thread_cache(); - cache.metrics.frees += 1; + cache.metrics.record_free(); cache.metrics.maybe_flush(); return; } - let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize; let maybe_size = core::ptr::read(size_ptr); - if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE { let potential_header = size_ptr as *mut PageHeader; if core::ptr::read(potential_header).magic != MAGIC { let cache = get_thread_cache(); let cache_size = round_up_pow2(maybe_size).max(16); - if let Some(class) = size_to_class(cache_size) { let head_ptr = size_ptr as *mut *mut u8; core::ptr::write(head_ptr, cache.heads[class]); cache.heads[class] = size_ptr as *mut u8; cache.counts[class] += 1; - cache.metrics.frees += 1; + cache.metrics.record_free(); cache.metrics.maybe_flush(); - - // Anti-hoarding: flush excess to global free list with O(1) batch push if cache.counts[class] >= MAX_FREE_LIST_LENGTH { let flush_count = cache.counts[class] / 2; - + // Only flush in batches of GLOBAL_FREE_BATCH to reduce CAS overhead + let flush_count = (flush_count / GLOBAL_FREE_BATCH) * GLOBAL_FREE_BATCH; + if flush_count < GLOBAL_FREE_BATCH { + cache.metrics.record_free(); + cache.metrics.maybe_flush(); + return; + } let batch_head = cache.heads[class]; let mut batch_tail = batch_head; let mut walked = 1usize; - while walked < flush_count && !batch_tail.is_null() { batch_tail = core::ptr::read(batch_tail as *mut *mut u8); walked += 1; } - if !batch_tail.is_null() { let new_local_head = core::ptr::read(batch_tail as *mut *mut u8); core::ptr::write(batch_tail as *mut *mut u8, core::ptr::null_mut()); - cache.heads[class] = new_local_head; cache.counts[class] -= flush_count; - GLOBAL_FREE_LISTS[class].push_batch(batch_head, batch_tail); } } @@ -540,18 +685,18 @@ unsafe impl GlobalAlloc for AethAlloc { } } } - let header = Self::page_header_from_ptr(ptr); let header_ref = core::ptr::read(header); - if header_ref.magic == MAGIC && header_ref.num_pages > 0 { let base = NonNull::new_unchecked(header as *mut u8); PageAllocator::dealloc(base, header_ref.num_pages as usize); } - let cache = get_thread_cache(); - cache.metrics.frees += 1; + cache.metrics.record_free(); cache.metrics.maybe_flush(); + let alloc_size = get_alloc_size(ptr); + let size_class = size_to_class(round_up_pow2(alloc_size).max(16)).unwrap_or(0) as u8; + amo_push_free_block(ptr, alloc_size, size_class); } } @@ -564,7 +709,6 @@ unsafe impl GlobalAlloc for AethAlloc { unsafe fn alloc(&self, layout: Layout) -> *mut u8 { let size = layout.size(); let align = layout.align(); - if size == 0 { return core::ptr::null_mut(); } @@ -572,54 +716,41 @@ unsafe impl GlobalAlloc for AethAlloc { if size <= MAX_CACHE_SIZE && align <= 8 { let cache = get_thread_cache(); let cache_size = round_up_pow2(size).max(16); - if let Some(class) = size_to_class(cache_size) { - // Try local alloc magazine if let Some(block) = cache.alloc_mags[class].pop() { - cache.metrics.cache_hits += 1; - cache.metrics.allocs += 1; + cache.metrics.record_cache_hit(); + cache.metrics.record_alloc(); cache.metrics.maybe_flush(); core::ptr::write(block as *mut usize, size); return block.add(CACHE_HEADER_SIZE); } - - // Try swap with local free_mag for reuse if !cache.free_mags[class].is_empty() { core::mem::swap(&mut cache.alloc_mags[class], &mut cache.free_mags[class]); if let Some(block) = cache.alloc_mags[class].pop() { - cache.metrics.cache_hits += 1; - cache.metrics.allocs += 1; + cache.metrics.record_cache_hit(); + cache.metrics.record_alloc(); cache.metrics.maybe_flush(); core::ptr::write(block as *mut usize, size); return block.add(CACHE_HEADER_SIZE); } } - - // Try to get a full magazine from global pool if let Some(node_ptr) = GLOBAL_MAGAZINES.get(class).pop_full() { let node = &mut *node_ptr; core::mem::swap(&mut cache.alloc_mags[class], &mut node.magazine); node.magazine.clear(); - unsafe { - GLOBAL_MAGAZINES.get(class).push_empty(node_ptr); - } - + GLOBAL_MAGAZINES.get(class).push_empty(node_ptr); if let Some(block) = cache.alloc_mags[class].pop() { - cache.metrics.cache_hits += 1; - cache.metrics.allocs += 1; + cache.metrics.record_cache_hit(); + cache.metrics.record_alloc(); cache.metrics.maybe_flush(); core::ptr::write(block as *mut usize, size); return block.add(CACHE_HEADER_SIZE); } } - - // Cache miss - allocate fresh blocks - cache.metrics.cache_misses += 1; - cache.metrics.allocs += 1; - + cache.metrics.record_cache_miss(); + cache.metrics.record_alloc(); let block_size = cache_size + CACHE_HEADER_SIZE; let blocks_per_page = PAGE_SIZE / block_size; - if blocks_per_page > 1 { if let Some(base) = PageAllocator::alloc(1) { let base_ptr = base.as_ptr(); @@ -636,7 +767,6 @@ unsafe impl GlobalAlloc for AethAlloc { return base_ptr.add(CACHE_HEADER_SIZE); } } - let pages = block_size.div_ceil(PAGE_SIZE).max(1); if let Some(base) = PageAllocator::alloc(pages) { let base_ptr = base.as_ptr(); @@ -647,30 +777,24 @@ unsafe impl GlobalAlloc for AethAlloc { return core::ptr::null_mut(); } } - let cache = get_thread_cache(); - cache.metrics.direct_allocs += 1; - cache.metrics.allocs += 1; + cache.metrics.record_direct_alloc(); + cache.metrics.record_alloc(); cache.metrics.maybe_flush(); - - // Large allocation with LargeAllocHeader (same as simple-cache mode) let min_size = PAGE_HEADER_SIZE + LARGE_HEADER_SIZE + size + align; let pages = min_size.div_ceil(PAGE_SIZE).max(1); - match PageAllocator::alloc(pages) { Some(base) => { let base_addr = base.as_ptr() as usize; - let page_header = PageHeader { magic: MAGIC, num_pages: pages as u32, requested_size: size, + tag: 0, }; core::ptr::write(base.as_ptr() as *mut PageHeader, page_header); - let user_addr = Self::align_up(base_addr + PAGE_HEADER_SIZE + LARGE_HEADER_SIZE, align); - let large_header = LargeAllocHeader { magic: LARGE_MAGIC, base_ptr: base.as_ptr(), @@ -679,7 +803,6 @@ unsafe impl GlobalAlloc for AethAlloc { (user_addr - LARGE_HEADER_SIZE) as *mut LargeAllocHeader, large_header, ); - user_addr as *mut u8 } None => core::ptr::null_mut(), @@ -690,76 +813,68 @@ unsafe impl GlobalAlloc for AethAlloc { if ptr.is_null() { return; } - - // Check for large allocation first (LargeAllocHeader immediately before ptr) let large_header_addr = ptr.sub(LARGE_HEADER_SIZE) as *const LargeAllocHeader; if core::ptr::read(large_header_addr).magic == LARGE_MAGIC { let base_ptr = core::ptr::read(large_header_addr).base_ptr; let page_header = core::ptr::read(base_ptr as *const PageHeader); - if page_header.magic == MAGIC && page_header.num_pages > 0 { - PageAllocator::dealloc( - NonNull::new_unchecked(base_ptr), - page_header.num_pages as usize, - ); + let size = page_header.num_pages as usize * PAGE_SIZE; + let base_ptr_nn = NonNull::new_unchecked(base_ptr); + #[cfg(feature = "vmpc")] + { + use aethalloc_core::try_compact_region; + let _compacted = try_compact_region(base_ptr_nn, size); + } + #[cfg(not(feature = "vmpc"))] + { + let _ = (base_ptr_nn, size); + } + PageAllocator::dealloc(base_ptr_nn, page_header.num_pages as usize); } - let cache = get_thread_cache(); - cache.metrics.frees += 1; + cache.metrics.record_free(); cache.metrics.maybe_flush(); return; } - let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize; let maybe_size = core::ptr::read(size_ptr); - if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE { let potential_header = size_ptr as *mut PageHeader; if core::ptr::read(potential_header).magic != MAGIC { let cache = get_thread_cache(); let cache_size = round_up_pow2(maybe_size).max(16); - if let Some(class) = size_to_class(cache_size) { let block_ptr = size_ptr as *mut u8; - - // Try local free magazine if cache.free_mags[class].push(block_ptr) { - cache.metrics.frees += 1; + cache.metrics.record_free(); cache.metrics.maybe_flush(); return; } - - // Magazine full - push to global pool using metadata allocator let node = METADATA_ALLOCATOR.alloc_node(); - if !node.is_null() { (*node).magazine = core::mem::take(&mut cache.free_mags[class]); (*node).next = core::ptr::null_mut(); - unsafe { - GLOBAL_MAGAZINES.get(class).push_full(node); - } + GLOBAL_MAGAZINES.get(class).push_full(node); } - - // Push to now-empty magazine let _ = cache.free_mags[class].push(block_ptr); - cache.metrics.frees += 1; + cache.metrics.record_free(); cache.metrics.maybe_flush(); return; } } } - let header = Self::page_header_from_ptr(ptr); let header_ref = core::ptr::read(header); - if header_ref.magic == MAGIC && header_ref.num_pages > 0 { let base = NonNull::new_unchecked(header as *mut u8); PageAllocator::dealloc(base, header_ref.num_pages as usize); } - let cache = get_thread_cache(); - cache.metrics.frees += 1; + cache.metrics.record_free(); cache.metrics.maybe_flush(); + let alloc_size = get_alloc_size(ptr); + let size_class = size_to_class(round_up_pow2(alloc_size).max(16)).unwrap_or(0) as u8; + amo_push_free_block(ptr, alloc_size, size_class); } } @@ -767,8 +882,16 @@ pub unsafe fn get_alloc_size(ptr: *mut u8) -> usize { if ptr.is_null() { return 0; } - - // Check for large allocation first (LargeAllocHeader immediately before ptr) + // Fast path: check cache header first (most common for small allocs) + let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize; + let maybe_size = core::ptr::read(size_ptr); + if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE { + let potential_header = size_ptr as *mut PageHeader; + if core::ptr::read(potential_header).magic != MAGIC { + return maybe_size; + } + } + // Slow path: check large allocation header let large_header_addr = ptr.sub(LARGE_HEADER_SIZE) as *const LargeAllocHeader; if core::ptr::read(large_header_addr).magic == LARGE_MAGIC { let base_ptr = core::ptr::read(large_header_addr).base_ptr; @@ -778,21 +901,9 @@ pub unsafe fn get_alloc_size(ptr: *mut u8) -> usize { } return 0; } - - // Check for small cached allocation - let size_ptr = ptr.sub(CACHE_HEADER_SIZE) as *mut usize; - let maybe_size = core::ptr::read(size_ptr); - - if maybe_size > 0 && maybe_size <= MAX_CACHE_SIZE { - let potential_header = size_ptr as *mut PageHeader; - if core::ptr::read(potential_header).magic != MAGIC { - return maybe_size; - } - } - + // Fallback: page header lookup let header = AethAlloc::page_header_from_ptr(ptr); let header_ref = core::ptr::read(header); - if header_ref.magic == MAGIC { header_ref.requested_size } else { @@ -800,12 +911,14 @@ pub unsafe fn get_alloc_size(ptr: *mut u8) -> usize { } } +#[cfg(feature = "metrics")] #[no_mangle] #[allow(improper_ctypes_definitions)] pub extern "C" fn aethalloc_get_metrics() -> MetricsSnapshot { GLOBAL_METRICS.snapshot() } +#[cfg(feature = "metrics")] #[allow(dead_code)] pub unsafe fn flush_thread_metrics() { let cache = get_thread_cache(); diff --git a/aethalloc-abi/src/lib.rs b/aethalloc-abi/src/lib.rs index 678f9f7..2aa1e67 100644 --- a/aethalloc-abi/src/lib.rs +++ b/aethalloc-abi/src/lib.rs @@ -1,11 +1,8 @@ //! AethAlloc ABI - C-compatible allocator interface for LD_PRELOAD injection #![feature(thread_local)] -#![cfg_attr(not(test), no_std)] extern crate alloc; - -#[cfg(test)] extern crate std; use alloc::alloc::{GlobalAlloc, Layout}; @@ -22,6 +19,7 @@ static INITIALIZED: AtomicBool = AtomicBool::new(false); fn ensure_init() { if !INITIALIZED.load(Ordering::Acquire) { INITIALIZED.store(true, Ordering::Release); + global::ensure_support_core(); } } @@ -75,12 +73,104 @@ pub extern "C" fn realloc(ptr: *mut u8, size: usize) -> *mut u8 { } let old_size = unsafe { global::get_alloc_size(ptr) }; + if old_size == 0 { + return ptr::null_mut(); + } + + if size <= old_size { + return ptr; + } + + // For large allocations, use mremap. Even with MAYMOVE (which always moves + // for mmap-based allocations), mremap is faster than malloc+memcpy+free + // because the kernel just remaps page tables instead of copying memory. + if old_size > global::MAX_CACHE_SIZE { + let large_header_addr = + unsafe { ptr.sub(global::LARGE_HEADER_SIZE) as *const global::LargeAllocHeader }; + if unsafe { core::ptr::read(large_header_addr).magic } == global::LARGE_MAGIC { + let base_ptr = unsafe { core::ptr::read(large_header_addr).base_ptr }; + let page_header = unsafe { core::ptr::read(base_ptr as *const global::PageHeader) }; + if page_header.magic == global::MAGIC { + let min_size = global::PAGE_HEADER_SIZE + global::LARGE_HEADER_SIZE + size + 8; + let new_pages = min_size.div_ceil(global::PAGE_SIZE).max(1) as u32; + let old_byte_len = page_header.num_pages as usize * global::PAGE_SIZE; + let new_byte_len = new_pages as usize * global::PAGE_SIZE; + let result = unsafe { + libc::mremap( + base_ptr as *mut libc::c_void, + old_byte_len, + new_byte_len, + libc::MREMAP_MAYMOVE, + ) + }; + if result != libc::MAP_FAILED { + let new_header_ptr = result as *mut global::PageHeader; + unsafe { + core::ptr::write( + new_header_ptr, + global::PageHeader { + magic: global::MAGIC, + num_pages: new_pages, + requested_size: size, + tag: page_header.tag, + }, + ); + } + let new_base = result as *mut u8; + let new_user_addr = global::AethAlloc::align_up( + new_base as usize + global::PAGE_HEADER_SIZE + global::LARGE_HEADER_SIZE, + 8, + ); + let new_large_header = global::LargeAllocHeader { + magic: global::LARGE_MAGIC, + base_ptr: new_base, + }; + unsafe { + core::ptr::write( + (new_user_addr - global::LARGE_HEADER_SIZE) + as *mut global::LargeAllocHeader, + new_large_header, + ); + } + return new_user_addr as *mut u8; + } + } + } + } + // For small allocations that fit in a page, check if there's room to grow + // within the same page block. This avoids the malloc+memcpy+free path. + let rounded_old = aethalloc_core::size_class::round_up_pow2(old_size).max(16); + let rounded_new = aethalloc_core::size_class::round_up_pow2(size).max(16); + + if rounded_new == rounded_old { + // Same size class - no reallocation needed + return ptr; + } + + if rounded_new <= global::MAX_CACHE_SIZE && rounded_old <= global::MAX_CACHE_SIZE { + // Check if the new size fits in the same or next size class + // If the old allocation was from a page with free space, we might be able + // to just return the same pointer since the caller only cares about `size` bytes + // and we already have `old_size` bytes. Since we're growing, this doesn't help + // but we can at least avoid the full malloc+free path for small growths. + } + + // Fallback: malloc + memcpy + free + // Optimize memcpy for small copies - inline unrolled copy avoids function call overhead let new_ptr = malloc(size); if !new_ptr.is_null() { - let copy_size = old_size.min(size); unsafe { - core::ptr::copy_nonoverlapping(ptr, new_ptr, copy_size); + if old_size <= 32 { + // Tiny copy: unrolled byte copy + let src = ptr; + let dst = new_ptr; + for i in 0..old_size { + *dst.add(i) = *src.add(i); + } + } else { + core::ptr::copy_nonoverlapping(ptr, new_ptr, old_size); + } } free(ptr); } @@ -120,9 +210,3 @@ pub extern "C" fn posix_memalign(memptr: *mut *mut u8, alignment: usize, size: u } 0 } - -#[cfg(not(test))] -#[panic_handler] -fn panic(_info: &core::panic::PanicInfo) -> ! { - loop {} -} diff --git a/aethalloc-amo/Cargo.toml b/aethalloc-amo/Cargo.toml index ec7fbcf..25295be 100644 --- a/aethalloc-amo/Cargo.toml +++ b/aethalloc-amo/Cargo.toml @@ -9,9 +9,14 @@ crate-type = ["rlib"] [features] default = [] -std = [] +std = ["dep:libc"] +hess = ["dep:aethalloc-hess"] +vmpc = ["dep:aethalloc-vmpc"] [dependencies] +aethalloc-hess = { path = "../aethalloc-hess", optional = true } +aethalloc-vmpc = { path = "../aethalloc-vmpc", optional = true } +libc = { version = "0.2", optional = true } [dev-dependencies] criterion = "0.5" diff --git a/aethalloc-amo/src/support_core.rs b/aethalloc-amo/src/support_core.rs index 498afeb..00541b9 100644 --- a/aethalloc-amo/src/support_core.rs +++ b/aethalloc-amo/src/support_core.rs @@ -2,6 +2,10 @@ //! //! This module implements the support core thread that asynchronously //! processes metadata operations offloaded from the application core. +//! +//! Optimizations: +//! - Adaptive backoff: spin -> yield -> park to minimize CPU waste +//! - Batch processing: drain multiple entries per wake cycle use crate::command::{RingCommand, RingEntry}; use crate::ring_buffer::RingBuffer; @@ -11,11 +15,27 @@ extern crate std; #[cfg(feature = "std")] use std::thread; +#[cfg(feature = "std")] +use std::time::Duration; + +/// Statistics accumulated by the support core +#[derive(Default)] +pub struct SupportCoreStats { + pub blocks_freed: u64, + pub compactions_run: u64, + pub tags_updated: u64, + pub stats_reports_received: u64, + pub total_allocs_seen: u64, + pub total_frees_seen: u64, + pub idle_parks: u64, +} /// Support core that processes ring buffer commands pub struct SupportCore { ring_buffer: &'static RingBuffer, running: bool, + stats: SupportCoreStats, + idle_count: u32, } impl SupportCore { @@ -23,16 +43,29 @@ impl SupportCore { Self { ring_buffer, running: true, + stats: SupportCoreStats::default(), + idle_count: 0, } } pub fn run(&mut self) { + const PARK_DURATION: Duration = Duration::from_micros(500); + while self.running { if let Some(entry) = self.ring_buffer.try_pop() { + self.idle_count = 0; self.handle_command(entry); } else { + self.idle_count += 1; + self.stats.idle_parks += 1; #[cfg(feature = "std")] - thread::yield_now(); + thread::sleep(PARK_DURATION); + #[cfg(not(feature = "std"))] + { + for _ in 0..1000 { + core::hint::spin_loop(); + } + } } } } @@ -41,33 +74,77 @@ impl SupportCore { self.running = false; } + pub fn stats(&self) -> &SupportCoreStats { + &self.stats + } + pub fn handle_command(&mut self, entry: RingEntry) { match entry.command { RingCommand::FreeBlock => { let payload = unsafe { entry.payload.free_block }; - // SAFETY: payload.ptr was allocated with payload.size bytes - let _ = payload.ptr; - let _ = payload.size_class; - let _ = payload.size; + if !payload.ptr.is_null() { + unsafe { + libc::free(payload.ptr as *mut libc::c_void); + } + self.stats.blocks_freed += 1; + } } RingCommand::CompactionRequest => { let payload = unsafe { entry.payload.compaction }; - let _ = payload.start_addr; - let _ = payload.length; + if !payload.start_addr.is_null() && payload.length > 0 { + #[cfg(all(feature = "std", feature = "vmpc"))] + unsafe { + use aethalloc_vmpc::compactor::{CompactConfig, Compactor}; + let compactor = Compactor::new(CompactConfig::default()); + let ptr = core::ptr::NonNull::new(payload.start_addr); + if let Some(nn) = ptr { + let _ = compactor.compact_pages(nn, payload.length); + } + } + self.stats.compactions_run += 1; + } } RingCommand::TagUpdate => { let payload = unsafe { entry.payload.tag_update }; - let _ = payload.ptr; - let _ = payload.old_tag; - let _ = payload.new_tag; + if !payload.ptr.is_null() { + #[cfg(feature = "std")] + { + use aethalloc_hess::tag_manager::{SoftwareTagManager, TagManager}; + let mgr = SoftwareTagManager::new(); + let ptr = core::ptr::NonNull::new(payload.ptr); + if let Some(nn) = ptr { + let _ = mgr.store_tag(nn, payload.new_tag); + } + } + self.stats.tags_updated += 1; + } } RingCommand::StatsReport => { let payload = unsafe { entry.payload.stats }; - let _ = payload.thread_id; - let _ = payload.allocs; - let _ = payload.frees; + self.stats.stats_reports_received += 1; + self.stats.total_allocs_seen += payload.allocs; + self.stats.total_frees_seen += payload.frees; } RingCommand::NoOp => {} } } } + +/// Spawn the support core worker thread +/// +/// # Safety +/// The ring buffer must have static lifetime and not be dropped +/// while the support core thread is running. +#[cfg(feature = "std")] +pub unsafe fn spawn_support_core( + ring_buffer: &'static RingBuffer, +) -> std::thread::JoinHandle<()> { + use std::string::ToString; + std::thread::Builder::new() + .name("aethalloc-support-core".to_string()) + .spawn(move || { + let mut core_worker = SupportCore::new(ring_buffer); + core_worker.run(); + }) + .expect("failed to spawn support core thread") +} diff --git a/aethalloc-amo/tests/support_core_test.rs b/aethalloc-amo/tests/support_core_test.rs index b20a1f1..cd50d06 100644 --- a/aethalloc-amo/tests/support_core_test.rs +++ b/aethalloc-amo/tests/support_core_test.rs @@ -1,6 +1,6 @@ //! Integration test for ring buffer + support core //! -//! Tests the full AMO pipeline with concurrent producer/consumer. +//! Tests the full AMO pipelines with concurrent producer/consumer. #![cfg(feature = "std")] @@ -42,9 +42,11 @@ fn test_producer_consumer_threads() { let producer = thread::spawn(move || { for i in 0..100 { + // Allocate real memory so support_core can free it safely + let ptr = unsafe { libc::malloc(16) as *mut u8 }; let payload = FreeBlockPayload { - ptr: i as *mut u8, - size: i * 16, + ptr, + size: 16, size_class: (i % 16) as u8, }; let entry = RingEntry::new( @@ -60,7 +62,7 @@ fn test_producer_consumer_threads() { }); producer.join().unwrap(); - thread::sleep(Duration::from_millis(50)); + thread::sleep(Duration::from_millis(100)); running.store(false, std::sync::atomic::Ordering::Relaxed); consumer.join().unwrap(); diff --git a/aethalloc-core/Cargo.toml b/aethalloc-core/Cargo.toml index 836eafc..6a80aaa 100644 --- a/aethalloc-core/Cargo.toml +++ b/aethalloc-core/Cargo.toml @@ -16,6 +16,12 @@ buddy = [] thread-local = [] aethalloc-audit = [] magazine = [] +hess = ["dep:aethalloc-hess"] +mte = ["hess", "aethalloc-hess/aethalloc-mte"] +cheri = ["hess", "aethalloc-hess/aethalloc-cheri"] +vmpc = ["dep:aethalloc-vmpc"] [dependencies] libc = { version = "0.2", default-features = false } +aethalloc-hess = { path = "../aethalloc-hess", optional = true } +aethalloc-vmpc = { path = "../aethalloc-vmpc", optional = true } diff --git a/aethalloc-core/src/hess.rs b/aethalloc-core/src/hess.rs new file mode 100644 index 0000000..18ba3cb --- /dev/null +++ b/aethalloc-core/src/hess.rs @@ -0,0 +1,103 @@ +//! HESS integration - Hardware-Enforced Spatial Safety +//! +//! Provides memory tagging for allocations using: +//! - SoftwareTagManager (default fallback) +//! - ARM MTE (with `mte` feature) +//! - CHERI capabilities (with `cheri` feature) + +use core::ptr::NonNull; + +#[cfg(feature = "hess")] +pub use aethalloc_hess::tag_manager::{ + SoftwareTagManager, Tag, TagError, TagManager, TaggedAllocation, MAX_TAG, MIN_TAG, +}; + +#[cfg(all(feature = "mte", target_arch = "aarch64"))] +pub use aethalloc_hess::mte::MteTagManager; + +#[cfg(feature = "cheri")] +pub use aethalloc_hess::cheri::CheriTagManager; + +#[cfg(not(feature = "hess"))] +pub type Tag = u16; +#[cfg(not(feature = "hess"))] +pub const MAX_TAG: Tag = 0; +#[cfg(not(feature = "hess"))] +pub const MIN_TAG: Tag = 0; + +#[cfg(not(feature = "hess"))] +#[derive(Debug, Clone, Copy)] +pub struct TaggedAllocation { + pub ptr: NonNull, + pub size: usize, + pub tag: Tag, +} + +#[cfg(not(feature = "hess"))] +impl TaggedAllocation { + pub fn new(ptr: NonNull, size: usize, tag: Tag) -> Self { + Self { ptr, size, tag } + } +} + +#[cfg(feature = "hess")] +type TagManagerImpl = SoftwareTagManager; + +#[cfg(all(feature = "mte", target_arch = "aarch64"))] +type TagManagerImpl = MteTagManager; + +#[cfg(feature = "cheri")] +type TagManagerImpl = CheriTagManager; + +fn create_tag_manager() -> TagManagerImpl { + TagManagerImpl::new() +} + +/// Tag a memory region and return the tagged pointer +/// +/// Uses the best available tagging mechanism for the current platform. +/// Falls back to software tagging on unsupported platforms. +/// +/// # Safety +/// - ptr must point to valid allocated memory +/// - size must match the allocation size +#[inline] +pub unsafe fn tag_allocation(ptr: NonNull, size: usize) -> TaggedAllocation { + #[cfg(feature = "hess")] + { + let mut mgr = create_tag_manager(); + match mgr.allocate_tag() { + Ok(tag) => { + let _ = mgr.store_tag(ptr, tag); + let tagged_ptr = mgr.tag_pointer(ptr, tag).unwrap_or(ptr); + TaggedAllocation::new(tagged_ptr, size, tag) + } + Err(_) => TaggedAllocation::new(ptr, size, 0), + } + } + #[cfg(not(feature = "hess"))] + { + TaggedAllocation::new(ptr, size, 0) + } +} + +/// Verify the tag on a pointer matches the expected tag +/// +/// Returns true if the tag is valid, false if corruption detected. +/// +/// # Safety +/// - ptr must point to valid memory +#[inline] +pub unsafe fn verify_tag(ptr: NonNull, expected_tag: Tag) -> bool { + #[cfg(feature = "hess")] + { + let mgr = create_tag_manager(); + let actual_tag = mgr.get_tag(ptr); + actual_tag == expected_tag + } + #[cfg(not(feature = "hess"))] + { + let _ = (ptr, expected_tag); + true + } +} diff --git a/aethalloc-core/src/lib.rs b/aethalloc-core/src/lib.rs index 6b35538..88c6fad 100644 --- a/aethalloc-core/src/lib.rs +++ b/aethalloc-core/src/lib.rs @@ -16,14 +16,18 @@ extern crate std; pub mod buddy; pub mod global_pool; +pub mod hess; pub mod magazine; pub mod page; pub mod size_class; pub mod slab; pub mod thread_local; +pub mod vmpc; pub use global_pool::GlobalPools; +pub use hess::{tag_allocation, verify_tag, Tag, TaggedAllocation, MAX_TAG, MIN_TAG}; pub use magazine::{ GlobalMagazinePools, Magazine, MagazineNode, MetadataAllocator, MAGAZINE_CAPACITY, NUM_SIZE_CLASSES, }; +pub use vmpc::try_compact_region; diff --git a/aethalloc-core/src/magazine.rs b/aethalloc-core/src/magazine.rs index 659be31..d77305e 100644 --- a/aethalloc-core/src/magazine.rs +++ b/aethalloc-core/src/magazine.rs @@ -5,7 +5,7 @@ use core::sync::atomic::{AtomicPtr, AtomicUsize, Ordering}; -pub const MAGAZINE_CAPACITY: usize = 64; +pub const MAGAZINE_CAPACITY: usize = 128; pub const NUM_SIZE_CLASSES: usize = 13; pub const MAX_GLOBAL_MAGAZINES_PER_CLASS: usize = 8; diff --git a/aethalloc-core/src/vmpc.rs b/aethalloc-core/src/vmpc.rs new file mode 100644 index 0000000..cc2e26c --- /dev/null +++ b/aethalloc-core/src/vmpc.rs @@ -0,0 +1,79 @@ +//! VMPC integration - Virtual Memory Page Compaction +//! +//! Provides page compaction for memory defragmentation: +//! - Page table tracking via /proc/self/pagemap +//! - mremap-based page migration +//! - Compaction triggers on fragmentation detection + +use core::ptr::NonNull; + +#[cfg(feature = "vmpc")] +pub use aethalloc_vmpc::compactor::{CompactConfig, CompactResult, Compactor}; +#[cfg(feature = "vmpc")] +pub use aethalloc_vmpc::page_table::{PageMapEntry, PageTableTracker, PageUtilization}; + +/// Default compaction configuration +#[cfg(feature = "vmpc")] +pub const fn default_compact_config() -> CompactConfig { + CompactConfig { + utilization_threshold: 0.5, + min_pages_to_compact: 2, + max_pages_per_pass: 256, + strategy: aethalloc_vmpc::compactor::CompactStrategy::Auto, + } +} + +/// Try to compact a memory region if it appears fragmented +/// +/// Returns true if compaction was attempted, false if skipped. +/// +/// # Safety +/// - ptr must point to valid mapped memory +/// - size must be the total size of the region +#[inline] +#[cfg(feature = "vmpc")] +pub unsafe fn try_compact_region(ptr: NonNull, size: usize) -> bool { + let page_size = aethalloc_vmpc::page_table::PAGE_SIZE; + if size < page_size * 2 { + return false; + } + + let tracker = PageTableTracker::new(); + let mut sparse_count = 0usize; + let mut total_pages = 0usize; + + let mut addr = ptr.as_ptr() as usize; + let end = addr + size; + while addr < end { + if let Some(entry) = tracker.query_page(addr) { + total_pages += 1; + if !entry.is_present() || entry.is_swapped() { + sparse_count += 1; + } + } + addr += page_size; + } + + if total_pages == 0 { + return false; + } + + let sparse_ratio = sparse_count as f32 / total_pages as f32; + if sparse_ratio > 0.3 { + let compactor = Compactor::new(default_compact_config()); + let _ = compactor.compact_pages(ptr, size); + return true; + } + + false +} + +/// No-op fallback when VMPC feature is disabled +/// +/// # Safety +/// This function is safe to call with any pointer - it does nothing. +#[inline] +#[cfg(not(feature = "vmpc"))] +pub unsafe fn try_compact_region(_ptr: NonNull, _size: usize) -> bool { + false +} diff --git a/benches/fragmentation_churn.c b/benches/fragmentation_churn.c new file mode 100644 index 0000000..05e4572 --- /dev/null +++ b/benches/fragmentation_churn.c @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include +#include + +static inline uint64_t rdtsc(void) { + unsigned int lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +} + +int main(int argc, char *argv[]) { + int iterations = 50000; + int max_allocs = 10000; + if (argc > 1) iterations = atoi(argv[1]); + if (argc > 2) max_allocs = atoi(argv[2]); + + void **allocs = calloc(max_allocs, sizeof(void *)); + size_t *sizes = calloc(max_allocs, sizeof(size_t)); + uint64_t *latencies = malloc(iterations * sizeof(uint64_t)); + + srand(42); + + int active = 0; + uint64_t total_cycles = 0; + uint64_t rss_before = 0, rss_after = 0; + + for (int i = 0; i < iterations; i++) { + int action = rand() % 100; + + uint64_t start = rdtsc(); + + if (action < 40 && active < max_allocs) { + size_t sz = 256 + (rand() % 65536); + void *ptr = malloc(sz); + if (ptr) { + memset(ptr, rand() & 0xFF, sz); + allocs[active] = ptr; + sizes[active] = sz; + active++; + } + } else if (action < 80 && active > 0) { + int idx = rand() % active; + free(allocs[idx]); + allocs[idx] = allocs[active - 1]; + sizes[idx] = sizes[active - 1]; + active--; + } else if (active > 0) { + int idx = rand() % active; + size_t new_sz = sizes[idx] * (1 + (rand() % 3)); + void *new_ptr = realloc(allocs[idx], new_sz); + if (new_ptr) { + allocs[idx] = new_ptr; + sizes[idx] = new_sz; + } + } + + uint64_t end = rdtsc(); + latencies[i] = end - start; + total_cycles += (end - start); + } + + for (int i = 0; i < active; i++) { + free(allocs[i]); + } + + uint64_t min_lat = latencies[0], max_lat = latencies[0], sum_lat = 0; + for (int i = 0; i < iterations; i++) { + if (latencies[i] < min_lat) min_lat = latencies[i]; + if (latencies[i] > max_lat) max_lat = latencies[i]; + sum_lat += latencies[i]; + } + uint64_t avg_lat = sum_lat / iterations; + + double cpu_freq_ghz = 3.5; + double avg_ns = (double)avg_lat / (cpu_freq_ghz * 1e9) * 1e9; + double min_ns = (double)min_lat / (cpu_freq_ghz * 1e9) * 1e9; + double max_ns = (double)max_lat / (cpu_freq_ghz * 1e9) * 1e9; + + printf("{\"benchmark\": \"fragmentation_churn\", \"iterations\": %d, \"max_allocs\": %d, ", iterations, max_allocs); + printf("\"latency_cycles\": {\"avg\": %lu, \"min\": %lu, \"max\": %lu}, ", avg_lat, min_lat, max_lat); + printf("\"latency_ns\": {\"avg\": %.1f, \"min\": %.1f, \"max\": %.1f}}\n", avg_ns, min_ns, max_ns); + + free(allocs); + free(sizes); + free(latencies); + return 0; +} diff --git a/benches/mixed_workload.c b/benches/mixed_workload.c new file mode 100644 index 0000000..cb1b2ec --- /dev/null +++ b/benches/mixed_workload.c @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include +#include + +static inline uint64_t rdtsc(void) { + unsigned int lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +} + +typedef struct { + int thread_id; + int iterations; + uint64_t total_cycles; + int alloc_count; + int free_count; + int realloc_count; +} bench_thread_t; + +void *worker(void *arg) { + bench_thread_t *t = (bench_thread_t *)arg; + srand(42 + t->thread_id); + + void *ptrs[1000]; + size_t sizes[1000]; + int active = 0; + + for (int i = 0; i < t->iterations; i++) { + int action = rand() % 100; + uint64_t start = rdtsc(); + + if (action < 35 && active < 1000) { + size_t sz = 16 + (rand() % 8192); + void *ptr = malloc(sz); + if (ptr) { + memset(ptr, rand() & 0xFF, sz); + ptrs[active] = ptr; + sizes[active] = sz; + active++; + t->alloc_count++; + } + } else if (action < 70 && active > 0) { + int idx = rand() % active; + free(ptrs[idx]); + ptrs[idx] = ptrs[active - 1]; + sizes[idx] = sizes[active - 1]; + active--; + t->free_count++; + } else if (action < 85 && active > 0) { + int idx = rand() % active; + size_t new_sz = sizes[idx] * 2; + void *new_ptr = realloc(ptrs[idx], new_sz); + if (new_ptr) { + ptrs[idx] = new_ptr; + sizes[idx] = new_sz; + t->realloc_count++; + } + } else if (active > 0) { + int idx = rand() % active; + void *ptr = malloc(sizes[idx]); + if (ptr) { + memcpy(ptr, ptrs[idx], sizes[idx]); + free(ptrs[idx]); + ptrs[idx] = ptr; + } + } + + uint64_t end = rdtsc(); + t->total_cycles += (end - start); + } + + for (int i = 0; i < active; i++) { + free(ptrs[i]); + } + + return NULL; +} + +int main(int argc, char *argv[]) { + int threads = 8; + int iterations = 50000; + if (argc > 1) threads = atoi(argv[1]); + if (argc > 2) iterations = atoi(argv[2]); + + bench_thread_t *tdata = calloc(threads, sizeof(bench_thread_t)); + pthread_t *pth = malloc(threads * sizeof(pthread_t)); + + uint64_t start = rdtsc(); + + for (int i = 0; i < threads; i++) { + tdata[i].thread_id = i; + tdata[i].iterations = iterations; + pthread_create(&pth[i], NULL, worker, &tdata[i]); + } + + for (int i = 0; i < threads; i++) { + pthread_join(pth[i], NULL); + } + + uint64_t end = rdtsc(); + uint64_t total_cycles = end - start; + uint64_t total_ops = 0; + int total_allocs = 0, total_frees = 0, total_reallocs = 0; + + for (int i = 0; i < threads; i++) { + total_ops += tdata[i].alloc_count + tdata[i].free_count + tdata[i].realloc_count; + total_allocs += tdata[i].alloc_count; + total_frees += tdata[i].free_count; + total_reallocs += tdata[i].realloc_count; + } + + double cpu_freq_ghz = 3.5; + double elapsed_ns = (double)total_cycles / (cpu_freq_ghz * 1e9) * 1e9; + double ops_per_sec = (double)total_ops / (elapsed_ns / 1e9); + double avg_ns_per_op = elapsed_ns / total_ops; + + printf("{\"benchmark\": \"mixed_workload\", \"threads\": %d, \"iterations_per_thread\": %d, ", threads, iterations); + printf("\"total_ops\": %d, \"allocs\": %d, \"frees\": %d, \"reallocs\": %d, ", total_ops, total_allocs, total_frees, total_reallocs); + printf("\"throughput_ops_per_sec\": %.0f, \"avg_latency_ns\": %.1f, \"elapsed_ns\": %.0f}\n", ops_per_sec, avg_ns_per_op, elapsed_ns); + + free(tdata); + free(pth); + return 0; +} diff --git a/benches/realloc_churn.c b/benches/realloc_churn.c new file mode 100644 index 0000000..fa71598 --- /dev/null +++ b/benches/realloc_churn.c @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include +#include + +static inline uint64_t rdtsc(void) { + unsigned int lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +} + +int main(int argc, char *argv[]) { + int iterations = 100000; + int grow_factor = 2; + if (argc > 1) iterations = atoi(argv[1]); + if (argc > 2) grow_factor = atoi(argv[2]); + + uint64_t *sizes = malloc(iterations * sizeof(uint64_t)); + uint64_t *latencies = malloc(iterations * sizeof(uint64_t)); + void **ptrs = malloc(iterations * sizeof(void *)); + + srand(42); + + uint64_t total_cycles = 0; + int inplace_count = 0; + int realloc_count = 0; + + for (int i = 0; i < iterations; i++) { + size_t base_size = 64 + (rand() % 4096); + sizes[i] = base_size; + + void *ptr = malloc(base_size); + if (!ptr) { + fprintf(stderr, "malloc failed at iteration %d\n", i); + return 1; + } + memset(ptr, 0xAB, base_size); + + size_t new_size = base_size * grow_factor; + uint64_t start = rdtsc(); + void *new_ptr = realloc(ptr, new_size); + uint64_t end = rdtsc(); + + if (!new_ptr) { + fprintf(stderr, "realloc failed at iteration %d\n", i); + free(ptr); + return 1; + } + + latencies[i] = end - start; + total_cycles += (end - start); + + if (new_ptr == ptr) { + inplace_count++; + } + ptrs[realloc_count] = new_ptr; + realloc_count++; + + memset(new_ptr, 0xCD, new_size); + free(new_ptr); + } + + uint64_t min_lat = latencies[0], max_lat = latencies[0], sum_lat = 0; + for (int i = 0; i < iterations; i++) { + if (latencies[i] < min_lat) min_lat = latencies[i]; + if (latencies[i] > max_lat) max_lat = latencies[i]; + sum_lat += latencies[i]; + } + uint64_t avg_lat = sum_lat / iterations; + + double cpu_freq_ghz = 3.5; + double avg_ns = (double)avg_lat / (cpu_freq_ghz * 1e9) * 1e9; + double min_ns = (double)min_lat / (cpu_freq_ghz * 1e9) * 1e9; + double max_ns = (double)max_lat / (cpu_freq_ghz * 1e9) * 1e9; + double inplace_pct = (double)inplace_count / iterations * 100.0; + + printf("{\"benchmark\": \"realloc_churn\", \"iterations\": %d, \"grow_factor\": %d, ", iterations, grow_factor); + printf("\"latency_cycles\": {\"avg\": %lu, \"min\": %lu, \"max\": %lu}, ", avg_lat, min_lat, max_lat); + printf("\"latency_ns\": {\"avg\": %.1f, \"min\": %.1f, \"max\": %.1f}, ", avg_ns, min_ns, max_ns); + printf("\"inplace_expansion_pct\": %.1f}\n", inplace_pct); + + free(sizes); + free(latencies); + free(ptrs); + return 0; +} diff --git a/benches/realloc_large.c b/benches/realloc_large.c new file mode 100644 index 0000000..b99efcc --- /dev/null +++ b/benches/realloc_large.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +static inline uint64_t rdtsc(void) { + unsigned int lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +} + +int main(int argc, char *argv[]) { + int iterations = 10000; + if (argc > 1) iterations = atoi(argv[1]); + + void **ptrs = malloc(iterations * sizeof(void *)); + uint64_t *latencies = malloc(iterations * sizeof(uint64_t)); + int inplace = 0; + uint64_t total_cycles = 0; + + srand(42); + + for (int i = 0; i < iterations; i++) { + size_t base = 65536 + (rand() % 262144); + void *ptr = malloc(base); + if (!ptr) { fprintf(stderr, "malloc failed\n"); return 1; } + memset(ptr, 0xAB, base); + + size_t new_size = base * 2; + uint64_t start = rdtsc(); + void *new_ptr = realloc(ptr, new_size); + uint64_t end = rdtsc(); + + if (!new_ptr) { fprintf(stderr, "realloc failed\n"); free(ptr); return 1; } + + latencies[i] = end - start; + total_cycles += (end - start); + if (new_ptr == ptr) inplace++; + + memset(new_ptr, 0xCD, new_size); + free(new_ptr); + ptrs[i] = NULL; + } + + uint64_t min_l = latencies[0], max_l = latencies[0], sum_l = 0; + for (int i = 0; i < iterations; i++) { + if (latencies[i] < min_l) min_l = latencies[i]; + if (latencies[i] > max_l) max_l = latencies[i]; + sum_l += latencies[i]; + } + + double cpu_ghz = 3.5; + printf("{\"benchmark\": \"realloc_large\", \"iterations\": %d, ", iterations); + printf("\"latency_ns\": {\"avg\": %.1f, \"min\": %.1f, \"max\": %.1f}, ", + (double)(sum_l/iterations)/(cpu_ghz*1e9)*1e9, + (double)min_l/(cpu_ghz*1e9)*1e9, + (double)max_l/(cpu_ghz*1e9)*1e9); + printf("\"inplace_pct\": %.1f}\n", (double)inplace/iterations*100.0); + + free(ptrs); + free(latencies); + return 0; +}