From f819af8acbbcc08d18cf7de37fb85f84b4581a07 Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Thu, 14 May 2026 11:30:08 -0400 Subject: [PATCH 1/8] fix(docker): make quickstart image build on arm64 and pick up overlay-ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues blocked `./scripts/quickstart.sh up` on macOS arm64 (colima / orbstack) and presumably arm64 Linux too: 1. `vcpkg-configuration.json` lists `overlay-ports/` as an overlay ports source (added in #78 for the cityhash port), but the Dockerfile never copied that directory into the build context. The first vcpkg invocation aborted with "Overlay path /build/pg_stat_ch/./overlay-ports must be an existing directory." 2. The cmake step hardcoded `VCPKG_TARGET_TRIPLET=x64-linux-pic`, which fails compiler detection inside an arm64 container (vcpkg tries to cross-target x64 from an aarch64 host without a cross toolchain present). Switch to `uname -m` detection so the same Dockerfile builds against the host architecture on both x64 CI runners and arm64 dev boxes. No BuildKit dependency — using `uname -m` instead of `$TARGETARCH` keeps it working on legacy docker builds too. Co-Authored-By: Claude Opus 4.7 (1M context) --- docker/postgres-ext.Dockerfile | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docker/postgres-ext.Dockerfile b/docker/postgres-ext.Dockerfile index 6dac61e..2112f5c 100644 --- a/docker/postgres-ext.Dockerfile +++ b/docker/postgres-ext.Dockerfile @@ -34,6 +34,7 @@ WORKDIR /build/pg_stat_ch # Copy dependency manifests first for layer caching COPY vcpkg.json vcpkg-configuration.json ./ COPY triplets/ triplets/ +COPY overlay-ports/ overlay-ports/ COPY CMakeLists.txt ./ COPY cmake/ cmake/ COPY include/ include/ @@ -41,11 +42,15 @@ COPY src/ src/ COPY sql/ sql/ COPY pg_stat_ch.control ./ -RUN cmake -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_TOOLCHAIN_FILE=$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake \ - -DVCPKG_TARGET_TRIPLET=x64-linux-pic \ - -DVCPKG_OVERLAY_TRIPLETS=/build/pg_stat_ch/triplets \ +# Pick the vcpkg triplet that matches the host architecture so the same +# Dockerfile builds on x64 CI runners and on arm64 dev boxes (Apple Silicon +# under colima/orbstack/etc.). +RUN TRIPLET="$([ "$(uname -m)" = "aarch64" ] && echo arm64-linux-pic || echo x64-linux-pic)" \ + && cmake -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_TOOLCHAIN_FILE=$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake \ + -DVCPKG_TARGET_TRIPLET="$TRIPLET" \ + -DVCPKG_OVERLAY_TRIPLETS=/build/pg_stat_ch/triplets \ && cmake --build build --parallel $(nproc) FROM postgres:18-bookworm From ea66cdee16559fb774b53620ef4820aeafb09476 Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Thu, 14 May 2026 11:30:25 -0400 Subject: [PATCH 2/8] test: local validation script + skill for parent_query_id Adds `scripts/quickstart-validate-parent-query-id.sh`, which drives the quickstart Docker stack end-to-end against the same three invariants the TAP test in `t/028_parent_query_id.pl` asserts: 1. Top-level queries report parent_query_id = 0. 2. Nested SPI queries report parent_query_id = outer's query_id. 3. Error events captured inside a plpgsql EXCEPTION block carry a non-zero query_id (the running statement) and parent_query_id equal to the outer caller's query_id, distinct from each other. The advantage over the TAP harness is iteration speed: once the quickstart image is built (one-time vcpkg compile), the script runs in a few seconds without needing a TAP-enabled local Postgres or a fresh cmake build. Also adds `.claude/skills/quickstart-validate/SKILL.md` describing the general pattern. Future PRs touching query-telemetry semantics can drop a `scripts/quickstart-validate-.sh` alongside this one without re-deriving how to drive the stack. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/skills/quickstart-validate/SKILL.md | 61 +++++++ .../quickstart-validate-parent-query-id.sh | 168 ++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 .claude/skills/quickstart-validate/SKILL.md create mode 100755 scripts/quickstart-validate-parent-query-id.sh diff --git a/.claude/skills/quickstart-validate/SKILL.md b/.claude/skills/quickstart-validate/SKILL.md new file mode 100644 index 0000000..2174edd --- /dev/null +++ b/.claude/skills/quickstart-validate/SKILL.md @@ -0,0 +1,61 @@ +--- +name: quickstart-validate +description: Validate pg_stat_ch behavior end-to-end against the local quickstart Docker stack (Postgres + ClickHouse). Use when a change affects emitted event content (parent_query_id, cmd_type, err_sqlstate, buffer usage, etc.) and you want to confirm it round-trips through to ClickHouse before pushing — i.e., the kind of verification the TAP test gives you but without needing a TAP-enabled local Postgres build. +--- + +# Quickstart-based PR validation + +Use the quickstart stack (`docker/quickstart/docker-compose.yml`) plus a per-PR validation script to confirm a change reaches ClickHouse with the expected event content. + +## Steps + +1. **Bring up the quickstart stack** (idempotent): + ```bash + ./scripts/quickstart.sh up + ``` + First run takes ~10–15 min because the postgres container builds `pg_stat_ch` from scratch via vcpkg + cmake. Cached subsequent runs are seconds. + +2. **Pick the validation script for the topic under test.** The convention is `scripts/quickstart-validate-.sh`: + - `scripts/quickstart-validate-parent-query-id.sh` — exercises parent_query_id linkage and the off-by-one in CaptureLogEvent. + - …add more as new PRs need them. + + If no script matches the change, write one (see "Adding a new script" below). + +3. **Run it**: + ```bash + ./scripts/quickstart-validate-.sh + ``` + Exits 0 on full pass; per-assertion PASS/FAIL printed to stderr. Output ends with a `Result: N passed, M failed` summary. + +4. **On failure, inspect raw events**: + ```bash + ./scripts/quickstart.sh ch -q "SELECT * FROM pg_stat_ch.events_raw ORDER BY ts_start DESC LIMIT 20 FORMAT Pretty" + ./scripts/quickstart.sh pg # open psql for ad-hoc reproduction + ``` + +5. **Tear down when done** (optional — leaving it up speeds up next iteration): + ```bash + ./scripts/quickstart.sh down + ``` + +## When to use this skill vs alternatives + +- **Quickstart-validate**: cheap, fast iteration on event-content semantics. Best for "did my change produce the right rows in ClickHouse?" +- **`./scripts/run-tests.sh 18 regress`**: PG regression suite. Best for SQL-level functionality (does `pg_stat_ch_stats()` return the right shape, does the extension load, etc.). Doesn't exercise the ClickHouse side. +- **`./scripts/run-tests.sh ../postgres/install_tap tap`**: TAP harness, which the CI uses. Most thorough but requires a TAP-enabled local Postgres and a fresh local build. Use before opening the PR, or when iterating on `t/*.pl` files directly. + +## Adding a new script + +Use `scripts/quickstart-validate-parent-query-id.sh` as a template. Each script should: + +1. `ensure_stack_up`: idempotently bring up the quickstart compose if it isn't running. +2. **Set up fixtures** with `pg_exec </dev/null | tr -d '[:space:]' +} + +ensure_stack_up() { + if ! docker compose -f "$COMPOSE_FILE" ps --status running 2>/dev/null | grep -q postgres; then + echo "Quickstart stack not running. Bringing it up (first run takes ~10-15 min)..." + ./scripts/quickstart.sh up + fi +} + +expect() { + local name="$1" expected="$2" actual="$3" comparator="${4:-=}" + local ok=0 + case "$comparator" in + '=') [[ "$actual" == "$expected" ]] && ok=1 ;; + '>=') [[ "$actual" -ge "$expected" ]] && ok=1 ;; + esac + if [[ "$ok" -eq 1 ]]; then + printf ' \033[32mPASS\033[0m %s\n' "$name" + PASSED=$((PASSED + 1)) + else + printf ' \033[31mFAIL\033[0m %s (expected %s %s, got %s)\n' \ + "$name" "$comparator" "$expected" "$actual" + FAILED=$((FAILED + 1)) + fi +} + +ensure_stack_up + +echo "Setting up fixtures..." +pg_exec <<'SQL' >/dev/null +CREATE EXTENSION IF NOT EXISTS pg_stat_ch; + +DROP TABLE IF EXISTS pqid_top_marker CASCADE; +DROP TABLE IF EXISTS pqid_inner_marker CASCADE; +DROP TABLE IF EXISTS pqid_err_inner CASCADE; +DROP FUNCTION IF EXISTS pqid_outer_caller(); +DROP FUNCTION IF EXISTS pqid_err_outer(); + +CREATE TABLE pqid_top_marker(x int); +CREATE TABLE pqid_inner_marker(x int); +INSERT INTO pqid_inner_marker VALUES (1); +CREATE TABLE pqid_err_inner(x int); +INSERT INTO pqid_err_inner VALUES (0); + +CREATE FUNCTION pqid_outer_caller() RETURNS int +LANGUAGE plpgsql AS $$ +DECLARE v int; +BEGIN + SELECT x INTO v FROM pqid_inner_marker; + RETURN v; +END$$; + +CREATE FUNCTION pqid_err_outer() RETURNS int +LANGUAGE plpgsql AS $$ +DECLARE v int; +BEGIN + BEGIN + SELECT 1 / x INTO v FROM pqid_err_inner; + EXCEPTION WHEN division_by_zero THEN + NULL; + END; + RETURN 1; +END$$; +SQL + +echo "Clearing events_raw and pg_stat_ch stats..." +ch_query "TRUNCATE TABLE pg_stat_ch.events_raw" >/dev/null +pg_exec -c "SELECT pg_stat_ch_reset();" >/dev/null + +echo "Driving test queries..." +pg_exec <<'SQL' >/dev/null +SELECT * FROM pqid_top_marker; +SELECT count(*) FROM pqid_top_marker; +SELECT pqid_outer_caller(); +SELECT pqid_err_outer(); +SELECT pg_stat_ch_flush(); +SQL + +echo "Waiting for events to land..." +for _ in $(seq 1 30); do + count=$(ch_query "SELECT count() FROM pg_stat_ch.events_raw") + if [[ "${count:-0}" -ge 5 ]]; then break; fi + sleep 1 +done + +echo +echo "Test 1: top-level queries report parent_query_id = 0" +expect "top-level rows have parent_query_id = 0" 0 \ + "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE query LIKE '%pqid_top_marker%' AND parent_query_id != 0")" + +echo +echo "Test 2: nested SPI parent_query_id links to outer query_id" +expect "inner row joins to outer via parent_query_id" 1 \ + "$(ch_query " + SELECT count() FROM pg_stat_ch.events_raw inner_q + JOIN pg_stat_ch.events_raw outer_q + ON inner_q.parent_query_id = outer_q.query_id + WHERE inner_q.query LIKE '%pqid_inner_marker%' + AND outer_q.query LIKE '%pqid_outer_caller%' + ")" '>=' +expect "nested SPI query is not reported as top-level" 0 \ + "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE query LIKE '%pqid_inner_marker%' AND parent_query_id = 0")" +expect "outer call still reports parent_query_id = 0" 0 \ + "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE query LIKE '%pqid_outer_caller%' AND parent_query_id != 0")" + +echo +echo "Test 3: error inside nested SPI" +expect "div-by-zero log event landed" 1 \ + "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE cmd_type = 'UNKNOWN' AND err_sqlstate = '22012'")" '>=' +expect "log event parent_query_id = outer caller's query_id" 1 \ + "$(ch_query " + SELECT count() FROM pg_stat_ch.events_raw err_q + JOIN pg_stat_ch.events_raw outer_q + ON err_q.parent_query_id = outer_q.query_id + WHERE err_q.cmd_type = 'UNKNOWN' + AND err_q.err_sqlstate = '22012' + AND outer_q.query LIKE '%pqid_err_outer%' + ")" '>=' +expect "log event query_id is the running statement (non-zero)" 1 \ + "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE cmd_type = 'UNKNOWN' AND err_sqlstate = '22012' AND query_id != 0")" '>=' +expect "log event query_id != parent_query_id (no self-parent)" 0 \ + "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE cmd_type = 'UNKNOWN' AND err_sqlstate = '22012' AND query_id = parent_query_id AND query_id != 0")" + +echo +echo "Cleaning up fixtures..." +pg_exec <<'SQL' >/dev/null +DROP FUNCTION IF EXISTS pqid_outer_caller(); +DROP FUNCTION IF EXISTS pqid_err_outer(); +DROP TABLE IF EXISTS pqid_top_marker; +DROP TABLE IF EXISTS pqid_inner_marker; +DROP TABLE IF EXISTS pqid_err_inner; +SQL + +echo +echo "===================================================" +printf "Result: \033[32m%d passed\033[0m, \033[31m%d failed\033[0m\n" "$PASSED" "$FAILED" +echo "===================================================" +[[ "$FAILED" -eq 0 ]] From c1a0598b0a8d7d71d1ef84666c752dbbabe8f294 Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Thu, 14 May 2026 12:58:45 -0400 Subject: [PATCH 3/8] fix(docker): copy all sql/ files instead of stale --0.1.sql reference The extension's SQL file is now pg_stat_ch--0.3.sql plus the 0.1->0.3 migration, but the Dockerfile still hardcoded the long-gone pg_stat_ch--0.1.sql path. COPY the whole sql/ directory so version bumps don't break the build again. Co-Authored-By: Claude Opus 4.7 (1M context) --- docker/postgres-ext.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/postgres-ext.Dockerfile b/docker/postgres-ext.Dockerfile index 2112f5c..65b2c10 100644 --- a/docker/postgres-ext.Dockerfile +++ b/docker/postgres-ext.Dockerfile @@ -61,7 +61,7 @@ RUN apt-get update && apt-get install -y \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /build/pg_stat_ch/build/pg_stat_ch.so /usr/lib/postgresql/18/lib/ -COPY --from=builder /build/pg_stat_ch/sql/pg_stat_ch--0.1.sql /usr/share/postgresql/18/extension/ +COPY --from=builder /build/pg_stat_ch/sql/ /usr/share/postgresql/18/extension/ COPY --from=builder /build/pg_stat_ch/pg_stat_ch.control /usr/share/postgresql/18/extension/ HEALTHCHECK --interval=10s --timeout=5s --start-period=30s --retries=3 \ From fdaabb89d146b1f434a684590146948759b07d5e Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Thu, 14 May 2026 12:59:19 -0400 Subject: [PATCH 4/8] test: switch validation to OTel/Arrow path with debug_arrow_dump_dir The native ClickHouse export path triggers a checksum-mismatch on ClickHouse 26.1 server (clickhouse-cpps LZ4 block format is incompatible with the newer servers checksum logic). Thats a separate, pre-existing bug well chase down on its own. For now, validate via the OTel/Arrow path, which: - uses ZSTD via Arrow IPC, so the LZ4/checksum issue doesnt apply, AND - mirrors the export pathway we actually use in production. Mechanically: - docker/quickstart/docker-compose.otel.yml runs the postgres container with use_otel=on, otel_arrow_passthrough=on, debug_arrow_dump_dir set to a host-mounted directory. No real OTel collector is needed: the gRPC send fails (endpoint points at nothing), but MaybeDumpArrowBatch fires before the send, so IPC files still land. Same trick as t/026_arrow_dump.pl. - scripts/quickstart-validate-parent-query-id.sh now drives queries against this stack, then parses the dumped Arrow IPC files with "uv run --with pyarrow" and asserts on parent_query_id, query_id, and err_sqlstate. - .claude/skills/quickstart-validate/SKILL.md documents the new requirements (uv) and the rationale for the path choice. Requires uv on the host (brew install uv or pip install uv). Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/skills/quickstart-validate/SKILL.md | 75 +++++--- docker/quickstart/arrow-dump/.gitignore | 2 + docker/quickstart/docker-compose.otel.yml | 44 +++++ .../quickstart-validate-parent-query-id.sh | 170 +++++++++++++----- 4 files changed, 219 insertions(+), 72 deletions(-) create mode 100644 docker/quickstart/arrow-dump/.gitignore create mode 100644 docker/quickstart/docker-compose.otel.yml diff --git a/.claude/skills/quickstart-validate/SKILL.md b/.claude/skills/quickstart-validate/SKILL.md index 2174edd..e168fda 100644 --- a/.claude/skills/quickstart-validate/SKILL.md +++ b/.claude/skills/quickstart-validate/SKILL.md @@ -1,61 +1,88 @@ --- name: quickstart-validate -description: Validate pg_stat_ch behavior end-to-end against the local quickstart Docker stack (Postgres + ClickHouse). Use when a change affects emitted event content (parent_query_id, cmd_type, err_sqlstate, buffer usage, etc.) and you want to confirm it round-trips through to ClickHouse before pushing — i.e., the kind of verification the TAP test gives you but without needing a TAP-enabled local Postgres build. +description: Validate pg_stat_ch behavior end-to-end against the local quickstart Docker stack via the OTel/Arrow export path (with debug_arrow_dump_dir capturing Arrow IPC batches). Use when a change affects emitted event content (parent_query_id, cmd_type, err_sqlstate, buffer usage, etc.) and you want to confirm it round-trips through the production export pathway before pushing — i.e., the kind of verification the TAP test gives you but without needing a TAP-enabled local Postgres build. --- -# Quickstart-based PR validation +# Quickstart-based PR validation (OTel/Arrow path) -Use the quickstart stack (`docker/quickstart/docker-compose.yml`) plus a per-PR validation script to confirm a change reaches ClickHouse with the expected event content. +Drive pg_stat_ch through its production export path (OTel + Arrow IPC) against the local quickstart Postgres container, capture Arrow batches via `debug_arrow_dump_dir`, and assert on the contents with pyarrow. ## Steps -1. **Bring up the quickstart stack** (idempotent): +1. **Install `uv`** (one-time, for inline pyarrow): ```bash - ./scripts/quickstart.sh up + brew install uv # macOS + pip install uv # otherwise ``` - First run takes ~10–15 min because the postgres container builds `pg_stat_ch` from scratch via vcpkg + cmake. Cached subsequent runs are seconds. -2. **Pick the validation script for the topic under test.** The convention is `scripts/quickstart-validate-.sh`: +2. **Bring up the OTel/Arrow quickstart**: + ```bash + ./scripts/quickstart.sh up # builds image once, ~10-15 min cold + docker compose -f docker/quickstart/docker-compose.otel.yml up -d --wait # arrow-dump variant + ``` + The `docker-compose.otel.yml` postgres container is configured with + `use_otel=on`, `otel_arrow_passthrough=on`, and + `debug_arrow_dump_dir=/var/lib/postgresql/arrow-dump` (host-mounted to + `docker/quickstart/arrow-dump/`). The `otel_endpoint` points at a + non-existent collector — the gRPC send fails, but + `MaybeDumpArrowBatch()` writes the IPC file *before* the send, so we + still get fully-formed Arrow batches on the host. + +3. **Pick the validation script for the topic under test.** Convention: - `scripts/quickstart-validate-parent-query-id.sh` — exercises parent_query_id linkage and the off-by-one in CaptureLogEvent. - …add more as new PRs need them. - If no script matches the change, write one (see "Adding a new script" below). + If none matches, write one (see "Adding a new script" below). -3. **Run it**: +4. **Run it**: ```bash - ./scripts/quickstart-validate-.sh + ./scripts/quickstart-validate-parent-query-id.sh ``` - Exits 0 on full pass; per-assertion PASS/FAIL printed to stderr. Output ends with a `Result: N passed, M failed` summary. + Auto-creates fixtures, runs queries, flushes, parses the Arrow IPC dumps with pyarrow, prints per-assertion PASS/FAIL, exits non-zero on any failure. -4. **On failure, inspect raw events**: +5. **On failure, inspect the dumps directly**: ```bash - ./scripts/quickstart.sh ch -q "SELECT * FROM pg_stat_ch.events_raw ORDER BY ts_start DESC LIMIT 20 FORMAT Pretty" - ./scripts/quickstart.sh pg # open psql for ad-hoc reproduction + ls docker/quickstart/arrow-dump/*.ipc + uv run --with pyarrow python -c " + import pyarrow.ipc, sys + with open(sys.argv[1], 'rb') as f: + t = pyarrow.ipc.open_stream(f).read_all() + print(t.to_pandas().to_string()) + " docker/quickstart/arrow-dump/.ipc ``` -5. **Tear down when done** (optional — leaving it up speeds up next iteration): +6. **Tear down when done** (optional): ```bash - ./scripts/quickstart.sh down + docker compose -f docker/quickstart/docker-compose.otel.yml down ``` +## Why the OTel/Arrow path and not the native ClickHouse path + +`docker/quickstart/docker-compose.yml` (the ClickHouse-native variant) +currently can't be used end-to-end: clickhouse-cpp's LZ4-compressed +block format trips a checksum-mismatch on ClickHouse 26.1 server +(separate pre-existing bug, not pg_stat_ch's fault). The OTel/Arrow +path uses ZSTD via Arrow IPC and is unaffected, plus it mirrors the +export pathway actually used in production. + ## When to use this skill vs alternatives -- **Quickstart-validate**: cheap, fast iteration on event-content semantics. Best for "did my change produce the right rows in ClickHouse?" -- **`./scripts/run-tests.sh 18 regress`**: PG regression suite. Best for SQL-level functionality (does `pg_stat_ch_stats()` return the right shape, does the extension load, etc.). Doesn't exercise the ClickHouse side. -- **`./scripts/run-tests.sh ../postgres/install_tap tap`**: TAP harness, which the CI uses. Most thorough but requires a TAP-enabled local Postgres and a fresh local build. Use before opening the PR, or when iterating on `t/*.pl` files directly. +- **Quickstart-validate** (this skill): cheap, fast iteration on event-content semantics via the production export path. Best for "did my change produce the right rows downstream?" +- **`./scripts/run-tests.sh 18 regress`**: PG regression suite. Best for SQL-level functionality (does `pg_stat_ch_stats()` return the right shape, does the extension load, etc.). Doesn't exercise the export side. +- **`./scripts/run-tests.sh ../postgres/install_tap tap`**: TAP harness, which CI uses. Most thorough but requires a TAP-enabled local Postgres and a fresh local build. ## Adding a new script Use `scripts/quickstart-validate-parent-query-id.sh` as a template. Each script should: -1. `ensure_stack_up`: idempotently bring up the quickstart compose if it isn't running. +1. `ensure_stack_up`: idempotently bring up the OTel/Arrow quickstart compose if it isn't running. 2. **Set up fixtures** with `pg_exec </dev/null 2>&1; then + echo "error: uv not found in PATH. Install with 'brew install uv' or 'pip install uv'." >&2 + exit 1 +fi + pg_exec() { docker compose -f "$COMPOSE_FILE" exec -T postgres \ psql -U postgres -d postgres -v ON_ERROR_STOP=1 "$@" } -ch_query() { - docker compose -f "$COMPOSE_FILE" exec -T clickhouse \ - clickhouse-client -q "$1" 2>/dev/null | tr -d '[:space:]' -} - ensure_stack_up() { if ! docker compose -f "$COMPOSE_FILE" ps --status running 2>/dev/null | grep -q postgres; then - echo "Quickstart stack not running. Bringing it up (first run takes ~10-15 min)..." - ./scripts/quickstart.sh up + echo "OTel/Arrow quickstart stack not running. Bringing it up..." + # Reuse the image built by ./scripts/quickstart.sh up + if ! docker image inspect quickstart-postgres:latest >/dev/null 2>&1; then + echo " Building image first (one-time, ~10-15 min cold)..." + docker compose -f docker/quickstart/docker-compose.yml build postgres + fi + docker compose -f "$COMPOSE_FILE" up -d --wait fi } @@ -56,9 +68,12 @@ expect() { ensure_stack_up -echo "Setting up fixtures..." +echo "Resetting state and setting up fixtures..." +rm -f "$DUMP_DIR"/*.ipc 2>/dev/null || true + pg_exec <<'SQL' >/dev/null CREATE EXTENSION IF NOT EXISTS pg_stat_ch; +SELECT pg_stat_ch_reset(); DROP TABLE IF EXISTS pqid_top_marker CASCADE; DROP TABLE IF EXISTS pqid_inner_marker CASCADE; @@ -93,8 +108,8 @@ BEGIN END$$; SQL -echo "Clearing events_raw and pg_stat_ch stats..." -ch_query "TRUNCATE TABLE pg_stat_ch.events_raw" >/dev/null +# Clear dumps generated by setup, then drive the test queries. +rm -f "$DUMP_DIR"/*.ipc 2>/dev/null || true pg_exec -c "SELECT pg_stat_ch_reset();" >/dev/null echo "Driving test queries..." @@ -106,50 +121,109 @@ SELECT pqid_err_outer(); SELECT pg_stat_ch_flush(); SQL -echo "Waiting for events to land..." +# Wait for IPC files to land. +echo "Waiting for Arrow IPC dumps..." for _ in $(seq 1 30); do - count=$(ch_query "SELECT count() FROM pg_stat_ch.events_raw") - if [[ "${count:-0}" -ge 5 ]]; then break; fi + if compgen -G "$DUMP_DIR"/*.ipc >/dev/null; then break; fi sleep 1 done +# Parse the dumped Arrow IPC batches and run assertions. We emit one +# `KEY=VALUE` line per assertion result from the inline pyarrow script so +# the shell can pick them up without re-implementing Arrow parsing. +results=$(uv run --quiet --with pyarrow --python 3.12 - "$DUMP_DIR" <<'PY' +import glob, os, sys +import pyarrow as pa +import pyarrow.ipc as ipc + +dump_dir = sys.argv[1] +rows = [] +for path in sorted(glob.glob(os.path.join(dump_dir, "*.ipc"))): + with open(path, "rb") as f: + reader = ipc.open_stream(f) + table = reader.read_all() + for batch_dict in table.to_pylist(): + rows.append(batch_dict) + +def has(text, predicate=None): + out = [] + for r in rows: + q = r.get("query_text") or "" + if text in q and (predicate is None or predicate(r)): + out.append(r) + return out + +def by_query_id(qid): + return [r for r in rows if r.get("query_id") == qid] + +# Test 1: top-level marker queries report parent_query_id == 0 +top_rows = has("pqid_top_marker") +top_with_nonzero_parent = [r for r in top_rows if r.get("parent_query_id", 0) != 0] +print(f"top_rows={len(top_rows)}") +print(f"top_nonzero_parent={len(top_with_nonzero_parent)}") + +# Test 2: nested SPI inner.parent_query_id matches outer.query_id +outer_rows = has("pqid_outer_caller") +inner_rows = has("pqid_inner_marker") +linked = 0 +for inner in inner_rows: + p = inner.get("parent_query_id") + if p and any(o.get("query_id") == p for o in outer_rows): + linked += 1 +inner_orphans = [r for r in inner_rows if r.get("parent_query_id", 0) == 0] +outer_self_parent = [r for r in outer_rows if r.get("parent_query_id", 0) != 0] +print(f"outer_rows={len(outer_rows)}") +print(f"inner_rows={len(inner_rows)}") +print(f"inner_linked_to_outer={linked}") +print(f"inner_with_zero_parent={len(inner_orphans)}") +print(f"outer_with_nonzero_parent={len(outer_self_parent)}") + +# Test 3: error event(s) for div-by-zero +err_rows = [r for r in rows if r.get("err_sqlstate") == "22012"] +err_outer = has("pqid_err_outer") +err_linked = 0 +err_qid_nonzero = 0 +err_self_parent = 0 +for er in err_rows: + qid = er.get("query_id", 0) + pqid = er.get("parent_query_id", 0) + if qid != 0: + err_qid_nonzero += 1 + if pqid and any(o.get("query_id") == pqid for o in err_outer): + err_linked += 1 + if qid != 0 and qid == pqid: + err_self_parent += 1 +print(f"err_rows={len(err_rows)}") +print(f"err_linked_to_outer={err_linked}") +print(f"err_qid_nonzero={err_qid_nonzero}") +print(f"err_self_parent={err_self_parent}") +PY +) + +echo "$results" echo + +# Pull individual values out. +get() { echo "$results" | awk -F= -v k="$1" '$1==k {print $2}'; } + echo "Test 1: top-level queries report parent_query_id = 0" -expect "top-level rows have parent_query_id = 0" 0 \ - "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE query LIKE '%pqid_top_marker%' AND parent_query_id != 0")" +expect "top-level rows landed" 1 "$(get top_rows)" '>=' +expect "top-level parent_query_id = 0" 0 "$(get top_nonzero_parent)" echo echo "Test 2: nested SPI parent_query_id links to outer query_id" -expect "inner row joins to outer via parent_query_id" 1 \ - "$(ch_query " - SELECT count() FROM pg_stat_ch.events_raw inner_q - JOIN pg_stat_ch.events_raw outer_q - ON inner_q.parent_query_id = outer_q.query_id - WHERE inner_q.query LIKE '%pqid_inner_marker%' - AND outer_q.query LIKE '%pqid_outer_caller%' - ")" '>=' -expect "nested SPI query is not reported as top-level" 0 \ - "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE query LIKE '%pqid_inner_marker%' AND parent_query_id = 0")" -expect "outer call still reports parent_query_id = 0" 0 \ - "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE query LIKE '%pqid_outer_caller%' AND parent_query_id != 0")" +expect "outer rows landed" 1 "$(get outer_rows)" '>=' +expect "inner rows landed" 1 "$(get inner_rows)" '>=' +expect "inner row joins outer via parent_query_id" 1 "$(get inner_linked_to_outer)" '>=' +expect "nested SPI is not reported as top-level" 0 "$(get inner_with_zero_parent)" +expect "outer call still reports parent_query_id = 0" 0 "$(get outer_with_nonzero_parent)" echo echo "Test 3: error inside nested SPI" -expect "div-by-zero log event landed" 1 \ - "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE cmd_type = 'UNKNOWN' AND err_sqlstate = '22012'")" '>=' -expect "log event parent_query_id = outer caller's query_id" 1 \ - "$(ch_query " - SELECT count() FROM pg_stat_ch.events_raw err_q - JOIN pg_stat_ch.events_raw outer_q - ON err_q.parent_query_id = outer_q.query_id - WHERE err_q.cmd_type = 'UNKNOWN' - AND err_q.err_sqlstate = '22012' - AND outer_q.query LIKE '%pqid_err_outer%' - ")" '>=' -expect "log event query_id is the running statement (non-zero)" 1 \ - "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE cmd_type = 'UNKNOWN' AND err_sqlstate = '22012' AND query_id != 0")" '>=' -expect "log event query_id != parent_query_id (no self-parent)" 0 \ - "$(ch_query "SELECT count() FROM pg_stat_ch.events_raw WHERE cmd_type = 'UNKNOWN' AND err_sqlstate = '22012' AND query_id = parent_query_id AND query_id != 0")" +expect "div-by-zero log event landed" 1 "$(get err_rows)" '>=' +expect "log event parent_query_id = outer's query_id" 1 "$(get err_linked_to_outer)" '>=' +expect "log event query_id is the running statement" 1 "$(get err_qid_nonzero)" '>=' +expect "log event query_id != parent_query_id" 0 "$(get err_self_parent)" echo echo "Cleaning up fixtures..." From d2f0bf7601ee2bc245a07320631b442ad0539f6f Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Thu, 14 May 2026 16:56:23 -0400 Subject: [PATCH 5/8] fix(ci): pin vcpkg checkout to tag 2026.04.27 across all entry points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setup-vcpkg/action.yml was doing a plain `git clone microsoft/vcpkg` with no checkout, so every CI run landed on whatever master HEAD happened to be that day. That silently shifts port-version ABI hashes between runs, invalidating the binary archive cache that the action set up two steps later. Result: every CI run paid the cold vcpkg-compile cost. Align all four pinning points on tag 2026.04.27 (commit 56bb2411): - third_party/vcpkg submodule - vcpkg-configuration.json baseline - docker/postgres-ext.Dockerfile ARG VCPKG_COMMIT - .github/actions/setup-vcpkg/action.yml (the actual fix — adds the `git checkout` that was missing, plus a `git fetch` for the case where the runner already has a cached vcpkg clone from a prior run) We were previously at 2026.03.18-398 (mid-release); the tagged commit is 144 commits forward of that, so this is a small forward move to a curated port-set, not a rewind. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/actions/setup-vcpkg/action.yml | 10 ++++++++++ docker/postgres-ext.Dockerfile | 3 ++- third_party/vcpkg | 2 +- vcpkg-configuration.json | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/actions/setup-vcpkg/action.yml b/.github/actions/setup-vcpkg/action.yml index 6dd43dd..4953895 100644 --- a/.github/actions/setup-vcpkg/action.yml +++ b/.github/actions/setup-vcpkg/action.yml @@ -12,11 +12,21 @@ runs: steps: - name: Install vcpkg shell: bash + env: + # Pin to the same commit as the vcpkg submodule and the Dockerfile + # (vcpkg tag 2026.04.27). Without this checkout the action lands on + # master HEAD, which drifts port-version ABI hashes and silently + # invalidates the binary archive cache between CI runs. + VCPKG_COMMIT: 56bb2411609227288b70117ead2c47585ba07713 run: | if [ ! -d "$HOME/vcpkg" ]; then # Full clone required — Arrow depends on Boost, which needs vcpkg git history git clone https://github.com/microsoft/vcpkg.git "$HOME/vcpkg" + git -C "$HOME/vcpkg" checkout "$VCPKG_COMMIT" "$HOME/vcpkg/bootstrap-vcpkg.sh" -disableMetrics + else + git -C "$HOME/vcpkg" fetch --quiet origin "$VCPKG_COMMIT" || true + git -C "$HOME/vcpkg" checkout "$VCPKG_COMMIT" fi echo "VCPKG_ROOT=$HOME/vcpkg" >> "$GITHUB_ENV" echo "$HOME/vcpkg" >> "$GITHUB_PATH" diff --git a/docker/postgres-ext.Dockerfile b/docker/postgres-ext.Dockerfile index 65b2c10..c1c3a7a 100644 --- a/docker/postgres-ext.Dockerfile +++ b/docker/postgres-ext.Dockerfile @@ -22,7 +22,8 @@ RUN apt-get update && apt-get install -y curl ca-certificates gnupg \ # Install vcpkg # Pin to the same commit as the vcpkg submodule for reproducible builds -ARG VCPKG_COMMIT=12159785447291b4069c82a3fe9c2770a393ac7f +# (vcpkg tag 2026.04.27). +ARG VCPKG_COMMIT=56bb2411609227288b70117ead2c47585ba07713 RUN git clone https://github.com/microsoft/vcpkg.git /opt/vcpkg \ && git -C /opt/vcpkg checkout "$VCPKG_COMMIT" \ && /opt/vcpkg/bootstrap-vcpkg.sh -disableMetrics diff --git a/third_party/vcpkg b/third_party/vcpkg index 1215978..56bb241 160000 --- a/third_party/vcpkg +++ b/third_party/vcpkg @@ -1 +1 @@ -Subproject commit 12159785447291b4069c82a3fe9c2770a393ac7f +Subproject commit 56bb2411609227288b70117ead2c47585ba07713 diff --git a/vcpkg-configuration.json b/vcpkg-configuration.json index 2103a7c..fc9dc4f 100644 --- a/vcpkg-configuration.json +++ b/vcpkg-configuration.json @@ -1,7 +1,7 @@ { "default-registry": { "kind": "builtin", - "baseline": "12159785447291b4069c82a3fe9c2770a393ac7f" + "baseline": "56bb2411609227288b70117ead2c47585ba07713" }, "overlay-triplets": ["./triplets"], "overlay-ports": ["./overlay-ports"] From 7af4504df88592dd979e601686fcfa136ffb0acb Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Fri, 15 May 2026 11:19:36 -0400 Subject: [PATCH 6/8] fix(quickstart-validate): compare parent_query_id as a string, not int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arrow dict-encodes query_id and parent_query_id as decimal *strings* ("0" for top-level / no parent). The inline pyarrow validator was comparing the resulting Python strings against int 0, which is always non-equal — making every top-level row look like it had a non-zero parent. Add an is_zero() helper for the "is this column zero?" checks. Mirrors the same fix in t/029 over on the parent-query-id-surgical branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../quickstart-validate-parent-query-id.sh | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/scripts/quickstart-validate-parent-query-id.sh b/scripts/quickstart-validate-parent-query-id.sh index a0481a9..54d0725 100755 --- a/scripts/quickstart-validate-parent-query-id.sh +++ b/scripts/quickstart-validate-parent-query-id.sh @@ -156,9 +156,14 @@ def has(text, predicate=None): def by_query_id(qid): return [r for r in rows if r.get("query_id") == qid] +# Arrow dict-encodes query_id and parent_query_id as decimal *strings*; "0" +# means top-level / no parent. Compare against strings, not ints. +def is_zero(v): + return v is None or v == "" or v == "0" + # Test 1: top-level marker queries report parent_query_id == 0 top_rows = has("pqid_top_marker") -top_with_nonzero_parent = [r for r in top_rows if r.get("parent_query_id", 0) != 0] +top_with_nonzero_parent = [r for r in top_rows if not is_zero(r.get("parent_query_id"))] print(f"top_rows={len(top_rows)}") print(f"top_nonzero_parent={len(top_with_nonzero_parent)}") @@ -168,10 +173,10 @@ inner_rows = has("pqid_inner_marker") linked = 0 for inner in inner_rows: p = inner.get("parent_query_id") - if p and any(o.get("query_id") == p for o in outer_rows): + if not is_zero(p) and any(o.get("query_id") == p for o in outer_rows): linked += 1 -inner_orphans = [r for r in inner_rows if r.get("parent_query_id", 0) == 0] -outer_self_parent = [r for r in outer_rows if r.get("parent_query_id", 0) != 0] +inner_orphans = [r for r in inner_rows if is_zero(r.get("parent_query_id"))] +outer_self_parent = [r for r in outer_rows if not is_zero(r.get("parent_query_id"))] print(f"outer_rows={len(outer_rows)}") print(f"inner_rows={len(inner_rows)}") print(f"inner_linked_to_outer={linked}") @@ -185,13 +190,13 @@ err_linked = 0 err_qid_nonzero = 0 err_self_parent = 0 for er in err_rows: - qid = er.get("query_id", 0) - pqid = er.get("parent_query_id", 0) - if qid != 0: + qid = er.get("query_id") + pqid = er.get("parent_query_id") + if not is_zero(qid): err_qid_nonzero += 1 - if pqid and any(o.get("query_id") == pqid for o in err_outer): + if not is_zero(pqid) and any(o.get("query_id") == pqid for o in err_outer): err_linked += 1 - if qid != 0 and qid == pqid: + if not is_zero(qid) and qid == pqid: err_self_parent += 1 print(f"err_rows={len(err_rows)}") print(f"err_linked_to_outer={err_linked}") From 06e2a69de9cf56345fdbadfff5e50bc66acd32cd Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Fri, 15 May 2026 11:25:53 -0400 Subject: [PATCH 7/8] fix(quickstart-validate): filter outer/inner to SELECT for nested-SPI check The Arrow dumps include the setup CREATE TABLE / INSERT / DROP utility statements that mention pqid_outer_caller / pqid_inner_marker. Those rows have no parent_query_id link, so the "no orphan inner rows" check was failing against them rather than against the actual nested SELECT. Filter to db_operation == "SELECT" so we only assert on the rows the test is actually about. Mirrors the same fix applied in t/029 on the parent-query-id-surgical branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/quickstart-validate-parent-query-id.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/quickstart-validate-parent-query-id.sh b/scripts/quickstart-validate-parent-query-id.sh index 54d0725..ff8581c 100755 --- a/scripts/quickstart-validate-parent-query-id.sh +++ b/scripts/quickstart-validate-parent-query-id.sh @@ -167,9 +167,11 @@ top_with_nonzero_parent = [r for r in top_rows if not is_zero(r.get("parent_quer print(f"top_rows={len(top_rows)}") print(f"top_nonzero_parent={len(top_with_nonzero_parent)}") -# Test 2: nested SPI inner.parent_query_id matches outer.query_id -outer_rows = has("pqid_outer_caller") -inner_rows = has("pqid_inner_marker") +# Test 2: nested SPI inner.parent_query_id matches outer.query_id. Filter +# to SELECT so the setup CREATE TABLE / INSERT / DROP utility statements +# (which mention the table) don't pollute the "no orphans" check. +outer_rows = [r for r in has("pqid_outer_caller") if r.get("db_operation") == "SELECT"] +inner_rows = [r for r in has("pqid_inner_marker") if r.get("db_operation") == "SELECT"] linked = 0 for inner in inner_rows: p = inner.get("parent_query_id") From 2992779fec9f1c32ed24a6fb86910b8df3144ebc Mon Sep 17 00:00:00 2001 From: Josh Ventura Date: Fri, 15 May 2026 14:14:26 -0400 Subject: [PATCH 8/8] fix(quickstart-validate): rewrite Test 3 to use RAISE WARNING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old Test 3 drove a caught division_by_zero through a plpgsql EXCEPTION block, expecting an err_sqlstate=22012 event to land in the Arrow dump. No such event ever arrives: emit_log_hook does not fire from errfinish for ERROR-level events (errfinish PG_RE_THROWs without calling EmitErrorReport). For caught-in-EXCEPTION errors, emit_log_hook never fires at all. For uncaught errors it fires from PostgresMain's top-level catch, after all PschQueryFrames have been popped during unwinding — query_id then comes out as 0. Switch to RAISE WARNING fired from inside a nested SPI call. WARNING goes through EmitErrorReport directly, so our hook fires while the inner SPI's frame is still on the stack — the only state where CaptureLogEvent's slot choice is observable. Mirrors the same fix applied to t/028 and t/029 on the parent-query-id-surgical branch. The new assertions distinguish "inner SPI queryid" from "outer caller queryid" so an off-by-one regression that attributes the warning to the wrong slot would be caught. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../quickstart-validate-parent-query-id.sh | 97 +++++++++++-------- 1 file changed, 56 insertions(+), 41 deletions(-) diff --git a/scripts/quickstart-validate-parent-query-id.sh b/scripts/quickstart-validate-parent-query-id.sh index ff8581c..e6aec91 100755 --- a/scripts/quickstart-validate-parent-query-id.sh +++ b/scripts/quickstart-validate-parent-query-id.sh @@ -8,9 +8,17 @@ # # 1. Top-level queries report parent_query_id = 0. # 2. Nested SPI queries report parent_query_id = outer's query_id. -# 3. Error events captured inside a plpgsql EXCEPTION block carry both -# a non-zero query_id (the running statement) and parent_query_id -# equal to the outer caller's query_id, distinct from each other. +# 3. Log events captured by emit_log_hook while a nested SPI query is +# on the stack carry query_id of the running (inner) statement and +# parent_query_id of the outer caller (catches CaptureLogEvent's +# off-by-one). We use RAISE WARNING for this rather than a caught +# ERROR: emit_log_hook does not fire from errfinish for ERROR-level +# events (errfinish PG_RE_THROWs before EmitErrorReport runs; +# emit_log_hook only fires later from PostgresMain's top-level +# catch, after all frames have been popped — or never, for errors +# caught inside a plpgsql EXCEPTION block). WARNING goes through +# EmitErrorReport directly so the frame stack is intact when our +# hook runs. # # Why not the native ClickHouse path? clickhouse-cpp's LZ4-compressed # block format currently triggers a checksum-mismatch on ClickHouse 26.1 @@ -77,15 +85,16 @@ SELECT pg_stat_ch_reset(); DROP TABLE IF EXISTS pqid_top_marker CASCADE; DROP TABLE IF EXISTS pqid_inner_marker CASCADE; -DROP TABLE IF EXISTS pqid_err_inner CASCADE; +DROP TABLE IF EXISTS pqid_warn_tbl CASCADE; DROP FUNCTION IF EXISTS pqid_outer_caller(); -DROP FUNCTION IF EXISTS pqid_err_outer(); +DROP FUNCTION IF EXISTS pqid_warn_outer(); +DROP FUNCTION IF EXISTS pqid_emit_warn(int); CREATE TABLE pqid_top_marker(x int); CREATE TABLE pqid_inner_marker(x int); INSERT INTO pqid_inner_marker VALUES (1); -CREATE TABLE pqid_err_inner(x int); -INSERT INTO pqid_err_inner VALUES (0); +CREATE TABLE pqid_warn_tbl(x int); +INSERT INTO pqid_warn_tbl VALUES (1); CREATE FUNCTION pqid_outer_caller() RETURNS int LANGUAGE plpgsql AS $$ @@ -95,16 +104,19 @@ BEGIN RETURN v; END$$; -CREATE FUNCTION pqid_err_outer() RETURNS int +CREATE FUNCTION pqid_emit_warn(x int) RETURNS int +LANGUAGE plpgsql AS $$ +BEGIN + RAISE WARNING 'pqid_warn_marker' USING ERRCODE = '01001'; + RETURN x; +END$$; + +CREATE FUNCTION pqid_warn_outer() RETURNS int LANGUAGE plpgsql AS $$ DECLARE v int; BEGIN - BEGIN - SELECT 1 / x INTO v FROM pqid_err_inner; - EXCEPTION WHEN division_by_zero THEN - NULL; - END; - RETURN 1; + SELECT pqid_emit_warn(x) INTO v FROM pqid_warn_tbl; + RETURN v; END$$; SQL @@ -117,7 +129,7 @@ pg_exec <<'SQL' >/dev/null SELECT * FROM pqid_top_marker; SELECT count(*) FROM pqid_top_marker; SELECT pqid_outer_caller(); -SELECT pqid_err_outer(); +SELECT pqid_warn_outer(); SELECT pg_stat_ch_flush(); SQL @@ -185,25 +197,26 @@ print(f"inner_linked_to_outer={linked}") print(f"inner_with_zero_parent={len(inner_orphans)}") print(f"outer_with_nonzero_parent={len(outer_self_parent)}") -# Test 3: error event(s) for div-by-zero -err_rows = [r for r in rows if r.get("err_sqlstate") == "22012"] -err_outer = has("pqid_err_outer") -err_linked = 0 -err_qid_nonzero = 0 -err_self_parent = 0 -for er in err_rows: - qid = er.get("query_id") - pqid = er.get("parent_query_id") - if not is_zero(qid): - err_qid_nonzero += 1 - if not is_zero(pqid) and any(o.get("query_id") == pqid for o in err_outer): - err_linked += 1 - if not is_zero(qid) and qid == pqid: - err_self_parent += 1 -print(f"err_rows={len(err_rows)}") -print(f"err_linked_to_outer={err_linked}") -print(f"err_qid_nonzero={err_qid_nonzero}") -print(f"err_self_parent={err_self_parent}") +# Test 3: log event captured inside nested SPI (via RAISE WARNING). +# The event has no query_text (CaptureLogEvent leaves it empty), so we +# identify it by err_sqlstate '01001' (our RAISE WARNING's custom code). +warn_outer = [r for r in has("pqid_warn_outer") if r.get("db_operation") == "SELECT"] +warn_inner = [r for r in has("pqid_emit_warn") if r.get("db_operation") == "SELECT"] +warn_outer_qids = {r.get("query_id") for r in warn_outer if not is_zero(r.get("query_id"))} +warn_inner_qids = {r.get("query_id") for r in warn_inner if not is_zero(r.get("query_id"))} +warn_rows = [r for r in rows if r.get("err_sqlstate") == "01001"] +warn_linked_to_outer = sum(1 for r in warn_rows + if r.get("parent_query_id") in warn_outer_qids) +warn_qid_is_inner = sum(1 for r in warn_rows if r.get("query_id") in warn_inner_qids) +warn_qid_is_outer = sum(1 for r in warn_rows if r.get("query_id") in warn_outer_qids) +warn_self_parent = sum(1 for r in warn_rows + if not is_zero(r.get("query_id")) + and r.get("query_id") == r.get("parent_query_id")) +print(f"warn_rows={len(warn_rows)}") +print(f"warn_linked_to_outer={warn_linked_to_outer}") +print(f"warn_qid_is_inner={warn_qid_is_inner}") +print(f"warn_qid_is_outer={warn_qid_is_outer}") +print(f"warn_self_parent={warn_self_parent}") PY ) @@ -226,20 +239,22 @@ expect "nested SPI is not reported as top-level" 0 "$(get inner_with_zero_par expect "outer call still reports parent_query_id = 0" 0 "$(get outer_with_nonzero_parent)" echo -echo "Test 3: error inside nested SPI" -expect "div-by-zero log event landed" 1 "$(get err_rows)" '>=' -expect "log event parent_query_id = outer's query_id" 1 "$(get err_linked_to_outer)" '>=' -expect "log event query_id is the running statement" 1 "$(get err_qid_nonzero)" '>=' -expect "log event query_id != parent_query_id" 0 "$(get err_self_parent)" +echo "Test 3: log event inside nested SPI" +expect "RAISE WARNING log event landed" 1 "$(get warn_rows)" '>=' +expect "log event parent_query_id = outer's query_id" 1 "$(get warn_linked_to_outer)" '>=' +expect "log event query_id = inner SPI (the running)" 1 "$(get warn_qid_is_inner)" '>=' +expect "log event query_id is NOT outer (no off-by-one)" 0 "$(get warn_qid_is_outer)" +expect "log event query_id != parent_query_id" 0 "$(get warn_self_parent)" echo echo "Cleaning up fixtures..." pg_exec <<'SQL' >/dev/null DROP FUNCTION IF EXISTS pqid_outer_caller(); -DROP FUNCTION IF EXISTS pqid_err_outer(); +DROP FUNCTION IF EXISTS pqid_warn_outer(); +DROP FUNCTION IF EXISTS pqid_emit_warn(int); DROP TABLE IF EXISTS pqid_top_marker; DROP TABLE IF EXISTS pqid_inner_marker; -DROP TABLE IF EXISTS pqid_err_inner; +DROP TABLE IF EXISTS pqid_warn_tbl; SQL echo