From f4d9846854155ebc0b2d2bbbb1dfdb02c172c6be Mon Sep 17 00:00:00 2001 From: UtkarshBhardwaj007 Date: Thu, 4 Jun 2026 10:05:59 +0100 Subject: [PATCH] fix: run the memory watchdog for every command; plug the connect() adapter leak Three playground processes grew past 40 GB each and froze the machine. Two causes, both fixed: 1. connect() leaked its session-probe TerminalAdapter on the existing-session and probe-failure paths, and nobody owned the QR-path adapter after the init TUI exited. The leaked statement-store WebSocket + subscriptions keep the event loop alive and are the machinery that can enter the documented polkadot-api microtask-flood state. 2. The 4 GB memory watchdog only ran for deploy/mod/contract. The worker-thread watchdog is the only guard that survives event-loop starvation (signal handlers, hardExit timers, and index.ts's final process.exit all stop firing), so it now defaults ON for every command, with explicit opt-out preserved. New regression coverage: src/utils/auth.connect.test.ts pins the adapter lifecycle contract (destroy on existing/throw paths, ownership transfer on the QR path); cli-runtime.test.ts pins the watchdog default. --- .changeset/heavy-pugs-guard.md | 5 ++ CLAUDE.md | 4 +- src/cli-runtime.test.ts | 18 +++++ src/cli-runtime.ts | 17 ++++- src/commands/init/index.ts | 10 +++ src/utils/auth.connect.test.ts | 121 +++++++++++++++++++++++++++++++++ src/utils/auth.ts | 33 +++++++-- 7 files changed, 197 insertions(+), 11 deletions(-) create mode 100644 .changeset/heavy-pugs-guard.md create mode 100644 src/utils/auth.connect.test.ts diff --git a/.changeset/heavy-pugs-guard.md b/.changeset/heavy-pugs-guard.md new file mode 100644 index 0000000..1f1b03d --- /dev/null +++ b/.changeset/heavy-pugs-guard.md @@ -0,0 +1,5 @@ +--- +"playground-cli": patch +--- + +Fix runaway memory growth in lingering `playground init` processes. The 4 GB memory watchdog now runs for every command by default (previously only `deploy`, `mod`, and `contract`), so a process whose event loop gets starved by a leaked subscription is killed with an actionable message instead of growing to tens of GB and freezing the machine. Also plugs the session-probe adapter leak in `playground init`: the login adapter's WebSocket is now released on the already-logged-in path, on probe failure, and after the QR login completes. diff --git a/CLAUDE.md b/CLAUDE.md index ca9bc97..a610775 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -77,7 +77,7 @@ These aren't self-evident from reading the code and have bitten us before. Treat - **Telemetry bootstrap** (`src/bootstrap.ts`) is the FIRST import in `src/index.ts`. It sets `BULLETIN_DEPLOY_USE_AMBIENT_SENTRY=1` and `BULLETIN_DEPLOY_HOST_APP=playground-cli` before `bulletin-deploy` evaluates, then maps `DOT_TELEMETRY`/internal-context detection to `BULLETIN_DEPLOY_TELEMETRY`. Don't leave `BULLETIN_DEPLOY_TELEMETRY` unset while setting the host app: `bulletin-deploy` treats `playground-cli` as an internal host, which would enable deploy telemetry for external users. - **Throttle TUI info updates.** bulletin-deploy logs per-chunk, builds stream thousands of lines/sec. `setState`-per-event floods React's reconciler with backpressure (can balloon past 20 GB and freeze the OS). `RunningStage` coalesces "latest info" updates to ≤10/sec via a ref + timer and caps line length at 160 chars. Don't hook raw per-line streams directly into Ink state. - **`DeployLogParser.feed()` MUST NOT emit an event per log line.** It's called for every console line bulletin-deploy prints. We emit only for phase-banner matches and `[N/M]` chunk progress; everything else returns `null`. A catch-all `info` emit allocates ~200 bytes × thousands of lines and was a measurable contributor to chunk-upload memory pressure. -- **`startMemoryWatchdog()` runs for both `playground deploy` and `playground mod`.** Mod's tarball download is a streaming pipe through `node:zlib` + `tar.extract()`; a stuck IPFS gateway or malformed tarball can leak buffers. Any new top-level command doing meaningful I/O should also call `startMemoryWatchdog()` and register `stopWatchdog` via `onProcessShutdown()`. +- **The memory watchdog runs for EVERY command by default** (`runCliCommand`'s `watchdog` option defaults to true). It is the only guard that survives event-loop starvation: when a leaked polkadot-api subscription enters the microtask-flood state, signal handlers, `hardExit` timers, and `src/index.ts`'s final `process.exit()` all stop firing — the process looks finished but lingers invisibly and grows unbounded. We shipped exactly that in June 2026: `playground init` ran watchdog-less, and three zombie `playground` processes reached 40+ GB each, swapping the laptop to death. Do NOT opt a command out to save the worker thread — it costs one 1 Hz `memoryUsage()` sample. The related session-probe rule: every `createAdapter()` call site must destroy the adapter on EVERY path that doesn't transfer ownership to the caller (`connect()`'s existing-session and probe-throw paths leaked it; `src/utils/auth.connect.test.ts` pins the contract). - **`QueryResult` from `@parity/product-sdk-contracts@0.5+` is a discriminated union.** Narrow on `.success` before reading `.value`. On the failure branch `.value` is the runtime's dispatch-error payload (`unknown`). On the success branch `gasRequired` is non-optional. We apply this in `src/utils/contractManifest.ts::resolveLiveContractAddresses`, `src/commands/mod/AppBrowser.tsx`, and `src/commands/mod/SetupScreen.tsx`. ## Repo conventions @@ -94,7 +94,7 @@ These aren't self-evident from reading the code and have bitten us before. Treat - DSN: `src/telemetry-config.ts::PLAYGROUND_SENTRY_DSN`. Region: EU (`https://de.sentry.io`). Attribute prefix: `cli.`. Spec: `sentry-instrumentation-spec.md` at the repo root (untracked). - Org slug: `paritytech`. API token: macOS keychain service `sentry-api-token`. -- **Helpers — don't reimplement.** `src/telemetry.ts` exports `withCommandTelemetry`, `withRootSpan`, `withSpan` (3-arg `(op, name, fn)` and 4-arg `(op, name, attrs, fn)` overloads), `captureWarning`, `captureException`, `errorMessage`, `sanitizedErrorMessage`. `src/utils/deploy/phase.ts` exports `withDeployPhase`. `src/cli-runtime.ts` exports `runCliCommand` — every command's `.action()` body should be one `runCliCommand(name, options, async () => { ... })` call. Today: `init` runs without `hardExit`/`watchdog`; `build`, `update`, `logout` run with `hardExit` only; `deploy` and `mod` run with both. +- **Helpers — don't reimplement.** `src/telemetry.ts` exports `withCommandTelemetry`, `withRootSpan`, `withSpan` (3-arg `(op, name, fn)` and 4-arg `(op, name, attrs, fn)` overloads), `captureWarning`, `captureException`, `errorMessage`, `sanitizedErrorMessage`. `src/utils/deploy/phase.ts` exports `withDeployPhase`. `src/cli-runtime.ts` exports `runCliCommand` — every command's `.action()` body should be one `runCliCommand(name, options, async () => { ... })` call. The memory watchdog is ON by default for every command (see the Runtime / memory invariant below — do not opt out); `hardExit` defaults to true and is currently disabled only for `init` (its event loop drains naturally after `destroyConnection()` + the QR login-adapter destroy). - **Dashboards** are JSON snapshots under `sentry/dashboards/.json`: `2143100` (Health, prod filter `!cli.tag:e2e-*`), `2216067` (Failures), `2216096` (E2E Health, inverse filter `cli.tag:e2e-*`). - **Workflow:** run `./sentry/backup-dashboards.sh` BEFORE any change. Use `./sentry/patch-dashboard.py ` for surgical edits or `./sentry/create-dashboard.py ` for new dashboards. PUT replaces the whole widget list — backup first. Don't include a `projects` field in POST payloads. - **E2E tagging:** every spawn from `e2e/cli/helpers/dot.ts` injects `DOT_TAG=e2e-local` (fallback), `DOT_TELEMETRY=1`, and `DEPLOY_TAG=e2e-cli-local` (derived from `DOT_TAG` with an `e2e-cli-` prefix). `tools/e2e-local.sh` overrides `DOT_TAG` to `e2e-local-{smoke|pr|nightly}`; CI sets `DOT_TAG=e2e-ci-{pr|nightly|dispatch}`. The `e2e-cli-` prefix on `DEPLOY_TAG` distinguishes our E2E traffic from bulletin-deploy's own. Production health widgets filter cleanly via `!cli.tag:e2e-*`. diff --git a/src/cli-runtime.test.ts b/src/cli-runtime.test.ts index 22779cd..cecb4d7 100644 --- a/src/cli-runtime.test.ts +++ b/src/cli-runtime.test.ts @@ -61,6 +61,24 @@ describe("runCliCommand", () => { expect(stopWatchdog).toHaveBeenCalledTimes(1); }); + it("starts the memory watchdog BY DEFAULT when the option is omitted", async () => { + // Regression guard for the June 2026 zombie incident: `playground + // init` ran without the watchdog, a leaked subscription starved the + // event loop (so signal handlers, hardExit timers, and index.ts's + // final process.exit all stopped firing), and three invisible + // processes grew past 40 GB each. The worker-thread watchdog is the + // only guard that survives that state, so every command gets it + // unless it explicitly opts out. + await runCliCommand("init", { hardExit: false }, async () => {}); + expect(startMemoryWatchdog).toHaveBeenCalledTimes(1); + expect(stopWatchdog).toHaveBeenCalledTimes(1); + }); + + it("does not start the watchdog when explicitly opted out", async () => { + await runCliCommand("build", { watchdog: false, hardExit: false }, async () => {}); + expect(startMemoryWatchdog).not.toHaveBeenCalled(); + }); + it("propagates errors but still schedules hard exit with non-zero code", async () => { process.exitCode = 1; await expect( diff --git a/src/cli-runtime.ts b/src/cli-runtime.ts index 2f6411c..4897e32 100644 --- a/src/cli-runtime.ts +++ b/src/cli-runtime.ts @@ -18,7 +18,20 @@ import type { CliCommandName } from "./telemetry-config.js"; import { onProcessShutdown, scheduleHardExit, startMemoryWatchdog } from "./utils/process-guard.js"; export interface RunCliCommandOptions { - /** Start the memory watchdog and stop it on exit. Defaults to false. */ + /** + * Start the memory watchdog and stop it on exit. Defaults to true. + * + * ON for every command by default because the watchdog's worker thread + * is the ONLY guard that survives event-loop starvation: when a leaked + * polkadot-api subscription enters the microtask-flood state (see + * `process-guard.ts`), signal handlers, `hardExit` timers, and the + * final `process.exit()` in `src/index.ts` all stop firing — the + * process looks finished but sits invisible, growing tens of GB until + * the OS swaps itself to death. We shipped exactly that: `playground + * init` ran watchdog-less and three zombies reached 40+ GB each + * (June 2026). Cost is one worker + a 1 Hz `memoryUsage()` sample — + * don't opt commands out to save it. + */ watchdog?: boolean; /** Schedule a hard-exit safety net after the action returns. Defaults to true. */ hardExit?: boolean; @@ -39,7 +52,7 @@ export async function runCliCommand( options: RunCliCommandOptions, action: () => Promise, ): Promise { - const watchdog = options.watchdog ?? false; + const watchdog = options.watchdog ?? true; const hardExit = options.hardExit ?? true; let stopWatchdog: (() => void) | undefined; diff --git a/src/commands/init/index.ts b/src/commands/init/index.ts index 6c0e35d..208ab4f 100644 --- a/src/commands/init/index.ts +++ b/src/commands/init/index.ts @@ -73,6 +73,16 @@ export const initCommand = new Command("init") // loop has to drain naturally — leaving the WS open means // `dot init` hangs after "setup complete". destroyConnection(); + // QR-path login handle: `connect()` transferred adapter + // ownership to us (it's the transport `waitForLogin` signs + // in over). Once the TUI has exited nothing uses it — + // AccountSetup / UsernamePrompt open their own handles via + // `getSessionSigner()` — so release it here, or its + // statement-store WebSocket keeps the event loop (and the + // process) alive indefinitely. Fire-and-forget + `.catch()` + // for the same post-destroy-artifact reasons as + // `SessionHandle.destroy()` (see auth.ts). + login?.adapter.destroy().catch(() => {}); } console.log(); diff --git a/src/utils/auth.connect.test.ts b/src/utils/auth.connect.test.ts new file mode 100644 index 0000000..7295bf7 --- /dev/null +++ b/src/utils/auth.connect.test.ts @@ -0,0 +1,121 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// SPDX-License-Identifier: Apache-2.0 + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * Regression coverage for the `connect()` adapter lifecycle. + * + * `connect()` creates a TerminalAdapter to probe for an existing session. + * On the existing-session path it returns plain address data — the adapter + * is not part of the result, so `connect()` itself is the only place that + * can release it. Forgetting the destroy leaks a live statement-store + * WebSocket + subscriptions for the rest of the process lifetime; that + * leaked machinery is exactly the kind that can enter the polkadot-api + * microtask-flood state (see `process-guard.ts`) and grow a zombie process + * to tens of GB. `getSessionSigner()` and `findSession()` already destroy + * their probe adapters on early-return paths — this pins `connect()` to the + * same contract. + * + * Lives in its own file (not `auth.test.ts`) because it module-mocks + * `@parity/product-sdk-terminal`, and the sibling suite exercises the real + * exports. + */ + +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const { createTerminalAdapterMock, waitForSessionsMock } = vi.hoisted(() => ({ + createTerminalAdapterMock: vi.fn(), + waitForSessionsMock: vi.fn(), +})); + +vi.mock("@parity/product-sdk-terminal", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + createTerminalAdapter: createTerminalAdapterMock, + waitForSessions: waitForSessionsMock, + }; +}); + +import { connect } from "./auth.js"; + +// A valid ristretto255 point — same frozen vector as `auth.test.ts`'s +// deriveSessionAddresses block. `connect()` derives display addresses from +// the session root, and arbitrary 32-byte buffers won't decode. +const TEST_ROOT_BYTES = Uint8Array.from([ + 0x9a, 0x76, 0x3d, 0x8d, 0x7d, 0xb9, 0x5e, 0xbd, 0xeb, 0x8f, 0xe2, 0x60, 0xb8, 0x90, 0xf3, 0x5a, + 0x25, 0x3d, 0xb8, 0x27, 0x74, 0xf6, 0x34, 0x46, 0x6c, 0xed, 0x38, 0x7a, 0xa1, 0x4e, 0xfd, 0x29, +]); + +function fakeAdapter() { + return { + destroy: vi.fn().mockResolvedValue(undefined), + sso: { + authenticate: vi.fn(() => new Promise(() => {})), + pairingStatus: { subscribe: vi.fn(() => () => {}) }, + }, + }; +} + +describe("connect() adapter lifecycle", () => { + beforeEach(() => { + createTerminalAdapterMock.mockReset(); + waitForSessionsMock.mockReset(); + }); + + it("destroys the probe adapter on the existing-session path", async () => { + const adapter = fakeAdapter(); + createTerminalAdapterMock.mockReturnValue(adapter); + waitForSessionsMock.mockResolvedValue([{ rootAccountId: TEST_ROOT_BYTES }]); + + const result = await connect(); + + expect(result.kind).toBe("existing"); + // The adapter is not part of the "existing" result, so connect() + // must release it itself — a leak here keeps the statement-store + // WebSocket (and the event loop) alive for the whole process. + expect(adapter.destroy).toHaveBeenCalledTimes(1); + }); + + it("destroys the adapter when the session probe throws", async () => { + const adapter = fakeAdapter(); + createTerminalAdapterMock.mockReturnValue(adapter); + waitForSessionsMock.mockRejectedValue(new Error("statement store unreachable")); + + await expect(connect()).rejects.toThrow("statement store unreachable"); + expect(adapter.destroy).toHaveBeenCalledTimes(1); + }); + + it("keeps the adapter alive on the QR path and hands it to the caller", async () => { + const adapter = fakeAdapter(); + createTerminalAdapterMock.mockReturnValue(adapter); + waitForSessionsMock.mockResolvedValue([]); + adapter.sso.pairingStatus.subscribe.mockImplementation( + (cb: (status: { step: string; payload: string }) => void) => { + cb({ step: "pairing", payload: "pairing-payload" }); + return () => {}; + }, + ); + + const result = await connect(); + + expect(result.kind).toBe("qr"); + // QR path: the login flow still needs the adapter (authenticate() + // is in flight) — ownership transfers to the caller via LoginHandle. + expect(adapter.destroy).not.toHaveBeenCalled(); + if (result.kind === "qr") { + expect(result.login.adapter).toBe(adapter); + } + }); +}); diff --git a/src/utils/auth.ts b/src/utils/auth.ts index 8bab22d..26c620e 100644 --- a/src/utils/auth.ts +++ b/src/utils/auth.ts @@ -208,13 +208,32 @@ export interface LoginHandle { export async function connect(): Promise { const adapter = createAdapter(); - const sessions = await loadSessions(adapter); - if (sessions.length > 0) { - const addresses = deriveSessionAddresses(sessions[0]); - // `address` is kept for back-compat with callers that only need the - // product-account SS58 (signer flows). UI consumers should read the - // richer `addresses` field instead. - return { kind: "existing", address: addresses.productAddress, addresses }; + let sessions: UserSession[]; + try { + sessions = await loadSessions(adapter); + if (sessions.length > 0) { + const addresses = deriveSessionAddresses(sessions[0]); + // The "existing" result carries plain address data only — the + // adapter is not part of it, so this is the last place that can + // release it. Leaking it keeps a statement-store WebSocket + + // subscriptions alive for the rest of the process: the event + // loop never drains, and the leaked subscription machinery is + // the kind that can enter the polkadot-api microtask-flood + // state (see process-guard.ts) and grow the process unbounded. + // Same fire-and-forget + `.catch()` rationale as + // `getSessionSigner()`'s no-session path below. + adapter.destroy().catch(() => {}); + // `address` is kept for back-compat with callers that only need + // the product-account SS58 (signer flows). UI consumers should + // read the richer `addresses` field instead. + return { kind: "existing", address: addresses.productAddress, addresses }; + } + } catch (err) { + // Probe failed (statement store unreachable, stale session, …) — + // release the WebSocket before propagating, mirroring the QR-wait + // catch below. + adapter.destroy().catch(() => {}); + throw err; } // Start authenticate — this triggers the pairing flow and QR emission