From 94bd6ac0911888c242748feb58ff34650d7623be Mon Sep 17 00:00:00 2001
From: Burak Yigit Kaya <byk@sentry.io>
Date: Sat, 2 May 2026 00:56:23 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20context=20health=20diagnostics=20?=
 =?UTF-8?q?=E2=80=94=20C=5Fnorm,=20R=5Fcompression,=20time-gap=20segmentat?=
 =?UTF-8?q?ion,=20recall=20recency?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two computable diagnostic signals for context management quality:

- temporalCnorm(): normalized variance of relative-existence weights over
  message timestamps. Measures attention imbalance [0,1] — 0 is uniform
  distribution, 1 is dominated by distant past. Logged per-distillation
  under LORE_DEBUG=1.

- compressionRatio(): k/√N ratio where k=distilled tokens, N=source tokens.
  Values < 1.0 signal aggressive/likely-lossy compression. Logged
  per-distillation under LORE_DEBUG=1.

Enhance detectSegments() to prefer splitting at the largest inter-message
time gap (≥3x median) when oversized, respecting natural conversation
boundaries instead of arbitrary count-based chunking. Falls back to
count-based splitting when timestamps are uniform.

Add recency-biased RRF list for temporal recall results. Same candidates
re-ranked by created_at (newest first), fused alongside BM25 via existing
RRF — messages that are both semantically relevant AND recent get a
natural score boost.

Inspired by D7x7z49/llm-context-idea research notes on temporal clustering
and compression boundaries.

582 tests pass, 0 fail. Build clean across all 3 packages.
---
 packages/core/src/distillation.ts         | 144 ++++++++++--
 packages/core/src/recall.ts               |  18 ++
 packages/core/src/temporal.ts             |  39 ++++
 packages/core/test/context-health.test.ts | 256 ++++++++++++++++++++++
 packages/core/test/distillation.test.ts   | 104 +++++++++
 5 files changed, 542 insertions(+), 19 deletions(-)
 create mode 100644 packages/core/test/context-health.test.ts

diff --git a/packages/core/src/distillation.ts b/packages/core/src/distillation.ts
index a07610d..8485397 100644
--- a/packages/core/src/distillation.ts
+++ b/packages/core/src/distillation.ts
@@ -19,32 +19,125 @@ export { workerSessionIDs };
 
 type TemporalMessage = temporal.TemporalMessage;
 
-// Segment detection: group related messages together
-function detectSegments(
+/**
+ * Compression health ratio: k / √N.
+ *
+ * k = distilled token count, N = source token count.
+ * Values < 1.0 signal likely lossy compression (below the square-root
+ * boundary). Values > 1.0 signal relatively faithful compression.
+ *
+ * Based on the "LLM Context Square Root Theory" heuristic from
+ * D7x7z49/llm-context-idea. The specific threshold is unvalidated —
+ * use as a diagnostic signal, not a hard gate.
+ */
+export function compressionRatio(
+  distilledTokens: number,
+  sourceTokens: number,
+): number {
+  if (sourceTokens <= 0) return 0;
+  return distilledTokens / Math.sqrt(sourceTokens);
+}
+
+/**
+ * Segment detection: group related messages into distillation-sized chunks.
+ *
+ * When the message count exceeds `maxSegment`, prefers splitting at the
+ * largest inter-message time gap (if it's ≥ 3× the median gap) to respect
+ * natural conversation boundaries. Falls back to count-based splitting at
+ * `maxSegment` when timestamps are uniform.
+ *
+ * Trailing segments with < 3 messages are merged into the previous segment
+ * to avoid tiny distillation inputs with too little context.
+ *
+ * Exported for testing; `run()` is the production caller.
+ */
+export function detectSegments(
   messages: TemporalMessage[],
   maxSegment: number,
 ): TemporalMessage[][] {
   if (messages.length <= maxSegment) return [messages];
-  const segments: TemporalMessage[][] = [];
-  let current: TemporalMessage[] = [];
-
-  for (const msg of messages) {
-    current.push(msg);
-    // Split on segment size limit
-    if (current.length >= maxSegment) {
-      segments.push(current);
-      current = [];
-    }
+  return splitSegments(messages, maxSegment);
+}
+
+/** Minimum segment size — segments smaller than this get merged. */
+const MIN_SEGMENT = 3;
+
+/**
+ * Multiplier for the median gap threshold: a time gap must be at least
+ * this many times the median gap to be used as a split point.
+ */
+const GAP_THRESHOLD_MULTIPLIER = 3;
+
+function splitSegments(
+  messages: TemporalMessage[],
+  maxSegment: number,
+): TemporalMessage[][] {
+  if (messages.length <= maxSegment) return [messages];
+
+  // Find the split point: prefer the largest time gap if it's significant
+  const splitIdx = findSplitIndex(messages, maxSegment);
+
+  const left = messages.slice(0, splitIdx);
+  const right = messages.slice(splitIdx);
+
+  // Recurse on both halves
+  const result = splitSegments(left, maxSegment);
+
+  if (right.length < MIN_SEGMENT) {
+    // Merge tiny trailing segment into the last segment
+    result[result.length - 1].push(...right);
+  } else {
+    result.push(...splitSegments(right, maxSegment));
+  }
+
+  return result;
+}
+
+/**
+ * Choose where to split an oversized message array.
+ *
+ * If there's a time gap ≥ 3× the median gap AND it falls within a range
+ * that would produce segments of at least MIN_SEGMENT size, use it.
+ * Otherwise fall back to the count-based boundary at `maxSegment`.
+ */
+function findSplitIndex(
+  messages: TemporalMessage[],
+  maxSegment: number,
+): number {
+  // Compute consecutive time gaps
+  const gaps: Array<{ index: number; gap: number }> = [];
+  for (let i = 1; i < messages.length; i++) {
+    gaps.push({
+      index: i,
+      gap: messages[i].created_at - messages[i - 1].created_at,
+    });
   }
-  if (current.length > 0) {
-    // Merge small trailing segment with previous if too small
-    if (current.length < 3 && segments.length > 0) {
-      segments[segments.length - 1].push(...current);
-    } else {
-      segments.push(current);
+
+  if (gaps.length === 0) return maxSegment;
+
+  // Find median gap
+  const sortedGaps = gaps.map((g) => g.gap).sort((a, b) => a - b);
+  const medianGap = sortedGaps[Math.floor(sortedGaps.length / 2)];
+
+  // Find the largest gap that would produce viable segments (≥ MIN_SEGMENT on each side)
+  let bestGap = { index: -1, gap: 0 };
+  for (const g of gaps) {
+    if (
+      g.gap > bestGap.gap &&
+      g.index >= MIN_SEGMENT &&
+      messages.length - g.index >= MIN_SEGMENT
+    ) {
+      bestGap = g;
     }
   }
-  return segments;
+
+  // Use the time gap if it's significantly larger than median
+  if (bestGap.index > 0 && bestGap.gap >= medianGap * GAP_THRESHOLD_MULTIPLIER) {
+    return bestGap.index;
+  }
+
+  // Fall back to count-based splitting
+  return maxSegment;
 }
 
 function formatTime(ms: number): string {
@@ -527,6 +620,19 @@ async function distillSegment(input: {
   });
   temporal.markDistilled(input.messages.map((m) => m.id));
 
+  // Diagnostic: log compression health and temporal clustering metrics.
+  // R_compression (k/√N): < 1.0 signals likely lossy distillation.
+  // C_norm: 0 = uniform timestamps, 1 = dominated by distant past.
+  const distilledTokens = Math.ceil(result.observations.length / 3);
+  const sourceTokens = input.messages.reduce((sum, m) => sum + m.tokens, 0);
+  const rComp = compressionRatio(distilledTokens, sourceTokens);
+  const cNorm = temporal.temporalCnorm(input.messages.map((m) => m.created_at));
+  log.info(
+    `distill segment: ${input.messages.length} msgs, ` +
+      `${sourceTokens}→${distilledTokens} tokens, ` +
+      `R=${rComp.toFixed(2)}, C_norm=${cNorm.toFixed(3)}`,
+  );
+
   // Fire-and-forget: embed the distillation for vector search
   if (embedding.isAvailable()) {
     embedding.embedDistillation(distillId, result.observations);
diff --git a/packages/core/src/recall.ts b/packages/core/src/recall.ts
index ea0007b..887fb1b 100644
--- a/packages/core/src/recall.ts
+++ b/packages/core/src/recall.ts
@@ -322,6 +322,24 @@ export async function runRecall(input: RecallInput): Promise<RecallResult> {
         key: (r) => `t:${r.item.id}`,
       },
     );
+
+    // Recency-biased list for temporal results: same candidates re-ranked
+    // by created_at (newest first). RRF naturally boosts messages that
+    // appear in both the BM25 and recency lists — i.e. results that are
+    // both semantically relevant AND recent. Uses the same `t:` key prefix
+    // so RRF merges rather than duplicates.
+    if (temporalResults.length > 0) {
+      const recencySorted = [...temporalResults].sort(
+        (a, b) => b.created_at - a.created_at,
+      );
+      allRrfLists.push({
+        items: recencySorted.map((item) => ({
+          source: "temporal" as const,
+          item,
+        })),
+        key: (r) => `t:${r.item.id}`,
+      });
+    }
   }
 
   // Vector search on the original query (not expansions — avoid redundant embeds).
diff --git a/packages/core/src/temporal.ts b/packages/core/src/temporal.ts
index bc86c36..6d0c30d 100644
--- a/packages/core/src/temporal.ts
+++ b/packages/core/src/temporal.ts
@@ -280,6 +280,45 @@ export function searchScored(input: {
   }
 }
 
+/**
+ * Normalized variance of relative-existence weights over message timestamps.
+ *
+ * Measures temporal attention imbalance: 0 means timestamps are evenly
+ * distributed (uniform attention), 1 means a single distant timestamp
+ * dominates (attention stuck in the past). Useful as a lightweight
+ * signal for distillation segmentation, recall time-biasing, and
+ * idle-resume awareness.
+ *
+ * Only meaningful for n ≥ 2. Returns 0 for 0 or 1 timestamps.
+ *
+ * Based on the "Temporal Clustering via Relative Existence" heuristic
+ * from D7x7z49/llm-context-idea.
+ */
+export function temporalCnorm(
+  timestamps: number[],
+  now: number = Date.now(),
+): number {
+  const n = timestamps.length;
+  if (n < 2) return 0;
+
+  // Existence durations: how long each piece has existed
+  const durations = timestamps.map((t) => now - t);
+  const totalDuration = durations.reduce((a, b) => a + b, 0);
+  if (totalDuration <= 0) return 0;
+
+  // Relative existence weights (positive, sum to 1)
+  const weights = durations.map((d) => d / totalDuration);
+
+  // Normalized variance: Var(w) / Var_max
+  // Var(w) = (1/n) * Σ(w_i - 1/n)²
+  // Var_max = (n-1) / n²  (when one weight = 1, rest = 0)
+  const uniform = 1 / n;
+  const variance =
+    weights.reduce((sum, w) => sum + (w - uniform) ** 2, 0) / n;
+  const maxVariance = (n - 1) / (n * n);
+  return maxVariance === 0 ? 0 : variance / maxVariance;
+}
+
 export function count(projectPath: string, sessionID?: string): number {
   const pid = ensureProject(projectPath);
   const query = sessionID
diff --git a/packages/core/test/context-health.test.ts b/packages/core/test/context-health.test.ts
new file mode 100644
index 0000000..e862da6
--- /dev/null
+++ b/packages/core/test/context-health.test.ts
@@ -0,0 +1,256 @@
+import { describe, test, expect } from "bun:test";
+import { temporalCnorm } from "../src/temporal";
+import { compressionRatio } from "../src/distillation";
+import { reciprocalRankFusion } from "../src/search";
+
+// ─── temporalCnorm ──────────────────────────────────────────────────────────
+
+describe("temporalCnorm", () => {
+  test("returns 0 for empty array", () => {
+    expect(temporalCnorm([], 1000)).toBe(0);
+  });
+
+  test("returns 0 for single timestamp", () => {
+    expect(temporalCnorm([500], 1000)).toBe(0);
+  });
+
+  test("returns 0 for two equal timestamps", () => {
+    // Both have the same existence duration → uniform weights → variance = 0
+    expect(temporalCnorm([500, 500], 1000)).toBe(0);
+  });
+
+  test("returns 0 when all timestamps equal now", () => {
+    // All durations = 0 → totalDuration = 0 → early return 0
+    const now = 5000;
+    expect(temporalCnorm([now, now, now], now)).toBe(0);
+  });
+
+  test("returns ≈0 for equally-spaced timestamps", () => {
+    // 10 timestamps at regular 100ms intervals. With a small n like 10,
+    // equally-spaced doesn't yield exactly 0 variance (it's proportional
+    // to how far each point is from the mean), but the C_norm should be
+    // very low compared to pathological cases.
+    const base = 1000;
+    const timestamps = Array.from({ length: 10 }, (_, i) => base + i * 100);
+    const now = base + 10 * 100;
+    const result = temporalCnorm(timestamps, now);
+    // For equally-spaced, C_norm is deterministic and low but not zero.
+    expect(result).toBeGreaterThanOrEqual(0);
+    expect(result).toBeLessThan(0.15);
+  });
+
+  test("approaches 1 for one ancient + many recent timestamps", () => {
+    // One timestamp from long ago, rest very recent → old one dominates
+    const now = 1_000_000;
+    const timestamps = [
+      0, // ancient — duration = 1_000_000
+      999_990,
+      999_991,
+      999_992,
+      999_993,
+      999_994,
+      999_995,
+      999_996,
+      999_997,
+      999_998,
+    ];
+    const result = temporalCnorm(timestamps, now);
+    expect(result).toBeGreaterThan(0.8);
+    expect(result).toBeLessThanOrEqual(1);
+  });
+
+  test("result is always in [0, 1]", () => {
+    // Property test: random timestamps should always produce [0, 1]
+    for (let trial = 0; trial < 100; trial++) {
+      const n = 2 + Math.floor(Math.random() * 20);
+      const now = Date.now();
+      const timestamps = Array.from(
+        { length: n },
+        () => now - Math.floor(Math.random() * 10_000_000),
+      );
+      const result = temporalCnorm(timestamps, now);
+      expect(result).toBeGreaterThanOrEqual(0);
+      expect(result).toBeLessThanOrEqual(1.0001); // tiny float tolerance
+    }
+  });
+
+  test("two timestamps: one old, one recent → high C_norm", () => {
+    // With exactly 2 items, extreme asymmetry should produce high C_norm
+    const now = 10_000;
+    const result = temporalCnorm([0, 9_999], now);
+    // Old: duration = 10_000, Recent: duration = 1
+    // Weights: [10000/10001, 1/10001] ≈ [0.9999, 0.0001]
+    // This is highly skewed
+    expect(result).toBeGreaterThan(0.9);
+  });
+
+  test("defaults to Date.now() when now is omitted", () => {
+    const past = Date.now() - 60_000;
+    const result = temporalCnorm([past, past + 100]);
+    // Should not throw and should return a valid number
+    expect(typeof result).toBe("number");
+    expect(result).toBeGreaterThanOrEqual(0);
+  });
+
+  test("spike detection: C_norm is higher for clustered timestamps than uniform", () => {
+    // Two distinct clusters separated by a large gap produce higher
+    // C_norm than the same number of evenly-spaced timestamps.
+    const now = 20_000;
+
+    // Clustered: 5 messages at t≈0, 5 messages at t≈19_000 (big gap)
+    const clustered = [
+      ...Array.from({ length: 5 }, (_, i) => i * 10),
+      ...Array.from({ length: 5 }, (_, i) => 19_000 + i * 10),
+    ];
+    const cClustered = temporalCnorm(clustered, now);
+
+    // Uniform: 10 messages evenly spaced
+    const uniform = Array.from({ length: 10 }, (_, i) => i * 2000);
+    const cUniform = temporalCnorm(uniform, now);
+
+    // Clustered layout should produce higher C_norm (more imbalance)
+    expect(cClustered).toBeGreaterThan(cUniform);
+  });
+});
+
+// ─── compressionRatio ───────────────────────────────────────────────────────
+
+describe("compressionRatio", () => {
+  test("returns 0 for zero source tokens", () => {
+    expect(compressionRatio(100, 0)).toBe(0);
+  });
+
+  test("returns 0 for negative source tokens", () => {
+    expect(compressionRatio(100, -10)).toBe(0);
+  });
+
+  test("returns k/√N for valid inputs", () => {
+    // k=10, N=100 → 10/√100 = 10/10 = 1.0
+    expect(compressionRatio(10, 100)).toBe(1.0);
+  });
+
+  test("returns < 1.0 for aggressively compressed output", () => {
+    // k=5, N=100 → 5/10 = 0.5
+    expect(compressionRatio(5, 100)).toBe(0.5);
+  });
+
+  test("returns > 1.0 for faithful/verbose compression", () => {
+    // k=20, N=100 → 20/10 = 2.0
+    expect(compressionRatio(20, 100)).toBe(2.0);
+  });
+
+  test("handles large token counts", () => {
+    // k=1000, N=1_000_000 → 1000/1000 = 1.0
+    expect(compressionRatio(1000, 1_000_000)).toBe(1.0);
+  });
+
+  test("returns exactly 1.0 when k = √N", () => {
+    // N = 10000, k = 100 → 100/√10000 = 100/100 = 1.0
+    expect(compressionRatio(100, 10_000)).toBe(1.0);
+  });
+
+  test("realistic distillation: 30 messages → summary", () => {
+    // 30 messages averaging 500 tokens each = 15000 source tokens
+    // Distilled to 300 tokens of observations
+    // R = 300 / √15000 = 300 / 122.47 ≈ 2.45
+    const r = compressionRatio(300, 15_000);
+    expect(r).toBeGreaterThan(2);
+    expect(r).toBeLessThan(3);
+  });
+});
+
+// ─── Recall recency biasing via RRF ─────────────────────────────────────────
+
+describe("recency-biased RRF fusion", () => {
+  // Simulate the pattern used in recall.ts: BM25-sorted list + recency-sorted
+  // list of the same items, fused via RRF. Items appearing in both lists get
+  // a higher score than items in just one.
+  type Item = { id: string; bm25Rank: number; created_at: number };
+
+  function makeItems(): Item[] {
+    return [
+      { id: "old-relevant", bm25Rank: 0, created_at: 1000 }, // best BM25, oldest
+      { id: "mid-mid", bm25Rank: 1, created_at: 5000 }, // mid BM25, mid recency
+      { id: "new-weak", bm25Rank: 2, created_at: 9000 }, // worst BM25, newest
+    ];
+  }
+
+  test("BM25-only: ordered purely by BM25 rank", () => {
+    const items = makeItems();
+    const bm25List = items; // already sorted by bm25Rank
+
+    const fused = reciprocalRankFusion([
+      { items: bm25List, key: (i) => i.id },
+    ]);
+
+    expect(fused.map((r) => r.item.id)).toEqual([
+      "old-relevant",
+      "mid-mid",
+      "new-weak",
+    ]);
+  });
+
+  test("BM25 + recency: items appearing in both lists get boosted", () => {
+    const items = makeItems();
+    const bm25List = items; // [old-relevant, mid-mid, new-weak]
+    const recencyList = [...items].sort((a, b) => b.created_at - a.created_at);
+    // recencyList = [new-weak, mid-mid, old-relevant]
+
+    const fused = reciprocalRankFusion([
+      { items: bm25List, key: (i) => i.id },
+      { items: recencyList, key: (i) => i.id },
+    ]);
+
+    // With k=60:
+    // old-relevant: rank 0 in BM25 (1/60) + rank 2 in recency (1/62)
+    // new-weak: rank 2 in BM25 (1/62) + rank 0 in recency (1/60)
+    // → old-relevant and new-weak have equal scores (symmetric)
+    // mid-mid: rank 1 in both (2 * 1/61) — very close but slightly less
+    //
+    // The key property: ALL items get boosted vs BM25-only, and items
+    // that are extreme on one axis but weak on the other (old-relevant,
+    // new-weak) score the same — the symmetry is correct.
+    expect(fused).toHaveLength(3);
+
+    // old-relevant and new-weak should have equal scores (symmetric ranks)
+    const oldRelevant = fused.find((r) => r.item.id === "old-relevant")!;
+    const newWeak = fused.find((r) => r.item.id === "new-weak")!;
+    expect(oldRelevant.score).toBeCloseTo(newWeak.score, 10);
+  });
+
+  test("recency list does not duplicate items in output", () => {
+    const items = makeItems();
+    const bm25List = items;
+    const recencyList = [...items].sort((a, b) => b.created_at - a.created_at);
+
+    const fused = reciprocalRankFusion([
+      { items: bm25List, key: (i) => i.id },
+      { items: recencyList, key: (i) => i.id },
+    ]);
+
+    // Should have exactly 3 unique items, not 6
+    expect(fused).toHaveLength(3);
+    const ids = fused.map((r) => r.item.id);
+    expect(new Set(ids).size).toBe(3);
+  });
+
+  test("scores are strictly higher with recency list than without", () => {
+    const items = makeItems();
+    const bm25List = items;
+    const recencyList = [...items].sort((a, b) => b.created_at - a.created_at);
+
+    const withoutRecency = reciprocalRankFusion([
+      { items: bm25List, key: (i) => i.id },
+    ]);
+    const withRecency = reciprocalRankFusion([
+      { items: bm25List, key: (i) => i.id },
+      { items: recencyList, key: (i) => i.id },
+    ]);
+
+    // Every item should have a higher score with the recency list
+    for (const item of withRecency) {
+      const without = withoutRecency.find((r) => r.item.id === item.item.id);
+      expect(item.score).toBeGreaterThan(without!.score);
+    }
+  });
+});
diff --git a/packages/core/test/distillation.test.ts b/packages/core/test/distillation.test.ts
index 17fc166..c595630 100644
--- a/packages/core/test/distillation.test.ts
+++ b/packages/core/test/distillation.test.ts
@@ -5,6 +5,7 @@ import {
   loadForSession,
   latestMetaObservations,
   metaDistill,
+  detectSegments,
 } from "../src/distillation";
 import * as temporal from "../src/temporal";
 import { CHUNK_TERMINATOR, partsToText } from "../src/temporal";
@@ -823,3 +824,106 @@ describe("metaDistill — anchored second round", () => {
     expect(metaRows[1]!.observations).toBe("updated");
   });
 });
+
+// ─── detectSegments (time-gap-aware splitting) ──────────────────────────────
+
+describe("detectSegments", () => {
+  function msgs(n: number, timestamps?: number[]): temporal.TemporalMessage[] {
+    return Array.from({ length: n }, (_, i) =>
+      msg("user", `message ${i}`, {
+        id: `seg-msg-${i}`,
+        created_at: timestamps ? timestamps[i] : T + i * 1000,
+      }),
+    );
+  }
+
+  test("returns single segment when under maxSegment", () => {
+    const result = detectSegments(msgs(10), 30);
+    expect(result).toHaveLength(1);
+    expect(result[0]).toHaveLength(10);
+  });
+
+  test("count-based split with uniform timestamps", () => {
+    // 40 messages at 1-second intervals → no significant time gap
+    // Should split at maxSegment=30 boundary
+    const result = detectSegments(msgs(40), 30);
+    expect(result).toHaveLength(2);
+    expect(result[0]).toHaveLength(30);
+    expect(result[1]).toHaveLength(10);
+  });
+
+  test("time-gap split when a large gap exists", () => {
+    // 20 messages: first 10 at 1s intervals, then a 1-hour gap, then 10 more
+    const timestamps = [
+      ...Array.from({ length: 10 }, (_, i) => T + i * 1000),
+      ...Array.from({ length: 10 }, (_, i) => T + 3_600_000 + i * 1000),
+    ];
+    // maxSegment=15, so count-based would split at 15
+    // but the time gap at index 10 is 3,599,000ms vs median ~1000ms → should split at 10
+    const result = detectSegments(msgs(20, timestamps), 15);
+    expect(result).toHaveLength(2);
+    expect(result[0]).toHaveLength(10);
+    expect(result[1]).toHaveLength(10);
+  });
+
+  test("merges tiny trailing segment into previous", () => {
+    // 32 messages with uniform timestamps, maxSegment=30
+    // First split at 30, leaving 2 → merged into first segment
+    const result = detectSegments(msgs(32), 30);
+    expect(result).toHaveLength(1);
+    expect(result[0]).toHaveLength(32);
+  });
+
+  test("does not merge trailing segment with ≥ 3 messages", () => {
+    // 33 messages with uniform timestamps, maxSegment=30
+    // First split at 30, leaving 3 → NOT merged (≥ MIN_SEGMENT)
+    const result = detectSegments(msgs(33), 30);
+    expect(result).toHaveLength(2);
+    expect(result[0]).toHaveLength(30);
+    expect(result[1]).toHaveLength(3);
+  });
+
+  test("multiple time-gap splits in large message set", () => {
+    // 46 messages in 3 bursts of ~15, separated by 1-hour gaps
+    // With maxSegment=29, the right half (31 msgs) exceeds maxSegment
+    // and triggers a second split at the next time gap
+    const timestamps = [
+      ...Array.from({ length: 15 }, (_, i) => T + i * 1000),
+      ...Array.from({ length: 15 }, (_, i) => T + 3_600_000 + i * 1000),
+      ...Array.from({ length: 16 }, (_, i) => T + 7_200_000 + i * 1000),
+    ];
+    const result = detectSegments(msgs(46, timestamps), 29);
+    expect(result).toHaveLength(3);
+    expect(result[0]).toHaveLength(15);
+    expect(result[1]).toHaveLength(15);
+    expect(result[2]).toHaveLength(16);
+  });
+
+  test("preserves original message references", () => {
+    const messages = msgs(20);
+    const result = detectSegments(messages, 10);
+    const flat = result.flat();
+    expect(flat).toHaveLength(20);
+    // Check all original messages are present
+    for (const m of messages) {
+      expect(flat.find((f) => f.id === m.id)).toBeDefined();
+    }
+  });
+
+  test("ignores time gap if it would create segment < MIN_SEGMENT", () => {
+    // 20 messages: first 2 at t=0, then a huge gap, then 18 more
+    // The gap at index 2 creates a left segment of only 2 (< MIN_SEGMENT=3)
+    // so it should NOT split there → falls back to count-based
+    const timestamps = [
+      T,
+      T + 1000,
+      T + 10_000_000,
+      ...Array.from({ length: 17 }, (_, i) => T + 10_000_000 + (i + 1) * 1000),
+    ];
+    const result = detectSegments(msgs(20, timestamps), 15);
+    // Should use count-based split at 15, not time-gap at 2
+    expect(result).toHaveLength(2);
+    expect(result[0]).toHaveLength(15);
+    expect(result[1]).toHaveLength(5);
+  });
+});