|
| 1 | +"""Phase 2: gather all context the LLM needs to review the PR. |
| 2 | +
|
| 3 | +Functions in this module are pure (no network, no subprocess) where |
| 4 | +possible, so unit tests are simple. The `gather` entry point at the |
| 5 | +bottom orchestrates everything and is the only place that shells out. |
| 6 | +""" |
| 7 | +from __future__ import annotations |
| 8 | + |
| 9 | +import re |
| 10 | +import sys |
| 11 | +from typing import Any, Dict, Iterable, Set, Union |
| 12 | + |
| 13 | +# Single source of truth for the keys we expect on a thread dict lives |
| 14 | +# next to the producer (`reconcile.REVIEW_THREAD_REQUIRED_KEYS` / |
| 15 | +# `reconcile.ReviewThread`). Imported lazily inside the function that |
| 16 | +# uses it so this module stays cheap to import — `reconcile` pulls in |
| 17 | +# the markers module + GraphQL plumbing this module otherwise doesn't |
| 18 | +# need. |
| 19 | + |
| 20 | + |
| 21 | +_DIFF_FILE_HEADER = re.compile(r"^\+\+\+ b/(.+)$") |
| 22 | +_HUNK_HEADER = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@") |
| 23 | + |
| 24 | + |
| 25 | +def parse_diff_positions(diff_text: str) -> Dict[str, Set[int]]: |
| 26 | + """Return {file_path: set of RIGHT-side line numbers that were added |
| 27 | + or modified}. |
| 28 | +
|
| 29 | + Walks the unified diff line by line. A line starting with `+` (but |
| 30 | + not `+++` which is the file header) is a RIGHT-side addition; track |
| 31 | + its line number. Context lines (` `) advance the right-side line |
| 32 | + counter without adding to the set. Deleted lines (`-`, not `---`) |
| 33 | + don't advance the right-side counter. |
| 34 | + """ |
| 35 | + positions: Dict[str, Set[int]] = {} |
| 36 | + current_file: str | None = None |
| 37 | + right_line: int | None = None |
| 38 | + |
| 39 | + for line in diff_text.splitlines(): |
| 40 | + # File header — start of a new file's diff |
| 41 | + m = _DIFF_FILE_HEADER.match(line) |
| 42 | + if m: |
| 43 | + current_file = m.group(1) |
| 44 | + positions.setdefault(current_file, set()) |
| 45 | + right_line = None |
| 46 | + continue |
| 47 | + |
| 48 | + # Hunk header — initializes the right-side line counter |
| 49 | + m = _HUNK_HEADER.match(line) |
| 50 | + if m: |
| 51 | + right_line = int(m.group(1)) |
| 52 | + continue |
| 53 | + |
| 54 | + if current_file is None or right_line is None: |
| 55 | + continue |
| 56 | + |
| 57 | + # `+++` was matched above as a file header; skip the `--- a/...` |
| 58 | + # variant which appears just before `+++ b/...`. |
| 59 | + if line.startswith("---"): |
| 60 | + continue |
| 61 | + |
| 62 | + # Unified-diff "no newline at end of file" meta marker (literal: |
| 63 | + # backslash + space + text). NOT a real RIGHT-side line — must |
| 64 | + # not advance the right-line counter, otherwise every position |
| 65 | + # reported AFTER such a hunk is off by one. |
| 66 | + if line.startswith("\\ "): |
| 67 | + continue |
| 68 | + |
| 69 | + if line.startswith("+"): |
| 70 | + positions[current_file].add(right_line) |
| 71 | + right_line += 1 |
| 72 | + elif line.startswith("-"): |
| 73 | + # Deleted from the LEFT side; right-side line counter does |
| 74 | + # not advance. |
| 75 | + pass |
| 76 | + else: |
| 77 | + # Context line ` ` (or any other unhandled non-meta line). |
| 78 | + right_line += 1 |
| 79 | + |
| 80 | + return positions |
| 81 | + |
| 82 | +from pathlib import Path |
| 83 | + |
| 84 | + |
| 85 | +def aggregate_repo_rules(changed_files: list[str], repo_root: Path) -> str: |
| 86 | + """Concatenate this repo's convention references so the reviewer can cite |
| 87 | + exact rule lines. For databricks-sql-python the canonical source is |
| 88 | + CONTRIBUTING.md (coding style: PEP 8 with a 100-char line limit; DCO |
| 89 | + sign-off). Missing files are silently skipped. |
| 90 | +
|
| 91 | + `changed_files` is unused (kept for signature stability with the followup / |
| 92 | + run_review callers) — this repo has no per-path convention files to select. |
| 93 | + The connector's README.md is usage docs, not review rules; the model can |
| 94 | + read it via read_paths if a finding needs it, so it is not inlined here. |
| 95 | + """ |
| 96 | + sections: list[str] = [] |
| 97 | + for name in ("CONTRIBUTING.md",): |
| 98 | + path = repo_root / name |
| 99 | + if not path.is_file(): |
| 100 | + continue |
| 101 | + sections.append(f"=== {name} ===\n{path.read_text()}") |
| 102 | + return "\n\n".join(sections) |
| 103 | + |
| 104 | +_SEVERITY_NAMES = ("CRITICAL", "HIGH", "MEDIUM", "LOW", "NIT") |
| 105 | + |
| 106 | + |
| 107 | +def _extract_severity_tag(body: str) -> str: |
| 108 | + """Return the severity tag (e.g. "HIGH") parsed from a posted finding |
| 109 | + body, or "?" when the body doesn't start with a recognizable badge. |
| 110 | +
|
| 111 | + The posted body shape is `{badge} — {text}` where `{badge}` is one of |
| 112 | + `severity.BADGE`'s values like `🔴 Critical`. We tolerate optional |
| 113 | + leading whitespace and the HTML marker comment that some posts put |
| 114 | + on the first line. "?" lets the LLM still see the row in the |
| 115 | + open-threads list (so dedup still fires) without inventing a |
| 116 | + severity — the system prompt's escalation rule degrades to "skip" |
| 117 | + when the existing severity is unknown, which is the safe default. |
| 118 | + """ |
| 119 | + if not body: |
| 120 | + return "?" |
| 121 | + # Drop HTML marker comments before scanning so a leading |
| 122 | + # `<!-- pr-review-bot:v1 ... -->` doesn't hide the badge. |
| 123 | + cleaned = re.sub(r"<!--.*?-->", "", body, flags=re.S).lstrip() |
| 124 | + # Match `{emoji}{ws}{NAME}` at the start (the dash/em-dash that |
| 125 | + # follows is not required — a body that's only the badge still |
| 126 | + # tells us the severity). |
| 127 | + m = re.match( |
| 128 | + r"[\U0001F534\U0001F7E0\U0001F7E1\U0001F535⚪]\s*" |
| 129 | + r"(?P<name>Critical|High|Medium|Low|Nit)\b", |
| 130 | + cleaned, flags=re.I, |
| 131 | + ) |
| 132 | + if not m: |
| 133 | + return "?" |
| 134 | + return m.group("name").upper() |
| 135 | + |
| 136 | + |
| 137 | +def _summarize_thread_root(body: str, max_chars: int = 180) -> str: |
| 138 | + """Compress a review-bot finding root to a one-line essence. |
| 139 | +
|
| 140 | + Strategy: |
| 141 | + 1. Strip the leading severity badge (🔴/🟠/🟡/🔵 + "— Severity —"). |
| 142 | + 2. Strip any HTML marker comments (`<!-- pr-review-bot:v1 ... -->`). |
| 143 | + 3. Take the first ~max_chars of the first non-empty line. |
| 144 | +
|
| 145 | + The result is meant to be unique-enough for the LLM to recognize a |
| 146 | + semantic duplicate without us shipping the full multi-paragraph |
| 147 | + finding body (which would balloon the prompt and add irrelevant |
| 148 | + detail like suggestion blocks). |
| 149 | +
|
| 150 | + Note: severity is parsed separately by `_extract_severity_tag` and |
| 151 | + surfaced in the row prefix so the LLM can apply the escalation |
| 152 | + override the system prompt asks for. We don't include it in the |
| 153 | + summary line itself — the prefix already shows it. |
| 154 | + """ |
| 155 | + if not body: |
| 156 | + return "" |
| 157 | + # Strip HTML marker comments |
| 158 | + body = re.sub(r"<!--.*?-->", "", body, flags=re.S).strip() |
| 159 | + # Strip leading severity emoji + the "— Severity — " preamble |
| 160 | + body = re.sub(r"^[\U0001F534\U0001F7E0\U0001F7E1\U0001F535⚪]\s*", "", body) |
| 161 | + body = re.sub(r"^(?:Critical|High|Medium|Low|Nit)\s*[—-]\s*", "", body, flags=re.I) |
| 162 | + # Take first non-empty line |
| 163 | + first_line = next( |
| 164 | + (ln.strip() for ln in body.splitlines() if ln.strip()), |
| 165 | + "", |
| 166 | + ) |
| 167 | + if len(first_line) > max_chars: |
| 168 | + first_line = first_line[:max_chars].rstrip() + "…" |
| 169 | + return first_line |
| 170 | + |
| 171 | + |
| 172 | +def format_open_threads( |
| 173 | + threads: list[Dict[str, Any]], |
| 174 | + *, |
| 175 | + bot_login: Union[str, Iterable[str]], |
| 176 | + max_threads: int = 50, |
| 177 | +) -> str: |
| 178 | + """Compress open review-bot threads to a one-line-each block for the prompt. |
| 179 | +
|
| 180 | + Each entry in `threads` is expected to match `reconcile.ReviewThread` |
| 181 | + (the TypedDict declared on the producer side). The contract carries |
| 182 | + these keys: `root_author`, `root_comment_id`, `root_path`, |
| 183 | + `root_line`, `root_body`, `root_created_at`. We reach into them |
| 184 | + directly below; if `reconcile`'s GraphQL projection drops or |
| 185 | + renames one of these without also updating `ReviewThread`, the |
| 186 | + boundary assertion below logs `::warning::` and skips the entry |
| 187 | + rather than silently falling through to "(no open review-bot |
| 188 | + threads)" or rendering `#None`. |
| 189 | +
|
| 190 | + Filters to roots authored by `bot_login` (skips human-authored |
| 191 | + threads — those aren't review-bot findings the dedup gate should |
| 192 | + consider). Caps to the `max_threads` most recent threads to bound |
| 193 | + prompt cost on long-running PRs; if more exist, the bot has bigger |
| 194 | + problems than the dedup heuristic can fix. |
| 195 | +
|
| 196 | + `bot_login` is matched case-insensitively against the thread root |
| 197 | + author's GitHub login. Accepts either a single login string or an |
| 198 | + iterable of logins so the caller can pass `post_review.BOT_LOGINS` |
| 199 | + to recognize both the current `peco-review-bot[bot]` identity AND |
| 200 | + the legacy `github-actions[bot]` identity that pre-app-migration |
| 201 | + PRs used. Without dual-login support, open legacy threads on |
| 202 | + migration-era PRs would be absent from the dedup prompt and the |
| 203 | + LLM would re-emit duplicates for those still-open findings. |
| 204 | +
|
| 205 | + Returns "(no open review-bot threads)" when nothing matches — |
| 206 | + template substitution shouldn't leave an empty section. |
| 207 | + """ |
| 208 | + if not threads: |
| 209 | + return "(no open review-bot threads)" |
| 210 | + |
| 211 | + if isinstance(bot_login, str): |
| 212 | + bot_logins_lc = frozenset({bot_login.lower()}) |
| 213 | + else: |
| 214 | + bot_logins_lc = frozenset(bl.lower() for bl in bot_login) |
| 215 | + # Imported here (not at module top) so we don't drag the reconcile |
| 216 | + # module's GraphQL/markers plumbing into anything that just wants |
| 217 | + # `parse_diff_positions` or `aggregate_repo_rules`. |
| 218 | + from .reconcile import REVIEW_THREAD_REQUIRED_KEYS |
| 219 | + |
| 220 | + rows: list[tuple[str, str]] = [] # (created_at, formatted_line) |
| 221 | + for t in threads: |
| 222 | + # Boundary assertion: the producer (`reconcile.fetch_open_review_threads`) |
| 223 | + # is documented to return dicts matching the `ReviewThread` TypedDict. |
| 224 | + # If a refactor over there drops or renames one of the keys we read |
| 225 | + # below without updating `ReviewThread`, the type-checker will catch |
| 226 | + # it — but a runtime payload that bypasses the type-check (malformed |
| 227 | + # GraphQL response, monkeypatched test fixture, future caller passing |
| 228 | + # a different shape) would silently degrade: a missing `root_author` |
| 229 | + # would skip the entry, a missing `root_comment_id` would render |
| 230 | + # `#None`. Surface the contract drift as a `::warning::` instead. |
| 231 | + missing = REVIEW_THREAD_REQUIRED_KEYS - t.keys() |
| 232 | + if missing: |
| 233 | + print( |
| 234 | + f"::warning::format_open_threads: thread missing required " |
| 235 | + f"keys {sorted(missing)} — skipping. " |
| 236 | + f"This signals a contract drift between " |
| 237 | + f"reconcile.ReviewThread and the actual payload; check " |
| 238 | + f"`fetch_open_review_threads` for a renamed/dropped field.", |
| 239 | + file=sys.stderr, |
| 240 | + ) |
| 241 | + continue |
| 242 | + if (t.get("root_author") or "").lower() not in bot_logins_lc: |
| 243 | + continue |
| 244 | + thread_id = t.get("root_comment_id") |
| 245 | + path = t.get("root_path") or "(no file)" |
| 246 | + line = t.get("root_line") |
| 247 | + line_str = str(line) if line is not None else "?" |
| 248 | + body = t.get("root_body") or "" |
| 249 | + # Surface the existing thread's severity in the row prefix — |
| 250 | + # `[#id SEVERITY]` matches the example in `prompts.SYSTEM_PROMPT` |
| 251 | + # under "De-duplication against existing review threads", and |
| 252 | + # gives the LLM the signal it needs to apply the prompt's |
| 253 | + # escalation override ("emit a new finding when the candidate's |
| 254 | + # severity is HIGHER than the existing thread's"). Without the |
| 255 | + # tag, the model has no reliable way to compare severities and |
| 256 | + # the override silently degrades to never-fires. |
| 257 | + severity_tag = _extract_severity_tag(body) |
| 258 | + essence = _summarize_thread_root(body) |
| 259 | + rows.append(( |
| 260 | + t.get("root_created_at") or "", |
| 261 | + f"[#{thread_id} {severity_tag}] {path}:{line_str} — {essence}", |
| 262 | + )) |
| 263 | + |
| 264 | + if not rows: |
| 265 | + return "(no open review-bot threads)" |
| 266 | + |
| 267 | + # Sort newest-first so when we truncate it's the OLDEST threads that |
| 268 | + # get dropped. A finding that re-surfaces is more likely to match |
| 269 | + # against a recent thread than an ancient one. |
| 270 | + rows.sort(key=lambda r: r[0], reverse=True) |
| 271 | + if len(rows) > max_threads: |
| 272 | + kept = rows[:max_threads] |
| 273 | + dropped = len(rows) - max_threads |
| 274 | + lines = [line for _, line in kept] |
| 275 | + lines.append( |
| 276 | + f"\n[… {dropped} older open thread(s) omitted to cap prompt cost. " |
| 277 | + f"If a candidate finding matches one of these, the dedup gate will " |
| 278 | + f"miss it and you'll emit a duplicate — accept that risk and " |
| 279 | + f"don't speculate.]" |
| 280 | + ) |
| 281 | + return "\n".join(lines) |
| 282 | + |
| 283 | + return "\n".join(line for _, line in rows) |
| 284 | + |
| 285 | + |
| 286 | +def list_driver_source( |
| 287 | + *, |
| 288 | + driver_root: Path, |
| 289 | + source_subpath: str, |
| 290 | +) -> str: |
| 291 | + """Return a flat directory listing of source_subpath (paths + sizes). |
| 292 | +
|
| 293 | + No file content — content is fetched on-demand via the agent's |
| 294 | + read_paths / grep tools. |
| 295 | + """ |
| 296 | + src_root = driver_root / source_subpath.rstrip("/") |
| 297 | + listing_lines: list[str] = [] |
| 298 | + for path in sorted(src_root.rglob("*")): |
| 299 | + if path.is_file(): |
| 300 | + rel = path.relative_to(driver_root) |
| 301 | + listing_lines.append(f"{rel} ({path.stat().st_size} bytes)") |
| 302 | + return "\n".join(listing_lines) |
0 commit comments