Skip to content

Commit 46ff1e7

Browse files
feat(bots): reviewer-bot code, adapted for databricks-sql-python
Second of the stacked reviewer-bot migration. Vendors scripts/reviewer_bot/ (the review loop, finalize_review tool, v2 posting, dedup/reconcile, severity, validate_findings, observer) and adapts the repo-specific surface for this Python connector: - prompts.py: retarget to databricks-sql-python; cite CONTRIBUTING.md (PEP 8 / 100-char lines, DCO) + README.md; drop the csharp driver-source section, specs/*.yaml alignment rules, and CLAUDE.md/.claude landmarks (none here). Trim the user-prompt template to PR/diff/open-threads/repo-conventions. - gather_context.aggregate_repo_rules: read CONTRIBUTING.md (this repo's conventions doc) instead of CLAUDE.md/.claude/specs/per-driver files. - Exploration (read_paths/grep) roots at the PR's own checkout — no driver clone (that path is csharp-conditional and never fires here). Adds .github/workflows/reviewer-bot-unit-tests.yml (protected runner + setup-poetry + pytest; no secrets — SDK import is guarded). 361 reviewer + shared tests pass. The csharp-conditional dead paths (run_review driver-clone, list_driver_source, the fake_repo/fake_driver_tree fixtures) are left in place for a follow-up cleanup PR — they never execute on this repo. Co-authored-by: Isaac Signed-off-by: Eric Wang <e.wang@databricks.com>
1 parent 6b8778d commit 46ff1e7

28 files changed

Lines changed: 11046 additions & 0 deletions
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Unit tests for the migrated PR-review bot (scripts/reviewer_bot + scripts/shared).
2+
#
3+
# Pure-logic tests — no secrets, no model calls, no Claude Agent SDK install
4+
# (the SDK import is guarded, so the tests run with just pytest, which the
5+
# connector's poetry dev-deps already provide). Runs only when the bot code
6+
# changes, so it doesn't add load to every connector PR.
7+
name: Reviewer Bot Unit Tests
8+
9+
on:
10+
pull_request:
11+
paths:
12+
- 'scripts/shared/**'
13+
- 'scripts/reviewer_bot/**'
14+
- '.github/workflows/reviewer-bot-unit-tests.yml'
15+
16+
permissions:
17+
contents: read
18+
19+
jobs:
20+
unit-tests:
21+
name: Reviewer Bot Unit Tests
22+
runs-on:
23+
group: databricks-protected-runner-group
24+
labels: linux-ubuntu-latest
25+
timeout-minutes: 15
26+
steps:
27+
- name: Check out repository
28+
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
29+
- name: Setup Poetry
30+
uses: ./.github/actions/setup-poetry
31+
with:
32+
python-version: "3.11"
33+
- name: Run reviewer-bot + shared unit tests
34+
run: poetry run python -m pytest scripts/reviewer_bot/tests scripts/shared/tests

scripts/reviewer_bot/__init__.py

Whitespace-only changes.

scripts/reviewer_bot/agent.py

Lines changed: 527 additions & 0 deletions
Large diffs are not rendered by default.

scripts/reviewer_bot/followup.py

Lines changed: 1253 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
"""Phase 2: gather all context the LLM needs to review the PR.
2+
3+
Functions in this module are pure (no network, no subprocess) where
4+
possible, so unit tests are simple. The `gather` entry point at the
5+
bottom orchestrates everything and is the only place that shells out.
6+
"""
7+
from __future__ import annotations
8+
9+
import re
10+
import sys
11+
from typing import Any, Dict, Iterable, Set, Union
12+
13+
# Single source of truth for the keys we expect on a thread dict lives
14+
# next to the producer (`reconcile.REVIEW_THREAD_REQUIRED_KEYS` /
15+
# `reconcile.ReviewThread`). Imported lazily inside the function that
16+
# uses it so this module stays cheap to import — `reconcile` pulls in
17+
# the markers module + GraphQL plumbing this module otherwise doesn't
18+
# need.
19+
20+
21+
_DIFF_FILE_HEADER = re.compile(r"^\+\+\+ b/(.+)$")
22+
_HUNK_HEADER = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@")
23+
24+
25+
def parse_diff_positions(diff_text: str) -> Dict[str, Set[int]]:
26+
"""Return {file_path: set of RIGHT-side line numbers that were added
27+
or modified}.
28+
29+
Walks the unified diff line by line. A line starting with `+` (but
30+
not `+++` which is the file header) is a RIGHT-side addition; track
31+
its line number. Context lines (` `) advance the right-side line
32+
counter without adding to the set. Deleted lines (`-`, not `---`)
33+
don't advance the right-side counter.
34+
"""
35+
positions: Dict[str, Set[int]] = {}
36+
current_file: str | None = None
37+
right_line: int | None = None
38+
39+
for line in diff_text.splitlines():
40+
# File header — start of a new file's diff
41+
m = _DIFF_FILE_HEADER.match(line)
42+
if m:
43+
current_file = m.group(1)
44+
positions.setdefault(current_file, set())
45+
right_line = None
46+
continue
47+
48+
# Hunk header — initializes the right-side line counter
49+
m = _HUNK_HEADER.match(line)
50+
if m:
51+
right_line = int(m.group(1))
52+
continue
53+
54+
if current_file is None or right_line is None:
55+
continue
56+
57+
# `+++` was matched above as a file header; skip the `--- a/...`
58+
# variant which appears just before `+++ b/...`.
59+
if line.startswith("---"):
60+
continue
61+
62+
# Unified-diff "no newline at end of file" meta marker (literal:
63+
# backslash + space + text). NOT a real RIGHT-side line — must
64+
# not advance the right-line counter, otherwise every position
65+
# reported AFTER such a hunk is off by one.
66+
if line.startswith("\\ "):
67+
continue
68+
69+
if line.startswith("+"):
70+
positions[current_file].add(right_line)
71+
right_line += 1
72+
elif line.startswith("-"):
73+
# Deleted from the LEFT side; right-side line counter does
74+
# not advance.
75+
pass
76+
else:
77+
# Context line ` ` (or any other unhandled non-meta line).
78+
right_line += 1
79+
80+
return positions
81+
82+
from pathlib import Path
83+
84+
85+
def aggregate_repo_rules(changed_files: list[str], repo_root: Path) -> str:
86+
"""Concatenate this repo's convention references so the reviewer can cite
87+
exact rule lines. For databricks-sql-python the canonical source is
88+
CONTRIBUTING.md (coding style: PEP 8 with a 100-char line limit; DCO
89+
sign-off). Missing files are silently skipped.
90+
91+
`changed_files` is unused (kept for signature stability with the followup /
92+
run_review callers) — this repo has no per-path convention files to select.
93+
The connector's README.md is usage docs, not review rules; the model can
94+
read it via read_paths if a finding needs it, so it is not inlined here.
95+
"""
96+
sections: list[str] = []
97+
for name in ("CONTRIBUTING.md",):
98+
path = repo_root / name
99+
if not path.is_file():
100+
continue
101+
sections.append(f"=== {name} ===\n{path.read_text()}")
102+
return "\n\n".join(sections)
103+
104+
_SEVERITY_NAMES = ("CRITICAL", "HIGH", "MEDIUM", "LOW", "NIT")
105+
106+
107+
def _extract_severity_tag(body: str) -> str:
108+
"""Return the severity tag (e.g. "HIGH") parsed from a posted finding
109+
body, or "?" when the body doesn't start with a recognizable badge.
110+
111+
The posted body shape is `{badge} — {text}` where `{badge}` is one of
112+
`severity.BADGE`'s values like `🔴 Critical`. We tolerate optional
113+
leading whitespace and the HTML marker comment that some posts put
114+
on the first line. "?" lets the LLM still see the row in the
115+
open-threads list (so dedup still fires) without inventing a
116+
severity — the system prompt's escalation rule degrades to "skip"
117+
when the existing severity is unknown, which is the safe default.
118+
"""
119+
if not body:
120+
return "?"
121+
# Drop HTML marker comments before scanning so a leading
122+
# `<!-- pr-review-bot:v1 ... -->` doesn't hide the badge.
123+
cleaned = re.sub(r"<!--.*?-->", "", body, flags=re.S).lstrip()
124+
# Match `{emoji}{ws}{NAME}` at the start (the dash/em-dash that
125+
# follows is not required — a body that's only the badge still
126+
# tells us the severity).
127+
m = re.match(
128+
r"[\U0001F534\U0001F7E0\U0001F7E1\U0001F535⚪]\s*"
129+
r"(?P<name>Critical|High|Medium|Low|Nit)\b",
130+
cleaned, flags=re.I,
131+
)
132+
if not m:
133+
return "?"
134+
return m.group("name").upper()
135+
136+
137+
def _summarize_thread_root(body: str, max_chars: int = 180) -> str:
138+
"""Compress a review-bot finding root to a one-line essence.
139+
140+
Strategy:
141+
1. Strip the leading severity badge (🔴/🟠/🟡/🔵 + "— Severity —").
142+
2. Strip any HTML marker comments (`<!-- pr-review-bot:v1 ... -->`).
143+
3. Take the first ~max_chars of the first non-empty line.
144+
145+
The result is meant to be unique-enough for the LLM to recognize a
146+
semantic duplicate without us shipping the full multi-paragraph
147+
finding body (which would balloon the prompt and add irrelevant
148+
detail like suggestion blocks).
149+
150+
Note: severity is parsed separately by `_extract_severity_tag` and
151+
surfaced in the row prefix so the LLM can apply the escalation
152+
override the system prompt asks for. We don't include it in the
153+
summary line itself — the prefix already shows it.
154+
"""
155+
if not body:
156+
return ""
157+
# Strip HTML marker comments
158+
body = re.sub(r"<!--.*?-->", "", body, flags=re.S).strip()
159+
# Strip leading severity emoji + the "— Severity — " preamble
160+
body = re.sub(r"^[\U0001F534\U0001F7E0\U0001F7E1\U0001F535⚪]\s*", "", body)
161+
body = re.sub(r"^(?:Critical|High|Medium|Low|Nit)\s*[—-]\s*", "", body, flags=re.I)
162+
# Take first non-empty line
163+
first_line = next(
164+
(ln.strip() for ln in body.splitlines() if ln.strip()),
165+
"",
166+
)
167+
if len(first_line) > max_chars:
168+
first_line = first_line[:max_chars].rstrip() + "…"
169+
return first_line
170+
171+
172+
def format_open_threads(
173+
threads: list[Dict[str, Any]],
174+
*,
175+
bot_login: Union[str, Iterable[str]],
176+
max_threads: int = 50,
177+
) -> str:
178+
"""Compress open review-bot threads to a one-line-each block for the prompt.
179+
180+
Each entry in `threads` is expected to match `reconcile.ReviewThread`
181+
(the TypedDict declared on the producer side). The contract carries
182+
these keys: `root_author`, `root_comment_id`, `root_path`,
183+
`root_line`, `root_body`, `root_created_at`. We reach into them
184+
directly below; if `reconcile`'s GraphQL projection drops or
185+
renames one of these without also updating `ReviewThread`, the
186+
boundary assertion below logs `::warning::` and skips the entry
187+
rather than silently falling through to "(no open review-bot
188+
threads)" or rendering `#None`.
189+
190+
Filters to roots authored by `bot_login` (skips human-authored
191+
threads — those aren't review-bot findings the dedup gate should
192+
consider). Caps to the `max_threads` most recent threads to bound
193+
prompt cost on long-running PRs; if more exist, the bot has bigger
194+
problems than the dedup heuristic can fix.
195+
196+
`bot_login` is matched case-insensitively against the thread root
197+
author's GitHub login. Accepts either a single login string or an
198+
iterable of logins so the caller can pass `post_review.BOT_LOGINS`
199+
to recognize both the current `peco-review-bot[bot]` identity AND
200+
the legacy `github-actions[bot]` identity that pre-app-migration
201+
PRs used. Without dual-login support, open legacy threads on
202+
migration-era PRs would be absent from the dedup prompt and the
203+
LLM would re-emit duplicates for those still-open findings.
204+
205+
Returns "(no open review-bot threads)" when nothing matches —
206+
template substitution shouldn't leave an empty section.
207+
"""
208+
if not threads:
209+
return "(no open review-bot threads)"
210+
211+
if isinstance(bot_login, str):
212+
bot_logins_lc = frozenset({bot_login.lower()})
213+
else:
214+
bot_logins_lc = frozenset(bl.lower() for bl in bot_login)
215+
# Imported here (not at module top) so we don't drag the reconcile
216+
# module's GraphQL/markers plumbing into anything that just wants
217+
# `parse_diff_positions` or `aggregate_repo_rules`.
218+
from .reconcile import REVIEW_THREAD_REQUIRED_KEYS
219+
220+
rows: list[tuple[str, str]] = [] # (created_at, formatted_line)
221+
for t in threads:
222+
# Boundary assertion: the producer (`reconcile.fetch_open_review_threads`)
223+
# is documented to return dicts matching the `ReviewThread` TypedDict.
224+
# If a refactor over there drops or renames one of the keys we read
225+
# below without updating `ReviewThread`, the type-checker will catch
226+
# it — but a runtime payload that bypasses the type-check (malformed
227+
# GraphQL response, monkeypatched test fixture, future caller passing
228+
# a different shape) would silently degrade: a missing `root_author`
229+
# would skip the entry, a missing `root_comment_id` would render
230+
# `#None`. Surface the contract drift as a `::warning::` instead.
231+
missing = REVIEW_THREAD_REQUIRED_KEYS - t.keys()
232+
if missing:
233+
print(
234+
f"::warning::format_open_threads: thread missing required "
235+
f"keys {sorted(missing)} — skipping. "
236+
f"This signals a contract drift between "
237+
f"reconcile.ReviewThread and the actual payload; check "
238+
f"`fetch_open_review_threads` for a renamed/dropped field.",
239+
file=sys.stderr,
240+
)
241+
continue
242+
if (t.get("root_author") or "").lower() not in bot_logins_lc:
243+
continue
244+
thread_id = t.get("root_comment_id")
245+
path = t.get("root_path") or "(no file)"
246+
line = t.get("root_line")
247+
line_str = str(line) if line is not None else "?"
248+
body = t.get("root_body") or ""
249+
# Surface the existing thread's severity in the row prefix —
250+
# `[#id SEVERITY]` matches the example in `prompts.SYSTEM_PROMPT`
251+
# under "De-duplication against existing review threads", and
252+
# gives the LLM the signal it needs to apply the prompt's
253+
# escalation override ("emit a new finding when the candidate's
254+
# severity is HIGHER than the existing thread's"). Without the
255+
# tag, the model has no reliable way to compare severities and
256+
# the override silently degrades to never-fires.
257+
severity_tag = _extract_severity_tag(body)
258+
essence = _summarize_thread_root(body)
259+
rows.append((
260+
t.get("root_created_at") or "",
261+
f"[#{thread_id} {severity_tag}] {path}:{line_str}{essence}",
262+
))
263+
264+
if not rows:
265+
return "(no open review-bot threads)"
266+
267+
# Sort newest-first so when we truncate it's the OLDEST threads that
268+
# get dropped. A finding that re-surfaces is more likely to match
269+
# against a recent thread than an ancient one.
270+
rows.sort(key=lambda r: r[0], reverse=True)
271+
if len(rows) > max_threads:
272+
kept = rows[:max_threads]
273+
dropped = len(rows) - max_threads
274+
lines = [line for _, line in kept]
275+
lines.append(
276+
f"\n[… {dropped} older open thread(s) omitted to cap prompt cost. "
277+
f"If a candidate finding matches one of these, the dedup gate will "
278+
f"miss it and you'll emit a duplicate — accept that risk and "
279+
f"don't speculate.]"
280+
)
281+
return "\n".join(lines)
282+
283+
return "\n".join(line for _, line in rows)
284+
285+
286+
def list_driver_source(
287+
*,
288+
driver_root: Path,
289+
source_subpath: str,
290+
) -> str:
291+
"""Return a flat directory listing of source_subpath (paths + sizes).
292+
293+
No file content — content is fetched on-demand via the agent's
294+
read_paths / grep tools.
295+
"""
296+
src_root = driver_root / source_subpath.rstrip("/")
297+
listing_lines: list[str] = []
298+
for path in sorted(src_root.rglob("*")):
299+
if path.is_file():
300+
rel = path.relative_to(driver_root)
301+
listing_lines.append(f"{rel} ({path.stat().st_size} bytes)")
302+
return "\n".join(listing_lines)

0 commit comments

Comments
 (0)