From bc229603a558b1e86f977b1bb6e66e1880443fd5 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 23 Mar 2026 17:30:39 -0400 Subject: [PATCH 1/5] Migrate AI local review from Chat Completions to Responses API Enable gpt-5.4-pro and future reasoning models by switching the OpenAI endpoint from /v1/chat/completions to /v1/responses. Add _is_reasoning_model() helper, REASONING_MAX_TOKENS, --timeout CLI flag, gpt-5.4-pro pricing, conditional temperature/max-tokens, reasoning token reporting, and mocked unit tests for payload construction. Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/commands/ai-review-local.md | 21 +++- .claude/scripts/openai_review.py | 75 ++++++++++---- tests/test_openai_review.py | 148 ++++++++++++++++++++++++++++ 3 files changed, 223 insertions(+), 21 deletions(-) diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index ad05587..1cb2f22 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -1,11 +1,11 @@ --- description: Run AI code review locally using OpenAI API before opening a PR -argument-hint: "[--context minimal|standard|deep] [--include-files ] [--token-budget ] [--force-fresh] [--full-registry] [--model ] [--dry-run]" +argument-hint: "[--context minimal|standard|deep] [--include-files ] [--token-budget ] [--force-fresh] [--full-registry] [--model ] [--timeout ] [--dry-run]" --- # Local AI Code Review -Run a structured code review using the OpenAI Chat Completions API. Reviews changes +Run a structured code review using the OpenAI Responses API. Reviews changes against the same methodology criteria used by the CI reviewer, but adapted for local pre-PR use. Designed for iterative review/revision cycles before submitting a PR. @@ -23,8 +23,15 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR - `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists - `--full-registry`: Include the entire REGISTRY.md instead of selective sections - `--model `: Override the OpenAI model (default: `gpt-5.4`) +- `--timeout `: HTTP request timeout (default: 300). Use 900 for reasoning models. - `--dry-run`: Print the compiled prompt without calling the API +**Reasoning models** (`gpt-5.4-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15 +minutes. For deep reviews with reasoning models, combine `--token-budget` with `--model`: +``` +/ai-review-local --model gpt-5.4-pro --token-budget 500000 --context deep +``` + ## Constraints This skill does not modify source code files. It may: @@ -320,12 +327,19 @@ python3 .claude/scripts/openai_review.py \ [--token-budget "$token_budget"] \ [--full-registry] \ [--model ] \ + [--timeout ] \ [--dry-run] ``` Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are generated in Step 4 and is NOT passed to the script. +**Reasoning model handling:** If the model contains `-pro` or starts with `o1`/`o3`/`o4` +(e.g., `gpt-5.4-pro`, `o3`, `o4-mini`): +- Pass `--timeout 900` to the script (unless the user explicitly specified `--timeout`) +- Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap) +- After the background command completes, continue to Step 6 + If `--dry-run`: display the prompt output and stop. Report the estimated token count, cost estimate, and model that would be used. @@ -451,6 +465,9 @@ runs `--force-fresh` or when a rebase invalidates the tracked commit. # Use a different model with full registry /ai-review-local --model gpt-4.1 --full-registry +# Deep review with reasoning model (may take 10-15 minutes) +/ai-review-local --model gpt-5.4-pro --token-budget 500000 --context deep + # Limit token budget for faster/cheaper reviews /ai-review-local --token-budget 100000 ``` diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index b305b2c..7d36292 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Local AI code review using OpenAI Chat Completions API. +"""Local AI code review using OpenAI Responses API. Compiles a review prompt from the project's review criteria, methodology registry, and code diffs, then sends it to the OpenAI API for structured feedback. @@ -854,6 +854,7 @@ def apply_token_budget( # MAINTENANCE: Update when OpenAI changes pricing. PRICING = { "gpt-5.4": (2.50, 15.00), + "gpt-5.4-pro": (15.00, 60.00), "gpt-4.1": (2.00, 8.00), "gpt-4.1-mini": (0.40, 1.60), "o3": (2.00, 8.00), @@ -1093,10 +1094,16 @@ def compile_prompt( # OpenAI API call # --------------------------------------------------------------------------- -ENDPOINT = "https://api.openai.com/v1/chat/completions" +ENDPOINT = "https://api.openai.com/v1/responses" DEFAULT_MODEL = "gpt-5.4" DEFAULT_TIMEOUT = 300 # seconds DEFAULT_MAX_TOKENS = 16384 +REASONING_MAX_TOKENS = 32768 + + +def _is_reasoning_model(model: str) -> bool: + """Return True for models that use internal chain-of-thought reasoning.""" + return model.startswith(("o1", "o3", "o4")) or "-pro" in model def estimate_tokens(text: str) -> int: @@ -1105,19 +1112,26 @@ def estimate_tokens(text: str) -> int: def call_openai( - prompt: str, model: str, api_key: str + prompt: str, + model: str, + api_key: str, + timeout: int = DEFAULT_TIMEOUT, ) -> "tuple[str, dict]": - """Call the OpenAI Chat Completions API. + """Call the OpenAI Responses API. Returns (content, usage) where usage is the API response's usage dict - containing prompt_tokens and completion_tokens. + containing input_tokens and output_tokens. """ - payload = { + reasoning = _is_reasoning_model(model) + max_tokens = REASONING_MAX_TOKENS if reasoning else DEFAULT_MAX_TOKENS + + payload: dict = { "model": model, - "messages": [{"role": "user", "content": prompt}], - "temperature": 0, - "max_completion_tokens": DEFAULT_MAX_TOKENS, + "input": prompt, + "max_output_tokens": max_tokens, } + if not reasoning: + payload["temperature"] = 0 data = json.dumps(payload).encode("utf-8") req = urllib.request.Request( @@ -1131,7 +1145,7 @@ def call_openai( ) try: - with urllib.request.urlopen(req, timeout=DEFAULT_TIMEOUT) as resp: + with urllib.request.urlopen(req, timeout=timeout) as resp: result = json.loads(resp.read().decode("utf-8")) except urllib.error.HTTPError as e: body = "" @@ -1165,7 +1179,7 @@ def call_openai( sys.exit(1) except TimeoutError: print( - f"Error: Request timed out (>{DEFAULT_TIMEOUT}s). " + f"Error: Request timed out (>{timeout}s). " "Try a smaller diff or disable --full-registry.", file=sys.stderr, ) @@ -1174,12 +1188,18 @@ def call_openai( print(f"Error: Network error — {e.reason}", file=sys.stderr) sys.exit(1) - choices = result.get("choices", []) - if not choices: - print("Error: Empty response from OpenAI API.", file=sys.stderr) + status = result.get("status") + if status != "completed": + detail = result.get("incomplete_details") or result.get("error") or "" + print( + f"Error: OpenAI response status is '{status}' (expected 'completed').", + file=sys.stderr, + ) + if detail: + print(f"Detail: {detail}", file=sys.stderr) sys.exit(1) - content = choices[0].get("message", {}).get("content", "") + content = result.get("output_text", "") if not content.strip(): print("Error: Empty review content from OpenAI API.", file=sys.stderr) sys.exit(1) @@ -1282,6 +1302,12 @@ def main() -> None: help=f"Max estimated input tokens before dropping context " f"(default: {DEFAULT_TOKEN_BUDGET:,})", ) + parser.add_argument( + "--timeout", + type=int, + default=DEFAULT_TIMEOUT, + help=f"HTTP request timeout in seconds (default: {DEFAULT_TIMEOUT})", + ) parser.add_argument( "--delta-diff", default=None, @@ -1531,7 +1557,8 @@ def main() -> None: ) # Cost estimate - cost_str = estimate_cost(est_tokens, DEFAULT_MAX_TOKENS, args.model) + max_out = REASONING_MAX_TOKENS if _is_reasoning_model(args.model) else DEFAULT_MAX_TOKENS + cost_str = estimate_cost(est_tokens, max_out, args.model) # Dry-run: print prompt and exit if args.dry_run: @@ -1559,7 +1586,9 @@ def main() -> None: if delta_diff_text: print("Mode: Delta-diff (changes since last review)", file=sys.stderr) - review_content, usage = call_openai(prompt, args.model, api_key) + review_content, usage = call_openai( + prompt, args.model, api_key, timeout=args.timeout + ) # Write review output os.makedirs(os.path.dirname(args.output), exist_ok=True) @@ -1603,8 +1632,8 @@ def main() -> None: ) # Print completion summary with actual usage - actual_input = usage.get("prompt_tokens", 0) - actual_output = usage.get("completion_tokens", 0) + actual_input = usage.get("input_tokens", 0) + actual_output = usage.get("output_tokens", 0) actual_cost = estimate_cost(actual_input, actual_output, args.model) print(f"\nAI Review complete.", file=sys.stderr) @@ -1615,6 +1644,14 @@ def main() -> None: f"{actual_output:,} output", file=sys.stderr, ) + reasoning_tokens = usage.get("output_tokens_details", {}).get( + "reasoning_tokens", 0 + ) + if reasoning_tokens: + print( + f" (includes {reasoning_tokens:,} reasoning tokens)", + file=sys.stderr, + ) if actual_cost: print(f"Actual cost: {actual_cost}", file=sys.stderr) else: diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 9767517..feed0b7 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1517,3 +1517,151 @@ def test_rejects_traversal(self, review_mod, repo_root): candidate = os.path.realpath(candidate) repo_root_real = os.path.realpath(repo_root) assert not candidate.startswith(repo_root_real + os.sep) + + +# --------------------------------------------------------------------------- +# Responses API migration +# --------------------------------------------------------------------------- + + +class TestIsReasoningModel: + def test_o3_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("o3") is True + + def test_o3_mini_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("o3-mini") is True + + def test_o3_snapshot_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("o3-mini-2025-01-31") is True + + def test_o1_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("o1") is True + + def test_o4_mini_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("o4-mini") is True + + def test_pro_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-5.4-pro") is True + + def test_pro_snapshot_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-5.4-pro-2026-03-05") is True + + def test_gpt54_is_not_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-5.4") is False + + def test_gpt41_is_not_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-4.1") is False + + def test_gpt41_mini_is_not_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-4.1-mini") is False + + +class TestProModelPricing: + def test_pro_gets_own_pricing(self, review_mod): + """gpt-5.4-pro should not fall back to gpt-5.4 pricing.""" + pro_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4-pro") + base_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4") + assert pro_cost is not None + assert base_cost is not None + assert pro_cost != base_cost + + def test_pro_snapshot_matches_pro(self, review_mod): + """gpt-5.4-pro-2026-03-05 should match gpt-5.4-pro via prefix.""" + snapshot = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4-pro-2026-03-05") + base = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4-pro") + assert snapshot == base + + +class TestResponsesAPIConstants: + def test_endpoint_is_responses(self, review_mod): + assert "responses" in review_mod.ENDPOINT + assert "chat/completions" not in review_mod.ENDPOINT + + def test_reasoning_max_tokens_larger(self, review_mod): + assert review_mod.REASONING_MAX_TOKENS > review_mod.DEFAULT_MAX_TOKENS + + +class TestCallOpenAIPayload: + """Test call_openai() payload construction and response parsing via mocked urllib.""" + + @pytest.fixture() + def mock_urlopen(self, monkeypatch, review_mod): + """Patch urllib.request.urlopen to capture requests and return canned responses.""" + import io + import urllib.request + + captured = {} + + class FakeResponse: + def __init__(self, data): + self._data = json.dumps(data).encode("utf-8") + + def read(self): + return self._data + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + def fake_urlopen(req, timeout=None): + captured["request"] = req + captured["timeout"] = timeout + captured["payload"] = json.loads(req.data.decode("utf-8")) + return FakeResponse(captured.get("response_data", { + "status": "completed", + "output_text": "Review content here.", + "usage": {"input_tokens": 100, "output_tokens": 50}, + })) + + monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) + return captured + + def test_standard_model_payload(self, review_mod, mock_urlopen): + """Standard model sends input, max_output_tokens, and temperature=0.""" + content, usage = review_mod.call_openai("test prompt", "gpt-5.4", "fake-key") + payload = mock_urlopen["payload"] + assert payload["input"] == "test prompt" + assert payload["max_output_tokens"] == review_mod.DEFAULT_MAX_TOKENS + assert payload["temperature"] == 0 + assert "messages" not in payload + assert "max_completion_tokens" not in payload + assert content == "Review content here." + assert usage["input_tokens"] == 100 + + def test_reasoning_model_payload(self, review_mod, mock_urlopen): + """Reasoning model omits temperature and uses REASONING_MAX_TOKENS.""" + content, _ = review_mod.call_openai("test prompt", "gpt-5.4-pro", "fake-key") + payload = mock_urlopen["payload"] + assert payload["max_output_tokens"] == review_mod.REASONING_MAX_TOKENS + assert "temperature" not in payload + assert content == "Review content here." + + def test_request_url_is_responses_endpoint(self, review_mod, mock_urlopen): + review_mod.call_openai("test", "gpt-5.4", "fake-key") + assert mock_urlopen["request"].full_url == review_mod.ENDPOINT + + def test_timeout_passed_through(self, review_mod, mock_urlopen): + review_mod.call_openai("test", "gpt-5.4", "fake-key", timeout=900) + assert mock_urlopen["timeout"] == 900 + + def test_incomplete_status_exits(self, review_mod, mock_urlopen): + """Non-completed status should cause sys.exit.""" + mock_urlopen["response_data"] = { + "status": "failed", + "output_text": "", + "usage": {}, + } + with pytest.raises(SystemExit): + review_mod.call_openai("test", "gpt-5.4", "fake-key") + + def test_empty_output_text_exits(self, review_mod, mock_urlopen): + """Empty output_text should cause sys.exit.""" + mock_urlopen["response_data"] = { + "status": "completed", + "output_text": " ", + "usage": {}, + } + with pytest.raises(SystemExit): + review_mod.call_openai("test", "gpt-5.4", "fake-key") From a2e8883de8ffe8e17a07c9e10b3b0a40268dd14d Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 23 Mar 2026 17:34:23 -0400 Subject: [PATCH 2/5] Fix Responses API output parsing for raw HTTP responses The output_text convenience field is null in raw HTTP responses (only populated by the Python SDK). Extract text from output[].content[] items instead. Update mock test responses to match real API format. Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/scripts/openai_review.py | 11 ++++++++++- tests/test_openai_review.py | 26 +++++++++++++++++--------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index 7d36292..eb802fd 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -1199,7 +1199,16 @@ def call_openai( print(f"Detail: {detail}", file=sys.stderr) sys.exit(1) - content = result.get("output_text", "") + # Extract text from output items (output_text is null in raw HTTP responses; + # the convenience property only exists in the Python SDK). + content = result.get("output_text") or "" + if not content: + for item in result.get("output", []): + if item.get("type") == "message": + for block in item.get("content", []): + if block.get("type") == "output_text": + content += block.get("text", "") + if not content.strip(): print("Error: Empty review content from OpenAI API.", file=sys.stderr) sys.exit(1) diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index feed0b7..d0455c2 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1605,15 +1605,21 @@ def __enter__(self): def __exit__(self, *args): pass + _DEFAULT_RESPONSE = { + "status": "completed", + "output_text": None, + "output": [{ + "type": "message", + "content": [{"type": "output_text", "text": "Review content here."}], + }], + "usage": {"input_tokens": 100, "output_tokens": 50}, + } + def fake_urlopen(req, timeout=None): captured["request"] = req captured["timeout"] = timeout captured["payload"] = json.loads(req.data.decode("utf-8")) - return FakeResponse(captured.get("response_data", { - "status": "completed", - "output_text": "Review content here.", - "usage": {"input_tokens": 100, "output_tokens": 50}, - })) + return FakeResponse(captured.get("response_data", _DEFAULT_RESPONSE)) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) return captured @@ -1650,17 +1656,19 @@ def test_incomplete_status_exits(self, review_mod, mock_urlopen): """Non-completed status should cause sys.exit.""" mock_urlopen["response_data"] = { "status": "failed", - "output_text": "", + "output_text": None, + "output": [], "usage": {}, } with pytest.raises(SystemExit): review_mod.call_openai("test", "gpt-5.4", "fake-key") - def test_empty_output_text_exits(self, review_mod, mock_urlopen): - """Empty output_text should cause sys.exit.""" + def test_empty_output_exits(self, review_mod, mock_urlopen): + """Empty output items should cause sys.exit.""" mock_urlopen["response_data"] = { "status": "completed", - "output_text": " ", + "output_text": None, + "output": [], "usage": {}, } with pytest.raises(SystemExit): From cf45f2a1bb8e6e4f55e8f70171580afd8abef503 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 23 Mar 2026 17:38:20 -0400 Subject: [PATCH 3/5] Address AI review findings: content-first parsing, response helper, tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1: Relax status check — extract content first via _extract_response_text(), only fail when no usable content is found (not on status mismatch alone). P2: Extract response parsing into dedicated helper with unit tests covering multiple payload shapes (missing status, null status, SDK output_text, multiple output blocks). Add reasoning-model timeout hint to stderr. Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/scripts/openai_review.py | 57 +++++++++++++------- tests/test_openai_review.py | 92 ++++++++++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 23 deletions(-) diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index eb802fd..e0a63db 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -1111,6 +1111,25 @@ def estimate_tokens(text: str) -> int: return len(text) // 4 +def _extract_response_text(result: dict) -> str: + """Extract review text from a Responses API JSON payload. + + Tries the top-level ``output_text`` convenience field first (populated by + the Python SDK but typically null in raw HTTP responses), then walks + ``output[].content[]`` items. Returns an empty string when no text is + found so the caller can decide how to handle it. + """ + text = result.get("output_text") or "" + if text: + return text + for item in result.get("output", []): + if item.get("type") == "message": + for block in item.get("content", []): + if block.get("type") == "output_text": + text += block.get("text", "") + return text + + def call_openai( prompt: str, model: str, @@ -1188,31 +1207,23 @@ def call_openai( print(f"Error: Network error — {e.reason}", file=sys.stderr) sys.exit(1) - status = result.get("status") - if status != "completed": + content = _extract_response_text(result) + + if not content.strip(): + # No usable content — report the best diagnostic we have. + status = result.get("status", "") detail = result.get("incomplete_details") or result.get("error") or "" - print( - f"Error: OpenAI response status is '{status}' (expected 'completed').", - file=sys.stderr, - ) + if status not in ("completed", ""): + print( + f"Error: OpenAI response status is '{status}' with no review content.", + file=sys.stderr, + ) + else: + print("Error: Empty review content from OpenAI API.", file=sys.stderr) if detail: print(f"Detail: {detail}", file=sys.stderr) sys.exit(1) - # Extract text from output items (output_text is null in raw HTTP responses; - # the convenience property only exists in the Python SDK). - content = result.get("output_text") or "" - if not content: - for item in result.get("output", []): - if item.get("type") == "message": - for block in item.get("content", []): - if block.get("type") == "output_text": - content += block.get("text", "") - - if not content.strip(): - print("Error: Empty review content from OpenAI API.", file=sys.stderr) - sys.exit(1) - usage = result.get("usage", {}) return (content, usage) @@ -1585,6 +1596,12 @@ def main() -> None: sys.exit(0) # Call OpenAI API + if _is_reasoning_model(args.model) and args.timeout == DEFAULT_TIMEOUT: + print( + f"Note: {args.model} is a reasoning model. Consider --timeout 900 " + "for large reviews.", + file=sys.stderr, + ) print(f"Sending review to {args.model}...", file=sys.stderr) print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr) if cost_str: diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index d0455c2..2d22e5b 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1572,6 +1572,37 @@ def test_pro_snapshot_matches_pro(self, review_mod): assert snapshot == base +class TestExtractResponseText: + def test_prefers_output_text_field(self, review_mod): + result = {"output_text": "Direct text.", "output": []} + assert review_mod._extract_response_text(result) == "Direct text." + + def test_walks_output_items_when_output_text_null(self, review_mod): + result = { + "output_text": None, + "output": [{"type": "message", "content": [ + {"type": "output_text", "text": "Walked text."}, + ]}], + } + assert review_mod._extract_response_text(result) == "Walked text." + + def test_concatenates_multiple_blocks(self, review_mod): + result = { + "output_text": None, + "output": [{"type": "message", "content": [ + {"type": "output_text", "text": "A"}, + {"type": "output_text", "text": "B"}, + ]}], + } + assert review_mod._extract_response_text(result) == "AB" + + def test_empty_when_no_output(self, review_mod): + assert review_mod._extract_response_text({"output_text": None, "output": []}) == "" + + def test_empty_when_missing_keys(self, review_mod): + assert review_mod._extract_response_text({}) == "" + + class TestResponsesAPIConstants: def test_endpoint_is_responses(self, review_mod): assert "responses" in review_mod.ENDPOINT @@ -1652,8 +1683,63 @@ def test_timeout_passed_through(self, review_mod, mock_urlopen): review_mod.call_openai("test", "gpt-5.4", "fake-key", timeout=900) assert mock_urlopen["timeout"] == 900 - def test_incomplete_status_exits(self, review_mod, mock_urlopen): - """Non-completed status should cause sys.exit.""" + def test_missing_status_with_valid_output_succeeds(self, review_mod, mock_urlopen): + """Valid content should be accepted even when status field is absent.""" + mock_urlopen["response_data"] = { + "output_text": None, + "output": [{ + "type": "message", + "content": [{"type": "output_text", "text": "Good review."}], + }], + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") + assert content == "Good review." + + def test_status_none_with_valid_output_succeeds(self, review_mod, mock_urlopen): + """status=None should not prevent content extraction.""" + mock_urlopen["response_data"] = { + "status": None, + "output_text": None, + "output": [{ + "type": "message", + "content": [{"type": "output_text", "text": "Good review."}], + }], + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") + assert content == "Good review." + + def test_output_text_convenience_field_used(self, review_mod, mock_urlopen): + """When output_text is populated (SDK-style), use it directly.""" + mock_urlopen["response_data"] = { + "status": "completed", + "output_text": "SDK-provided text.", + "output": [], + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") + assert content == "SDK-provided text." + + def test_multiple_output_text_blocks_concatenated(self, review_mod, mock_urlopen): + """Multiple output_text blocks should be concatenated in order.""" + mock_urlopen["response_data"] = { + "status": "completed", + "output_text": None, + "output": [{ + "type": "message", + "content": [ + {"type": "output_text", "text": "Part 1. "}, + {"type": "output_text", "text": "Part 2."}, + ], + }], + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") + assert content == "Part 1. Part 2." + + def test_failed_status_no_content_exits(self, review_mod, mock_urlopen): + """Failed status with no usable content should exit.""" mock_urlopen["response_data"] = { "status": "failed", "output_text": None, @@ -1664,7 +1750,7 @@ def test_incomplete_status_exits(self, review_mod, mock_urlopen): review_mod.call_openai("test", "gpt-5.4", "fake-key") def test_empty_output_exits(self, review_mod, mock_urlopen): - """Empty output items should cause sys.exit.""" + """Empty output items with completed status should exit.""" mock_urlopen["response_data"] = { "status": "completed", "output_text": None, From a8324048722f9be393a65cdd9c567207fc955f43 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 23 Mar 2026 17:49:07 -0400 Subject: [PATCH 4/5] Add regression test for non-completed status with valid content Pin the content-first parsing behavior: status='incomplete' with usable output should return content, not exit. Addresses P2 from gpt-5.4-pro review. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_openai_review.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 2d22e5b..0efa42c 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1710,6 +1710,20 @@ def test_status_none_with_valid_output_succeeds(self, review_mod, mock_urlopen): content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") assert content == "Good review." + def test_incomplete_status_with_valid_content_succeeds(self, review_mod, mock_urlopen): + """Non-completed status should still return content when output is usable.""" + mock_urlopen["response_data"] = { + "status": "incomplete", + "output_text": None, + "output": [{ + "type": "message", + "content": [{"type": "output_text", "text": "Partial but usable."}], + }], + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") + assert content == "Partial but usable." + def test_output_text_convenience_field_used(self, review_mod, mock_urlopen): """When output_text is populated (SDK-style), use it directly.""" mock_urlopen["response_data"] = { From 5af76f194818b0308614061ccd1f112a9989b0bf Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 23 Mar 2026 18:03:03 -0400 Subject: [PATCH 5/5] Address CI review: incomplete status handling, pricing, help text P1: Treat status='incomplete' as a hard error even when content exists, since truncated reviews may silently suppress findings. Print incomplete_details and suggest remediation (reduce diff, lower context). P2: Fix gpt-5.4-pro pricing from (15,60) to (30,180) per official rates. P3: Fix argparse description to say "Responses API" not "Chat Completions". Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/scripts/openai_review.py | 22 ++++++++++++++++++++-- tests/test_openai_review.py | 28 +++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index e0a63db..ac8487f 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -854,7 +854,7 @@ def apply_token_budget( # MAINTENANCE: Update when OpenAI changes pricing. PRICING = { "gpt-5.4": (2.50, 15.00), - "gpt-5.4-pro": (15.00, 60.00), + "gpt-5.4-pro": (30.00, 180.00), "gpt-4.1": (2.00, 8.00), "gpt-4.1-mini": (0.40, 1.60), "o3": (2.00, 8.00), @@ -1209,6 +1209,24 @@ def call_openai( content = _extract_response_text(result) + # Treat truncated responses as errors — partial reviews may suppress findings. + status = result.get("status") + if content.strip() and status == "incomplete": + detail = result.get("incomplete_details") or "" + print( + "Error: Review was truncated (status='incomplete'). " + "Output may be missing findings.", + file=sys.stderr, + ) + if detail: + print(f"Detail: {detail}", file=sys.stderr) + print( + "Try reducing diff size, disabling --full-registry, or " + "lowering --context to 'minimal'.", + file=sys.stderr, + ) + sys.exit(1) + if not content.strip(): # No usable content — report the best diagnostic we have. status = result.get("status", "") @@ -1244,7 +1262,7 @@ def _read_file(path: str, label: str) -> str: def main() -> None: parser = argparse.ArgumentParser( - description="Run local AI code review via OpenAI Chat Completions API." + description="Run local AI code review via OpenAI Responses API." ) parser.add_argument( "--review-criteria", diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 0efa42c..9e8e9d9 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1710,19 +1710,37 @@ def test_status_none_with_valid_output_succeeds(self, review_mod, mock_urlopen): content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") assert content == "Good review." - def test_incomplete_status_with_valid_content_succeeds(self, review_mod, mock_urlopen): - """Non-completed status should still return content when output is usable.""" + def test_incomplete_status_with_content_exits(self, review_mod, mock_urlopen): + """Truncated response (status=incomplete) should exit even if content exists.""" mock_urlopen["response_data"] = { "status": "incomplete", "output_text": None, "output": [{ "type": "message", - "content": [{"type": "output_text", "text": "Partial but usable."}], + "content": [{"type": "output_text", "text": "Partial review."}], }], "usage": {"input_tokens": 10, "output_tokens": 5}, } - content, _ = review_mod.call_openai("test", "gpt-5.4", "fake-key") - assert content == "Partial but usable." + with pytest.raises(SystemExit): + review_mod.call_openai("test", "gpt-5.4", "fake-key") + + def test_incomplete_status_surfaces_details(self, review_mod, mock_urlopen, capsys): + """Incomplete response should print incomplete_details to stderr.""" + mock_urlopen["response_data"] = { + "status": "incomplete", + "incomplete_details": {"reason": "max_output_tokens"}, + "output_text": None, + "output": [{ + "type": "message", + "content": [{"type": "output_text", "text": "Partial."}], + }], + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + with pytest.raises(SystemExit): + review_mod.call_openai("test", "gpt-5.4", "fake-key") + captured = capsys.readouterr() + assert "truncated" in captured.err.lower() + assert "max_output_tokens" in captured.err def test_output_text_convenience_field_used(self, review_mod, mock_urlopen): """When output_text is populated (SDK-style), use it directly."""