Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
365 changes: 365 additions & 0 deletions tests/test_agentic_e2e_fix_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2148,3 +2148,368 @@ def test_verify_independently_fails_when_no_runner_available(self, mock_pytest,

assert passed is False
assert "no test runner available" in output.lower()


class TestCrossCycleFailureMemory:
"""Tests for issue #661: workflow retries deterministically failed steps across cycles.

The orchestrator clears step_outputs at cycle boundaries (lines 696, 920, 924),
erasing all knowledge of which steps failed and why. This causes identical retries
of steps that failed deterministically (e.g., provider timeouts), wasting cost and time.
"""

def _make_side_effect(self, failure_steps=None, cycle_failure_steps=None):
"""Helper to create a side_effect function for run_agentic_task.

Args:
failure_steps: Dict of step_num -> failure message (fails in ALL cycles)
cycle_failure_steps: Dict of (cycle, step_num) -> failure message (fails in specific cycle)
"""
failure_steps = failure_steps or {}
cycle_failure_steps = cycle_failure_steps or {}
call_count = [0]

def side_effect(*args, **kwargs):
label = kwargs.get('label', '')
call_count[0] += 1

# Parse cycle and step from label like "cycle1_step2"
cycle_num = None
step_num = None
for part in label.split('_'):
if part.startswith('cycle'):
cycle_num = int(part[5:])
elif part.startswith('step'):
step_num = int(part[4:])

# Check cycle-specific failures first
if cycle_num and step_num and (cycle_num, step_num) in cycle_failure_steps:
msg = cycle_failure_steps[(cycle_num, step_num)]
return (False, msg, 0.1, "gpt-4")

# Check global failures
if step_num and step_num in failure_steps:
msg = failure_steps[step_num]
return (False, msg, 0.1, "gpt-4")

# Step 9 default: CONTINUE_CYCLE to trigger next cycle
if step_num == 9:
return (True, "Tests still failing. CONTINUE_CYCLE", 0.1, "gpt-4")

return (True, f"Output for {label}", 0.1, "gpt-4")

return side_effect

def test_failure_history_preserved_across_cycles(self, e2e_fix_mock_dependencies, e2e_fix_default_args):
"""Bug reproduction: step_outputs is cleared at cycle boundary, erasing failure history.

Issue #661: When Step 2 fails with a provider timeout in Cycle 1, the orchestrator
should retain this failure information for Cycle 2. Currently, step_outputs = {} at
line 920 erases everything.

This test verifies that the state saved at the start of Cycle 2 contains failure
history from Cycle 1.
"""
mock_run, _, _ = e2e_fix_mock_dependencies
e2e_fix_default_args["max_cycles"] = 2

# Step 2 fails with provider timeout in every cycle
mock_run.side_effect = self._make_side_effect(
failure_steps={2: "All agent providers failed: anthropic: Timeout expired"}
)

run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args)

# Inspect what was passed to save_workflow_state at the cycle boundary.
# After Cycle 1 completes and before Cycle 2 starts, the state should
# retain failure information from Cycle 1.
from unittest.mock import call
save_state_mock = None
with patch("pdd.agentic_e2e_fix_orchestrator.save_workflow_state") as _:
pass
# Access the mock through the fixture's context - need to get it from the
# patched module
# Instead, check: in Cycle 2, step prompts should contain failure context from Cycle 1
# We verify this through the run_agentic_task calls in Cycle 2
cycle2_calls = [c for c in mock_run.call_args_list if 'cycle2' in c.kwargs.get('label', '')]
assert len(cycle2_calls) > 0, "Cycle 2 should have executed steps"

# The key assertion: In Cycle 2, the instruction (prompt) passed to run_agentic_task
# should contain information about Cycle 1's Step 2 failure.
# On buggy code: step_outputs is cleared, so no failure context appears.
cycle2_step1_call = [c for c in cycle2_calls if 'step1' in c.kwargs.get('label', '')][0]
cycle2_instruction = cycle2_step1_call.kwargs.get('instruction', '')

# After the fix, failure history should be injected into prompts.
# On buggy code, there's no mechanism for this, so this assertion fails.
# We check that SOME failure context from prior cycles is available.
# The fix should add a failure_history or prior_step_failures field to the context.
assert "Timeout expired" in cycle2_instruction or "failure" in cycle2_instruction.lower() or "prior" in cycle2_instruction.lower(), (
"Cycle 2 prompts should contain failure context from Cycle 1. "
"Currently step_outputs is cleared at the cycle boundary (line 920), "
"so Cycle 2 has no knowledge of Cycle 1's Step 2 provider timeout failure."
)

def test_failure_context_injected_into_prompt(self, e2e_fix_mock_dependencies, e2e_fix_default_args):
"""Prompt context channel: prior-cycle failure data should appear in step prompts.

Issue #661: The context dict (lines 737-764) never includes prior-cycle failure
data. Even if failures were preserved in state, the LLM wouldn't see them because
no failure_history/prior_failures key is added to the context.

This test captures the formatted prompt passed to run_agentic_task in Cycle 2
and verifies it mentions the Cycle 1 failure.
"""
mock_run, _, _ = e2e_fix_mock_dependencies
e2e_fix_default_args["max_cycles"] = 2

# Step 2 fails with timeout only in Cycle 1, succeeds in Cycle 2
mock_run.side_effect = self._make_side_effect(
cycle_failure_steps={(1, 2): "All agent providers failed: anthropic: Timeout expired"}
)

run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args)

# Get Cycle 2 calls
cycle2_calls = [c for c in mock_run.call_args_list if 'cycle2' in c.kwargs.get('label', '')]
assert len(cycle2_calls) > 0, "Cycle 2 should have executed"

# Check ANY Cycle 2 step prompt for prior failure context
any_has_failure_context = False
for call in cycle2_calls:
instruction = call.kwargs.get('instruction', '')
if 'timeout' in instruction.lower() or 'failed' in instruction.lower() or 'prior' in instruction.lower():
any_has_failure_context = True
break

assert any_has_failure_context, (
"No Cycle 2 step prompt contains failure context from Cycle 1. "
"The context dict (lines 737-764) has no key for prior-cycle failures, "
"so the LLM retries Step 2 blindly with no awareness of the prior timeout."
)

def test_persisted_state_retains_failure_history(self, e2e_fix_mock_dependencies, e2e_fix_default_args):
"""Persisted state channel: saved state at cycle boundary should retain failure history.

Issue #661: Line 924 sets state_data['step_outputs'] = {}, erasing failure
data from the persisted state. The save_workflow_state call at line 928-930
then persists an empty step_outputs.
"""
mock_run, _, _ = e2e_fix_mock_dependencies
e2e_fix_default_args["max_cycles"] = 2

# Step 2 fails with timeout in Cycle 1
mock_run.side_effect = self._make_side_effect(
cycle_failure_steps={(1, 2): "All agent providers failed: anthropic: Timeout expired"}
)

# We need to capture what save_workflow_state receives at the cycle boundary.
# The fixture already mocks save_workflow_state. We need to access it.
# Re-patch to capture call args.
with patch("pdd.agentic_e2e_fix_orchestrator.save_workflow_state") as mock_save:
mock_save.return_value = None
run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args)

# Find the save call at the cycle boundary (the one that prepares for Cycle 2).
# This is the save_workflow_state call AFTER all Cycle 1 steps complete,
# where state_data["current_cycle"] is incremented.
cycle_boundary_saves = []
for call_obj in mock_save.call_args_list:
args, kwargs = call_obj
# save_workflow_state(cwd, issue_number, workflow_name, state_data, ...)
# state_data is the 4th positional arg (index 3)
if len(args) >= 4:
state_data = args[3]
if isinstance(state_data, dict) and state_data.get("current_cycle", 0) == 2:
cycle_boundary_saves.append(state_data)

assert len(cycle_boundary_saves) > 0, (
"Expected save_workflow_state to be called with current_cycle=2 at the cycle boundary"
)

# The state saved for Cycle 2 should contain failure history from Cycle 1
boundary_state = cycle_boundary_saves[0]

# Check that failure information is preserved (not cleared to {})
# The bug: line 924 sets state_data["step_outputs"] = {}, so this will be empty.
# After fix: either step_outputs retains failure entries, or a separate
# failure_history field is present.
has_failure_data = False
step_outputs = boundary_state.get("step_outputs", {})
failure_history = boundary_state.get("failure_history", {})

# Check if step_outputs has any FAILED entries from prior cycle
for key, val in step_outputs.items():
if isinstance(val, str) and "FAILED:" in val:
has_failure_data = True
break

# Or check if a dedicated failure_history field exists
if failure_history:
has_failure_data = True

assert has_failure_data, (
f"Persisted state at cycle boundary has no failure data. "
f"step_outputs={step_outputs}, failure_history={failure_history}. "
f"Line 924 sets state_data['step_outputs'] = {{}}, erasing Cycle 1's "
f"Step 2 timeout failure from the saved state."
)

def test_resume_from_completed_cycle_preserves_failure_history(self, e2e_fix_mock_dependencies, e2e_fix_default_args):
"""Resume path channel: resuming from a completed cycle should preserve failure history.

Issue #661: Line 696 sets step_outputs = {} when resuming from a completed cycle
(last_completed_step >= 9), erasing any failure data from the saved state.
"""
mock_run, _, _ = e2e_fix_mock_dependencies
e2e_fix_default_args["max_cycles"] = 2
e2e_fix_default_args["resume"] = True

# Simulate resuming from a saved state where Cycle 1 completed (all 9 steps done)
# and Step 2 had failed with a timeout
saved_state = {
"workflow": "e2e_fix",
"issue_url": e2e_fix_default_args["issue_url"],
"issue_number": 1,
"current_cycle": 1,
"last_completed_step": 9, # All steps completed
"step_outputs": {
"1": "All 15 unit tests pass.",
"2": "FAILED: All agent providers failed: anthropic: Timeout expired",
"3": "Root cause identified.",
"4": "Fix plan created.",
"5": "Dev units: module_a, module_b",
"6": "Applied fix to module_a.",
"7": "Applied fix to module_b.",
"8": "Ran dev unit tests.",
"9": "Tests still failing. CONTINUE_CYCLE"
},
"dev_unit_states": {},
"total_cost": 1.0,
"model_used": "gpt-4",
"changed_files": [],
"last_saved_at": "2024-01-01T00:00:00",
}

with patch("pdd.agentic_e2e_fix_orchestrator.load_workflow_state") as mock_load_state, \
patch("pdd.agentic_e2e_fix_orchestrator.save_workflow_state") as mock_save_state:
mock_load_state.return_value = (saved_state, None)
mock_save_state.return_value = None

# Step 2 fails again in Cycle 2 (but we want to verify the prompt includes
# Cycle 1's failure context)
mock_run.side_effect = self._make_side_effect(
failure_steps={2: "All agent providers failed: anthropic: Timeout expired"}
)

run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args)

# After resume, the orchestrator should have started Cycle 2 (since Cycle 1
# was complete). Check that Cycle 2's prompts include failure context.
cycle2_calls = [c for c in mock_run.call_args_list if 'cycle2' in c.kwargs.get('label', '')]
assert len(cycle2_calls) > 0, "Cycle 2 should execute after resuming from completed Cycle 1"

# Verify failure context from Cycle 1 is available in Cycle 2 prompts
cycle2_step2_calls = [c for c in cycle2_calls if 'step2' in c.kwargs.get('label', '')]
if cycle2_step2_calls:
instruction = cycle2_step2_calls[0].kwargs.get('instruction', '')
assert 'timeout' in instruction.lower() or 'failed' in instruction.lower() or 'prior' in instruction.lower(), (
"After resuming from a completed cycle with a failed Step 2, "
"Cycle 2's Step 2 prompt should contain failure context. "
"Line 696 sets step_outputs = {}, erasing this history."
)

def test_deterministic_failure_not_retried_identically(self, e2e_fix_mock_dependencies, e2e_fix_default_args):
"""Step 2 should not be called with identical inputs when it failed identically before.

Issue #661: The orchestrator calls run_agentic_task for the same step with the
exact same context in every cycle, even when the step failed deterministically.
After the fix, either the step is skipped, or the prompt includes failure context
so the LLM can adapt its strategy.
"""
mock_run, _, _ = e2e_fix_mock_dependencies
e2e_fix_default_args["max_cycles"] = 2

# Step 2 fails with provider timeout in both cycles
mock_run.side_effect = self._make_side_effect(
failure_steps={2: "All agent providers failed: anthropic: Timeout expired"}
)

run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args)

# Get Step 2 calls from both cycles
step2_calls = [c for c in mock_run.call_args_list if 'step2' in c.kwargs.get('label', '')]

if len(step2_calls) >= 2:
cycle1_instruction = step2_calls[0].kwargs.get('instruction', '')
cycle2_instruction = step2_calls[1].kwargs.get('instruction', '')

# The instructions should NOT be identical — Cycle 2 should have additional
# failure context that Cycle 1 didn't have.
assert cycle1_instruction != cycle2_instruction, (
"Step 2 was called with identical instructions in Cycle 1 and Cycle 2. "
"After Step 2 failed with a provider timeout in Cycle 1, Cycle 2 should "
"either skip the step or include failure context in the prompt so the LLM "
"can adapt. Currently step_outputs is cleared (line 920) and no failure "
"history is injected into the context (lines 737-764)."
)
elif len(step2_calls) == 1:
# If only 1 call, the fix might be skipping Step 2 in Cycle 2 — that's valid
pass
else:
pytest.fail("Step 2 should have been called at least once")

def test_happy_path_multi_cycle_no_failures_unaffected(self, e2e_fix_mock_dependencies, e2e_fix_default_args):
"""Regression: normal multi-cycle behavior should not be broken by failure tracking.

When all steps succeed and Step 9 returns CONTINUE_CYCLE, both cycles should
run all 9 steps normally. This ensures the failure-tracking fix doesn't break
the happy path.
"""
mock_run, _, _ = e2e_fix_mock_dependencies
e2e_fix_default_args["max_cycles"] = 2

# All steps succeed, Step 9 returns CONTINUE_CYCLE
mock_run.side_effect = self._make_side_effect() # No failures

run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args)

# All 18 steps should execute (9 per cycle x 2 cycles)
assert mock_run.call_count == 18, (
f"Expected 18 step calls (9 per cycle x 2 cycles) but got {mock_run.call_count}. "
f"The failure-tracking changes should not affect normal multi-cycle behavior."
)

def test_mixed_outcomes_partial_failures_across_cycles(self, e2e_fix_mock_dependencies, e2e_fix_default_args):
"""Edge case: system should distinguish steps that succeeded vs. failed across cycles.

Cycle 1: Step 2 fails, all other steps succeed.
Cycle 2: All steps should have access to the fact that Step 2 failed in Cycle 1.
"""
mock_run, _, _ = e2e_fix_mock_dependencies
e2e_fix_default_args["max_cycles"] = 2

# Step 2 fails only in Cycle 1
mock_run.side_effect = self._make_side_effect(
cycle_failure_steps={(1, 2): "All agent providers failed: anthropic: Timeout expired"}
)

run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args)

# Get Cycle 2 calls
cycle2_calls = [c for c in mock_run.call_args_list if 'cycle2' in c.kwargs.get('label', '')]
assert len(cycle2_calls) > 0, "Cycle 2 should execute"

# In Cycle 2, at least one step's prompt should mention the prior cycle's Step 2 failure
has_prior_failure_context = False
for call in cycle2_calls:
instruction = call.kwargs.get('instruction', '')
if 'timeout' in instruction.lower() or 'prior' in instruction.lower() or 'failure_history' in instruction.lower():
has_prior_failure_context = True
break

assert has_prior_failure_context, (
"Cycle 2 prompts should contain context about Cycle 1's Step 2 failure. "
"The system should distinguish successful steps from failed ones across cycles, "
"but step_outputs is cleared at the cycle boundary (line 920)."
)
Loading
Loading