Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ leave `in_progress=True` so the next cycle retries automatically. Other
exceptions clear `in_progress` and log the error. Closed issues are pruned
from state at the start of each tick.

**Availability pre-check**: each tick begins with
`check_claude_availability()` (`claude.py`), which returns a `ClaudeStatus`
of `AVAILABLE`, `USAGE_LIMIT`, or `AUTH_FAILED`. A usage limit skips the cycle
silently (transient — retried next tick). An auth failure (expired/replaced
credentials) sends a one-shot high-priority ntfy alert ("Clayde: Claude CLI
auth failed") so the operator can re-authenticate and restart. The alert
fires once per failure streak — tracked by the top-level
`claude_auth_failure_notified` flag in `state.json`
(`get_claude_auth_notified()` / `set_claude_auth_notified()`) — and re-arms
when Claude becomes reachable again.

---

## Safety & Content Filtering
Expand Down
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ Clayde's loop is event-driven and stateless by design:
7. **Pure PR approvals** (no comments) update `last_seen_at` without invoking Claude.
8. **Closed issues** are pruned from state automatically.

Before any issues are processed, each tick runs a Claude availability
pre-check. A **usage/rate limit** simply skips the cycle (it is transient and
retried next tick). An **authentication failure** — credentials expired or
replaced — instead sends a one-shot high-priority ntfy alert (title *"Clayde:
Claude CLI auth failed"*) so the operator can re-authenticate and restart the
container. The alert fires once per failure streak (tracked by
`claude_auth_failure_notified` in `state.json`) and re-arms once Claude is
reachable again.

---

## Safety & Content Filtering
Expand All @@ -60,6 +69,7 @@ Whitelisted users are configured via `CLAYDE_WHITELISTED_USERS` in `data/config.
- **PR creation by Claude**: Claude writes the PR description and a recommended reading order for larger diffs
- **PR review handling**: Reads and addresses reviewer feedback automatically
- **Rate-limit resilience**: Detects Claude usage limits and automatically retries
- **Auth-failure alerting**: Distinguishes Claude authentication failures from usage limits and sends a one-shot high-priority ntfy alert so the operator can re-authenticate
- **Crash recovery**: `in_progress` flag ensures interrupted runs are retried next cycle
- **Safety filtering**: Whitelist-based content filtering prevents acting on unauthorized content
- **Observability**: OpenTelemetry tracing with JSONL file export
Expand Down Expand Up @@ -179,7 +189,7 @@ In any repository the bot has access to, assign issues to the bot account. Clayd
| `CLAYDE_PEBBLE_PORT` | Internal HTTP port (default `8080`) |
| `CLAYDE_PEBBLE_TIMEOUT` | Per-request CLI timeout seconds (default `300`) |
| `CLAYDE_PEBBLE_QUEUE_MAX` | Max queued jobs before 503 (default `100`) |
| `CLAYDE_NTFY_TOPIC` | ntfy.sh topic for Pebble outcome notifications |
| `CLAYDE_NTFY_TOPIC` | ntfy.sh topic for Pebble outcome notifications and Claude auth-failure alerts |
| `CLAYDE_NTFY_BASE_URL` | ntfy base URL (override for self-host) |
| `CLAYDE_NTFY_TIMEOUT_S` | ntfy POST timeout seconds (default `10`) |
| `CLAYDE_KB_PATH` | In-container KB path; Pebble per-request cwd (default `/home/clayde/knowledge_base`) |
Expand Down
61 changes: 45 additions & 16 deletions src/clayde/claude.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Claude invocation via the Anthropic API or the Claude Code CLI."""

import dataclasses
import enum
import json
import logging
import os
Expand Down Expand Up @@ -52,6 +53,20 @@
]


class ClaudeStatus(enum.Enum):
"""Outcome of a Claude availability pre-check.

Distinguishes a recoverable usage/rate limit (transient — just skip the
cycle) from an authentication failure (requires operator intervention —
re-authenticate and restart). ``AVAILABLE`` covers both success and any
unrecognised error (fail-open).
"""

AVAILABLE = "available"
USAGE_LIMIT = "usage_limit"
AUTH_FAILED = "auth_failed"


@dataclasses.dataclass
class InvocationResult:
"""Result of a Claude invocation, including output text and cost."""
Expand Down Expand Up @@ -130,7 +145,7 @@ def invoke(
) -> InvocationResult: ...

@abstractmethod
def is_available(self) -> bool: ...
def check_availability(self) -> ClaudeStatus: ...


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -428,7 +443,7 @@ def invoke(
input_tokens=total_input, output_tokens=total_output,
)

def is_available(self) -> bool:
def check_availability(self) -> ClaudeStatus:
tracer = get_tracer()
with tracer.start_as_current_span("clayde.claude_available_check") as span:
try:
Expand All @@ -439,21 +454,22 @@ def is_available(self) -> bool:
messages=[{"role": "user", "content": "respond with: OK"}],
)
span.set_attribute("claude.available", True)
return True
return ClaudeStatus.AVAILABLE
except anthropic.RateLimitError as e:
log.warning("Claude availability check: rate limit hit — %s", e)
span.set_attribute("claude.available", False)
return False
return ClaudeStatus.USAGE_LIMIT
except anthropic.AuthenticationError as exc:
log.error("Claude availability check: authentication failed — %s", exc)
span.set_attribute("claude.available", False)
span.set_attribute("claude.auth_failed", True)
span.set_attribute("claude.check_error", str(exc))
return False
return ClaudeStatus.AUTH_FAILED
except Exception as exc:
log.warning("Claude availability pre-check raised %s — assuming available", exc)
span.set_attribute("claude.available", True)
span.set_attribute("claude.check_error", str(exc))
return True
return ClaudeStatus.AVAILABLE


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -659,7 +675,7 @@ def invoke(
input_tokens=0, output_tokens=0,
)

def is_available(self) -> bool:
def check_availability(self) -> ClaudeStatus:
tracer = get_tracer()
with tracer.start_as_current_span("clayde.claude_available_check") as span:
cli_bin = _resolve_cli_bin()
Expand All @@ -683,18 +699,19 @@ def is_available(self) -> bool:
error_text += " " + (result.stdout or "")
if _is_limit_error(error_text):
span.set_attribute("claude.available", False)
return False
return ClaudeStatus.USAGE_LIMIT
if is_error and _is_auth_error(error_text):
log.warning("Claude CLI authentication failed — marking unavailable")
span.set_attribute("claude.available", False)
return False
span.set_attribute("claude.auth_failed", True)
return ClaudeStatus.AUTH_FAILED
span.set_attribute("claude.available", True)
return True
return ClaudeStatus.AVAILABLE
except Exception as exc:
log.warning("Claude CLI availability pre-check raised %s — assuming available", exc)
span.set_attribute("claude.available", True)
span.set_attribute("claude.check_error", str(exc))
return True
return ClaudeStatus.AVAILABLE


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -750,11 +767,23 @@ def invoke_claude(
)


def check_claude_availability() -> ClaudeStatus:
"""Return the current Claude availability status.

Makes a minimal invocation and distinguishes the failure modes so callers
can react differently: a usage/rate limit is transient (skip the cycle and
retry), whereas an authentication failure needs operator intervention
(re-authenticate and restart). Returns ``ClaudeStatus.AVAILABLE`` on
success or any unrecognised error (fail-open to avoid suppressing real
work on spurious pre-check errors).
"""
return _get_backend().check_availability()


def is_claude_available() -> bool:
"""Return True if Claude is available (rate limit not currently hit).
"""Return True if Claude is available (no usage limit or auth failure).

Makes a minimal invocation. Returns False when a limit is detected;
returns True on success or any other error (fail-open to avoid
suppressing real work on spurious pre-check errors).
Thin boolean wrapper over :func:`check_claude_availability` for callers
that don't need to distinguish the failure modes.
"""
return _get_backend().is_available()
return check_claude_availability() is ClaudeStatus.AVAILABLE
58 changes: 55 additions & 3 deletions src/clayde/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,15 @@
from github import Github
from github.Issue import Issue

from clayde.claude import InvocationTimeoutError, UsageLimitError, is_claude_available
from clayde.claude import (
ClaudeStatus,
InvocationTimeoutError,
UsageLimitError,
check_claude_availability,
)
from clayde.config import get_github_client, get_settings, setup_logging
from clayde.webhook import JobQueue, create_app, worker_loop
from clayde.webhook.notify import send_ntfy_sync
from clayde.github import (
fetch_issue,
fetch_issue_comments,
Expand All @@ -44,7 +50,14 @@
parse_pr_url,
)
from clayde.safety import filter_pr_reviews, get_new_visible_comments, has_visible_content
from clayde.state import get_issue_state, load_state, save_state, update_issue_state
from clayde.state import (
get_claude_auth_notified,
get_issue_state,
load_state,
save_state,
set_claude_auth_notified,
update_issue_state,
)
from clayde.tasks import work, wrap_up, pr_work
from clayde.telemetry import get_tracer, init_tracer

Expand Down Expand Up @@ -306,6 +319,32 @@ def _prune_closed_issues(g: Github, issues_state: dict) -> None:
save_state(state)


def _notify_claude_auth_failure(settings) -> None:
"""Send a one-shot high-priority ntfy alert when Claude auth fails.

Fires once per auth-failure streak: the notified flag is set here and
cleared in ``main()`` once Claude becomes available again. A persistent
failure therefore alerts only on the first cycle, while a
recovered-then-failed sequence alerts again.
"""
if get_claude_auth_notified():
log.warning("Claude authentication still failing — already alerted, skipping ntfy")
return
log.error("Claude authentication failed — sending operator alert")
send_ntfy_sync(
title="Clayde: Claude CLI auth failed",
body=(
"Claude authentication failed — Clayde is skipping all work. "
"Re-authenticate (claude auth login) and restart the container."
),
success=False,
base_url=settings.ntfy_base_url,
topic=settings.ntfy_topic,
timeout_s=settings.ntfy_timeout_s,
)
set_claude_auth_notified(True)


def _configure_global_git_identity(settings) -> None:
git_name = settings.effective_git_name
git_email = settings.git_email
Expand All @@ -332,12 +371,25 @@ def main():
tracer = get_tracer()

with tracer.start_as_current_span("clayde.tick") as tick_span:
if not is_claude_available():
status = check_claude_availability()
if status is ClaudeStatus.AUTH_FAILED:
tick_span.set_attribute("claude.available", False)
tick_span.set_attribute("claude.auth_failed", True)
_notify_claude_auth_failure(settings)
provider.force_flush()
return
if status is ClaudeStatus.USAGE_LIMIT:
log.warning("Claude usage limit hit — skipping all work this cycle")
tick_span.set_attribute("claude.available", False)
provider.force_flush()
return

# Claude is available — clear any prior auth-failure alert latch so a
# future failure alerts again.
if get_claude_auth_notified():
log.info("Claude authentication recovered — clearing auth-failure alert latch")
set_claude_auth_notified(False)

tick_span.set_attribute("claude.available", True)
g = get_github_client()
assigned = get_assigned_issues(g)
Expand Down
17 changes: 17 additions & 0 deletions src/clayde/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@

_STATE_FILE = DATA_DIR / "state.json"

# Top-level (non-issue) state key tracking whether the operator has been
# alerted about the current Claude auth-failure streak, so the alert fires
# once per streak rather than every cycle.
_CLAUDE_AUTH_NOTIFIED_KEY = "claude_auth_failure_notified"


def load_state() -> dict:
if _STATE_FILE.exists():
Expand All @@ -29,3 +34,15 @@ def update_issue_state(issue_url: str, updates: dict) -> None:
entry = state["issues"].setdefault(issue_url, {})
entry.update(updates)
save_state(state)


def get_claude_auth_notified() -> bool:
"""Return whether an alert has been sent for the current auth-failure streak."""
return bool(load_state().get(_CLAUDE_AUTH_NOTIFIED_KEY, False))


def set_claude_auth_notified(value: bool) -> None:
"""Record whether the current Claude auth-failure streak has been notified."""
state = load_state()
state[_CLAUDE_AUTH_NOTIFIED_KEY] = value
save_state(state)
Loading
Loading