Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions src/kernel_ci_cloud_labs/pull_labs_poller.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,80 @@ def _http_put_json(
return json.loads(resp_body) if resp_body else None


def _validate_api_token(
api_base_uri: str, api_token: Optional[str], runtime_name: str
) -> None:
"""Startup preflight: confirm api_token authenticates and can edit nodes.

Calls GET /whoami once and logs the outcome. Never fatal -- a transient
API error must not stop the poller from starting, and node updates have
their own per-call error handling. It surfaces, at startup, the two
failure modes that otherwise only show up as a 401 on every job:
* the token does not authenticate at all (no token / invalid / expired);
* the token authenticates but the user cannot edit job nodes for this
runtime (kernelci-api _user_can_edit_node).
"""
if not api_token:
logger.warning(
"No kernelci-api token set (KERNELCI_API_TOKEN / UNIFIED_TOKEN / "
"config kernelci.api_token) -- node claim/finish updates will "
"fail with HTTP 401"
)
return

url = f"{api_base_uri.rstrip('/')}/whoami"
try:
whoami = _http_get_json(url, token=api_token) or {}
except urllib.error.HTTPError as e:
if e.code in (401, 403):
logger.error(
"kernelci-api token rejected by %s (HTTP %s) -- the token is "
"invalid, expired, or not a kernelci-api token; node updates "
"will fail",
url, e.code,
)
else:
logger.warning(
"Could not validate kernelci-api token via %s: HTTP %s",
url, e.code,
)
return
except (urllib.error.URLError, json.JSONDecodeError) as e:
logger.warning(
"Could not reach %s to validate the kernelci-api token (%s) -- "
"continuing; node updates will be retried per job",
url, e,
)
return

username = whoami.get("username") or whoami.get("email") or "<unknown>"
is_superuser = bool(whoami.get("is_superuser"))
groups = {
g.get("name")
for g in whoami.get("groups", [])
if isinstance(g, dict) and g.get("name")
}
# Groups that let a user edit a job node it does not own
# (kernelci-api _user_can_edit_node).
editor_groups = {
"node:edit:any",
f"runtime:{runtime_name}:node-editor",
f"runtime:{runtime_name}:node-admin",
}
logger.info(
"kernelci-api token OK: user=%s superuser=%s groups=%s",
username, is_superuser, sorted(groups) or [],
)
if not is_superuser and not (groups & editor_groups):
logger.warning(
"kernelci-api user %s cannot edit job nodes for runtime '%s': "
"not a superuser and in none of %s -- node claim/finish updates "
"will fail with HTTP 401 unless the user owns the nodes. Add the "
"user to group 'runtime:%s:node-editor'.",
username, runtime_name, sorted(editor_groups), runtime_name,
)


# ---------------------------------------------------------------------------
# Cursor persistence — generic filesystem backend by default.
# A deployment can swap in a custom CursorStore (e.g. backed by S3) by
Expand Down Expand Up @@ -392,6 +466,10 @@ def __init__(
if job_executor is None:
_validate_default_executor_deps()

# Startup preflight: surface a bad/under-privileged kernelci-api token
# now, rather than as a 401 on every job's claim/finish update.
_validate_api_token(self.api_base_uri, self.api_token, self.runtime_name)

# -- Credential resolution -------------------------------------------

@staticmethod
Expand Down
68 changes: 66 additions & 2 deletions tests/test_pull_labs_poller.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""Unit tests for pull_labs_poller (no network, no AWS)."""

import json
import logging
import os
import tempfile
import urllib.error
Expand All @@ -28,9 +29,10 @@
_GET = "kernel_ci_cloud_labs.pull_labs_poller._http_get_json"
_PUT = "kernel_ci_cloud_labs.pull_labs_poller._http_put_json"

# Capture the real validator at import time so a specific test can restore it
# after the autouse fixture has stubbed it out.
# Capture the real validators at import time so a specific test can call them
# after the autouse fixtures have stubbed them out.
_REAL_VALIDATE_DEFAULT_EXECUTOR_DEPS = poller_mod._validate_default_executor_deps
_REAL_VALIDATE_API_TOKEN = poller_mod._validate_api_token


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -60,6 +62,16 @@ def _skip_default_executor_deps_check(monkeypatch):
monkeypatch.setattr(poller_mod, "_validate_default_executor_deps", lambda: None)


@pytest.fixture(autouse=True)
def _skip_api_token_check(monkeypatch):
"""Bypass the startup /whoami token preflight (no network in unit tests).

Dedicated tests call the real _validate_api_token via the captured
reference with _http_get_json patched.
"""
monkeypatch.setattr(poller_mod, "_validate_api_token", lambda *a, **k: None)


def _minimal_kc(**overrides):
base = {
"api_base_uri": "https://api.example/latest",
Expand Down Expand Up @@ -554,3 +566,55 @@ def _fail_if_called():
# Custom executor — validator must be skipped, no SystemExit.
PullLabsPoller(_minimal_kc(), job_executor=lambda cfg: ([], None))
assert called["validator"] is False


# ---------------------------------------------------------------------------
# Startup /whoami token preflight
# ---------------------------------------------------------------------------


class TestValidateApiToken:
"""_validate_api_token() -- never fatal, logs token validity and groups."""

URI = "https://api.example/latest"
RUNTIME = "pull-labs-aws-ec2"

def test_no_token_warns(self, caplog):
with caplog.at_level(logging.WARNING):
_REAL_VALIDATE_API_TOKEN(self.URI, None, self.RUNTIME)
assert "No kernelci-api token" in caplog.text

def test_401_logs_error(self, caplog):
err = urllib.error.HTTPError(self.URI, 401, "Unauthorized", {}, None)
with patch(_GET, side_effect=err), caplog.at_level(logging.ERROR):
_REAL_VALIDATE_API_TOKEN(self.URI, "bad-token", self.RUNTIME)
assert "rejected" in caplog.text

def test_network_error_is_not_fatal(self):
# A transient API error must not raise -- it cannot block startup.
with patch(_GET, side_effect=urllib.error.URLError("boom")):
_REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME)

def test_valid_token_with_editor_group(self, caplog):
whoami = {
"username": "pullbot",
"groups": [{"name": "runtime:pull-labs-aws-ec2:node-editor"}],
}
with patch(_GET, return_value=whoami), caplog.at_level(logging.INFO):
_REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME)
assert "token OK" in caplog.text
assert "cannot edit" not in caplog.text

def test_superuser_token_ok(self, caplog):
whoami = {"username": "root", "is_superuser": True, "groups": []}
with patch(_GET, return_value=whoami), caplog.at_level(logging.INFO):
_REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME)
assert "cannot edit" not in caplog.text

def test_valid_token_without_editor_group_warns(self, caplog):
whoami = {"username": "pullbot", "groups": [{"name": "some-other-group"}]}
with patch(_GET, return_value=whoami), caplog.at_level(logging.WARNING):
_REAL_VALIDATE_API_TOKEN(self.URI, "t", self.RUNTIME)
assert "cannot edit job nodes" in caplog.text
# The required group is named in the hint.
assert "runtime:pull-labs-aws-ec2:node-editor" in caplog.text
Loading