From 6ff0c656d30ccefc32d0cca9b2d967b433ce2356 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 19:47:15 +0200 Subject: [PATCH 01/16] config: add Health.StorageProbeInterval --- internal/config/config.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/config/config.go b/internal/config/config.go index 067981d..aec0b8f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -102,6 +102,9 @@ type Config struct { // Gradle configures Gradle HttpBuildCache behavior. Gradle GradleConfig `json:"gradle" yaml:"gradle"` + + // Health configures the /health endpoint behavior. + Health HealthConfig `json:"health" yaml:"health"` } // CooldownConfig configures version cooldown periods. @@ -182,6 +185,14 @@ type GradleBuildCacheConfig struct { SweepInterval string `json:"sweep_interval" yaml:"sweep_interval"` } +// HealthConfig configures the /health endpoint. +type HealthConfig struct { + // StorageProbeInterval is the minimum time between storage backend probes. + // Uses Go duration syntax (e.g. "30s", "1m"). Default: "30s". + // Set to "0" to probe on every /health request (useful for low-traffic deployments). + StorageProbeInterval string `json:"storage_probe_interval" yaml:"storage_probe_interval"` +} + // DatabaseConfig configures the cache database. type DatabaseConfig struct { // Driver is the database driver: "sqlite" or "postgres". @@ -343,6 +354,7 @@ func Load(path string) (*Config, error) { // - PROXY_DATABASE_PATH // - PROXY_LOG_LEVEL // - PROXY_LOG_FORMAT +// - PROXY_HEALTH_STORAGE_PROBE_INTERVAL func (c *Config) LoadFromEnv() { if v := os.Getenv("PROXY_LISTEN"); v != "" { c.Listen = v @@ -410,6 +422,9 @@ func (c *Config) LoadFromEnv() { if v := os.Getenv("PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL"); v != "" { c.Gradle.BuildCache.SweepInterval = v } + if v := os.Getenv("PROXY_HEALTH_STORAGE_PROBE_INTERVAL"); v != "" { + c.Health.StorageProbeInterval = v + } } // Validate checks the configuration for errors. From 77e766d6d17ba579e4b78911781d02e8ecc1c9ab Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 19:50:13 +0200 Subject: [PATCH 02/16] metrics: add proxy_health_probe_failures_total counter --- internal/metrics/metrics.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 18ebe88..f23a5a9 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -128,6 +128,14 @@ var ( }, []string{"ecosystem"}, ) + + HealthProbeFailures = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "proxy_health_probe_failures_total", + Help: "Total number of storage health probe failures, by step (write|size|read|verify|delete).", + }, + []string{"step"}, + ) ) func init() { @@ -147,6 +155,7 @@ func init() { StorageErrors, ActiveRequests, IntegrityFailures, + HealthProbeFailures, ) } @@ -192,6 +201,12 @@ func RecordIntegrityFailure(ecosystem string) { IntegrityFailures.WithLabelValues(ecosystem).Inc() } +// RecordHealthProbeFailure increments the health probe failure counter. +// step is one of: "write", "size", "read", "verify", "delete". +func RecordHealthProbeFailure(step string) { + HealthProbeFailures.WithLabelValues(step).Inc() +} + // RecordStorageError increments storage error counter. func RecordStorageError(operation string) { StorageErrors.WithLabelValues(operation).Inc() From ba4aa76c1e7786a1aa8e122b2977264aff4d8f7d Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 19:52:04 +0200 Subject: [PATCH 03/16] server: add storageProbe with happy-path test --- internal/server/health.go | 100 +++++++++++++++++++++++ internal/server/health_test.go | 145 +++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 internal/server/health.go create mode 100644 internal/server/health_test.go diff --git a/internal/server/health.go b/internal/server/health.go new file mode 100644 index 0000000..08ce8f1 --- /dev/null +++ b/internal/server/health.go @@ -0,0 +1,100 @@ +// Package server implements the proxy HTTP server. +package server + +import ( + "bytes" + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "io" + "strconv" + "time" + + "github.com/git-pkgs/proxy/internal/storage" +) + +const ( + probePathPrefix = ".healthcheck/" + probeMarker = "proxy-healthcheck:" + probeSuffixBytes = 8 + defaultProbeTTL = 30 * time.Second + defaultProbeTimeout = 10 * time.Second +) + +// HealthResponse is the JSON payload returned by /health. +type HealthResponse struct { + Status string `json:"status"` + Checks map[string]HealthCheck `json:"checks"` +} + +// HealthCheck reports the status of a single subsystem check. +type HealthCheck struct { + Status string `json:"status"` + Error string `json:"error,omitempty"` + Step string `json:"step,omitempty"` +} + +// probeError tags a storage probe failure with the step that failed. +type probeError struct { + step string + err error +} + +func (e *probeError) Error() string { return e.step + ": " + e.err.Error() } +func (e *probeError) Unwrap() error { return e.err } + +// storageProbe runs a write → size-check → read → verify → delete round-trip +// against the storage backend. Returns nil on success or a *probeError on failure. +func storageProbe(ctx context.Context, s storage.Storage) error { + suffix, err := randomSuffix() + if err != nil { + return &probeError{step: "write", err: fmt.Errorf("generating random suffix: %w", err)} + } + path := probePathPrefix + strconv.FormatInt(time.Now().UnixNano(), 10) + "-" + suffix + payload := []byte(probeMarker + suffix) + + // 1. Store + size, _, err := s.Store(ctx, path, bytes.NewReader(payload)) + if err != nil { + return &probeError{step: "write", err: err} + } + // 2. Size check + if size != int64(len(payload)) { + return &probeError{step: "size", err: fmt.Errorf("wrote %d bytes, expected %d", size, len(payload))} + } + // 3. Open + rc, err := s.Open(ctx, path) + if err != nil { + return &probeError{step: "read", err: err} + } + defer func() { + if cerr := rc.Close(); cerr != nil { + // Logged at the caller level; not fatal. + _ = cerr + } + }() + // 4. Read all (classify mid-stream errors as read, not verify) + data, err := io.ReadAll(rc) + if err != nil { + return &probeError{step: "read", err: err} + } + // 5. Verify + if !bytes.Equal(data, payload) { + return &probeError{step: "verify", err: fmt.Errorf("content mismatch")} + } + // 6. Delete + if err := s.Delete(ctx, path); err != nil { + return &probeError{step: "delete", err: err} + } + return nil +} + +// randomSuffix returns 8 cryptographically random bytes hex-encoded. +func randomSuffix() (string, error) { + b := make([]byte, probeSuffixBytes) + if _, err := rand.Read(b); err != nil { + return "", err + } + return hex.EncodeToString(b), nil +} diff --git a/internal/server/health_test.go b/internal/server/health_test.go new file mode 100644 index 0000000..27571ba --- /dev/null +++ b/internal/server/health_test.go @@ -0,0 +1,145 @@ +package server + +import ( + "context" + "io" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/git-pkgs/proxy/internal/storage" +) + +// fakeStorage is a minimal storage.Storage for probe tests with per-step failure injection. +type fakeStorage struct { + mu sync.Mutex + + storeCalls atomic.Int64 + openCalls atomic.Int64 + closeCalls atomic.Int64 + deleteCalls atomic.Int64 + + paths []string + payloads [][]byte + + // Failure injection. + storeErr error + openErr error + readErr error // returned by the io.ReadCloser.Read after partial bytes + deleteErr error + + // Misbehavior knobs. + sizeDelta int64 // added to the reported size from Store + readOverride []byte // if non-nil, Open returns a reader yielding these bytes instead of stored content + + stored map[string][]byte +} + +func newFakeStorage() *fakeStorage { return &fakeStorage{stored: map[string][]byte{}} } + +func (f *fakeStorage) Store(ctx context.Context, path string, r io.Reader) (int64, string, error) { + f.storeCalls.Add(1) + if f.storeErr != nil { + return 0, "", f.storeErr + } + data, err := io.ReadAll(r) + if err != nil { + return 0, "", err + } + f.mu.Lock() + f.stored[path] = data + f.paths = append(f.paths, path) + f.payloads = append(f.payloads, data) + f.mu.Unlock() + return int64(len(data)) + f.sizeDelta, "fakehash", nil +} + +type fakeReadCloser struct { + data []byte + pos int + readErr error + closed *atomic.Int64 +} + +func (rc *fakeReadCloser) Read(p []byte) (int, error) { + if rc.pos >= len(rc.data) { + if rc.readErr != nil { + return 0, rc.readErr + } + return 0, io.EOF + } + n := copy(p, rc.data[rc.pos:]) + rc.pos += n + if rc.pos >= len(rc.data) && rc.readErr != nil { + return n, rc.readErr + } + return n, nil +} + +func (rc *fakeReadCloser) Close() error { rc.closed.Add(1); return nil } + +func (f *fakeStorage) Open(ctx context.Context, path string) (io.ReadCloser, error) { + f.openCalls.Add(1) + if f.openErr != nil { + return nil, f.openErr + } + f.mu.Lock() + data := f.stored[path] + f.mu.Unlock() + if f.readOverride != nil { + data = f.readOverride + } + return &fakeReadCloser{data: data, readErr: f.readErr, closed: &f.closeCalls}, nil +} + +func (f *fakeStorage) Exists(ctx context.Context, path string) (bool, error) { + f.mu.Lock() + defer f.mu.Unlock() + _, ok := f.stored[path] + return ok, nil +} + +func (f *fakeStorage) Delete(ctx context.Context, path string) error { + f.deleteCalls.Add(1) + if f.deleteErr != nil { + return f.deleteErr + } + f.mu.Lock() + delete(f.stored, path) + f.mu.Unlock() + return nil +} + +func (f *fakeStorage) Size(ctx context.Context, path string) (int64, error) { return 0, nil } +func (f *fakeStorage) SignedURL(ctx context.Context, path string, expiry time.Duration) (string, error) { + return "", storage.ErrSignedURLUnsupported +} +func (f *fakeStorage) UsedSpace(ctx context.Context) (int64, error) { return 0, nil } +func (f *fakeStorage) URL() string { return "fake://" } +func (f *fakeStorage) Close() error { return nil } + +// --- Tests follow. First test: happy path --- + +func TestStorageProbe_HappyPath(t *testing.T) { + fs := newFakeStorage() + if err := storageProbe(context.Background(), fs); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got := fs.storeCalls.Load(); got != 1 { + t.Errorf("Store calls = %d, want 1", got) + } + if got := fs.openCalls.Load(); got != 1 { + t.Errorf("Open calls = %d, want 1", got) + } + if got := fs.closeCalls.Load(); got != 1 { + t.Errorf("Close calls = %d, want 1", got) + } + if got := fs.deleteCalls.Load(); got != 1 { + t.Errorf("Delete calls = %d, want 1", got) + } + if len(fs.paths) != 1 || !strings.HasPrefix(fs.paths[0], ".healthcheck/") { + t.Errorf("unexpected probe path: %v", fs.paths) + } +} From 928c53d7007fa4ca48b85dba90504d757330716b Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 19:54:55 +0200 Subject: [PATCH 04/16] server: add storageProbe failure-mode tests --- internal/server/health_test.go | 98 ++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/internal/server/health_test.go b/internal/server/health_test.go index 27571ba..79abb2e 100644 --- a/internal/server/health_test.go +++ b/internal/server/health_test.go @@ -2,6 +2,7 @@ package server import ( "context" + "errors" "io" "strings" "sync" @@ -143,3 +144,100 @@ func TestStorageProbe_HappyPath(t *testing.T) { t.Errorf("unexpected probe path: %v", fs.paths) } } + +func TestStorageProbe_WriteFails(t *testing.T) { + fs := newFakeStorage() + fs.storeErr = errors.New("disk full") + err := storageProbe(context.Background(), fs) + var pe *probeError + if !errors.As(err, &pe) { + t.Fatalf("expected *probeError, got %T: %v", err, err) + } + if pe.step != "write" { + t.Errorf("step = %q, want write", pe.step) + } + if fs.openCalls.Load() != 0 { + t.Errorf("Open should not be called after write failure") + } +} + +func TestStorageProbe_SizeMismatch(t *testing.T) { + fs := newFakeStorage() + fs.sizeDelta = -1 // Report 1 byte fewer than actually written + err := storageProbe(context.Background(), fs) + var pe *probeError + if !errors.As(err, &pe) || pe.step != "size" { + t.Fatalf("step = %v, want size; err = %v", pe, err) + } + if fs.openCalls.Load() != 0 { + t.Errorf("Open should not be called after size mismatch") + } +} + +func TestStorageProbe_OpenFails(t *testing.T) { + fs := newFakeStorage() + fs.openErr = errors.New("access denied") + err := storageProbe(context.Background(), fs) + var pe *probeError + if !errors.As(err, &pe) || pe.step != "read" { + t.Fatalf("step = %v, want read; err = %v", pe, err) + } +} + +func TestStorageProbe_ReadMidStreamFails(t *testing.T) { + fs := newFakeStorage() + fs.readErr = errors.New("connection reset") + err := storageProbe(context.Background(), fs) + var pe *probeError + if !errors.As(err, &pe) || pe.step != "read" { + t.Fatalf("step = %v, want read (NOT verify); err = %v", pe, err) + } +} + +func TestStorageProbe_ContentMismatch(t *testing.T) { + fs := newFakeStorage() + fs.readOverride = []byte("wrong content") + err := storageProbe(context.Background(), fs) + var pe *probeError + if !errors.As(err, &pe) || pe.step != "verify" { + t.Fatalf("step = %v, want verify; err = %v", pe, err) + } +} + +func TestStorageProbe_DeleteFails(t *testing.T) { + fs := newFakeStorage() + fs.deleteErr = errors.New("permission denied") + err := storageProbe(context.Background(), fs) + var pe *probeError + if !errors.As(err, &pe) || pe.step != "delete" { + t.Fatalf("step = %v, want delete; err = %v", pe, err) + } +} + +func TestStorageProbe_ReaderClosedOnReadFailure(t *testing.T) { + fs := newFakeStorage() + fs.readErr = errors.New("read error") + _ = storageProbe(context.Background(), fs) + if got := fs.closeCalls.Load(); got != fs.openCalls.Load() { + t.Errorf("closeCalls = %d, openCalls = %d (should match)", got, fs.openCalls.Load()) + } +} + +func TestStorageProbe_PathUniqueness(t *testing.T) { + fs := newFakeStorage() + for i := 0; i < 100; i++ { + if err := storageProbe(context.Background(), fs); err != nil { + t.Fatalf("probe %d: %v", i, err) + } + } + seen := make(map[string]bool) + for _, p := range fs.paths { + if !strings.HasPrefix(p, ".healthcheck/") { + t.Errorf("path missing prefix: %q", p) + } + if seen[p] { + t.Errorf("duplicate path: %q", p) + } + seen[p] = true + } +} From d7572c8cf676075d581b79c856eddd0b9b4faa23 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 20:03:08 +0200 Subject: [PATCH 05/16] server: add healthCache with TTL, single-flight, transition logging --- internal/server/health.go | 81 +++++++++++++++ internal/server/health_test.go | 185 +++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) diff --git a/internal/server/health.go b/internal/server/health.go index 08ce8f1..8ad0bec 100644 --- a/internal/server/health.go +++ b/internal/server/health.go @@ -6,11 +6,15 @@ import ( "context" "crypto/rand" "encoding/hex" + "errors" "fmt" "io" + "log/slog" "strconv" + "sync" "time" + "github.com/git-pkgs/proxy/internal/metrics" "github.com/git-pkgs/proxy/internal/storage" ) @@ -98,3 +102,80 @@ func randomSuffix() (string, error) { } return hex.EncodeToString(b), nil } + +// healthCache memoizes the result of storageProbe for a configurable TTL. +// It is safe for concurrent use. +type healthCache struct { + storage storage.Storage + interval time.Duration + probeTimeout time.Duration + logger *slog.Logger + + mu sync.Mutex + lastAt time.Time + lastErr error +} + +// newHealthCache builds a cache, parsing the interval from a duration string. +// Empty interval string defaults to 30s. "0" or "0s" disables caching. +func newHealthCache(s storage.Storage, intervalStr string, logger *slog.Logger) (*healthCache, error) { + interval := defaultProbeTTL + if intervalStr != "" { + d, err := time.ParseDuration(intervalStr) + if err != nil { + return nil, fmt.Errorf("parsing storage_probe_interval %q: %w", intervalStr, err) + } + interval = d + } + return &healthCache{ + storage: s, + interval: interval, + probeTimeout: defaultProbeTimeout, + logger: logger, + }, nil +} + +// Check returns the cached probe result if still fresh, otherwise runs a fresh probe. +// The callerCtx parameter is accepted for symmetry with handler signatures but is +// intentionally NOT passed to the probe — the probe runs under a context derived +// from context.Background() with a fixed timeout so that caller cancellation +// (e.g. client disconnect) cannot poison the cache with context.Canceled. +func (c *healthCache) Check(callerCtx context.Context) error { + _ = callerCtx // see comment above + c.mu.Lock() + defer c.mu.Unlock() + + // Cache hit + if c.interval > 0 && !c.lastAt.IsZero() && time.Since(c.lastAt) < c.interval { + return c.lastErr + } + + // Fresh probe under a detached context + probeCtx, cancel := context.WithTimeout(context.Background(), c.probeTimeout) + defer cancel() + err := storageProbe(probeCtx, c.storage) + + // Transition logging and metric increment happen only on the fresh-probe path. + c.logTransition(c.lastErr, err) + if err != nil { + var pe *probeError + if errors.As(err, &pe) { + metrics.RecordHealthProbeFailure(pe.step) + } else { + metrics.RecordHealthProbeFailure("unknown") + } + } + + c.lastErr = err + c.lastAt = time.Now() + return err +} + +func (c *healthCache) logTransition(prev, curr error) { + switch { + case prev != nil && curr == nil: + c.logger.Info("storage probe recovered") + case prev == nil && curr != nil: + c.logger.Error("storage probe failed", "error", curr.Error()) + } +} diff --git a/internal/server/health_test.go b/internal/server/health_test.go index 79abb2e..d32face 100644 --- a/internal/server/health_test.go +++ b/internal/server/health_test.go @@ -1,16 +1,20 @@ package server import ( + "bytes" "context" "errors" "io" + "log/slog" "strings" "sync" "sync/atomic" "testing" "time" + "github.com/git-pkgs/proxy/internal/metrics" "github.com/git-pkgs/proxy/internal/storage" + "github.com/prometheus/client_golang/prometheus/testutil" ) // fakeStorage is a minimal storage.Storage for probe tests with per-step failure injection. @@ -35,6 +39,9 @@ type fakeStorage struct { sizeDelta int64 // added to the reported size from Store readOverride []byte // if non-nil, Open returns a reader yielding these bytes instead of stored content + // storeBlock, if non-nil, causes Store to block until the channel is closed or ctx is done. + storeBlock chan struct{} + stored map[string][]byte } @@ -45,6 +52,13 @@ func (f *fakeStorage) Store(ctx context.Context, path string, r io.Reader) (int6 if f.storeErr != nil { return 0, "", f.storeErr } + if f.storeBlock != nil { + select { + case <-f.storeBlock: + case <-ctx.Done(): + return 0, "", ctx.Err() + } + } data, err := io.ReadAll(r) if err != nil { return 0, "", err @@ -241,3 +255,174 @@ func TestStorageProbe_PathUniqueness(t *testing.T) { seen[p] = true } } + +// helper: a healthCache wired to a fakeStorage and a discard logger. +func newTestCache(fs *fakeStorage, interval time.Duration) *healthCache { + return &healthCache{ + storage: fs, + interval: interval, + probeTimeout: 5 * time.Second, + logger: discardLogger(), + } +} + +func discardLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +func TestHealthCache_CacheHit(t *testing.T) { + fs := newFakeStorage() + c := newTestCache(fs, 30*time.Second) + if err := c.Check(context.Background()); err != nil { + t.Fatalf("first check: %v", err) + } + if err := c.Check(context.Background()); err != nil { + t.Fatalf("second check: %v", err) + } + if got := fs.storeCalls.Load(); got != 1 { + t.Errorf("storeCalls = %d, want 1 (second call should be cached)", got) + } +} + +func TestHealthCache_MissAfterTTL(t *testing.T) { + fs := newFakeStorage() + c := newTestCache(fs, 10*time.Millisecond) + _ = c.Check(context.Background()) + time.Sleep(20 * time.Millisecond) + _ = c.Check(context.Background()) + if got := fs.storeCalls.Load(); got != 2 { + t.Errorf("storeCalls = %d, want 2", got) + } +} + +func TestHealthCache_Disabled(t *testing.T) { + fs := newFakeStorage() + c := newTestCache(fs, 0) // interval = 0 means probe every call + _ = c.Check(context.Background()) + _ = c.Check(context.Background()) + if got := fs.storeCalls.Load(); got != 2 { + t.Errorf("storeCalls = %d, want 2", got) + } +} + +func TestHealthCache_LastAtNotAdvancedOnHit(t *testing.T) { + fs := newFakeStorage() + c := newTestCache(fs, 30*time.Second) + for i := 0; i < 100; i++ { + _ = c.Check(context.Background()) + } + if got := fs.storeCalls.Load(); got != 1 { + t.Errorf("storeCalls = %d, want 1 across 100 hits", got) + } +} + +func TestHealthCache_ConcurrentSingleFlight(t *testing.T) { + fs := newFakeStorage() + c := newTestCache(fs, 30*time.Second) + var wg sync.WaitGroup + for i := 0; i < 20; i++ { + wg.Add(1) + go func() { defer wg.Done(); _ = c.Check(context.Background()) }() + } + wg.Wait() + if got := fs.storeCalls.Load(); got != 1 { + t.Errorf("storeCalls = %d, want 1 with 20 concurrent callers", got) + } +} + +func TestHealthCache_CallerCancellationNotPoisoning(t *testing.T) { + fs := newFakeStorage() + c := newTestCache(fs, 30*time.Second) + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Already cancelled before the call + if err := c.Check(ctx); err != nil { + t.Fatalf("Check with cancelled caller ctx should still succeed: %v", err) + } +} + +func TestHealthCache_FailureCounterIncrement(t *testing.T) { + fs := newFakeStorage() + fs.storeErr = errors.New("boom") + c := newTestCache(fs, 30*time.Second) + + before := testutil.ToFloat64(metrics.HealthProbeFailures.WithLabelValues("write")) + + // First call: fresh probe → counter +1 + _ = c.Check(context.Background()) + afterFirst := testutil.ToFloat64(metrics.HealthProbeFailures.WithLabelValues("write")) + if afterFirst-before != 1 { + t.Errorf("counter delta after first call = %v, want 1", afterFirst-before) + } + + // Second call: cache hit → counter NOT re-incremented + _ = c.Check(context.Background()) + afterSecond := testutil.ToFloat64(metrics.HealthProbeFailures.WithLabelValues("write")) + if afterSecond != afterFirst { + t.Errorf("counter changed on cache hit: %v → %v", afterFirst, afterSecond) + } +} + +func TestHealthCache_ProbeTimeout(t *testing.T) { + fs := newFakeStorage() + fs.storeBlock = make(chan struct{}) // Store will block until channel is closed (or never) + t.Cleanup(func() { close(fs.storeBlock) }) + + c := &healthCache{ + storage: fs, + interval: 30 * time.Second, + probeTimeout: 50 * time.Millisecond, + logger: discardLogger(), + } + start := time.Now() + err := c.Check(context.Background()) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("expected timeout error, got nil") + } + if elapsed > 500*time.Millisecond { + t.Errorf("probe took %v, expected ~50ms (timeout not respected)", elapsed) + } +} + +func TestHealthCache_TransitionLogging(t *testing.T) { + fs := newFakeStorage() + var buf bytes.Buffer + logger := slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelInfo})) + c := &healthCache{ + storage: fs, + interval: 0, // probe every call + probeTimeout: 5 * time.Second, + logger: logger, + } + + // Steady ok state — should not log + _ = c.Check(context.Background()) + _ = c.Check(context.Background()) + if got := strings.Count(buf.String(), "storage probe"); got != 0 { + t.Errorf("steady-state logs = %d, want 0; output: %s", got, buf.String()) + } + + // ok → err transition: exactly one Error log + buf.Reset() + fs.storeErr = errors.New("boom") + _ = c.Check(context.Background()) + if !strings.Contains(buf.String(), "storage probe failed") { + t.Errorf("missing failure log on transition; output: %s", buf.String()) + } + + // err steady state — should not log again + buf.Reset() + _ = c.Check(context.Background()) + if buf.Len() != 0 { + t.Errorf("steady-err logs = %q, want empty", buf.String()) + } + + // err → ok transition: exactly one Info log + buf.Reset() + fs.storeErr = nil + _ = c.Check(context.Background()) + if !strings.Contains(buf.String(), "storage probe recovered") { + t.Errorf("missing recovery log on transition; output: %s", buf.String()) + } +} From 228b5aa946c6f818ba4ca1ecc8a48ef6e4f37834 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 20:05:19 +0200 Subject: [PATCH 06/16] server: wire storage probe into /health --- internal/server/server.go | 59 ++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index a0983e5..8b25317 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -40,6 +40,7 @@ import ( "context" "database/sql" "encoding/json" + "errors" "fmt" "log/slog" "net/http" @@ -79,7 +80,8 @@ type Server struct { logger *slog.Logger http *http.Server templates *Templates - cancel context.CancelFunc + cancel context.CancelFunc + healthCache *healthCache } // New creates a new Server with the given configuration. @@ -125,12 +127,20 @@ func New(cfg *config.Config, logger *slog.Logger) (*Server, error) { return nil, fmt.Errorf("verifying storage connectivity: %w", err) } + hc, err := newHealthCache(store, cfg.Health.StorageProbeInterval, logger) + if err != nil { + _ = store.Close() + _ = db.Close() + return nil, fmt.Errorf("initializing health cache: %w", err) + } + return &Server{ - cfg: cfg, - db: db, - storage: store, - logger: logger, - templates: &Templates{}, + cfg: cfg, + db: db, + storage: store, + logger: logger, + templates: &Templates{}, + healthCache: hc, }, nil } @@ -799,23 +809,46 @@ func (s *Server) showComparePage(w http.ResponseWriter, ecosystem, name, version } } -// handleHealth responds with a simple health check. +// handleHealth responds with a structured JSON health report. +// // @Summary Health check // @Tags meta -// @Produce plain -// @Success 200 {string} string -// @Failure 503 {string} string +// @Produce json +// @Success 200 {object} HealthResponse +// @Failure 503 {object} HealthResponse // @Router /health [get] func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) { - // Check database connectivity + w.Header().Set("Content-Type", "application/json") + + resp := HealthResponse{Status: "ok", Checks: map[string]HealthCheck{}} + + // Database check (short-circuit; do not waste a storage probe call when DB is down). if _, err := s.db.SchemaVersion(); err != nil { + resp.Status = "error" + resp.Checks["database"] = HealthCheck{Status: "error", Error: err.Error()} + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(resp) + return + } + resp.Checks["database"] = HealthCheck{Status: "ok"} + + // Storage probe (via cache). + if err := s.healthCache.Check(r.Context()); err != nil { + resp.Status = "error" + sc := HealthCheck{Status: "error", Error: err.Error()} + var pe *probeError + if errors.As(err, &pe) { + sc.Step = pe.step + } + resp.Checks["storage"] = sc w.WriteHeader(http.StatusServiceUnavailable) - _, _ = fmt.Fprint(w, "database error") + _ = json.NewEncoder(w).Encode(resp) return } + resp.Checks["storage"] = HealthCheck{Status: "ok"} w.WriteHeader(http.StatusOK) - _, _ = fmt.Fprint(w, "ok") + _ = json.NewEncoder(w).Encode(resp) } // StatsResponse contains cache statistics. From b80dcd3fb33ac925a053df42b6cae21b19367518 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 20:17:30 +0200 Subject: [PATCH 07/16] server: update TestHealthEndpoint for JSON; wire healthCache into newTestServer Also fix Windows file-locking issue in storageProbe: close the reader explicitly before Delete so the file handle is released prior to os.Remove. --- internal/server/health.go | 20 ++++++----- internal/server/server_test.go | 66 +++++++++++++++++++++++++++++----- 2 files changed, 68 insertions(+), 18 deletions(-) diff --git a/internal/server/health.go b/internal/server/health.go index 8ad0bec..314a569 100644 --- a/internal/server/health.go +++ b/internal/server/health.go @@ -72,16 +72,18 @@ func storageProbe(ctx context.Context, s storage.Storage) error { if err != nil { return &probeError{step: "read", err: err} } - defer func() { - if cerr := rc.Close(); cerr != nil { - // Logged at the caller level; not fatal. - _ = cerr - } - }() // 4. Read all (classify mid-stream errors as read, not verify) - data, err := io.ReadAll(rc) - if err != nil { - return &probeError{step: "read", err: err} + data, readErr := io.ReadAll(rc) + // Close the reader explicitly before Delete; on some platforms (Windows) an + // open file handle prevents deletion. The defer below is a safety net for + // early returns on read errors. + closeErr := rc.Close() + if readErr != nil { + return &probeError{step: "read", err: readErr} + } + if closeErr != nil { + // Non-fatal on the happy path; log at the caller level. + _ = closeErr } // 5. Verify if !bytes.Equal(data, payload) { diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 574b6ba..31527e1 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -81,13 +81,21 @@ func newTestServer(t *testing.T) *testServer { r.Mount("/pypi", http.StripPrefix("/pypi", pypiHandler.Routes())) r.Mount("/gradle", http.StripPrefix("/gradle", gradleHandler.Routes())) + hc, err := newHealthCache(store, "30s", logger) + if err != nil { + _ = db.Close() + _ = os.RemoveAll(tempDir) + t.Fatalf("failed to create health cache: %v", err) + } + // Create a minimal server struct for the handlers s := &Server{ - cfg: cfg, - db: db, - storage: store, - logger: logger, - templates: &Templates{}, + cfg: cfg, + db: db, + storage: store, + logger: logger, + templates: &Templates{}, + healthCache: hc, } r.Get("/health", s.handleHealth) @@ -179,12 +187,52 @@ func TestHealthEndpoint(t *testing.T) { ts.handler.ServeHTTP(w, req) if w.Code != http.StatusOK { - t.Errorf("expected status 200, got %d", w.Code) + t.Fatalf("status = %d, want 200; body: %s", w.Code, w.Body.String()) + } + if got := w.Header().Get("Content-Type"); got != "application/json" { + t.Errorf("Content-Type = %q, want application/json", got) } + var resp HealthResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatalf("decoding response: %v", err) + } + if resp.Status != "ok" { + t.Errorf("status = %q, want ok", resp.Status) + } + if resp.Checks["database"].Status != "ok" { + t.Errorf("database check = %+v, want ok", resp.Checks["database"]) + } + if resp.Checks["storage"].Status != "ok" { + t.Errorf("storage check = %+v, want ok", resp.Checks["storage"]) + } +} - body := w.Body.String() - if body != "ok" { - t.Errorf("expected body 'ok', got %q", body) +func TestHealthEndpoint_DBFailureShortCircuits(t *testing.T) { + ts := newTestServer(t) + defer ts.close() + + // Force DB failure by closing the connection. + _ = ts.db.Close() + + req := httptest.NewRequest("GET", "/health", nil) + w := httptest.NewRecorder() + ts.handler.ServeHTTP(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("status = %d, want 503; body: %s", w.Code, w.Body.String()) + } + var resp HealthResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatalf("decoding: %v", err) + } + if resp.Status != "error" { + t.Errorf("status = %q, want error", resp.Status) + } + if resp.Checks["database"].Status != "error" { + t.Errorf("database check = %+v, want error", resp.Checks["database"]) + } + if _, present := resp.Checks["storage"]; present { + t.Errorf("storage key should be absent on DB short-circuit, got %+v", resp.Checks["storage"]) } } From d0e52b32a3d753de89f8cba60c6630ff2cf7820a Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 20:18:56 +0200 Subject: [PATCH 08/16] server: clean up stale comment in storageProbe --- internal/server/health.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/internal/server/health.go b/internal/server/health.go index 314a569..d875122 100644 --- a/internal/server/health.go +++ b/internal/server/health.go @@ -72,11 +72,10 @@ func storageProbe(ctx context.Context, s storage.Storage) error { if err != nil { return &probeError{step: "read", err: err} } - // 4. Read all (classify mid-stream errors as read, not verify) + // 4. Read all (classify mid-stream errors as read, not verify). + // Close explicitly (not deferred) so the file handle is released before + // Delete — on Windows, an open handle prevents deletion. data, readErr := io.ReadAll(rc) - // Close the reader explicitly before Delete; on some platforms (Windows) an - // open file handle prevents deletion. The defer below is a safety net for - // early returns on read errors. closeErr := rc.Close() if readErr != nil { return &probeError{step: "read", err: readErr} From ca0803f760bd792cd81416628b37617100ebcf16 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 20:41:18 +0200 Subject: [PATCH 09/16] docs: document storage health probe and new metric --- README.md | 21 ++++++++++++++++++++- cmd/proxy/main.go | 2 ++ config.example.yaml | 9 +++++++++ docs/architecture.md | 2 +- 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 22beaaf..35c8816 100644 --- a/README.md +++ b/README.md @@ -577,7 +577,7 @@ Recently cached: | Endpoint | Description | |----------|-------------| | `GET /` | Dashboard (web UI) | -| `GET /health` | Health check (returns "ok" if healthy) | +| `GET /health` | Health check (JSON; HTTP 200 healthy, 503 unhealthy) | | `GET /stats` | Cache statistics (JSON) | | `GET /metrics` | Prometheus metrics | | `GET /npm/*` | npm registry protocol | @@ -815,9 +815,28 @@ The proxy exposes Prometheus metrics at `GET /metrics`. All metric names are pre | `proxy_storage_operation_duration_seconds` | histogram | `operation` | Storage read/write latency | | `proxy_storage_errors_total` | counter | `operation` | Storage read/write failures | | `proxy_active_requests` | gauge | | In-flight requests | +| `proxy_health_probe_failures_total` | Counter | `step` | Storage health probe failures by failing step (`write`, `size`, `read`, `verify`, `delete`). | Cache size and artifact count are refreshed every 60 seconds. The remaining metrics update on each request. +### Health Check + +`/health` returns a structured JSON report of subsystem health. HTTP 200 if all checks pass; 503 if any fail. + +```json +{ + "status": "ok", + "checks": { + "database": {"status": "ok"}, + "storage": {"status": "ok"} + } +} +``` + +Failing checks include an `"error"` field. Storage failures also include a `"step"` field identifying which probe step failed (`write`, `size`, `read`, `verify`, `delete`). + +Storage probe results are cached for `health.storage_probe_interval` (default 30s) to bound the cost of probing remote backends. + Scrape config for Prometheus: ```yaml diff --git a/cmd/proxy/main.go b/cmd/proxy/main.go index 946d12a..4c9ca56 100644 --- a/cmd/proxy/main.go +++ b/cmd/proxy/main.go @@ -77,6 +77,7 @@ // PROXY_GRADLE_BUILD_CACHE_MAX_AGE - Gradle cache max age eviction // PROXY_GRADLE_BUILD_CACHE_MAX_SIZE - Gradle cache max total size // PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL - Gradle cache eviction sweep interval +// PROXY_HEALTH_STORAGE_PROBE_INTERVAL - Storage health probe cache interval (default "30s") // // Example: // @@ -203,6 +204,7 @@ func runServe() { fmt.Fprintf(os.Stderr, " PROXY_GRADLE_BUILD_CACHE_MAX_AGE Gradle cache max age eviction\n") fmt.Fprintf(os.Stderr, " PROXY_GRADLE_BUILD_CACHE_MAX_SIZE Gradle cache max total size\n") fmt.Fprintf(os.Stderr, " PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL Gradle cache eviction sweep interval\n") + fmt.Fprintf(os.Stderr, " PROXY_HEALTH_STORAGE_PROBE_INTERVAL Storage health probe cache interval\n") } _ = fs.Parse(os.Args[1:]) diff --git a/config.example.yaml b/config.example.yaml index 4505849..853f47a 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -128,6 +128,15 @@ gradle: # How often eviction runs when max_age or max_size is set sweep_interval: "10m" +# Health endpoint configuration. +health: + # Minimum time between storage backend probes. + # The /health endpoint runs a write/read/verify/delete round-trip + # against the configured storage backend and caches the result for + # this interval. Set to "0" to probe on every request. + # Default: "30s". + storage_probe_interval: "30s" + # Version cooldown configuration # Hides package versions published too recently, giving the community time # to spot malicious releases before they're pulled into projects. diff --git a/docs/architecture.md b/docs/architecture.md index 81c41cf..b0ca6d8 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -277,7 +277,7 @@ HTTP server setup, web UI, and API handlers. - Web UI: dashboard, package browser, source browser, version comparison - Templates are embedded in the binary via `//go:embed` - Enrichment API for package metadata, vulnerability scanning, and outdated detection -- Health, stats, and Prometheus metrics endpoints +- Health, stats, and Prometheus metrics endpoints. `/health` runs an active write → read → verify → delete probe against the storage backend and returns a structured JSON response (`HealthResponse`) with `"ok"` / `"error"` status per subsystem. Probe results are cached (default 30 s, configurable via `health.storage_probe_interval`) to avoid overwhelming remote backends. ### `internal/metrics` From c9f1231226252bf0852d3c3050b8873d8537488a Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 20:58:56 +0200 Subject: [PATCH 10/16] docs: regenerate Swagger for /health JSON response --- docs/swagger/docs.go | 34 +++++++++++++++++++++++++++++++--- docs/swagger/swagger.json | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/docs/swagger/docs.go b/docs/swagger/docs.go index 34aedbf..23ff54a 100644 --- a/docs/swagger/docs.go +++ b/docs/swagger/docs.go @@ -399,7 +399,7 @@ const docTemplate = `{ "/health": { "get": { "produces": [ - "text/plain" + "application/json" ], "tags": [ "meta" @@ -409,13 +409,13 @@ const docTemplate = `{ "200": { "description": "OK", "schema": { - "type": "string" + "$ref": "#/definitions/server.HealthResponse" } }, "503": { "description": "Service Unavailable", "schema": { - "type": "string" + "$ref": "#/definitions/server.HealthResponse" } } } @@ -515,6 +515,34 @@ const docTemplate = `{ } } }, + "server.HealthCheck": { + "type": "object", + "properties": { + "error": { + "type": "string" + }, + "status": { + "type": "string" + }, + "step": { + "type": "string" + } + } + }, + "server.HealthResponse": { + "type": "object", + "properties": { + "checks": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/server.HealthCheck" + } + }, + "status": { + "type": "string" + } + } + }, "server.OutdatedPackage": { "type": "object", "properties": { diff --git a/docs/swagger/swagger.json b/docs/swagger/swagger.json index b775e63..c2b4dfc 100644 --- a/docs/swagger/swagger.json +++ b/docs/swagger/swagger.json @@ -392,7 +392,7 @@ "/health": { "get": { "produces": [ - "text/plain" + "application/json" ], "tags": [ "meta" @@ -402,13 +402,13 @@ "200": { "description": "OK", "schema": { - "type": "string" + "$ref": "#/definitions/server.HealthResponse" } }, "503": { "description": "Service Unavailable", "schema": { - "type": "string" + "$ref": "#/definitions/server.HealthResponse" } } } @@ -508,6 +508,34 @@ } } }, + "server.HealthCheck": { + "type": "object", + "properties": { + "error": { + "type": "string" + }, + "status": { + "type": "string" + }, + "step": { + "type": "string" + } + } + }, + "server.HealthResponse": { + "type": "object", + "properties": { + "checks": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/server.HealthCheck" + } + }, + "status": { + "type": "string" + } + } + }, "server.OutdatedPackage": { "type": "object", "properties": { From f39a3e31f5d327d992e85f10c49f796b82adf26f Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Tue, 12 May 2026 21:31:52 +0200 Subject: [PATCH 11/16] server: simplify rc.Close error handling in storageProbe --- internal/server/health.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/internal/server/health.go b/internal/server/health.go index d875122..4424558 100644 --- a/internal/server/health.go +++ b/internal/server/health.go @@ -76,14 +76,10 @@ func storageProbe(ctx context.Context, s storage.Storage) error { // Close explicitly (not deferred) so the file handle is released before // Delete — on Windows, an open handle prevents deletion. data, readErr := io.ReadAll(rc) - closeErr := rc.Close() + _ = rc.Close() if readErr != nil { return &probeError{step: "read", err: readErr} } - if closeErr != nil { - // Non-fatal on the happy path; log at the caller level. - _ = closeErr - } // 5. Verify if !bytes.Equal(data, payload) { return &probeError{step: "verify", err: fmt.Errorf("content mismatch")} From 4ac6049790408de2ad22e2f9a0c70bcf80d28ff9 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Sat, 16 May 2026 00:11:18 +0200 Subject: [PATCH 12/16] server: defer probe cleanup so size/open/read/verify failures don't leak objects Previously, storageProbe only called Delete on the success path. Any failure between Store and the final Delete (size mismatch, Open error, mid-stream read failure, content mismatch) left the probe object orphaned in the storage backend. With caching disabled and Kubernetes-rate probing, the leak could accumulate noticeably on backends like S3. Use a named return + defer to attempt Delete after every successful Store. The earlier-step failure remains the primary error; Delete failure only surfaces as step="delete" when nothing else went wrong. Add a table-driven test that asserts cleanup runs for each non-delete failure path. Reported by Copilot on #119. --- internal/server/health.go | 34 ++++++++++++++++++++-------------- internal/server/health_test.go | 30 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/internal/server/health.go b/internal/server/health.go index 4424558..bd34058 100644 --- a/internal/server/health.go +++ b/internal/server/health.go @@ -50,27 +50,36 @@ func (e *probeError) Unwrap() error { return e.err } // storageProbe runs a write → size-check → read → verify → delete round-trip // against the storage backend. Returns nil on success or a *probeError on failure. -func storageProbe(ctx context.Context, s storage.Storage) error { - suffix, err := randomSuffix() - if err != nil { - return &probeError{step: "write", err: fmt.Errorf("generating random suffix: %w", err)} +func storageProbe(ctx context.Context, s storage.Storage) (err error) { + suffix, suffixErr := randomSuffix() + if suffixErr != nil { + return &probeError{step: "write", err: fmt.Errorf("generating random suffix: %w", suffixErr)} } path := probePathPrefix + strconv.FormatInt(time.Now().UnixNano(), 10) + "-" + suffix payload := []byte(probeMarker + suffix) // 1. Store - size, _, err := s.Store(ctx, path, bytes.NewReader(payload)) - if err != nil { - return &probeError{step: "write", err: err} + size, _, storeErr := s.Store(ctx, path, bytes.NewReader(payload)) + if storeErr != nil { + return &probeError{step: "write", err: storeErr} } + // After Store succeeds, always attempt to delete on the way out so probe + // objects don't accumulate when a later step (size/open/read/verify) fails. + // Delete is reported as the primary error only if no earlier failure + // already set one. + defer func() { + if delErr := s.Delete(ctx, path); delErr != nil && err == nil { + err = &probeError{step: "delete", err: delErr} + } + }() // 2. Size check if size != int64(len(payload)) { return &probeError{step: "size", err: fmt.Errorf("wrote %d bytes, expected %d", size, len(payload))} } // 3. Open - rc, err := s.Open(ctx, path) - if err != nil { - return &probeError{step: "read", err: err} + rc, openErr := s.Open(ctx, path) + if openErr != nil { + return &probeError{step: "read", err: openErr} } // 4. Read all (classify mid-stream errors as read, not verify). // Close explicitly (not deferred) so the file handle is released before @@ -84,10 +93,7 @@ func storageProbe(ctx context.Context, s storage.Storage) error { if !bytes.Equal(data, payload) { return &probeError{step: "verify", err: fmt.Errorf("content mismatch")} } - // 6. Delete - if err := s.Delete(ctx, path); err != nil { - return &probeError{step: "delete", err: err} - } + // 6. Delete is handled via the deferred cleanup above. return nil } diff --git a/internal/server/health_test.go b/internal/server/health_test.go index d32face..f224c4e 100644 --- a/internal/server/health_test.go +++ b/internal/server/health_test.go @@ -228,6 +228,36 @@ func TestStorageProbe_DeleteFails(t *testing.T) { } } +// TestStorageProbe_CleanupOnNonDeleteFailure asserts that the probe object is +// deleted even when a step after Store (size/open/read/verify) fails, so +// probe artifacts don't accumulate in the storage backend. +func TestStorageProbe_CleanupOnNonDeleteFailure(t *testing.T) { + cases := []struct { + name string + inject func(*fakeStorage) + wantErr string + }{ + {"size mismatch", func(fs *fakeStorage) { fs.sizeDelta = -1 }, "size"}, + {"open fails", func(fs *fakeStorage) { fs.openErr = errors.New("open boom") }, "read"}, + {"read mid-stream", func(fs *fakeStorage) { fs.readErr = errors.New("mid-stream boom") }, "read"}, + {"content mismatch", func(fs *fakeStorage) { fs.readOverride = []byte("wrong") }, "verify"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + fs := newFakeStorage() + tc.inject(fs) + err := storageProbe(context.Background(), fs) + var pe *probeError + if !errors.As(err, &pe) || pe.step != tc.wantErr { + t.Fatalf("step = %v, want %q; err = %v", pe, tc.wantErr, err) + } + if got := fs.deleteCalls.Load(); got != 1 { + t.Errorf("deleteCalls = %d, want 1 (cleanup should run on non-delete failures)", got) + } + }) + } +} + func TestStorageProbe_ReaderClosedOnReadFailure(t *testing.T) { fs := newFakeStorage() fs.readErr = errors.New("read error") From 4c990549cf807616cfcd308bdb4f4c99095c13fb Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Sat, 16 May 2026 00:14:29 +0200 Subject: [PATCH 13/16] config: validate health.storage_probe_interval in Config.Validate The new duration field was only validated at use time in newHealthCache. The existing codebase already validates other duration fields (MetadataTTL, DirectServeTTL, Gradle.MaxAge, Gradle.SweepInterval) in Config.Validate() so misconfiguration fails fast at startup with a config-key-specific error. Match that pattern. The parse-at-use code in newHealthCache stays as a safety net, mirroring the MetadataTTL precedent. Reported by Copilot on #119. --- internal/config/config.go | 7 +++++++ internal/config/config_test.go | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/internal/config/config.go b/internal/config/config.go index aec0b8f..97e9080 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -496,6 +496,13 @@ func (c *Config) Validate() error { } } + // Validate health probe interval if specified + if c.Health.StorageProbeInterval != "" && c.Health.StorageProbeInterval != "0" { + if _, err := time.ParseDuration(c.Health.StorageProbeInterval); err != nil { + return fmt.Errorf("invalid health.storage_probe_interval %q: %w", c.Health.StorageProbeInterval, err) + } + } + if err := c.Gradle.BuildCache.Validate(); err != nil { return err } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 26a0fc6..175a6c1 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -432,6 +432,29 @@ func TestValidateMetadataTTL(t *testing.T) { } } +func TestValidateHealthStorageProbeInterval(t *testing.T) { + cfg := Default() + cfg.Health.StorageProbeInterval = "not-a-duration" + if err := cfg.Validate(); err == nil { + t.Error("expected validation error for invalid health.storage_probe_interval") + } + + cfg.Health.StorageProbeInterval = "30s" + if err := cfg.Validate(); err != nil { + t.Errorf("unexpected error for valid health.storage_probe_interval: %v", err) + } + + cfg.Health.StorageProbeInterval = "0" + if err := cfg.Validate(); err != nil { + t.Errorf("unexpected error for zero health.storage_probe_interval: %v", err) + } + + cfg.Health.StorageProbeInterval = "" + if err := cfg.Validate(); err != nil { + t.Errorf("unexpected error for empty health.storage_probe_interval: %v", err) + } +} + func TestLoadMetadataTTLFromEnv(t *testing.T) { cfg := Default() t.Setenv("PROXY_METADATA_TTL", "10m") From f296af40e03e3fbe5ed380ba79080cca51f4bd9c Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Sat, 16 May 2026 00:15:49 +0200 Subject: [PATCH 14/16] docs: lowercase "counter" in metrics table for consistency Other rows in the table use lowercase type names (counter/gauge/histogram). Match that style. Reported by Copilot on #119. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 35c8816..a646379 100644 --- a/README.md +++ b/README.md @@ -815,7 +815,7 @@ The proxy exposes Prometheus metrics at `GET /metrics`. All metric names are pre | `proxy_storage_operation_duration_seconds` | histogram | `operation` | Storage read/write latency | | `proxy_storage_errors_total` | counter | `operation` | Storage read/write failures | | `proxy_active_requests` | gauge | | In-flight requests | -| `proxy_health_probe_failures_total` | Counter | `step` | Storage health probe failures by failing step (`write`, `size`, `read`, `verify`, `delete`). | +| `proxy_health_probe_failures_total` | counter | `step` | Storage health probe failures by failing step (`write`, `size`, `read`, `verify`, `delete`). | Cache size and artifact count are refreshed every 60 seconds. The remaining metrics update on each request. From 1b3cbfa749b946dea1612cbd28fba2374b861da1 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Sat, 16 May 2026 00:16:57 +0200 Subject: [PATCH 15/16] docs: include size-check step in /health probe description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe is write → size-check → read → verify → delete; the architecture note was missing the size-check step. Reported by Copilot on #119. --- docs/architecture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/architecture.md b/docs/architecture.md index b0ca6d8..d71c4ef 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -277,7 +277,7 @@ HTTP server setup, web UI, and API handlers. - Web UI: dashboard, package browser, source browser, version comparison - Templates are embedded in the binary via `//go:embed` - Enrichment API for package metadata, vulnerability scanning, and outdated detection -- Health, stats, and Prometheus metrics endpoints. `/health` runs an active write → read → verify → delete probe against the storage backend and returns a structured JSON response (`HealthResponse`) with `"ok"` / `"error"` status per subsystem. Probe results are cached (default 30 s, configurable via `health.storage_probe_interval`) to avoid overwhelming remote backends. +- Health, stats, and Prometheus metrics endpoints. `/health` runs an active write → size-check → read → verify → delete probe against the storage backend and returns a structured JSON response (`HealthResponse`) with `"ok"` / `"error"` status per subsystem. Probe results are cached (default 30 s, configurable via `health.storage_probe_interval`) to avoid overwhelming remote backends. ### `internal/metrics` From c89cb2fdf151caf447842ae96dc36882bd8ff005 Mon Sep 17 00:00:00 2001 From: Lars Wallenborn Date: Sat, 16 May 2026 00:23:08 +0200 Subject: [PATCH 16/16] server: address andrew's review on #119 - Drop unused callerCtx parameter from healthCache.Check (Check is now parameter-less; the comment-only "accepted for symmetry" justification wasn't carrying its weight). - Emit "storage": {"status": "skipped"} on DB short-circuit instead of omitting the key, so monitors expecting a fixed key set keep working. - Reject negative storage_probe_interval at config validation time (previously parsed and silently behaved like "0"). - Extract HealthConfig.Validate to keep Config.Validate under the gocognit threshold and match the existing GradleBuildCacheConfig pattern. - README Health Check section: note that /health is intended as a readiness probe rather than a liveness probe (Check holds a mutex for up to the 10s probe timeout). - cmd/proxy/main.go godoc: column-align the new env var with the surrounding Gradle entries. Reported by andrew on #119. --- README.md | 4 ++-- cmd/proxy/main.go | 2 +- internal/config/config.go | 23 +++++++++++++++---- internal/config/config_test.go | 5 ++++ internal/server/health.go | 10 ++++---- internal/server/health_test.go | 42 +++++++++++++--------------------- internal/server/server.go | 5 +++- internal/server/server_test.go | 7 ++++-- 8 files changed, 55 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index a646379..a5e2e23 100644 --- a/README.md +++ b/README.md @@ -833,9 +833,9 @@ Cache size and artifact count are refreshed every 60 seconds. The remaining metr } ``` -Failing checks include an `"error"` field. Storage failures also include a `"step"` field identifying which probe step failed (`write`, `size`, `read`, `verify`, `delete`). +Failing checks include an `"error"` field. Storage failures also include a `"step"` field identifying which probe step failed (`write`, `size`, `read`, `verify`, `delete`). When the database check fails, the storage entry reports `{"status": "skipped"}` so the response always carries the same key set. -Storage probe results are cached for `health.storage_probe_interval` (default 30s) to bound the cost of probing remote backends. +Storage probe results are cached for `health.storage_probe_interval` (default 30s) to bound the cost of probing remote backends. A probe holds an internal mutex for up to 10 seconds (the hardcoded per-probe timeout), so `/health` is intended as a Kubernetes **readiness** probe rather than a liveness probe — a slow S3 round-trip should pull the pod from rotation, not restart it. Scrape config for Prometheus: diff --git a/cmd/proxy/main.go b/cmd/proxy/main.go index 4c9ca56..8b28535 100644 --- a/cmd/proxy/main.go +++ b/cmd/proxy/main.go @@ -77,7 +77,7 @@ // PROXY_GRADLE_BUILD_CACHE_MAX_AGE - Gradle cache max age eviction // PROXY_GRADLE_BUILD_CACHE_MAX_SIZE - Gradle cache max total size // PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL - Gradle cache eviction sweep interval -// PROXY_HEALTH_STORAGE_PROBE_INTERVAL - Storage health probe cache interval (default "30s") +// PROXY_HEALTH_STORAGE_PROBE_INTERVAL - Storage health probe cache interval (default "30s") // // Example: // diff --git a/internal/config/config.go b/internal/config/config.go index 97e9080..8727884 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -496,11 +496,8 @@ func (c *Config) Validate() error { } } - // Validate health probe interval if specified - if c.Health.StorageProbeInterval != "" && c.Health.StorageProbeInterval != "0" { - if _, err := time.ParseDuration(c.Health.StorageProbeInterval); err != nil { - return fmt.Errorf("invalid health.storage_probe_interval %q: %w", c.Health.StorageProbeInterval, err) - } + if err := c.Health.Validate(); err != nil { + return err } if err := c.Gradle.BuildCache.Validate(); err != nil { @@ -510,6 +507,22 @@ func (c *Config) Validate() error { return nil } +// Validate checks the /health configuration. An unset interval is allowed +// (the cache uses its default); explicit values must parse and be non-negative. +func (h *HealthConfig) Validate() error { + if h.StorageProbeInterval == "" || h.StorageProbeInterval == "0" { + return nil + } + d, err := time.ParseDuration(h.StorageProbeInterval) + if err != nil { + return fmt.Errorf("invalid health.storage_probe_interval %q: %w", h.StorageProbeInterval, err) + } + if d < 0 { + return fmt.Errorf("invalid health.storage_probe_interval %q: must be non-negative", h.StorageProbeInterval) + } + return nil +} + // Validate checks Gradle build cache settings, applying the default upload // size if unset. func (g *GradleBuildCacheConfig) Validate() error { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 175a6c1..22694c0 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -453,6 +453,11 @@ func TestValidateHealthStorageProbeInterval(t *testing.T) { if err := cfg.Validate(); err != nil { t.Errorf("unexpected error for empty health.storage_probe_interval: %v", err) } + + cfg.Health.StorageProbeInterval = "-5s" + if err := cfg.Validate(); err == nil { + t.Error("expected validation error for negative health.storage_probe_interval") + } } func TestLoadMetadataTTLFromEnv(t *testing.T) { diff --git a/internal/server/health.go b/internal/server/health.go index bd34058..f4e4847 100644 --- a/internal/server/health.go +++ b/internal/server/health.go @@ -139,12 +139,10 @@ func newHealthCache(s storage.Storage, intervalStr string, logger *slog.Logger) } // Check returns the cached probe result if still fresh, otherwise runs a fresh probe. -// The callerCtx parameter is accepted for symmetry with handler signatures but is -// intentionally NOT passed to the probe — the probe runs under a context derived -// from context.Background() with a fixed timeout so that caller cancellation -// (e.g. client disconnect) cannot poison the cache with context.Canceled. -func (c *healthCache) Check(callerCtx context.Context) error { - _ = callerCtx // see comment above +// The probe runs under a context derived from context.Background() with a fixed +// timeout so that caller cancellation (e.g. client disconnect) cannot poison the +// cache with context.Canceled. +func (c *healthCache) Check() error { c.mu.Lock() defer c.mu.Unlock() diff --git a/internal/server/health_test.go b/internal/server/health_test.go index f224c4e..c0f70c9 100644 --- a/internal/server/health_test.go +++ b/internal/server/health_test.go @@ -303,10 +303,10 @@ func discardLogger() *slog.Logger { func TestHealthCache_CacheHit(t *testing.T) { fs := newFakeStorage() c := newTestCache(fs, 30*time.Second) - if err := c.Check(context.Background()); err != nil { + if err := c.Check(); err != nil { t.Fatalf("first check: %v", err) } - if err := c.Check(context.Background()); err != nil { + if err := c.Check(); err != nil { t.Fatalf("second check: %v", err) } if got := fs.storeCalls.Load(); got != 1 { @@ -317,9 +317,9 @@ func TestHealthCache_CacheHit(t *testing.T) { func TestHealthCache_MissAfterTTL(t *testing.T) { fs := newFakeStorage() c := newTestCache(fs, 10*time.Millisecond) - _ = c.Check(context.Background()) + _ = c.Check() time.Sleep(20 * time.Millisecond) - _ = c.Check(context.Background()) + _ = c.Check() if got := fs.storeCalls.Load(); got != 2 { t.Errorf("storeCalls = %d, want 2", got) } @@ -328,8 +328,8 @@ func TestHealthCache_MissAfterTTL(t *testing.T) { func TestHealthCache_Disabled(t *testing.T) { fs := newFakeStorage() c := newTestCache(fs, 0) // interval = 0 means probe every call - _ = c.Check(context.Background()) - _ = c.Check(context.Background()) + _ = c.Check() + _ = c.Check() if got := fs.storeCalls.Load(); got != 2 { t.Errorf("storeCalls = %d, want 2", got) } @@ -339,7 +339,7 @@ func TestHealthCache_LastAtNotAdvancedOnHit(t *testing.T) { fs := newFakeStorage() c := newTestCache(fs, 30*time.Second) for i := 0; i < 100; i++ { - _ = c.Check(context.Background()) + _ = c.Check() } if got := fs.storeCalls.Load(); got != 1 { t.Errorf("storeCalls = %d, want 1 across 100 hits", got) @@ -352,7 +352,7 @@ func TestHealthCache_ConcurrentSingleFlight(t *testing.T) { var wg sync.WaitGroup for i := 0; i < 20; i++ { wg.Add(1) - go func() { defer wg.Done(); _ = c.Check(context.Background()) }() + go func() { defer wg.Done(); _ = c.Check() }() } wg.Wait() if got := fs.storeCalls.Load(); got != 1 { @@ -360,16 +360,6 @@ func TestHealthCache_ConcurrentSingleFlight(t *testing.T) { } } -func TestHealthCache_CallerCancellationNotPoisoning(t *testing.T) { - fs := newFakeStorage() - c := newTestCache(fs, 30*time.Second) - ctx, cancel := context.WithCancel(context.Background()) - cancel() // Already cancelled before the call - if err := c.Check(ctx); err != nil { - t.Fatalf("Check with cancelled caller ctx should still succeed: %v", err) - } -} - func TestHealthCache_FailureCounterIncrement(t *testing.T) { fs := newFakeStorage() fs.storeErr = errors.New("boom") @@ -378,14 +368,14 @@ func TestHealthCache_FailureCounterIncrement(t *testing.T) { before := testutil.ToFloat64(metrics.HealthProbeFailures.WithLabelValues("write")) // First call: fresh probe → counter +1 - _ = c.Check(context.Background()) + _ = c.Check() afterFirst := testutil.ToFloat64(metrics.HealthProbeFailures.WithLabelValues("write")) if afterFirst-before != 1 { t.Errorf("counter delta after first call = %v, want 1", afterFirst-before) } // Second call: cache hit → counter NOT re-incremented - _ = c.Check(context.Background()) + _ = c.Check() afterSecond := testutil.ToFloat64(metrics.HealthProbeFailures.WithLabelValues("write")) if afterSecond != afterFirst { t.Errorf("counter changed on cache hit: %v → %v", afterFirst, afterSecond) @@ -404,7 +394,7 @@ func TestHealthCache_ProbeTimeout(t *testing.T) { logger: discardLogger(), } start := time.Now() - err := c.Check(context.Background()) + err := c.Check() elapsed := time.Since(start) if err == nil { @@ -427,8 +417,8 @@ func TestHealthCache_TransitionLogging(t *testing.T) { } // Steady ok state — should not log - _ = c.Check(context.Background()) - _ = c.Check(context.Background()) + _ = c.Check() + _ = c.Check() if got := strings.Count(buf.String(), "storage probe"); got != 0 { t.Errorf("steady-state logs = %d, want 0; output: %s", got, buf.String()) } @@ -436,14 +426,14 @@ func TestHealthCache_TransitionLogging(t *testing.T) { // ok → err transition: exactly one Error log buf.Reset() fs.storeErr = errors.New("boom") - _ = c.Check(context.Background()) + _ = c.Check() if !strings.Contains(buf.String(), "storage probe failed") { t.Errorf("missing failure log on transition; output: %s", buf.String()) } // err steady state — should not log again buf.Reset() - _ = c.Check(context.Background()) + _ = c.Check() if buf.Len() != 0 { t.Errorf("steady-err logs = %q, want empty", buf.String()) } @@ -451,7 +441,7 @@ func TestHealthCache_TransitionLogging(t *testing.T) { // err → ok transition: exactly one Info log buf.Reset() fs.storeErr = nil - _ = c.Check(context.Background()) + _ = c.Check() if !strings.Contains(buf.String(), "storage probe recovered") { t.Errorf("missing recovery log on transition; output: %s", buf.String()) } diff --git a/internal/server/server.go b/internal/server/server.go index 8b25317..54df362 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -823,9 +823,12 @@ func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) { resp := HealthResponse{Status: "ok", Checks: map[string]HealthCheck{}} // Database check (short-circuit; do not waste a storage probe call when DB is down). + // On DB failure the storage entry reports "skipped" rather than being omitted so + // the response always carries the same key set for monitors that expect it. if _, err := s.db.SchemaVersion(); err != nil { resp.Status = "error" resp.Checks["database"] = HealthCheck{Status: "error", Error: err.Error()} + resp.Checks["storage"] = HealthCheck{Status: "skipped"} w.WriteHeader(http.StatusServiceUnavailable) _ = json.NewEncoder(w).Encode(resp) return @@ -833,7 +836,7 @@ func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) { resp.Checks["database"] = HealthCheck{Status: "ok"} // Storage probe (via cache). - if err := s.healthCache.Check(r.Context()); err != nil { + if err := s.healthCache.Check(); err != nil { resp.Status = "error" sc := HealthCheck{Status: "error", Error: err.Error()} var pe *probeError diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 31527e1..e2dc1c2 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -231,8 +231,11 @@ func TestHealthEndpoint_DBFailureShortCircuits(t *testing.T) { if resp.Checks["database"].Status != "error" { t.Errorf("database check = %+v, want error", resp.Checks["database"]) } - if _, present := resp.Checks["storage"]; present { - t.Errorf("storage key should be absent on DB short-circuit, got %+v", resp.Checks["storage"]) + storage, present := resp.Checks["storage"] + if !present { + t.Error("storage key should be present (with status=skipped) on DB short-circuit") + } else if storage.Status != "skipped" { + t.Errorf("storage check = %+v, want status=skipped", storage) } }