diff --git a/.github/renovate.json b/.github/renovate.json index 65083e8fa..a49aba388 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -22,6 +22,21 @@ "gomodTidy", "gomodUpdateImportPaths" ], + "regexManagers": [ + { + "fileMatch": ["^postgres/Dockerfile$"], + "matchStrings": ["FROM (?[^:\\n]+):(?[^@\\n]+)@sha256:(?[a-f0-9]+)"], + "datasourceTemplate": "docker" + }, + { + "fileMatch": ["^postgres/Dockerfile$"], + "matchStrings": ["ENV PG_VERSION (?[\\d]+\\.[\\d]+)-[^\\n]+"], + "depNameTemplate": "postgres", + "datasourceTemplate": "docker", + "versioningTemplate": "semver-coerced", + "autoReplaceStringTemplate": "ENV PG_VERSION {{{newValue}}}-1.pgdg13+1" + } + ], "packageRules": [ { "matchPackageNames": [ @@ -29,6 +44,17 @@ ], "allowedVersions": "1.26.x" }, + { + "matchPackageNames": [ + "postgres" + ], + "matchFileNames": [ + "postgres/Dockerfile" + ], + "allowedVersions": "17.x", + "automerge": true, + "groupName": "postgres Dockerfile" + }, { "matchPackageNames": [ "/^github\\.com\\/sapcc\\/.*/" diff --git a/cmd/main.go b/cmd/main.go index fc3c9b1b8..86e85855b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -7,10 +7,14 @@ import ( "context" "crypto/tls" "flag" + "log/slog" "net/http" + + uberzap "go.uber.org/zap" "os" "path/filepath" "slices" + "strings" "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) @@ -143,12 +147,39 @@ func main() { flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") opts := zap.Options{ - Development: true, + Development: false, } opts.BindFlags(flag.CommandLine) flag.Parse() - ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + ctrl.SetLogger(zap.New( + zap.UseFlagOptions(&opts), + zap.RawZapOpts(uberzap.WrapCore(monitoring.WrapCoreWithLogMetrics)), + )) + + // Configure slog (used across internal packages) with JSON output and + // level control via the LOG_LEVEL environment variable. + // Supported values: debug, info (default), warn, error. + slogLevel := new(slog.LevelVar) + slogLevel.Set(slog.LevelInfo) + if lvl := os.Getenv("LOG_LEVEL"); lvl != "" { + switch strings.ToLower(lvl) { + case "debug": + slogLevel.Set(slog.LevelDebug) + case "info": + slogLevel.Set(slog.LevelInfo) + case "warn", "warning": + slogLevel.Set(slog.LevelWarn) + case "error": + slogLevel.Set(slog.LevelError) + } + } + slog.SetDefault(slog.New(monitoring.NewMetricsSlogHandler( + slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: slogLevel, + }), + ))) + slog.Info("slog configured", "level", slogLevel.Level().String()) // Log the main configuration setupLog.Info("loaded main configuration", @@ -301,6 +332,7 @@ func main() { // This is useful to distinguish metrics from different deployments. metricsConfig := conf.GetConfigOrDie[monitoring.Config]() metrics.Registry = monitoring.WrapRegistry(metrics.Registry, metricsConfig) + metrics.Registry.MustRegister(monitoring.LogMessagesTotal) // TODO: Remove me after scheduling pipeline steps don't require DB connections anymore. metrics.Registry.MustRegister(&db.Monitor) diff --git a/cortex.secrets.example.yaml b/cortex.secrets.example.yaml index daab61cc9..56d09c8bc 100644 --- a/cortex.secrets.example.yaml +++ b/cortex.secrets.example.yaml @@ -1,8 +1,9 @@ # Copyright SAP SE # SPDX-License-Identifier: Apache-2.0 -# Override config values that contain sensitive information or -# are specific to your environment. These values can be used in the Tiltfile. +# Override config values for local development. This includes secrets, +# environment-specific settings, and logging configuration. +# These values can be used in the Tiltfile. # SSO certificate to use. sharedSSOCert: &sharedSSOCert @@ -20,6 +21,23 @@ sharedSSOCert: &sharedSSOCert # If true, the certificate is not verified. selfSigned: "false" +# Logging configuration for local development. +# Set logLevel to "debug" for verbose output from both zap and slog loggers. +# Set zapDevel to true for human-readable console logs instead of JSON. +# These apply per sub-chart, e.g. for cortex-nova: +# +# cortex-scheduling-controllers: +# controllerManager: +# container: +# logLevel: "debug" +# zapDevel: true +# +# cortex-knowledge-controllers: +# controllerManager: +# container: +# logLevel: "debug" +# zapDevel: true + # Enable kvm pipelines and scheduling support. kvm: enabled: true diff --git a/go.mod b/go.mod index 2aa5950e6..83ec1bcfc 100644 --- a/go.mod +++ b/go.mod @@ -96,15 +96,15 @@ require ( github.com/ziutek/mymysql v1.5.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect - go.opentelemetry.io/otel v1.40.0 // indirect + go.opentelemetry.io/otel v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect - go.opentelemetry.io/otel/metric v1.40.0 // indirect - go.opentelemetry.io/otel/sdk v1.40.0 // indirect - go.opentelemetry.io/otel/trace v1.40.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/sdk v1.43.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect go.opentelemetry.io/proto/otlp v1.8.0 // indirect go.uber.org/multierr v1.11.0 // indirect - go.uber.org/zap v1.27.1 // indirect + go.uber.org/zap v1.27.1 go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect diff --git a/go.sum b/go.sum index a76b5a380..46b5a742a 100644 --- a/go.sum +++ b/go.sum @@ -241,20 +241,20 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= -go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= -go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= -go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= -go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= -go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= -go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= -go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= -go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= -go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= -go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= go.opentelemetry.io/proto/otlp v1.8.0 h1:fRAZQDcAFHySxpJ1TwlA1cJ4tvcrw7nXl9xWWC8N5CE= go.opentelemetry.io/proto/otlp v1.8.0/go.mod h1:tIeYOeNBU4cvmPqpaji1P+KbB4Oloai8wN4rWzRrFF0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= diff --git a/helm/bundles/cortex-cinder/Chart.yaml b/helm/bundles/cortex-cinder/Chart.yaml index 7dc1e0b45..16b78ff9f 100644 --- a/helm/bundles/cortex-cinder/Chart.yaml +++ b/helm/bundles/cortex-cinder/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.13 + version: 0.5.14 # from: file://../../library/cortex - name: cortex diff --git a/helm/bundles/cortex-manila/Chart.yaml b/helm/bundles/cortex-manila/Chart.yaml index 516b71070..f3b846b51 100644 --- a/helm/bundles/cortex-manila/Chart.yaml +++ b/helm/bundles/cortex-manila/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.13 + version: 0.5.14 # from: file://../../library/cortex - name: cortex diff --git a/helm/bundles/cortex-nova/Chart.yaml b/helm/bundles/cortex-nova/Chart.yaml index 7976d0b5c..2fc6574b1 100644 --- a/helm/bundles/cortex-nova/Chart.yaml +++ b/helm/bundles/cortex-nova/Chart.yaml @@ -11,7 +11,7 @@ dependencies: # from: file://../../library/cortex-postgres - name: cortex-postgres repository: oci://ghcr.io/cobaltcore-dev/cortex/charts - version: 0.5.13 + version: 0.5.14 # from: file://../../library/cortex - name: cortex diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 41bf29794..f7b4f180e 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -731,4 +731,4 @@ groups: The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. This may indicate issues with the webhook logic, connectivity problems, or external factors causing failures. Check the webhook server logs for error - details and investigate the affected resources. \ No newline at end of file + details and investigate the affected resources. diff --git a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml index 6b3d9fcbc..82d7ce2dd 100644 --- a/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/knowledges_kvm.yaml @@ -23,6 +23,7 @@ metadata: name: kvm-libvirt-domain-cpu-steal-pct spec: schedulingDomain: nova + recency: "60s" extractor: name: kvm_libvirt_domain_cpu_steal_pct_extractor description: | diff --git a/helm/library/cortex-postgres/Chart.yaml b/helm/library/cortex-postgres/Chart.yaml index 39710ebad..5c5acf169 100644 --- a/helm/library/cortex-postgres/Chart.yaml +++ b/helm/library/cortex-postgres/Chart.yaml @@ -5,5 +5,5 @@ apiVersion: v2 name: cortex-postgres description: Postgres setup for Cortex. type: application -version: 0.5.13 +version: 0.5.14 appVersion: "sha-6db36b81" diff --git a/helm/library/cortex/templates/_helpers.tpl b/helm/library/cortex/templates/_helpers.tpl index 782e14eef..3afa8c2f7 100644 --- a/helm/library/cortex/templates/_helpers.tpl +++ b/helm/library/cortex/templates/_helpers.tpl @@ -41,6 +41,22 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{ $hasMutating }}}}{{- end }} +{{/* +chart.argsContainPrefix checks if any string in args starts with prefix. +Usage: include "chart.argsContainPrefix" (dict "prefix" "--zap-log-level" "args" .Values.controllerManager.container.args) +Returns "true" or "false". +*/}} +{{- define "chart.argsContainPrefix" -}} +{{- $prefix := .prefix -}} +{{- $result := dict "found" "false" -}} +{{- range .args -}} + {{- if hasPrefix $prefix . -}} + {{- $_ := set $result "found" "true" -}} + {{- end -}} +{{- end -}} +{{- get $result "found" -}} +{{- end -}} + {{- define "chart.hasValidatingWebhooks" -}} {{- $hasValidating := false }} {{- range . }} diff --git a/helm/library/cortex/templates/manager/manager.yaml b/helm/library/cortex/templates/manager/manager.yaml index 73672164f..f3dd0c91f 100644 --- a/helm/library/cortex/templates/manager/manager.yaml +++ b/helm/library/cortex/templates/manager/manager.yaml @@ -35,6 +35,12 @@ spec: {{- range .Values.controllerManager.container.args }} - {{ . }} {{- end }} + {{- if and .Values.controllerManager.container.logLevel (ne (include "chart.argsContainPrefix" (dict "prefix" "--zap-log-level" "args" .Values.controllerManager.container.args)) "true") }} + - "--zap-log-level={{ .Values.controllerManager.container.logLevel }}" + {{- end }} + {{- if and .Values.controllerManager.container.zapDevel (ne (include "chart.argsContainPrefix" (dict "prefix" "--zap-devel" "args" .Values.controllerManager.container.args)) "true") }} + - "--zap-devel" + {{- end }} {{- if and .Values.webhook.enable .Values.certmanager.enable }} - "--webhook-cert-path=/tmp/k8s-webhook-server/serving-certs" {{- end }} @@ -56,13 +62,17 @@ spec: {{- if .Values.controllerManager.container.image.pullPolicy }} imagePullPolicy: {{ .Values.controllerManager.container.image.pullPolicy }} {{- end }} - {{- if .Values.controllerManager.container.env }} env: + {{- if and .Values.controllerManager.container.logLevel (not (and .Values.controllerManager.container.env (hasKey .Values.controllerManager.container.env "LOG_LEVEL"))) }} + - name: LOG_LEVEL + value: {{ .Values.controllerManager.container.logLevel | quote }} + {{- end }} + {{- if .Values.controllerManager.container.env }} {{- range $key, $value := .Values.controllerManager.container.env }} - name: {{ $key }} value: {{ $value }} {{- end }} - {{- end }} + {{- end }} livenessProbe: {{- toYaml .Values.controllerManager.container.livenessProbe | nindent 12 }} readinessProbe: @@ -140,4 +150,4 @@ data: {{- $mergedSecrets = mergeOverwrite .Values.secrets $mergedSecrets }} {{- end }} {{ toJson $mergedSecrets | b64enc }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/helm/library/cortex/values.yaml b/helm/library/cortex/values.yaml index ad3c7ba8b..f6b94c9ef 100644 --- a/helm/library/cortex/values.yaml +++ b/helm/library/cortex/values.yaml @@ -13,6 +13,14 @@ controllerManager: - "--metrics-bind-address=:2112" - "--health-probe-bind-address=:8081" - "--metrics-secure=false" + # Log level for both zap (controller-runtime) and slog (internal packages). + # Supported: debug, info (default), warn, error. + logLevel: "info" + # Enable zap development mode (human-readable console logs, development stack traces). + # This only changes output format and stack trace behavior, not the log level. + # The effective log level is controlled by logLevel above (default: "info"). + # Set to true for local development (e.g. Tilt), keep false for production. + zapDevel: false resources: limits: cpu: 500m diff --git a/internal/knowledge/extractor/controller.go b/internal/knowledge/extractor/controller.go index f1a641c22..35982675f 100644 --- a/internal/knowledge/extractor/controller.go +++ b/internal/knowledge/extractor/controller.go @@ -53,9 +53,10 @@ func (r *KnowledgeReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // Sanity checks. lastExtracted := knowledge.Status.LastExtracted.Time recency := knowledge.Spec.Recency.Duration - if lastExtracted.Add(recency).After(time.Now()) && knowledge.Status.RawLength != 0 { - log.Info("skipping knowledge extraction, not yet time", "name", knowledge.Name) - return ctrl.Result{RequeueAfter: time.Until(lastExtracted.Add(recency))}, nil + if lastExtracted.Add(recency).After(time.Now()) { + waitFor := time.Until(lastExtracted.Add(recency)) + log.Info("skipping knowledge extraction, not yet time", "name", knowledge.Name, "waitFor", waitFor) + return ctrl.Result{RequeueAfter: waitFor}, nil } extractor, ok := supportedExtractors[knowledge.Spec.Extractor.Name] diff --git a/internal/scheduling/lib/filter_weigher_pipeline.go b/internal/scheduling/lib/filter_weigher_pipeline.go index 6330912e6..ee769433d 100644 --- a/internal/scheduling/lib/filter_weigher_pipeline.go +++ b/internal/scheduling/lib/filter_weigher_pipeline.go @@ -62,7 +62,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest]( unknownFilters := []string{} for _, filterConfig := range confedFilters { slog.Info("scheduler: configuring filter", "name", filterConfig.Name) - slog.Info("supported:", "filters", maps.Keys(supportedFilters)) + slog.Info("supported:", "filters", slices.Sorted(maps.Keys(supportedFilters))) makeFilter, ok := supportedFilters[filterConfig.Name] if !ok { slog.Error("scheduler: unsupported filter", "name", filterConfig.Name) @@ -73,7 +73,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest]( filter = validateFilter(filter) filter = monitorFilter(filter, filterConfig.Name, pipelineMonitor) if err := filter.Init(ctx, client, filterConfig); err != nil { - slog.Error("scheduler: failed to initialize filter", "name", filterConfig.Name, "error", err) + slog.Warn("scheduler: failed to initialize filter", "name", filterConfig.Name, "error", err) filterErrors[filterConfig.Name] = errors.New("failed to initialize filter: " + err.Error()) continue } @@ -90,7 +90,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest]( unknownWeighers := []string{} for _, weigherConfig := range confedWeighers { slog.Info("scheduler: configuring weigher", "name", weigherConfig.Name) - slog.Info("supported:", "weighers", maps.Keys(supportedWeighers)) + slog.Info("supported:", "weighers", slices.Sorted(maps.Keys(supportedWeighers))) makeWeigher, ok := supportedWeighers[weigherConfig.Name] if !ok { slog.Error("scheduler: unsupported weigher", "name", weigherConfig.Name) @@ -102,7 +102,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest]( weigher = validateWeigher(weigher) weigher = monitorWeigher(weigher, weigherConfig.Name, pipelineMonitor) if err := weigher.Init(ctx, client, weigherConfig); err != nil { - slog.Error("scheduler: failed to initialize weigher", "name", weigherConfig.Name, "error", err) + slog.Warn("scheduler: failed to initialize weigher", "name", weigherConfig.Name, "error", err) weigherErrors[weigherConfig.Name] = errors.New("failed to initialize weigher: " + err.Error()) continue } diff --git a/internal/scheduling/nova/external_scheduler_api.go b/internal/scheduling/nova/external_scheduler_api.go index 8179edb12..16fa304f7 100644 --- a/internal/scheduling/nova/external_scheduler_api.go +++ b/internal/scheduling/nova/external_scheduler_api.go @@ -93,12 +93,12 @@ func (httpAPI *httpAPI) canRunScheduler(requestData api.ExternalSchedulerRequest func (httpAPI *httpAPI) inferPipelineName(requestData api.ExternalSchedulerRequest) (string, error) { hvType, err := requestData.GetHypervisorType() if err != nil { - slog.Info("failed to determine hypervisor type, cannot infer pipeline name", "error", err) + slog.Warn("failed to determine hypervisor type, cannot infer pipeline name", "error", err) return "", errors.New("failed to determine hypervisor type from request data") } flavorType, err := requestData.GetFlavorType() if err != nil { - slog.Info("failed to determine flavor type, cannot infer pipeline name", "error", err) + slog.Warn("failed to determine flavor type, cannot infer pipeline name", "error", err) return "", errors.New("failed to determine flavor type from request data") } switch hvType { diff --git a/internal/scheduling/nova/hypervisor_overcommit_controller.go b/internal/scheduling/nova/hypervisor_overcommit_controller.go index 83dc0a4db..72e4507fc 100644 --- a/internal/scheduling/nova/hypervisor_overcommit_controller.go +++ b/internal/scheduling/nova/hypervisor_overcommit_controller.go @@ -110,7 +110,7 @@ type HypervisorOvercommitController struct { // - https://ahmet.im/blog/controller-pitfalls/#reconcile-method-shape func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx) - log.Info("Reconciling resource") + log.V(1).Info("Reconciling resource") obj := new(hv1.Hypervisor) if err := c.Get(ctx, req.NamespacedName, obj); err != nil { @@ -130,7 +130,7 @@ func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl // non-overlapping resources from previous mappings. desiredOvercommit := make(map[hv1.ResourceName]float64) for _, mapping := range c.config.OvercommitMappings { - log.Info("Processing overcommit mapping", + log.V(1).Info("Processing overcommit mapping", "mapping", mapping, "hypervisorTraits", obj.Status.Traits) var applyMapping bool @@ -142,21 +142,21 @@ func (c *HypervisorOvercommitController) Reconcile(ctx context.Context, req ctrl applyMapping = !slices.Contains(obj.Status.Traits, *mapping.HasntTrait) default: // This should never happen due to validation, but we check it just in case. - log.Info("Skipping overcommit mapping with no trait specified", + log.V(1).Info("Skipping overcommit mapping with no trait specified", "overcommit", mapping.Overcommit) continue } if !applyMapping { continue } - log.Info("Applying overcommit mapping on hypervisor", + log.V(1).Info("Applying overcommit mapping on hypervisor", "overcommit", mapping.Overcommit) maps.Copy(desiredOvercommit, mapping.Overcommit) } - log.Info("Desired overcommit ratios based on traits", + log.V(1).Info("Desired overcommit ratios based on traits", "desiredOvercommit", desiredOvercommit) if maps.Equal(desiredOvercommit, obj.Spec.Overcommit) { - log.Info("Overcommit ratios are up to date, no update needed") + log.V(1).Info("Overcommit ratios are up to date, no update needed") return ctrl.Result{}, nil } diff --git a/internal/scheduling/nova/plugins/filters/filter_capabilities.go b/internal/scheduling/nova/plugins/filters/filter_capabilities.go index 8cf6afac9..0aec4b22d 100644 --- a/internal/scheduling/nova/plugins/filters/filter_capabilities.go +++ b/internal/scheduling/nova/plugins/filters/filter_capabilities.go @@ -93,11 +93,12 @@ func (s *FilterCapabilitiesStep) Run(traceLog *slog.Logger, request api.External hvCaps := make(map[string]map[string]string) for _, hv := range hvs.Items { - var err error - if hvCaps[hv.Name], err = hvToNovaCapabilities(hv); err != nil { - traceLog.Error("failed to get nova capabilities from hypervisor", "host", hv.Name, "error", err) - return nil, err + caps, err := hvToNovaCapabilities(hv) + if err != nil { + traceLog.Warn("skipping hypervisor with unknown capabilities", "host", hv.Name, "error", err) + continue } + hvCaps[hv.Name] = caps } traceLog.Info("looking for capabilities", "capabilities", hvCaps) diff --git a/internal/scheduling/nova/plugins/filters/filter_external_customer.go b/internal/scheduling/nova/plugins/filters/filter_external_customer.go index 39f11f7af..827712a84 100644 --- a/internal/scheduling/nova/plugins/filters/filter_external_customer.go +++ b/internal/scheduling/nova/plugins/filters/filter_external_customer.go @@ -37,7 +37,7 @@ func (s *FilterExternalCustomerStep) Run(traceLog *slog.Logger, request api.Exte result := s.IncludeAllHostsFromRequest(request) domainName, err := request.Spec.Data.GetSchedulerHintStr("domain_name") if err != nil { - traceLog.Error("failed to get domain_name scheduler hint, skipping filter", "error", err) + traceLog.Warn("failed to get domain_name scheduler hint, skipping filter", "error", err) return result, nil } if slices.Contains(s.Options.CustomerIgnoredDomainNames, domainName) { diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index d26c7c940..e6956609a 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -263,7 +263,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa } freeCPU, ok := free["cpu"] if !ok || freeCPU.Value() < 0 { - traceLog.Error( + traceLog.Warn( "host with invalid CPU capacity", "host", host, "freeCPU", freeCPU.String(), ) diff --git a/pkg/monitoring/log_metrics.go b/pkg/monitoring/log_metrics.go new file mode 100644 index 000000000..9fd4912be --- /dev/null +++ b/pkg/monitoring/log_metrics.go @@ -0,0 +1,124 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package monitoring + +import ( + "context" + "log/slog" + "path" + "runtime" + "sync" + + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/zap/zapcore" +) + +// pcFileCache caches the resolved file path for each program counter. The set +// of distinct PCs is bounded by the number of log call sites in the binary, so +// this map grows to a fixed size and all subsequent lookups are lock-free reads. +var pcFileCache sync.Map // uintptr -> string + +// LogMessagesTotal counts warn and error log messages emitted by both the slog +// and zap loggers. Labels: "level" (warn|error), "file" (relative source path). +var LogMessagesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "cortex", + Name: "log_messages_total", + Help: "Total number of log messages emitted at warn or error level.", +}, []string{"level", "file"}) + +// shortFilePath returns "parent_dir/filename.go" from any absolute or +// module-relative path. This is independent of the build environment (no +// -trimpath needed) and keeps Prometheus label cardinality manageable. +func shortFilePath(file string) string { + dir, base := path.Split(file) + parent := path.Base(dir) + if parent == "." || parent == "/" { + return base + } + return parent + "/" + base +} + +// --- slog handler wrapper --- + +// MetricsSlogHandler wraps an slog.Handler and increments LogMessagesTotal for +// every warn or error log record. +type MetricsSlogHandler struct { + next slog.Handler +} + +// NewMetricsSlogHandler returns a new handler that counts warn/error logs and +// delegates all calls to next. +func NewMetricsSlogHandler(next slog.Handler) *MetricsSlogHandler { + return &MetricsSlogHandler{next: next} +} + +func (h *MetricsSlogHandler) Enabled(ctx context.Context, level slog.Level) bool { + if h.next == nil { + return false + } + return h.next.Enabled(ctx, level) +} + +func (h *MetricsSlogHandler) Handle(ctx context.Context, r slog.Record) error { + if r.Level >= slog.LevelWarn { + level := "warn" + if r.Level >= slog.LevelError { + level = "error" + } + file := "unknown" + if r.PC != 0 { + if cached, ok := pcFileCache.Load(r.PC); ok { + file = cached.(string) + } else { + frames := runtime.CallersFrames([]uintptr{r.PC}) + f, _ := frames.Next() + if f.File != "" { + file = shortFilePath(f.File) + } + pcFileCache.Store(r.PC, file) + } + } + LogMessagesTotal.WithLabelValues(level, file).Inc() + } + if h.next == nil { + return nil + } + return h.next.Handle(ctx, r) +} + +func (h *MetricsSlogHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + if h.next == nil { + return &MetricsSlogHandler{} + } + return &MetricsSlogHandler{next: h.next.WithAttrs(attrs)} +} + +func (h *MetricsSlogHandler) WithGroup(name string) slog.Handler { + if h.next == nil { + return &MetricsSlogHandler{} + } + return &MetricsSlogHandler{next: h.next.WithGroup(name)} +} + +// --- zap core wrapper --- + +// WrapCoreWithLogMetrics returns a zapcore.Core that hooks into every write to +// increment LogMessagesTotal for warn and error entries. It uses +// zapcore.RegisterHooks so no manual Check/Write plumbing is needed. +func WrapCoreWithLogMetrics(core zapcore.Core) zapcore.Core { + return zapcore.RegisterHooks(core, func(e zapcore.Entry) error { + if e.Level >= zapcore.WarnLevel { + level := "warn" + if e.Level >= zapcore.ErrorLevel { + level = "error" + } + file := "unknown" + if e.Caller.Defined { + file = shortFilePath(e.Caller.File) + } + LogMessagesTotal.WithLabelValues(level, file).Inc() + } + return nil + }) +} diff --git a/pkg/monitoring/log_metrics_test.go b/pkg/monitoring/log_metrics_test.go new file mode 100644 index 000000000..12d3dd282 --- /dev/null +++ b/pkg/monitoring/log_metrics_test.go @@ -0,0 +1,382 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package monitoring + +import ( + "bytes" + "log/slog" + "strings" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/zap/zapcore" +) + +func TestShortFilePath(t *testing.T) { + tests := []struct { + input, want string + }{ + { + input: "github.com/cobaltcore-dev/cortex/internal/scheduling/nova/external_scheduler_api.go", + want: "nova/external_scheduler_api.go", + }, + { + input: "/workspace/internal/knowledge/extractor/plugins/compute/vrops_hostsystem_contention_long_term.go", + want: "compute/vrops_hostsystem_contention_long_term.go", + }, + { + input: "/some/absolute/path/file.go", + want: "path/file.go", + }, + { + input: "file.go", + want: "file.go", + }, + { + input: "", + want: "", + }, + } + for _, tt := range tests { + if got := shortFilePath(tt.input); got != tt.want { + t.Errorf("shortFilePath(%q) = %q, want %q", tt.input, got, tt.want) + } + } +} + +// newTestCounter creates a fresh CounterVec with the same schema as +// LogMessagesTotal, useful for isolated test assertions. +func newTestCounter() *prometheus.CounterVec { + return prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "cortex", + Name: "log_messages_total", + Help: "Total number of log messages emitted at warn or error level.", + }, []string{"level", "file"}) +} + +// gatherCounts collects the counter from a fresh registry and returns +// a map of level -> file -> count. +func gatherCounts(t *testing.T, counter *prometheus.CounterVec) map[string]map[string]float64 { + t.Helper() + reg := prometheus.NewRegistry() + reg.MustRegister(counter) + families, err := reg.Gather() + if err != nil { + t.Fatalf("gather failed: %v", err) + } + counts := make(map[string]map[string]float64) + for _, fam := range families { + if fam.GetName() != "cortex_log_messages_total" { + continue + } + for _, m := range fam.GetMetric() { + labels := make(map[string]string) + for _, lp := range m.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + level := labels["level"] + file := labels["file"] + if counts[level] == nil { + counts[level] = make(map[string]float64) + } + counts[level][file] += m.GetCounter().GetValue() + } + } + return counts +} + +// sumLevel sums all file counts for a given level label. +func sumLevel(counts map[string]map[string]float64, level string) float64 { + total := 0.0 + for _, v := range counts[level] { + total += v + } + return total +} + +func TestMetricsSlogHandler_Counts(t *testing.T) { + tests := []struct { + name string + emit func(logger *slog.Logger) + wantWarn float64 + wantError float64 + wantNoDebug bool + wantNoInfo bool + }{ + { + name: "one warn one error", + emit: func(l *slog.Logger) { + l.Warn("w") + l.Error("e") + }, + wantWarn: 1, + wantError: 1, + wantNoDebug: true, + wantNoInfo: true, + }, + { + name: "multiple warns", + emit: func(l *slog.Logger) { + l.Warn("w1") + l.Warn("w2") + l.Warn("w3") + }, + wantWarn: 3, + wantError: 0, + wantNoDebug: true, + wantNoInfo: true, + }, + { + name: "multiple errors", + emit: func(l *slog.Logger) { + l.Error("e1") + l.Error("e2") + }, + wantWarn: 0, + wantError: 2, + wantNoDebug: true, + wantNoInfo: true, + }, + { + name: "debug and info are not counted", + emit: func(l *slog.Logger) { + l.Debug("d") + l.Info("i") + }, + wantWarn: 0, + wantError: 0, + wantNoDebug: true, + wantNoInfo: true, + }, + { + name: "mixed levels", + emit: func(l *slog.Logger) { + l.Debug("d") + l.Info("i") + l.Warn("w1") + l.Warn("w2") + l.Error("e1") + l.Error("e2") + l.Error("e3") + }, + wantWarn: 2, + wantError: 3, + wantNoDebug: true, + wantNoInfo: true, + }, + { + name: "no logs emitted", + emit: func(l *slog.Logger) {}, + wantWarn: 0, + wantError: 0, + wantNoDebug: true, + wantNoInfo: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + orig := LogMessagesTotal + LogMessagesTotal = newTestCounter() + defer func() { LogMessagesTotal = orig }() + + var buf bytes.Buffer + inner := slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}) + logger := slog.New(NewMetricsSlogHandler(inner)) + + tt.emit(logger) + + counts := gatherCounts(t, LogMessagesTotal) + + if got := sumLevel(counts, "warn"); got != tt.wantWarn { + t.Errorf("warn count: got %v, want %v", got, tt.wantWarn) + } + if got := sumLevel(counts, "error"); got != tt.wantError { + t.Errorf("error count: got %v, want %v", got, tt.wantError) + } + if tt.wantNoDebug && len(counts["debug"]) > 0 { + t.Errorf("expected no debug counts, got %v", counts["debug"]) + } + if tt.wantNoInfo && len(counts["info"]) > 0 { + t.Errorf("expected no info counts, got %v", counts["info"]) + } + }) + } +} + +func TestMetricsSlogHandler_DelegatesAllLevels(t *testing.T) { + orig := LogMessagesTotal + LogMessagesTotal = newTestCounter() + defer func() { LogMessagesTotal = orig }() + + var buf bytes.Buffer + inner := slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}) + logger := slog.New(NewMetricsSlogHandler(inner)) + + logger.Debug("d") + logger.Info("i") + logger.Warn("w") + logger.Error("e") + + output := buf.String() + for _, msg := range []string{"msg=d", "msg=i", "msg=w", "msg=e"} { + if !strings.Contains(output, msg) { + t.Errorf("expected inner handler to receive %q", msg) + } + } +} + +func TestMetricsSlogHandler_WithAttrsAndGroup(t *testing.T) { + orig := LogMessagesTotal + LogMessagesTotal = newTestCounter() + defer func() { LogMessagesTotal = orig }() + + var buf bytes.Buffer + inner := slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}) + handler := NewMetricsSlogHandler(inner) + + derived := handler.WithAttrs([]slog.Attr{slog.String("key", "val")}) + grouped := derived.WithGroup("grp") + + if _, ok := derived.(*MetricsSlogHandler); !ok { + t.Fatalf("WithAttrs should return *MetricsSlogHandler, got %T", derived) + } + if _, ok := grouped.(*MetricsSlogHandler); !ok { + t.Fatalf("WithGroup should return *MetricsSlogHandler, got %T", grouped) + } +} + +func TestWrapCoreWithLogMetrics(t *testing.T) { + type entry struct { + level zapcore.Level + caller bool + } + tests := []struct { + name string + entries []entry + wantWarn float64 + wantError float64 + wantUnknown float64 // warn entries with no caller → file="unknown" + }{ + { + name: "one warn one error with caller", + entries: []entry{ + {zapcore.WarnLevel, true}, + {zapcore.ErrorLevel, true}, + }, + wantWarn: 1, + wantError: 1, + }, + { + name: "multiple warns", + entries: []entry{ + {zapcore.WarnLevel, true}, + {zapcore.WarnLevel, true}, + {zapcore.WarnLevel, true}, + }, + wantWarn: 3, + wantError: 0, + }, + { + name: "multiple errors", + entries: []entry{ + {zapcore.ErrorLevel, true}, + {zapcore.ErrorLevel, true}, + }, + wantWarn: 0, + wantError: 2, + }, + { + name: "debug and info are not counted", + entries: []entry{ + {zapcore.DebugLevel, true}, + {zapcore.InfoLevel, true}, + }, + wantWarn: 0, + wantError: 0, + }, + { + name: "warn without caller uses unknown file", + entries: []entry{ + {zapcore.WarnLevel, false}, + {zapcore.WarnLevel, false}, + }, + wantWarn: 2, + wantError: 0, + wantUnknown: 2, + }, + { + name: "mixed levels and callers", + entries: []entry{ + {zapcore.DebugLevel, true}, + {zapcore.InfoLevel, true}, + {zapcore.WarnLevel, true}, + {zapcore.WarnLevel, false}, + {zapcore.ErrorLevel, true}, + {zapcore.ErrorLevel, true}, + }, + wantWarn: 2, + wantError: 2, + wantUnknown: 1, + }, + { + name: "dpanic and above count as error", + entries: []entry{{zapcore.DPanicLevel, true}, {zapcore.PanicLevel, true}}, + wantWarn: 0, + wantError: 2, + }, + } + + enc := zapcore.NewJSONEncoder(zapcore.EncoderConfig{ + MessageKey: "msg", + LevelKey: "level", + TimeKey: "ts", + EncodeTime: zapcore.ISO8601TimeEncoder, + EncodeLevel: zapcore.LowercaseLevelEncoder, + }) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + orig := LogMessagesTotal + LogMessagesTotal = newTestCounter() + defer func() { LogMessagesTotal = orig }() + + sink := zapcore.AddSync(&bytes.Buffer{}) + inner := zapcore.NewCore(enc, sink, zapcore.DebugLevel) + wrapped := WrapCoreWithLogMetrics(inner) + + for i, e := range tt.entries { + ent := zapcore.Entry{Level: e.level, Message: "msg", Time: time.Now()} + if e.caller { + ent.Caller = zapcore.EntryCaller{ + Defined: true, + File: "github.com/cobaltcore-dev/cortex/internal/test/fake.go", + Line: i, + } + } + if ce := wrapped.Check(ent, nil); ce != nil { + ce.Write() + } + } + + counts := gatherCounts(t, LogMessagesTotal) + + if got := sumLevel(counts, "warn"); got != tt.wantWarn { + t.Errorf("warn count: got %v, want %v", got, tt.wantWarn) + } + if got := sumLevel(counts, "error"); got != tt.wantError { + t.Errorf("error count: got %v, want %v", got, tt.wantError) + } + if tt.wantUnknown > 0 { + if got := counts["warn"]["unknown"]; got != tt.wantUnknown { + t.Errorf("warn[unknown] count: got %v, want %v", got, tt.wantUnknown) + } + } + if len(counts["debug"]) > 0 || len(counts["info"]) > 0 { + t.Errorf("debug/info should not be counted, got debug=%v info=%v", counts["debug"], counts["info"]) + } + }) + } +} diff --git a/postgres/Dockerfile b/postgres/Dockerfile index 09c049295..a2b526268 100644 --- a/postgres/Dockerfile +++ b/postgres/Dockerfile @@ -1,5 +1,5 @@ -# Last updated: 17 Mar 2026 -FROM debian:trixie-slim@sha256:26f98ccd92fd0a44d6928ce8ff8f4921b4d2f535bfa07555ee5d18f61429cf0c +# Last updated: 9 Apr 2026 +FROM debian:trixie-slim@sha256:4ffb3a1511099754cddc70eb1b12e50ffdb67619aa0ab6c13fcd800a78ef7c7a # explicitly set user/group IDs RUN set -eux; \ @@ -68,7 +68,7 @@ RUN set -ex; \ ENV PG_MAJOR 17 ENV PATH $PATH:/usr/lib/postgresql/$PG_MAJOR/bin -ENV PG_VERSION 17.8-1.pgdg13+1 +ENV PG_VERSION 17.9-1.pgdg13+1 RUN set -ex; \ \