Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/renovate.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,39 @@
"gomodTidy",
"gomodUpdateImportPaths"
],
"regexManagers": [
{
"fileMatch": ["^postgres/Dockerfile$"],
"matchStrings": ["FROM (?<depName>[^:\\n]+):(?<currentValue>[^@\\n]+)@sha256:(?<currentDigest>[a-f0-9]+)"],
"datasourceTemplate": "docker"
},
{
"fileMatch": ["^postgres/Dockerfile$"],
"matchStrings": ["ENV PG_VERSION (?<currentValue>[\\d]+\\.[\\d]+)-[^\\n]+"],
"depNameTemplate": "postgres",
"datasourceTemplate": "docker",
"versioningTemplate": "semver-coerced",
"autoReplaceStringTemplate": "ENV PG_VERSION {{{newValue}}}-1.pgdg13+1"
}
],
"packageRules": [
{
"matchPackageNames": [
"golang"
],
"allowedVersions": "1.26.x"
},
{
"matchPackageNames": [
"postgres"
],
"matchFileNames": [
"postgres/Dockerfile"
],
"allowedVersions": "17.x",
"automerge": true,
"groupName": "postgres Dockerfile"
},
{
"matchPackageNames": [
"/^github\\.com\\/sapcc\\/.*/"
Expand Down
36 changes: 34 additions & 2 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ import (
"context"
"crypto/tls"
"flag"
"log/slog"
"net/http"

uberzap "go.uber.org/zap"
"os"
"path/filepath"
"slices"
"strings"
"time"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
Expand Down Expand Up @@ -143,12 +147,39 @@ func main() {
flag.BoolVar(&enableHTTP2, "enable-http2", false,
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
opts := zap.Options{
Development: true,
Development: false,
}
opts.BindFlags(flag.CommandLine)
flag.Parse()

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
ctrl.SetLogger(zap.New(
zap.UseFlagOptions(&opts),
zap.RawZapOpts(uberzap.WrapCore(monitoring.WrapCoreWithLogMetrics)),
))

// Configure slog (used across internal packages) with JSON output and
// level control via the LOG_LEVEL environment variable.
// Supported values: debug, info (default), warn, error.
slogLevel := new(slog.LevelVar)
slogLevel.Set(slog.LevelInfo)
if lvl := os.Getenv("LOG_LEVEL"); lvl != "" {
switch strings.ToLower(lvl) {
case "debug":
slogLevel.Set(slog.LevelDebug)
case "info":
slogLevel.Set(slog.LevelInfo)
case "warn", "warning":
slogLevel.Set(slog.LevelWarn)
case "error":
slogLevel.Set(slog.LevelError)
}
}
slog.SetDefault(slog.New(monitoring.NewMetricsSlogHandler(
slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slogLevel,
}),
)))
slog.Info("slog configured", "level", slogLevel.Level().String())

// Log the main configuration
setupLog.Info("loaded main configuration",
Expand Down Expand Up @@ -301,6 +332,7 @@ func main() {
// This is useful to distinguish metrics from different deployments.
metricsConfig := conf.GetConfigOrDie[monitoring.Config]()
metrics.Registry = monitoring.WrapRegistry(metrics.Registry, metricsConfig)
metrics.Registry.MustRegister(monitoring.LogMessagesTotal)

// TODO: Remove me after scheduling pipeline steps don't require DB connections anymore.
metrics.Registry.MustRegister(&db.Monitor)
Expand Down
22 changes: 20 additions & 2 deletions cortex.secrets.example.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright SAP SE
# SPDX-License-Identifier: Apache-2.0

# Override config values that contain sensitive information or
# are specific to your environment. These values can be used in the Tiltfile.
# Override config values for local development. This includes secrets,
# environment-specific settings, and logging configuration.
# These values can be used in the Tiltfile.

# SSO certificate to use.
sharedSSOCert: &sharedSSOCert
Expand All @@ -20,6 +21,23 @@ sharedSSOCert: &sharedSSOCert
# If true, the certificate is not verified.
selfSigned: "false"

# Logging configuration for local development.
# Set logLevel to "debug" for verbose output from both zap and slog loggers.
# Set zapDevel to true for human-readable console logs instead of JSON.
# These apply per sub-chart, e.g. for cortex-nova:
#
# cortex-scheduling-controllers:
# controllerManager:
# container:
# logLevel: "debug"
# zapDevel: true
#
# cortex-knowledge-controllers:
# controllerManager:
# container:
# logLevel: "debug"
# zapDevel: true

# Enable kvm pipelines and scheduling support.
kvm:
enabled: true
Expand Down
10 changes: 5 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,15 @@ require (
github.com/ziutek/mymysql v1.5.4 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
go.opentelemetry.io/otel v1.40.0 // indirect
go.opentelemetry.io/otel v1.43.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect
go.opentelemetry.io/otel/metric v1.40.0 // indirect
go.opentelemetry.io/otel/sdk v1.40.0 // indirect
go.opentelemetry.io/otel/trace v1.40.0 // indirect
go.opentelemetry.io/otel/metric v1.43.0 // indirect
go.opentelemetry.io/otel/sdk v1.43.0 // indirect
go.opentelemetry.io/otel/trace v1.43.0 // indirect
go.opentelemetry.io/proto/otlp v1.8.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.1 // indirect
go.uber.org/zap v1.27.1
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect
Expand Down
20 changes: 10 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -241,20 +241,20 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms=
go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g=
go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk=
go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g=
go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc=
go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8=
go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE=
go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw=
go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg=
go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw=
go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA=
go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
go.opentelemetry.io/proto/otlp v1.8.0 h1:fRAZQDcAFHySxpJ1TwlA1cJ4tvcrw7nXl9xWWC8N5CE=
go.opentelemetry.io/proto/otlp v1.8.0/go.mod h1:tIeYOeNBU4cvmPqpaji1P+KbB4Oloai8wN4rWzRrFF0=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-cinder/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
version: 0.5.13
version: 0.5.14

# from: file://../../library/cortex
- name: cortex
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-manila/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
version: 0.5.13
version: 0.5.14

# from: file://../../library/cortex
- name: cortex
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-nova/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
# from: file://../../library/cortex-postgres
- name: cortex-postgres
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
version: 0.5.13
version: 0.5.14

# from: file://../../library/cortex
- name: cortex
Expand Down
2 changes: 1 addition & 1 deletion helm/bundles/cortex-nova/alerts/nova.alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -731,4 +731,4 @@ groups:
The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
This may indicate issues with the webhook logic, connectivity problems, or
external factors causing failures. Check the webhook server logs for error
details and investigate the affected resources.
details and investigate the affected resources.
1 change: 1 addition & 0 deletions helm/bundles/cortex-nova/templates/knowledges_kvm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ metadata:
name: kvm-libvirt-domain-cpu-steal-pct
spec:
schedulingDomain: nova
recency: "60s"
extractor:
name: kvm_libvirt_domain_cpu_steal_pct_extractor
description: |
Expand Down
2 changes: 1 addition & 1 deletion helm/library/cortex-postgres/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ apiVersion: v2
name: cortex-postgres
description: Postgres setup for Cortex.
type: application
version: 0.5.13
version: 0.5.14
appVersion: "sha-6db36b81"
16 changes: 16 additions & 0 deletions helm/library/cortex/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ app.kubernetes.io/instance: {{ .Release.Name }}
{{ $hasMutating }}}}{{- end }}


{{/*
chart.argsContainPrefix checks if any string in args starts with prefix.
Usage: include "chart.argsContainPrefix" (dict "prefix" "--zap-log-level" "args" .Values.controllerManager.container.args)
Returns "true" or "false".
*/}}
{{- define "chart.argsContainPrefix" -}}
{{- $prefix := .prefix -}}
{{- $result := dict "found" "false" -}}
{{- range .args -}}
{{- if hasPrefix $prefix . -}}
{{- $_ := set $result "found" "true" -}}
{{- end -}}
{{- end -}}
{{- get $result "found" -}}
{{- end -}}

{{- define "chart.hasValidatingWebhooks" -}}
{{- $hasValidating := false }}
{{- range . }}
Expand Down
16 changes: 13 additions & 3 deletions helm/library/cortex/templates/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ spec:
{{- range .Values.controllerManager.container.args }}
- {{ . }}
{{- end }}
{{- if and .Values.controllerManager.container.logLevel (ne (include "chart.argsContainPrefix" (dict "prefix" "--zap-log-level" "args" .Values.controllerManager.container.args)) "true") }}
- "--zap-log-level={{ .Values.controllerManager.container.logLevel }}"
{{- end }}
{{- if and .Values.controllerManager.container.zapDevel (ne (include "chart.argsContainPrefix" (dict "prefix" "--zap-devel" "args" .Values.controllerManager.container.args)) "true") }}
- "--zap-devel"
{{- end }}
{{- if and .Values.webhook.enable .Values.certmanager.enable }}
- "--webhook-cert-path=/tmp/k8s-webhook-server/serving-certs"
{{- end }}
Expand All @@ -56,13 +62,17 @@ spec:
{{- if .Values.controllerManager.container.image.pullPolicy }}
imagePullPolicy: {{ .Values.controllerManager.container.image.pullPolicy }}
{{- end }}
{{- if .Values.controllerManager.container.env }}
env:
{{- if and .Values.controllerManager.container.logLevel (not (and .Values.controllerManager.container.env (hasKey .Values.controllerManager.container.env "LOG_LEVEL"))) }}
- name: LOG_LEVEL
value: {{ .Values.controllerManager.container.logLevel | quote }}
{{- end }}
{{- if .Values.controllerManager.container.env }}
{{- range $key, $value := .Values.controllerManager.container.env }}
- name: {{ $key }}
value: {{ $value }}
{{- end }}
{{- end }}
{{- end }}
livenessProbe:
{{- toYaml .Values.controllerManager.container.livenessProbe | nindent 12 }}
readinessProbe:
Expand Down Expand Up @@ -140,4 +150,4 @@ data:
{{- $mergedSecrets = mergeOverwrite .Values.secrets $mergedSecrets }}
{{- end }}
{{ toJson $mergedSecrets | b64enc }}
{{- end }}
{{- end }}
8 changes: 8 additions & 0 deletions helm/library/cortex/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ controllerManager:
- "--metrics-bind-address=:2112"
- "--health-probe-bind-address=:8081"
- "--metrics-secure=false"
# Log level for both zap (controller-runtime) and slog (internal packages).
# Supported: debug, info (default), warn, error.
logLevel: "info"
# Enable zap development mode (human-readable console logs, development stack traces).
# This only changes output format and stack trace behavior, not the log level.
# The effective log level is controlled by logLevel above (default: "info").
# Set to true for local development (e.g. Tilt), keep false for production.
zapDevel: false
resources:
limits:
cpu: 500m
Expand Down
7 changes: 4 additions & 3 deletions internal/knowledge/extractor/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ func (r *KnowledgeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
// Sanity checks.
lastExtracted := knowledge.Status.LastExtracted.Time
recency := knowledge.Spec.Recency.Duration
if lastExtracted.Add(recency).After(time.Now()) && knowledge.Status.RawLength != 0 {
log.Info("skipping knowledge extraction, not yet time", "name", knowledge.Name)
return ctrl.Result{RequeueAfter: time.Until(lastExtracted.Add(recency))}, nil
if lastExtracted.Add(recency).After(time.Now()) {
waitFor := time.Until(lastExtracted.Add(recency))
log.Info("skipping knowledge extraction, not yet time", "name", knowledge.Name, "waitFor", waitFor)
return ctrl.Result{RequeueAfter: waitFor}, nil
}

extractor, ok := supportedExtractors[knowledge.Spec.Extractor.Name]
Expand Down
8 changes: 4 additions & 4 deletions internal/scheduling/lib/filter_weigher_pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
unknownFilters := []string{}
for _, filterConfig := range confedFilters {
slog.Info("scheduler: configuring filter", "name", filterConfig.Name)
slog.Info("supported:", "filters", maps.Keys(supportedFilters))
slog.Info("supported:", "filters", slices.Sorted(maps.Keys(supportedFilters)))
makeFilter, ok := supportedFilters[filterConfig.Name]
if !ok {
slog.Error("scheduler: unsupported filter", "name", filterConfig.Name)
Expand All @@ -73,7 +73,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
filter = validateFilter(filter)
filter = monitorFilter(filter, filterConfig.Name, pipelineMonitor)
if err := filter.Init(ctx, client, filterConfig); err != nil {
slog.Error("scheduler: failed to initialize filter", "name", filterConfig.Name, "error", err)
slog.Warn("scheduler: failed to initialize filter", "name", filterConfig.Name, "error", err)
filterErrors[filterConfig.Name] = errors.New("failed to initialize filter: " + err.Error())
continue
}
Expand All @@ -90,7 +90,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
unknownWeighers := []string{}
for _, weigherConfig := range confedWeighers {
slog.Info("scheduler: configuring weigher", "name", weigherConfig.Name)
slog.Info("supported:", "weighers", maps.Keys(supportedWeighers))
slog.Info("supported:", "weighers", slices.Sorted(maps.Keys(supportedWeighers)))
makeWeigher, ok := supportedWeighers[weigherConfig.Name]
if !ok {
slog.Error("scheduler: unsupported weigher", "name", weigherConfig.Name)
Expand All @@ -102,7 +102,7 @@ func InitNewFilterWeigherPipeline[RequestType FilterWeigherPipelineRequest](
weigher = validateWeigher(weigher)
weigher = monitorWeigher(weigher, weigherConfig.Name, pipelineMonitor)
if err := weigher.Init(ctx, client, weigherConfig); err != nil {
slog.Error("scheduler: failed to initialize weigher", "name", weigherConfig.Name, "error", err)
slog.Warn("scheduler: failed to initialize weigher", "name", weigherConfig.Name, "error", err)
weigherErrors[weigherConfig.Name] = errors.New("failed to initialize weigher: " + err.Error())
continue
}
Expand Down
4 changes: 2 additions & 2 deletions internal/scheduling/nova/external_scheduler_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ func (httpAPI *httpAPI) canRunScheduler(requestData api.ExternalSchedulerRequest
func (httpAPI *httpAPI) inferPipelineName(requestData api.ExternalSchedulerRequest) (string, error) {
hvType, err := requestData.GetHypervisorType()
if err != nil {
slog.Info("failed to determine hypervisor type, cannot infer pipeline name", "error", err)
slog.Warn("failed to determine hypervisor type, cannot infer pipeline name", "error", err)
return "", errors.New("failed to determine hypervisor type from request data")
}
flavorType, err := requestData.GetFlavorType()
if err != nil {
slog.Info("failed to determine flavor type, cannot infer pipeline name", "error", err)
slog.Warn("failed to determine flavor type, cannot infer pipeline name", "error", err)
return "", errors.New("failed to determine flavor type from request data")
}
switch hvType {
Expand Down
Loading