From a3ec8d4853e3af0fba4bf8ed63f2239f0a789f2d Mon Sep 17 00:00:00 2001 From: Amit Marcus Date: Tue, 19 May 2026 23:06:57 +0300 Subject: [PATCH 1/4] feat: add SIGUSR1/SIGUSR2 reinit signals SIGUSR1 - soft reinit: disables existing bindings, reloads hook configs, re-enables kubernetes watchers and schedules. onStartup hooks do not re-run. SIGUSR2 - full reset: same as SIGUSR1 but also re-runs onStartup hooks, emulating a process restart without interrupting in-flight executions. On any --config failure during reinit the process exits with code 1. Co-Authored-By: Claude Sonnet 4.6 --- cmd/shell-operator/start.go | 16 +++++++ pkg/shell-operator/operator.go | 87 ++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/cmd/shell-operator/start.go b/cmd/shell-operator/start.go index bd168ede..1517b312 100644 --- a/cmd/shell-operator/start.go +++ b/cmd/shell-operator/start.go @@ -4,7 +4,9 @@ import ( "context" "fmt" "os" + "os/signal" "strings" + "syscall" "github.com/deckhouse/deckhouse/pkg/log" "github.com/spf13/cobra" @@ -42,6 +44,20 @@ func start(logger *log.Logger, cfg *app.Config) func(cmd *cobra.Command, args [] operator.Start() + // Listen for SIGUSR1 (soft reinit) and SIGUSR2 (full reset) in the background. + go func() { + ch := make(chan os.Signal, 1) + signal.Notify(ch, syscall.SIGUSR1, syscall.SIGUSR2) + for sig := range ch { + switch sig { + case syscall.SIGUSR1: + operator.Reinit(ctx, false) + case syscall.SIGUSR2: + operator.Reinit(ctx, true) + } + } + }() + // Block until OS signal. utils_signal.WaitForProcessInterruption(func() { operator.Shutdown() diff --git a/pkg/shell-operator/operator.go b/pkg/shell-operator/operator.go index a028bf91..cb810a02 100644 --- a/pkg/shell-operator/operator.go +++ b/pkg/shell-operator/operator.go @@ -18,6 +18,7 @@ import ( "context" "fmt" "log/slog" + "os" "time" "github.com/deckhouse/deckhouse/pkg/log" @@ -825,6 +826,92 @@ func (op *ShellOperator) initAndStartHookQueues() { } } +// Reinit reloads all hooks without stopping the operator. +// +// It disables existing kubernetes and schedule bindings, re-runs hook discovery +// and --config, then re-queues binding-enable tasks into the already-running main +// queue. In-flight hook executions are not interrupted — they finish naturally. +// +// If fullReset is true, onStartup hooks are also re-queued (SIGUSR2 behaviour). +// If fullReset is false, only bindings are re-enabled (SIGUSR1 behaviour). +// +// On any error the process exits with code 1, matching startup failure behaviour. +func (op *ShellOperator) Reinit(ctx context.Context, fullReset bool) { + op.logger.Info("reinit begin", slog.Bool("fullReset", fullReset)) + + // Disable all existing bindings before rebuilding the hook index. + for _, hookName := range op.HookManager.GetHookNames() { + h := op.HookManager.GetHook(hookName) + if h.HookController != nil { + h.HookController.DisableKubernetesBindings() + h.HookController.DisableScheduleBindings() + } + } + + if err := op.initHookManager(); err != nil { + op.logger.Error("reinit: hook manager init failed, exiting", log.Err(err)) + os.Exit(1) + } + + // Ensure queues exist for any new hooks introduced by the reload. + op.initAndStartHookQueues() + + mainQueue := op.TaskQueues.GetMain() + logEntry := op.logger.With(pkg.LogKeyOperatorComponent, "reinit") + + if fullReset { + onStartupHooks, err := op.HookManager.GetHooksInOrder(types.OnStartup) + if err != nil { + op.logger.Error("reinit: get onStartup hooks failed, exiting", log.Err(err)) + os.Exit(1) + } + for _, hookName := range onStartupHooks { + bc := bindingcontext.BindingContext{Binding: string(types.OnStartup)} + bc.Metadata.BindingType = types.OnStartup + newTask := task.NewTask(task_metadata.HookRun). + WithMetadata(task_metadata.HookMetadata{ + HookName: hookName, + BindingType: types.OnStartup, + BindingContext: []bindingcontext.BindingContext{bc}, + }). + WithCompactionID(hookName). + WithQueuedAt(time.Now()) + mainQueue.AddLast(newTask) + logEntry.Info("queue onStartup task", slog.String(pkg.LogKeyHook, hookName)) + } + } + + for _, hookName := range op.HookManager.GetHookNames() { + h := op.HookManager.GetHook(hookName) + + if h.GetConfig().HasBinding(types.OnKubernetesEvent) { + newTask := task.NewTask(task_metadata.EnableKubernetesBindings). + WithMetadata(task_metadata.HookMetadata{ + HookName: hookName, + Binding: string(task_metadata.EnableKubernetesBindings), + }). + WithCompactionID(hookName). + WithQueuedAt(time.Now()) + mainQueue.AddLast(newTask) + logEntry.Info("queue EnableKubernetesBindings task", slog.String(pkg.LogKeyHook, hookName)) + } + + if h.GetConfig().HasBinding(types.Schedule) { + newTask := task.NewTask(task_metadata.EnableScheduleBindings). + WithMetadata(task_metadata.HookMetadata{ + HookName: hookName, + Binding: string(task_metadata.EnableScheduleBindings), + }). + WithCompactionID(hookName). + WithQueuedAt(time.Now()) + mainQueue.AddLast(newTask) + logEntry.Info("queue EnableScheduleBindings task", slog.String(pkg.LogKeyHook, hookName)) + } + } + + op.logger.Info("reinit complete", slog.Bool("fullReset", fullReset)) +} + // Shutdown pause kubernetes events handling and stop queues. Wait for queues to stop. func (op *ShellOperator) Shutdown() { op.logger.Info("shutdown begin", slog.String(pkg.LogKeyPhase, "shutdown")) From 196df3fc6a0865f8605d195b693bdfc83eb24c87 Mon Sep 17 00:00:00 2001 From: Amit Marcus Date: Tue, 19 May 2026 23:11:34 +0300 Subject: [PATCH 2/4] test: add unit tests for Reinit soft and full reset Verifies that SIGUSR1 (fullReset=false) does not re-enqueue onStartup tasks and that SIGUSR2 (fullReset=true) does, while both enqueue binding-enable tasks. Co-Authored-By: Claude Sonnet 4.6 --- pkg/shell-operator/operator_test.go | 75 +++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/pkg/shell-operator/operator_test.go b/pkg/shell-operator/operator_test.go index e4e05949..30d1ed71 100644 --- a/pkg/shell-operator/operator_test.go +++ b/pkg/shell-operator/operator_test.go @@ -7,6 +7,7 @@ import ( "github.com/deckhouse/deckhouse/pkg/log" . "github.com/onsi/gomega" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/flant/shell-operator/pkg/app" . "github.com/flant/shell-operator/pkg/hook/task_metadata" @@ -82,3 +83,77 @@ func Test_Operator_startup_tasks(t *testing.T) { i++ }) } + +// newTestOperator builds a minimal ShellOperator from the startup_tasks testdata hooks, +// ready for Reinit testing. The main queue is bootstrapped but not started. +func newTestOperator(t *testing.T) *ShellOperator { + t.Helper() + hooksDir, err := utils.RequireExistingDirectory("testdata/startup_tasks/hooks") + require.NoError(t, err) + + op := NewShellOperator(context.Background(), nil, nil, WithLogger(log.NewNop())) + op.SetupEventManagers() + op.setupHookManagers(app.NewConfig(), hooksDir, t.TempDir()) + require.NoError(t, op.initHookManager()) + op.bootstrapMainQueue(op.TaskQueues) + return op +} + +// collectTasks returns all tasks currently in the main queue as a slice. +func collectTasks(op *ShellOperator) []task.Task { + var tasks []task.Task + op.TaskQueues.GetMain().IterateSnapshot(func(tsk task.Task) { + tasks = append(tasks, tsk) + }) + return tasks +} + +func Test_Reinit_softReinit_doesNotRequeueOnStartup(t *testing.T) { + op := newTestOperator(t) + + initialCount := len(collectTasks(op)) + + op.Reinit(context.Background(), false) + + all := collectTasks(op) + reinitTasks := all[initialCount:] + + for _, tsk := range reinitTasks { + hm, ok := HookMetadataAccessor(tsk) + require.True(t, ok) + assert.NotEqual(t, htypes.OnStartup, hm.BindingType, "soft reinit must not enqueue onStartup tasks") + } + + // Binding-enable tasks must be present for hooks that have those bindings. + types := make(map[task.TaskType]bool) + for _, tsk := range reinitTasks { + types[tsk.GetType()] = true + } + assert.True(t, types[EnableKubernetesBindings], "EnableKubernetesBindings tasks must be enqueued") + assert.True(t, types[EnableScheduleBindings], "EnableScheduleBindings tasks must be enqueued") +} + +func Test_Reinit_fullReset_requeuesOnStartup(t *testing.T) { + op := newTestOperator(t) + + initialCount := len(collectTasks(op)) + + op.Reinit(context.Background(), true) + + all := collectTasks(op) + reinitTasks := all[initialCount:] + + taskTypes := make(map[task.TaskType]bool) + hasOnStartup := false + for _, tsk := range reinitTasks { + taskTypes[tsk.GetType()] = true + hm, ok := HookMetadataAccessor(tsk) + require.True(t, ok) + if hm.BindingType == htypes.OnStartup { + hasOnStartup = true + } + } + assert.True(t, hasOnStartup, "full reset must re-enqueue onStartup tasks") + assert.True(t, taskTypes[EnableKubernetesBindings], "EnableKubernetesBindings tasks must be enqueued") + assert.True(t, taskTypes[EnableScheduleBindings], "EnableScheduleBindings tasks must be enqueued") +} From 4b237e12cded73df0a65c9e5b9b3b944457f4b43 Mon Sep 17 00:00:00 2001 From: Amit Marcus Date: Tue, 19 May 2026 23:16:18 +0300 Subject: [PATCH 3/4] test: assert hook controllers are replaced on full reset Captures controller pointers before Reinit and verifies they are new instances after, proving the disable+rebuild cycle ran. Co-Authored-By: Claude Sonnet 4.6 --- pkg/shell-operator/operator_test.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pkg/shell-operator/operator_test.go b/pkg/shell-operator/operator_test.go index 30d1ed71..8dfe089f 100644 --- a/pkg/shell-operator/operator_test.go +++ b/pkg/shell-operator/operator_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/require" "github.com/flant/shell-operator/pkg/app" + "github.com/flant/shell-operator/pkg/hook/controller" . "github.com/flant/shell-operator/pkg/hook/task_metadata" htypes "github.com/flant/shell-operator/pkg/hook/types" "github.com/flant/shell-operator/pkg/metric" @@ -136,6 +137,12 @@ func Test_Reinit_softReinit_doesNotRequeueOnStartup(t *testing.T) { func Test_Reinit_fullReset_requeuesOnStartup(t *testing.T) { op := newTestOperator(t) + // Capture old controller pointers before reinit to verify they are replaced. + oldControllers := map[string]*controller.HookController{} + for _, name := range op.HookManager.GetHookNames() { + oldControllers[name] = op.HookManager.GetHook(name).HookController + } + initialCount := len(collectTasks(op)) op.Reinit(context.Background(), true) @@ -156,4 +163,10 @@ func Test_Reinit_fullReset_requeuesOnStartup(t *testing.T) { assert.True(t, hasOnStartup, "full reset must re-enqueue onStartup tasks") assert.True(t, taskTypes[EnableKubernetesBindings], "EnableKubernetesBindings tasks must be enqueued") assert.True(t, taskTypes[EnableScheduleBindings], "EnableScheduleBindings tasks must be enqueued") + + // Verify that old hook controllers were replaced, proving the disable+rebuild cycle ran. + for _, name := range op.HookManager.GetHookNames() { + newCtrl := op.HookManager.GetHook(name).HookController + assert.NotSame(t, oldControllers[name], newCtrl, "hook %s: controller must be a new instance after full reset", name) + } } From 87dbf7d06c2e536a148c7cc80049e27b89c94529 Mon Sep 17 00:00:00 2001 From: Amit Marcus Date: Tue, 19 May 2026 23:18:06 +0300 Subject: [PATCH 4/4] refactor: simplify Reinit test names Co-Authored-By: Claude Sonnet 4.6 --- pkg/shell-operator/operator_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/shell-operator/operator_test.go b/pkg/shell-operator/operator_test.go index 8dfe089f..db9ea81a 100644 --- a/pkg/shell-operator/operator_test.go +++ b/pkg/shell-operator/operator_test.go @@ -109,7 +109,7 @@ func collectTasks(op *ShellOperator) []task.Task { return tasks } -func Test_Reinit_softReinit_doesNotRequeueOnStartup(t *testing.T) { +func Test_Reinit_softReinit(t *testing.T) { op := newTestOperator(t) initialCount := len(collectTasks(op)) @@ -134,7 +134,7 @@ func Test_Reinit_softReinit_doesNotRequeueOnStartup(t *testing.T) { assert.True(t, types[EnableScheduleBindings], "EnableScheduleBindings tasks must be enqueued") } -func Test_Reinit_fullReset_requeuesOnStartup(t *testing.T) { +func Test_Reinit_fullReset(t *testing.T) { op := newTestOperator(t) // Capture old controller pointers before reinit to verify they are replaced.