From 68cdf08bf5431d81472412e00326d0437b38dd7b Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Wed, 12 Nov 2025 19:29:33 +0400 Subject: [PATCH 01/16] cni-switcher init Signed-off-by: Denis Tarabrin --- .gitignore | 2 +- cmd/commands/cni.go | 158 ++++ cmd/commands/kubectl.go | 7 +- cmd/d8/root.go | 1 + .../cni/api/v1alpha1/cni_migration_types.go | 75 ++ .../api/v1alpha1/cni_node_migration_types.go | 63 ++ internal/cni/api/v1alpha1/register.go | 50 ++ .../cni/api/v1alpha1/zz_generated.deepcopy.go | 218 +++++ internal/cni/cleanup.go | 330 +++++++ internal/cni/common.go | 141 +++ internal/cni/prepare.go | 586 +++++++++++++ internal/cni/resources.go | 537 ++++++++++++ internal/cni/rollback.go | 28 + internal/cni/switch.go | 807 ++++++++++++++++++ 14 files changed, 2998 insertions(+), 5 deletions(-) create mode 100644 cmd/commands/cni.go create mode 100644 internal/cni/api/v1alpha1/cni_migration_types.go create mode 100644 internal/cni/api/v1alpha1/cni_node_migration_types.go create mode 100644 internal/cni/api/v1alpha1/register.go create mode 100644 internal/cni/api/v1alpha1/zz_generated.deepcopy.go create mode 100644 internal/cni/cleanup.go create mode 100644 internal/cni/common.go create mode 100644 internal/cni/prepare.go create mode 100644 internal/cni/resources.go create mode 100644 internal/cni/rollback.go create mode 100644 internal/cni/switch.go diff --git a/.gitignore b/.gitignore index 30a43a92..8680e16b 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,4 @@ dist/ build/ # Entrypoint for the application -!/cmd/d8 \ No newline at end of file +!/cmd/d8 diff --git a/cmd/commands/cni.go b/cmd/commands/cni.go new file mode 100644 index 00000000..229a6b86 --- /dev/null +++ b/cmd/commands/cni.go @@ -0,0 +1,158 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package commands + +import ( + "fmt" + "log" + "strings" + "time" + + "github.com/deckhouse/deckhouse-cli/internal/cni" + "github.com/go-logr/logr" + "github.com/spf13/cobra" + "k8s.io/kubectl/pkg/util/templates" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" +) + +var ( + cniSwitchLong = templates.LongDesc(` +A group of commands to switch the CNI (Container Network Interface) provider in the Deckhouse cluster. + +This process is divided into several steps: + + - 'd8 cni-switch prepare' - STEP 1. Prepares the cluster for CNI migration. + - 'd8 cni-switch switch' - STEP 2. Performs the actual CNI switch. + - 'd8 cni-switch cleanup' - STEP 3. Cleans up resources if the switch is aborted. + - 'd8 cni-switch rollback' - (Optional) Rollback CNI if the switch is aborted.`) + + cniPrepareExample = templates.Examples(` + # Prepare to switch to Cilium CNI + d8 cni-switch prepare --to-cni cilium`) + + cniSwitchExample = templates.Examples(` + # Perform the CNI switch after the prepare step is complete + d8 cni-switch switch`) + + cniCleanupExample = templates.Examples(` + # Cleanup resources created by the 'prepare' command + d8 cni-switch cleanup`) + + cniRollbackExample = templates.Examples(` + # Rollback changes, restore previous CNI, cleanup resources created by the 'prepare' command + d8 cni-switch rollback`) + + supportedCNIs = []string{"cilium", "flannel", "simple-bridge"} +) + +func NewCniSwitchCommand() *cobra.Command { + log.SetFlags(0) + ctrllog.SetLogger(logr.Discard()) + + cmd := &cobra.Command{ + Use: "cni-switch", + Short: "A group of commands to switch CNI in the cluster", + Long: cniSwitchLong, + } + cmd.PersistentFlags().Duration("timeout", 1*time.Hour, "The timeout for the entire operation (e.g., 40m, 1h)") + cmd.AddCommand(NewCmdCniPrepare()) + cmd.AddCommand(NewCmdCniSwitch()) + cmd.AddCommand(NewCmdCniCleanup()) + cmd.AddCommand(NewCmdCniRollback()) + return cmd +} + +func NewCmdCniPrepare() *cobra.Command { + cmd := &cobra.Command{ + Use: "prepare", + Short: "Prepares the cluster for CNI switching", + Example: cniPrepareExample, + PreRunE: func(cmd *cobra.Command, args []string) error { + targetCNI, _ := cmd.Flags().GetString("to-cni") + for _, supported := range supportedCNIs { + if strings.ToLower(targetCNI) == supported { + return nil + } + } + return fmt.Errorf( + "invalid --to-cni value %q. Supported values are: %s", + targetCNI, + strings.Join(supportedCNIs, ", "), + ) + }, + + Run: func(cmd *cobra.Command, args []string) { + targetCNI, _ := cmd.Flags().GetString("to-cni") + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunPrepare(targetCNI, timeout); err != nil { + log.Fatalf("❌ Error running prepare command: %v", err) + } + }, + } + cmd.Flags().String("to-cni", "", fmt.Sprintf( + "Target CNI provider to switch to. Supported values: %s", + strings.Join(supportedCNIs, ", "), + )) + _ = cmd.MarkFlagRequired("to-cni") + + return cmd +} + +func NewCmdCniSwitch() *cobra.Command { + cmd := &cobra.Command{ + Use: "switch", + Short: "Performs the CNI switching", + Example: cniSwitchExample, + Run: func(cmd *cobra.Command, args []string) { + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunSwitch(timeout); err != nil { + log.Fatalf("❌ Error running switch command: %v", err) + } + }, + } + return cmd +} + +func NewCmdCniCleanup() *cobra.Command { + cmd := &cobra.Command{ + Use: "cleanup", + Short: "Cleans up resources created during CNI switching", + Example: cniCleanupExample, + Run: func(cmd *cobra.Command, args []string) { + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunCleanup(timeout); err != nil { + log.Fatalf("❌ Error running cleanup command: %v", err) + } + }, + } + return cmd +} + +func NewCmdCniRollback() *cobra.Command { // TDEN It needs to be done! + cmd := &cobra.Command{ + Use: "rollback", + Short: "Rollback all changes and restore previous CNI", + Example: cniCleanupExample, + Run: func(cmd *cobra.Command, args []string) { + timeout, _ := cmd.Flags().GetDuration("timeout") + if err := cni.RunRollback(timeout); err != nil { + log.Fatalf("❌ Error running rollback command: %v", err) + } + }, + } + return cmd +} diff --git a/cmd/commands/kubectl.go b/cmd/commands/kubectl.go index d029fdc0..5a65cfc7 100644 --- a/cmd/commands/kubectl.go +++ b/cmd/commands/kubectl.go @@ -25,6 +25,7 @@ import ( "regexp" "time" + "github.com/deckhouse/deckhouse-cli/internal/cni" "github.com/spf13/cobra" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/cli-runtime/pkg/genericclioptions" @@ -35,9 +36,7 @@ import ( ) const ( - cmNamespace = "d8-system" - cmName = "debug-container" - cmImageKey = "image" + cmImageKey = "debug-container-image" ) var d8CommandRegex = regexp.MustCompile("([\"'`])d8 (\\w+)") @@ -112,7 +111,7 @@ func getDebugImage(cmd *cobra.Command) (string, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - configMap, err := kubeCl.CoreV1().ConfigMaps(cmNamespace).Get(ctx, cmName, v1.GetOptions{}) + configMap, err := kubeCl.CoreV1().ConfigMaps(cni.CMDataNameSpace).Get(ctx, cni.CMDataName, v1.GetOptions{}) if err != nil { return "", ErrGenericImageFetch } diff --git a/cmd/d8/root.go b/cmd/d8/root.go index 46ccce7d..946c7936 100644 --- a/cmd/d8/root.go +++ b/cmd/d8/root.go @@ -98,6 +98,7 @@ func (r *RootCommand) registerCommands() { r.cmd.AddCommand(commands.NewKubectlCommand()) r.cmd.AddCommand(commands.NewLoginCommand()) r.cmd.AddCommand(commands.NewStrongholdCommand()) + r.cmd.AddCommand(commands.NewCniSwitchCommand()) r.cmd.AddCommand(commands.NewHelpJSONCommand(r.cmd)) r.cmd.AddCommand(plugins.NewPluginsCommand(r.logger.Named("plugins-command"))) diff --git a/internal/cni/api/v1alpha1/cni_migration_types.go b/internal/cni/api/v1alpha1/cni_migration_types.go new file mode 100644 index 00000000..5cf9a76d --- /dev/null +++ b/internal/cni/api/v1alpha1/cni_migration_types.go @@ -0,0 +1,75 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +genclient:nonNamespaced +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +k8s:openapi-gen=true + +// CNIMigration is the schema for the CNIMigration API. +// It is a cluster-level resource that serves as the "single source of truth" +// for the entire migration process. It defines the goal (targetCNI) +// and tracks the overall progress across all nodes. +type CNIMigration struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec defines the desired state of CNIMigration. + Spec CNIMigrationSpec `json:"spec,omitempty"` + // Status defines the observed state of CNIMigration. + Status CNIMigrationStatus `json:"status,omitempty"` +} + +// CNIMigrationSpec defines the desired state of CNIMigration. +// +k8s:deepcopy-gen=true +type CNIMigrationSpec struct { + // TargetCNI is the CNI to switch to. + // Set by the d8 cli utility when starting Phase 1. + TargetCNI string `json:"targetCNI"` + // Phase is the phase controlled by the d8 cli to command the agents. + // Possible values: Prepare, Migrate, Cleanup, Abort. + Phase string `json:"phase"` +} + +// CNIMigrationStatus defines the observed state of CNIMigration. +// +k8s:deepcopy-gen=true +type CNIMigrationStatus struct { + // CurrentCNI is the detected CNI from which the switch is being made. + CurrentCNI string `json:"currentCNI,omitempty"` + // NodesTotal is the total number of nodes involved in the migration. + NodesTotal int `json:"nodesTotal,omitempty"` + // NodesSucceeded is the number of nodes that have successfully completed the current phase. + NodesSucceeded int `json:"nodesSucceeded,omitempty"` + // NodesFailed is the number of nodes where an error occurred. + NodesFailed int `json:"nodesFailed,omitempty"` + // Conditions reflect the state of the migration as a whole. + // The d8 cli aggregates statuses from all CNINodeMigrations here. + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// CNIMigrationList contains a list of CNIMigration. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type CNIMigrationList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CNIMigration `json:"items"` +} diff --git a/internal/cni/api/v1alpha1/cni_node_migration_types.go b/internal/cni/api/v1alpha1/cni_node_migration_types.go new file mode 100644 index 00000000..b8536266 --- /dev/null +++ b/internal/cni/api/v1alpha1/cni_node_migration_types.go @@ -0,0 +1,63 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +genclient:nonNamespaced +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +k8s:openapi-gen=true + +// CNINodeMigration is the schema for the CNINodeMigration API. +// This resource is created for each node in the cluster. The Helper +// agent running on the node updates this resource to report its local progress. +// The d8 cli reads these resources to display detailed status. +type CNINodeMigration struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Spec can be empty, as all configuration is taken from the parent CNIMigration resource. + Spec CNINodeMigrationSpec `json:"spec,omitempty"` + // Status defines the observed state of CNINodeMigration. + Status CNINodeMigrationStatus `json:"status,omitempty"` +} + +// CNINodeMigrationSpec defines the desired state of CNINodeMigration. +// +k8s:deepcopy-gen=true +type CNINodeMigrationSpec struct { + // The spec can be empty, as all configuration is taken from the parent CNIMigration resource. +} + +// CNINodeMigrationStatus defines the observed state of CNINodeMigration. +// +k8s:deepcopy-gen=true +type CNINodeMigrationStatus struct { + // Phase is the phase of this particular node. + Phase string `json:"phase,omitempty"` + // Conditions are the detailed conditions reflecting the steps performed on the node. + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// CNINodeMigrationList contains a list of CNINodeMigration. +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +type CNINodeMigrationList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []CNINodeMigration `json:"items"` +} diff --git a/internal/cni/api/v1alpha1/register.go b/internal/cni/api/v1alpha1/register.go new file mode 100644 index 00000000..958d272f --- /dev/null +++ b/internal/cni/api/v1alpha1/register.go @@ -0,0 +1,50 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +const ( + APIGroup = "network.deckhouse.io" + APIVersion = "v1alpha1" +) + +// SchemeGroupVersion is group version used to register these objects +var ( + SchemeGroupVersion = schema.GroupVersion{ + Group: APIGroup, + Version: APIVersion, + } + SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) + AddToScheme = SchemeBuilder.AddToScheme +) + +// Adds the list of known types to Scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &CNIMigration{}, + &CNIMigrationList{}, + &CNINodeMigration{}, + &CNINodeMigrationList{}, + ) + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/internal/cni/api/v1alpha1/zz_generated.deepcopy.go b/internal/cni/api/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 00000000..495df5c2 --- /dev/null +++ b/internal/cni/api/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,218 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigration) DeepCopyInto(out *CNIMigration) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigration. +func (in *CNIMigration) DeepCopy() *CNIMigration { + if in == nil { + return nil + } + out := new(CNIMigration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNIMigration) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigrationList) DeepCopyInto(out *CNIMigrationList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CNIMigration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigrationList. +func (in *CNIMigrationList) DeepCopy() *CNIMigrationList { + if in == nil { + return nil + } + out := new(CNIMigrationList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNIMigrationList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigrationSpec) DeepCopyInto(out *CNIMigrationSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigrationSpec. +func (in *CNIMigrationSpec) DeepCopy() *CNIMigrationSpec { + if in == nil { + return nil + } + out := new(CNIMigrationSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNIMigrationStatus) DeepCopyInto(out *CNIMigrationStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNIMigrationStatus. +func (in *CNIMigrationStatus) DeepCopy() *CNIMigrationStatus { + if in == nil { + return nil + } + out := new(CNIMigrationStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigration) DeepCopyInto(out *CNINodeMigration) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigration. +func (in *CNINodeMigration) DeepCopy() *CNINodeMigration { + if in == nil { + return nil + } + out := new(CNINodeMigration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNINodeMigration) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigrationList) DeepCopyInto(out *CNINodeMigrationList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]CNINodeMigration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigrationList. +func (in *CNINodeMigrationList) DeepCopy() *CNINodeMigrationList { + if in == nil { + return nil + } + out := new(CNINodeMigrationList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *CNINodeMigrationList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigrationSpec) DeepCopyInto(out *CNINodeMigrationSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigrationSpec. +func (in *CNINodeMigrationSpec) DeepCopy() *CNINodeMigrationSpec { + if in == nil { + return nil + } + out := new(CNINodeMigrationSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CNINodeMigrationStatus) DeepCopyInto(out *CNINodeMigrationStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CNINodeMigrationStatus. +func (in *CNINodeMigrationStatus) DeepCopy() *CNINodeMigrationStatus { + if in == nil { + return nil + } + out := new(CNINodeMigrationStatus) + in.DeepCopyInto(out) + return out +} diff --git a/internal/cni/cleanup.go b/internal/cni/cleanup.go new file mode 100644 index 00000000..53b15fdc --- /dev/null +++ b/internal/cni/cleanup.go @@ -0,0 +1,330 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// RunCleanup executes the logic for the 'cni-switch cleanup' command. +// It performs a robust cleanup by deleting a known list of resources +// and waiting for them to be fully terminated. +func RunCleanup(timeout time.Duration) error { + startTime := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("🚀 Starting CNI switch cleanup (global timeout: %s)\n", timeout) + + // 1. Create Kubernetes client + safeClient, err := saferequest.NewSafeClient() + if err != nil { + return fmt.Errorf("creating safe client: %w", err) + } + + rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) + if err != nil { + return fmt.Errorf("creating runtime client: %w", err) + } + fmt.Printf("✅ Kubernetes client created\n") + + // 2. Delete cluster-scoped resources first, with waiting + fmt.Println("\nDeleting cluster-scoped resources...") + webhookConfig := &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{Name: webhookConfigName}, + } + if err = deleteAndWait(ctx, rtClient, webhookConfig); err != nil { + return err + } + + // 3. Stop active controllers first to prevent reconciliation loops + fmt.Printf(" Stopping active controllers in '%s'...\n", cniSwitchNamespace) + helperDs := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperDaemonSetName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, helperDs); err != nil { + return err + } + webhookDep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: webhookDeploymentName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookDep); err != nil { + return err + } + fmt.Println("✅ Active controllers stopped") + + // 4. Delete all CNINodeMigration resources + fmt.Println("\nDeleting all CNINodeMigration resources...") + nodeMigrations := &v1alpha1.CNINodeMigrationList{} + if err = rtClient.List(ctx, nodeMigrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { + return fmt.Errorf("listing CNINodeMigrations: %w", err) + } + for _, nm := range nodeMigrations.Items { + // Remove finalizers to ensure deletion even if controller is down + if err = removeFinalizers(ctx, rtClient, &nm); err != nil { + fmt.Printf("⚠️ Warning: failed to remove finalizers from %s: %v\n", nm.Name, err) + } + if err = deleteAndWait(ctx, rtClient, &nm); err != nil { + return err + } + } + fmt.Println("✅ All CNINodeMigration resources deleted") + + // 5. Delete all CNIMigration resources + fmt.Println("\nDeleting all CNIMigration resources...") + migrations := &v1alpha1.CNIMigrationList{} + if err = rtClient.List(ctx, migrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { + return fmt.Errorf("listing CNIMigrations: %w", err) + } + for _, m := range migrations.Items { + // Remove finalizers to ensure deletion even if controller is down + if err = removeFinalizers(ctx, rtClient, &m); err != nil { + fmt.Printf("⚠️ Warning: failed to remove finalizers from %s: %v\n", m.Name, err) + } + if err = deleteAndWait(ctx, rtClient, &m); err != nil { + return err + } + } + fmt.Println("✅ All CNIMigration resources deleted") + + // 6. Remove annotations from all pods + if err = removePodAnnotations(ctx, rtClient); err != nil { + // Non-fatal, print a warning + fmt.Printf("⚠️ Warning: failed to remove all pod annotations: %v\n", err) + } + + // 7. Delete RBAC resources + fmt.Println("\nDeleting RBAC resources...") + clusterRoleHelper := &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperClusterRoleName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleHelper); err != nil { + return err + } + clusterRoleWebhook := &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{Name: webhookClusterRoleName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleWebhook); err != nil { + return err + } + clusterRoleBindingHelper := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperClusterRoleBindingName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleBindingHelper); err != nil { + return err + } + clusterRoleBindingWebhook := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: webhookClusterRoleBindingName}, + } + if err = deleteAndWait(ctx, rtClient, clusterRoleBindingWebhook); err != nil { + return err + } + fmt.Println("✅ Cluster-scoped RBAC resources deleted") + + // 8. Delete remaining namespaced resources + fmt.Printf("\nDeleting remaining namespaced resources in '%s'...\n", cniSwitchNamespace) + webhookService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: webhookServiceName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookService); err != nil { + return err + } + webhookSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: webhookSecretName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookSecret); err != nil { + return err + } + helperSA := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: switchHelperServiceAccountName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, helperSA); err != nil { + return err + } + webhookSA := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: webhookServiceAccountName, Namespace: cniSwitchNamespace}, + } + if err = deleteAndWait(ctx, rtClient, webhookSA); err != nil { + return err + } + fmt.Println("✅ Remaining namespaced resources deleted") + + // 9. Delete the namespace + fmt.Printf("\nDeleting namespace '%s'...\n", cniSwitchNamespace) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: cniSwitchNamespace}} + if err = deleteAndWait(ctx, rtClient, ns); err != nil { + return err + } + fmt.Println("✅ Namespace successfully deleted") + + fmt.Printf("\n🎉 CNI switch cleanup successfully completed (total time: %s)\n", + time.Since(startTime).Round(time.Second), + ) + return nil +} + +// deleteAndWait deletes a resource and waits for it to be fully terminated. +func deleteAndWait(ctx context.Context, rtClient client.Client, obj client.Object) error { + if err := deleteResource(ctx, rtClient, obj); err != nil { + // This handles the "already deleted" case, so we don't need to check again. + return err + } + return waitForResourceDeletion(ctx, rtClient, obj) +} + +// deleteResource just initiates deletion for a Kubernetes resource. +func deleteResource(ctx context.Context, rtClient client.Client, obj client.Object) error { + kind := getKind(obj) + name := obj.GetName() + ns := obj.GetNamespace() + + fmt.Printf("- Deleting %s '%s%s'... ", kind, name, func() string { + if ns != "" { + return fmt.Sprintf("' in namespace '%s", ns) + } + return "" + }()) + + err := rtClient.Delete(ctx, obj, client.PropagationPolicy(metav1.DeletePropagationBackground)) + if err != nil { + if errors.IsNotFound(err) { + fmt.Println("already deleted.") + return nil + } + return fmt.Errorf("failed to delete %s '%s': %w", kind, name, err) + } + + fmt.Println("deleted.") + return nil +} + +// waitForResourceDeletion polls until a resource is confirmed to be gone. +func waitForResourceDeletion(ctx context.Context, rtClient client.Client, obj client.Object) error { + key := client.ObjectKey{Name: obj.GetName(), Namespace: obj.GetNamespace()} + kind := getKind(obj) + + // Check if the resource is already gone before starting to wait. + if err := rtClient.Get(ctx, key, obj); err != nil { + if errors.IsNotFound(err) { + // It was already gone, no need to wait. + return nil + } + return fmt.Errorf("getting %s '%s': %w", kind, key.Name, err) + } + + fmt.Printf(" Waiting for %s '%s' to terminate... ", kind, key.Name) + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + fmt.Println("timeout.") + return fmt.Errorf("timed out waiting for %s '%s' to be deleted", kind, key.Name) + case <-ticker.C: + err := rtClient.Get(ctx, key, obj) + if err != nil { + if errors.IsNotFound(err) { + fmt.Println("terminated.") + return nil // Success! + } + if strings.Contains(err.Error(), "no matches for kind") { + fmt.Println("Kind not known, assuming terminated.") + return nil + } + fmt.Printf("error: %v\n", err) + return fmt.Errorf("getting %s '%s': %w", kind, key.Name, err) + } + } + } +} + +func removePodAnnotations(ctx context.Context, rtClient client.Client) error { + fmt.Println("\nRemoving CNI switch annotations from all pods...") + + podList := &corev1.PodList{} + if err := rtClient.List(ctx, podList); err != nil { + return fmt.Errorf("listing all pods: %w", err) + } + + podsPatched := 0 + for _, pod := range podList.Items { + if pod.Spec.HostNetwork { + continue + } + + if _, ok := pod.Annotations[EffectiveCNIAnnotation]; ok { + // Use a merge patch to remove just the one annotation + patch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{"%s":null}}}`, EffectiveCNIAnnotation)) + err := rtClient.Patch(ctx, &pod, client.RawPatch(client.Merge.Type(), patch)) + if err != nil { + if errors.IsNotFound(err) { + // No need to print warning, just continue + continue + } + fmt.Printf("\n⚠️ Warning: failed to patch pod %s/%s: %v", pod.Namespace, pod.Name, err) + continue + } + podsPatched++ + fmt.Printf("\r Patched %d pods...", podsPatched) + } + } + + if podsPatched > 0 { + fmt.Printf("\n✅ Removed annotations from %d pods.\n", podsPatched) + } else { + fmt.Print("✅ No pods with CNI switch annotations were found.\n") + } + + return nil +} + +// getKind extracts a user-friendly kind from a runtime object. +func getKind(obj client.Object) string { + kind := obj.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + t := fmt.Sprintf("%T", obj) + parts := strings.Split(t, ".") + if len(parts) > 0 { + return parts[len(parts)-1] + } + } + return kind +} + +// removeFinalizers patches the object to remove all finalizers. +func removeFinalizers(ctx context.Context, rtClient client.Client, obj client.Object) error { + if len(obj.GetFinalizers()) == 0 { + return nil + } + + patch := []byte(`{"metadata":{"finalizers":null}}`) + return rtClient.Patch(ctx, obj, client.RawPatch(client.Merge.Type(), patch)) +} diff --git a/internal/cni/common.go b/internal/cni/common.go new file mode 100644 index 00000000..0fc646f3 --- /dev/null +++ b/internal/cni/common.go @@ -0,0 +1,141 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "bufio" + "context" + "fmt" + "os" + "strings" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + // Image ConfigMap from Deckhouse + CMDataName = "d8-cli-data" + CMDataNameSpace = "d8-system" + cmDataHelperImage = "cni-switch-helper-image" + + // Namespace where the helper and webhook are deployed + cniSwitchNamespace = "cni-switch" + + // Service Account Names + switchHelperServiceAccountName = "cni-switch-helper-sa" + webhookServiceAccountName = "cni-switch-webhook-sa" + + // DaemonSet Names + switchHelperDaemonSetName = "cni-switch-helper" + + // Cluster Role and Binding Names + switchHelperClusterRoleName = "d8:cni-switch-helper" + webhookClusterRoleName = "d8:cni-switch-webhook" + switchHelperClusterRoleBindingName = "d8:cni-switch-helper" + webhookClusterRoleBindingName = "d8:cni-switch-webhook" + + // Webhook Resources + webhookDeploymentName = "cni-switch-webhook" + webhookServiceName = "cni-switch-webhook-service" + webhookSecretName = "cni-switch-webhook-tls" + webhookConfigName = "cni-switch-pod-annotator" + webhookConfigurationName = "annotator.cni-switch.deckhouse.io" + webhookPort = 9443 + + // Annotations + EffectiveCNIAnnotation = "effective-cni.network.deckhouse.io" +) + +var ( + CNIModuleConfigs = []string{"cni-cilium", "cni-flannel", "cni-simple-bridge"} + + moduleConfigGVK = schema.GroupVersionKind{ + Group: "deckhouse.io", + Version: "v1alpha1", + Kind: "ModuleConfig", + } +) + +// AskForConfirmation displays a warning and prompts the user for confirmation. +func AskForConfirmation(commandName string) (bool, error) { + reader := bufio.NewReader(os.Stdin) + + fmt.Println("--------------------------------------------------------------------------------") + fmt.Println("⚠️ IMPORTANT: PLEASE READ CAREFULLY") + fmt.Println("--------------------------------------------------------------------------------") + fmt.Println() + fmt.Printf("You are about to run the '%s' step of the CNI switch process. Please ensure that:\n\n", commandName) + fmt.Println("1. External cluster management systems (CI/CD, GitOps like ArgoCD, Flux)") + fmt.Println(" are temporarily disabled. They might interfere with the CNI switch process") + fmt.Println(" by reverting changes made by this tool.") + fmt.Println() + fmt.Println("2. You have sufficient administrative privileges for this cluster to perform") + fmt.Println(" the required actions (modifying ModuleConfigs, deleting pods, etc.).") + fmt.Println() + fmt.Println("3. The utility does not configure CNI modules in the cluster; it only enables/disables") + fmt.Println(" them via ModuleConfig during operation. The user must independently prepare the") + fmt.Println(" ModuleConfig configuration for the target CNI.") + fmt.Println() + fmt.Println("Once the process starts, no active intervention is required from you.") + fmt.Println() + fmt.Print("Do you want to continue? (y/n): ") + + for { + response, err := reader.ReadString('\n') + if err != nil { + return false, err + } + + response = strings.ToLower(strings.TrimSpace(response)) + + switch response { + case "y", "yes": + fmt.Println() + return true, nil + case "n", "no": + fmt.Println() + return false, nil + default: + fmt.Print("Invalid input. Please enter 'y/yes' or 'n/no'): ") + } + } +} + +// FindActiveMigration searches for an existing CNIMigration resource. +// It returns an error if more than one migration is found. +func FindActiveMigration(ctx context.Context, rtClient client.Client) (*v1alpha1.CNIMigration, error) { + migrationList := &v1alpha1.CNIMigrationList{} + if err := rtClient.List(ctx, migrationList); err != nil { + return nil, fmt.Errorf("listing CNIMigration objects: %w", err) + } + + if len(migrationList.Items) == 0 { + return nil, nil // No migration found + } + + if len(migrationList.Items) > 1 { + return nil, fmt.Errorf( + "found %d CNI migration objects, which is an inconsistent state. "+ + "Please run 'd8 cni-switch cleanup' to resolve this", + len(migrationList.Items), + ) + } + + return &migrationList.Items[0], nil +} diff --git a/internal/cni/prepare.go b/internal/cni/prepare.go new file mode 100644 index 00000000..b23d28eb --- /dev/null +++ b/internal/cni/prepare.go @@ -0,0 +1,586 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "bytes" + "context" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "math/big" + "strings" + "time" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// RunPrepare executes the logic for the 'cni-switch prepare' command. +func RunPrepare(targetCNI string, timeout time.Duration) error { + // 0. Ask for user confirmation + confirmed, err := AskForConfirmation("prepare") + if err != nil { + return fmt.Errorf("asking for confirmation: %w", err) + } + if !confirmed { + fmt.Println("Operation cancelled by user.") + return nil + } + + startTime := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("🚀 Starting CNI switch preparation for target '%s' (global timeout: %s)\n", + targetCNI, timeout) + + // 1. Create a Kubernetes client + safeClient, err := saferequest.NewSafeClient() + if err != nil { + return fmt.Errorf("creating safe client: %w", err) + } + + rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) + if err != nil { + return fmt.Errorf("creating runtime client: %w", err) + } + fmt.Printf("✅ Kubernetes client created (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 2. Find an existing migration or create a new one + activeMigration, err := getOrCreateMigrationForPrepare(ctx, rtClient, targetCNI) + if err != nil { + return err + } + if activeMigration == nil { + // This means preparation is already complete, and the user has been notified. + return nil + } + fmt.Printf( + "✅ Working with CNIMigration '%s' (total elapsed: %s)\n\n", + activeMigration.Name, + time.Since(startTime).Round(time.Millisecond), + ) + + // 3. Detect current CNI and update migration status + if activeMigration.Status.CurrentCNI == "" { + var currentCNI string + currentCNI, err = detectCurrentCNI(rtClient) + if err != nil { + return fmt.Errorf("detecting current CNI: %w", err) + } + fmt.Printf("Detected current CNI: '%s'\n", currentCNI) + + if currentCNI == targetCNI { + return fmt.Errorf("target CNI '%s' is the same as the current CNI. Nothing to do", targetCNI) + } + + activeMigration.Status.CurrentCNI = currentCNI + err = rtClient.Status().Update(ctx, activeMigration) + if err != nil { + return fmt.Errorf("updating migration status with current CNI: %w", err) + } + fmt.Printf( + "✅ Added current CNI to migration status (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond), + ) + } + + // 4. Create the dedicated namespace + fmt.Printf("Creating dedicated namespace '%s'...\n", cniSwitchNamespace) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: cniSwitchNamespace}} + if err = rtClient.Create(ctx, ns); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating namespace %s: %w", cniSwitchNamespace, err) + } + fmt.Printf("✅ Namespace created (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 5. Get the helper image name from the configmap + cm := &corev1.ConfigMap{} + if err = rtClient.Get(ctx, client.ObjectKey{Name: CMDataName, Namespace: CMDataNameSpace}, cm); err != nil { + return fmt.Errorf("getting %s configmap: %w", CMDataName, err) + } + imageName, ok := cm.Data[cmDataHelperImage] + if !ok || imageName == "" { + return fmt.Errorf("%s not found or empty in %s configmap", cmDataHelperImage, CMDataName) + } + + // 6. Apply RBAC + fmt.Println("Applying RBAC...") + // Helper RBAC + helperSA := getSwitchHelperServiceAccount(cniSwitchNamespace) + if err = rtClient.Create(ctx, helperSA); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating helper service account: %w", err) + } + fmt.Printf("- Helper's ServiceAccount '%s' created\n", helperSA.Name) + + helperRole := getSwitchHelperClusterRole() + if err = rtClient.Create(ctx, helperRole); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating cluster role: %w", err) + } + fmt.Printf("- Helper's ClusterRole '%s' created\n", helperRole.Name) + + helperBinding := getSwitchHelperClusterRoleBinding(cniSwitchNamespace) + if err = rtClient.Create(ctx, helperBinding); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating cluster role binding: %w", err) + } + fmt.Printf("- Helper's ClusterRoleBinding '%s' created\n", helperBinding.Name) + + // Webhook RBAC + webhookSA := getWebhookServiceAccount(cniSwitchNamespace) + if err = rtClient.Create(ctx, webhookSA); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook service account: %w", err) + } + fmt.Printf("- Webhook's ServiceAccount '%s' created\n", webhookSA.Name) + + webhookRole := getWebhookClusterRole() + if err = rtClient.Create(ctx, webhookRole); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook cluster role: %w", err) + } + fmt.Printf("- Webhook's ClusterRole '%s' created\n", webhookRole.Name) + + webhookBinding := getWebhookClusterRoleBinding(cniSwitchNamespace) + if err = rtClient.Create(ctx, webhookBinding); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook cluster role binding: %w", err) + } + fmt.Printf("- Webhook's ClusterRoleBinding '%s' created\n", webhookBinding.Name) + fmt.Printf("✅ RBAC applied (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 7. Create and wait for the mutating webhook + fmt.Println("Deploying Mutating Webhook for annotating new pods...") + // Generate certificates + caCert, serverCert, serverKey, err := generateWebhookCertificates(cniSwitchNamespace) + if err != nil { + return fmt.Errorf("generating webhook certificates: %w", err) + } + fmt.Printf("- TLS certificate generated\n") + + // Create TLS secret + tlsSecret := getWebhookTLSSecret(cniSwitchNamespace, serverCert, serverKey) + if err = rtClient.Create(ctx, tlsSecret); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook tls secret: %w", err) + } + fmt.Printf("- Secret with TLS certificate '%s' created\n", tlsSecret.Name) + + // Create Deployment + webhookDeployment := getWebhookDeployment(cniSwitchNamespace, imageName, webhookServiceAccountName) + if err = rtClient.Create(ctx, webhookDeployment); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook deployment: %w", err) + } + + // Wait for Deployment to be ready + if err = waitForDeploymentReady(ctx, rtClient, webhookDeployment); err != nil { + return fmt.Errorf("waiting for webhook deployment ready: %w", err) + } + fmt.Printf("- Webhook Deployment '%s' created\n", webhookDeployment.Name) + + // Create Service + webhookService := getWebhookService(cniSwitchNamespace) + if err = rtClient.Create(ctx, webhookService); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating webhook service: %w", err) + } + fmt.Printf("- Webhook Service '%s' created\n", webhookService.Name) + + // Create MutatingWebhookConfiguration + webhookConfig := getMutatingWebhookConfiguration(cniSwitchNamespace, caCert) + if err = rtClient.Create(ctx, webhookConfig); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("creating mutating webhook configuration: %w", err) + } + fmt.Printf("✅ Mutating Webhook '%s' is active (total elapsed: %s)\n\n", + webhookConfig.Name, time.Since(startTime).Round(time.Millisecond)) + + // 8. Create and wait for the Helper daemonset + dsKey := client.ObjectKey{Name: switchHelperDaemonSetName, Namespace: cniSwitchNamespace} + ds := &appsv1.DaemonSet{} + if err = rtClient.Get(ctx, dsKey, ds); err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("getting helper daemonset: %w", err) + } + fmt.Printf("Creating helper DaemonSet '%s'...\n", switchHelperDaemonSetName) + dsToCreate := getSwitchHelperDaemonSet(cniSwitchNamespace, imageName) + if err = rtClient.Create(ctx, dsToCreate); err != nil { + return fmt.Errorf("creating helper daemonset: %w", err) + } + ds = dsToCreate + } else { + fmt.Printf("Helper DaemonSet '%s' already exists.\n", switchHelperDaemonSetName) + } + + if err = waitForDaemonSetReady(ctx, rtClient, ds); err != nil { + return fmt.Errorf("waiting for daemonset ready: %w", err) + } + fmt.Printf("✅ Helper DaemonSet is ready (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 9. Wait for all nodes to be prepared + fmt.Println("Waiting for all nodes to complete the preparation step...") + err = waitForNodesPrepared(ctx, rtClient) + if err != nil { + return fmt.Errorf("waiting for nodes to be prepared: %w", err) + } + fmt.Printf("✅ All CNINodeMigrations are created and all nodes are prepared (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 10. Update overall status + activeMigration.Status.Conditions = append(activeMigration.Status.Conditions, metav1.Condition{ + Type: "PreparationSucceeded", + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "AllNodesPrepared", + Message: "All nodes successfully completed the preparation step.", + }) + + err = rtClient.Status().Update(ctx, activeMigration) + if err != nil { + return fmt.Errorf("updating CNIMigration status to prepared: %w", err) + } + + fmt.Printf( + "🎉 Cluster successfully prepared for CNI switch (total time: %s)\n", + time.Since(startTime).Round(time.Second), + ) + fmt.Println("\nYou can now run 'd8 cni-switch switch' to proceed") + + return nil +} + +// generateWebhookCertificates creates a self-signed CA and a server certificate for the webhook. +func generateWebhookCertificates(namespace string) (caCert, serverCert, serverKey []byte, err error) { + caSerialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + caSerialNumber, err := rand.Int(rand.Reader, caSerialNumberLimit) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to generate CA serial number: %w", err) + } + + // CA configuration + ca := &x509.Certificate{ + SerialNumber: caSerialNumber, + Subject: pkix.Name{ + Organization: []string{"deckhouse.io"}, + }, + NotBefore: time.Now(), + NotAfter: time.Now().AddDate(1, 0, 0), + IsCA: true, + ExtKeyUsage: []x509.ExtKeyUsage{ + x509.ExtKeyUsageClientAuth, + x509.ExtKeyUsageServerAuth, + }, + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, + BasicConstraintsValid: true, + } + + caPrivKey, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + return nil, nil, nil, fmt.Errorf("generating CA private key: %w", err) + } + + caBytes, err := x509.CreateCertificate(rand.Reader, ca, ca, &caPrivKey.PublicKey, caPrivKey) + if err != nil { + return nil, nil, nil, fmt.Errorf("creating CA certificate: %w", err) + } + + caPEM := new(bytes.Buffer) + _ = pem.Encode(caPEM, &pem.Block{ + Type: "CERTIFICATE", + Bytes: caBytes, + }) + + serverSerialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serverSerialNumber, err := rand.Int(rand.Reader, serverSerialNumberLimit) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to generate server serial number: %w", err) + } + + // Server certificate configuration + commonName := fmt.Sprintf("%s.%s.svc", webhookServiceName, namespace) + cert := &x509.Certificate{ + SerialNumber: serverSerialNumber, + Subject: pkix.Name{ + CommonName: commonName, + Organization: []string{"deckhouse.io"}, + }, + DNSNames: []string{commonName}, + NotBefore: time.Now(), + NotAfter: time.Now().AddDate(1, 0, 0), + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + KeyUsage: x509.KeyUsageDigitalSignature, + } + + serverPrivKey, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + return nil, nil, nil, fmt.Errorf("generating server private key: %w", err) + } + + serverCertBytes, err := x509.CreateCertificate(rand.Reader, cert, ca, &serverPrivKey.PublicKey, caPrivKey) + if err != nil { + return nil, nil, nil, fmt.Errorf("creating server certificate: %w", err) + } + + serverCertPEM := new(bytes.Buffer) + _ = pem.Encode(serverCertPEM, &pem.Block{ + Type: "CERTIFICATE", + Bytes: serverCertBytes, + }) + + serverPrivKeyPEM := new(bytes.Buffer) + _ = pem.Encode(serverPrivKeyPEM, &pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(serverPrivKey), + }) + + return caPEM.Bytes(), serverCertPEM.Bytes(), serverPrivKeyPEM.Bytes(), nil +} + +func getOrCreateMigrationForPrepare( + ctx context.Context, rtClient client.Client, targetCNI string, +) (*v1alpha1.CNIMigration, error) { + activeMigration, err := FindActiveMigration(ctx, rtClient) + if err != nil { + return nil, fmt.Errorf("failed to find active migration: %w", err) + } + + if activeMigration != nil { + fmt.Printf("Found active CNIMigration '%s'\n", activeMigration.Name) + + // If an active migration is found, ensure its target CNI matches the requested target CNI. + if activeMigration.Spec.TargetCNI != targetCNI { + return nil, fmt.Errorf( + "an active CNI migration to '%s' is already in progress. "+ + "Cannot prepare for '%s'. To change the target CNI, "+ + "please run 'd8 cni-switch cleanup' first to reset the state", + activeMigration.Spec.TargetCNI, + targetCNI, + ) + } + + // Check if preparation is already done + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { + fmt.Println("🎉 Cluster has already been prepared for CNI switch.") + fmt.Println("\nYou can now run 'd8 cni-switch switch' to proceed.") + return nil, nil // Signal to the caller that we can exit gracefully + } + } + + // Ensure the migration is in the 'Prepare' phase + if activeMigration.Spec.Phase != "Prepare" { + return nil, fmt.Errorf( + "an active migration is already in the '%s' phase. "+ + "Cannot run 'prepare' again. To proceed, run 'd8 cni-switch switch'. "+ + "To start over, run 'd8 cni-switch cleanup'", + activeMigration.Spec.Phase, + ) + } + + return activeMigration, nil + } + migrationName := fmt.Sprintf("cni-migration-%s", time.Now().Format("20060102-150405")) + fmt.Printf("No active migration found. Creating a new one...\n") + + newMigration := &v1alpha1.CNIMigration{ + ObjectMeta: metav1.ObjectMeta{ + Name: migrationName, + }, + Spec: v1alpha1.CNIMigrationSpec{ + TargetCNI: targetCNI, + Phase: "Prepare", + }, + } + + err = rtClient.Create(ctx, newMigration) + if err != nil { + if errors.IsAlreadyExists(err) { + fmt.Println("Migration object was created by another process. Getting it.") + err = rtClient.Get(ctx, client.ObjectKey{Name: migrationName}, newMigration) + if err != nil { + return nil, fmt.Errorf("getting existing CNIMigration object: %w", err) + } + return newMigration, nil + } + return nil, fmt.Errorf("creating new CNIMigration object: %w", err) + } + + fmt.Printf("Successfully created CNIMigration object '%s'\n", newMigration.Name) + return newMigration, nil +} + +func waitForDaemonSetReady(ctx context.Context, rtClient client.Client, ds *appsv1.DaemonSet) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + key := client.ObjectKey{Name: ds.Name, Namespace: ds.Namespace} + err := rtClient.Get(ctx, key, ds) + if err != nil { + fmt.Printf("\n⚠️ Warning: could not get DaemonSet status: %v\n", err) + continue + } + + // This is the exit condition for the loop. + if ds.Status.DesiredNumberScheduled == ds.Status.NumberReady && ds.Status.NumberUnavailable == 0 { + fmt.Printf( + "\r Waiting for DaemonSet... %d/%d pods ready\n", + ds.Status.NumberReady, + ds.Status.DesiredNumberScheduled, + ) + return nil + } + + // This is the progress update. + fmt.Printf( + "\r Waiting for DaemonSet... %d/%d pods ready", + ds.Status.NumberReady, + ds.Status.DesiredNumberScheduled, + ) + } + } +} + +func waitForDeploymentReady(ctx context.Context, rtClient client.Client, dep *appsv1.Deployment) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + key := client.ObjectKey{Name: dep.Name, Namespace: dep.Namespace} + err := rtClient.Get(ctx, key, dep) + if err != nil { + fmt.Printf("\n⚠️ Warning: could not get Deployment status: %v\n", err) + continue + } + + // This is the exit condition for the loop. + if dep.Spec.Replicas != nil && dep.Status.ReadyReplicas >= + *dep.Spec.Replicas && dep.Status.UnavailableReplicas == 0 { + fmt.Printf( + "\r Waiting for Deployment... %d/%d replicas ready\n", + dep.Status.ReadyReplicas, + *dep.Spec.Replicas, + ) + return nil + } + + // This is the progress update. + if dep.Spec.Replicas != nil { + fmt.Printf( + "\r Waiting for Deployment... %d/%d replicas ready", + dep.Status.ReadyReplicas, + *dep.Spec.Replicas, + ) + } + } + } +} + +func waitForNodesPrepared(ctx context.Context, rtClient client.Client) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + nodes := &corev1.NodeList{} + if err := rtClient.List(ctx, nodes); err != nil { + fmt.Printf("⚠️ Warning: could not list nodes: %v\n", err) + continue + } + totalNodes := len(nodes.Items) + + migrations := &v1alpha1.CNINodeMigrationList{} + if err := rtClient.List(ctx, migrations); err != nil { + fmt.Printf("⚠️ Warning: could not list node migrations: %v\n", err) + continue + } + + readyNodes := 0 + for _, migration := range migrations.Items { + for _, cond := range migration.Status.Conditions { + if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { + readyNodes++ + break + } + } + } + + fmt.Printf("\r Progress: %d/%d nodes prepared...", readyNodes, totalNodes) + + if readyNodes >= totalNodes && totalNodes > 0 { + fmt.Printf("\r Progress: %d/%d nodes prepared...\n", readyNodes, totalNodes) + return nil + } + } + } +} + +func detectCurrentCNI(rtClient client.Client) (string, error) { + var enabledCNIs []string + for _, cniModule := range CNIModuleConfigs { + mc := &unstructured.Unstructured{} + mc.SetGroupVersionKind(moduleConfigGVK) + + err := rtClient.Get(context.Background(), client.ObjectKey{Name: cniModule}, mc) + if err != nil { + if errors.IsNotFound(err) { + continue + } + return "", fmt.Errorf("getting module config %s: %w", cniModule, err) + } + + enabled, found, err := unstructured.NestedBool(mc.Object, "spec", "enabled") + if err != nil { + return "", fmt.Errorf("parsing 'spec.enabled' for module config %s: %w", cniModule, err) + } + + if found && enabled { + cniName := strings.TrimPrefix(cniModule, "cni-") + enabledCNIs = append(enabledCNIs, cniName) + } + } + + if len(enabledCNIs) == 0 { + return "", fmt.Errorf("no enabled CNI module found. Looked for: %s", strings.Join(CNIModuleConfigs, ", ")) + } + + if len(enabledCNIs) > 1 { + return "", fmt.Errorf( + "found multiple enabled CNI modules: %s. Please disable all but one", + strings.Join(enabledCNIs, ", "), + ) + } + + return enabledCNIs[0], nil +} diff --git a/internal/cni/resources.go b/internal/cni/resources.go new file mode 100644 index 00000000..f7801ccb --- /dev/null +++ b/internal/cni/resources.go @@ -0,0 +1,537 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +// --- Helper Resources --- + +// getSwitchHelperDaemonSet returns a DaemonSet object. +func getSwitchHelperDaemonSet(namespace, imageName string) *appsv1.DaemonSet { + rootID := int64(0) + truePtr := true + terminationGracePeriodSeconds := int64(5) + mountPropagationBidirectional := corev1.MountPropagationBidirectional + hostPathDirectoryOrCreate := corev1.HostPathDirectoryOrCreate + hostPathDirectory := corev1.HostPathDirectory + + return &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperDaemonSetName, + Namespace: namespace, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": switchHelperDaemonSetName}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": switchHelperDaemonSetName}, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: switchHelperServiceAccountName, + Containers: []corev1.Container{ + { + Name: "helper", + Image: imageName, + Ports: []corev1.ContainerPort{ + { + Name: "healthz", + ContainerPort: 8081, + Protocol: corev1.ProtocolTCP, + }, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 15, + PeriodSeconds: 20, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 5, + PeriodSeconds: 10, + }, + Env: []corev1.EnvVar{ + { + Name: "NODE_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "spec.nodeName", + }, + }, + }, + { + Name: "KUBERNETES_SERVICE_HOST", + Value: "127.0.0.1", + }, + { + Name: "KUBERNETES_SERVICE_PORT", + Value: "6445", + }, + }, + SecurityContext: &corev1.SecurityContext{ + Privileged: &truePtr, + RunAsUser: &rootID, + RunAsGroup: &rootID, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "host-proc", + MountPath: "/host/proc", + ReadOnly: true, + }, + { + Name: "host-sys", + MountPath: "/host/sys", + ReadOnly: true, + }, + { + Name: "cni-net-d", + MountPath: "/etc/cni/net.d", + }, + { + Name: "cni-bin", + MountPath: "/opt/cni/bin", + }, + { + Name: "host-run", + MountPath: "/run", + }, + { + Name: "host-bpf", + MountPath: "/sys/fs/bpf", + MountPropagation: &mountPropagationBidirectional, + }, + { + Name: "host-lib-modules", + MountPath: "/lib/modules", + ReadOnly: true, + }, + { + Name: "host-var-lib-cni", + MountPath: "/var/lib/cni", + }, + }, + }, + }, + Tolerations: []corev1.Toleration{ + { + Operator: corev1.TolerationOpExists, + }, + }, + PriorityClassName: "system-node-critical", + HostNetwork: true, + HostPID: true, + HostIPC: true, + TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, + Volumes: []corev1.Volume{ + { + Name: "host-proc", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/proc"}, + }, + }, + { + Name: "host-sys", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/sys"}, + }, + }, + { + Name: "cni-net-d", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/etc/cni/net.d"}, + }, + }, + { + Name: "cni-bin", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/opt/cni/bin"}, + }, + }, + { + Name: "host-run", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/run"}, + }, + }, + { + Name: "host-bpf", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/sys/fs/bpf", + Type: &hostPathDirectoryOrCreate, + }, + }, + }, + { + Name: "host-lib-modules", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/lib/modules", + Type: &hostPathDirectory, + }, + }, + }, + { + Name: "host-var-lib-cni", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/cni", + Type: &hostPathDirectoryOrCreate, + }, + }, + }, + }, + }, + }, + }, + } +} + +// --- Webhook Resources --- + +// getWebhookTLSSecret returns a Secret object containing the webhook's TLS certificates. +func getWebhookTLSSecret(namespace string, cert, key []byte) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookSecretName, + Namespace: namespace, + }, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + corev1.TLSCertKey: cert, + corev1.TLSPrivateKeyKey: key, + }, + } +} + +// getWebhookDeployment returns a Deployment object for the webhook server. +func getWebhookDeployment(namespace, imageName, serviceAccountName string) *appsv1.Deployment { + replicas := int32(2) + terminationGracePeriodSeconds := int64(5) + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookDeploymentName, + Namespace: namespace, + Labels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: serviceAccountName, + HostNetwork: true, + DNSPolicy: corev1.DNSClusterFirstWithHostNet, + TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, + Affinity: &corev1.Affinity{ + PodAntiAffinity: &corev1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": webhookDeploymentName, + }, + }, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "webhook", + Image: imageName, + Args: []string{ + "--mode=webhook", + "--health-probe-bind-address=:8082", + }, + Ports: []corev1.ContainerPort{ + { + ContainerPort: webhookPort, + Name: "webhook", + }, + { + Name: "healthz", + ContainerPort: 8082, + Protocol: corev1.ProtocolTCP, + }, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 15, + PeriodSeconds: 20, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromString("healthz"), + }, + }, + InitialDelaySeconds: 5, + PeriodSeconds: 10, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "tls-certs", + MountPath: "/etc/tls", + ReadOnly: true, + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "tls-certs", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: webhookSecretName, + }, + }, + }, + }, + }, + }, + }, + } +} + +// getWebhookService returns a Service object for the webhook. +func getWebhookService(namespace string) *corev1.Service { + return &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookServiceName, + Namespace: namespace, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + "app": webhookDeploymentName, + }, + Ports: []corev1.ServicePort{ + { + Protocol: corev1.ProtocolTCP, + Port: 443, + TargetPort: intstr.FromInt(webhookPort), + }, + }, + }, + } +} + +// getMutatingWebhookConfiguration returns a MutatingWebhookConfiguration object for annotating pods. +func getMutatingWebhookConfiguration(namespace string, caBundle []byte) *admissionregistrationv1.MutatingWebhookConfiguration { + path := "/mutate-pod" + failurePolicy := admissionregistrationv1.Ignore + sideEffects := admissionregistrationv1.SideEffectClassNone + + return &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookConfigName, + }, + Webhooks: []admissionregistrationv1.MutatingWebhook{ + { + Name: webhookConfigurationName, + AdmissionReviewVersions: []string{"v1"}, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + Service: &admissionregistrationv1.ServiceReference{ + Name: webhookServiceName, + Namespace: namespace, + Path: &path, + Port: &[]int32{443}[0], + }, + CABundle: caBundle, + }, + Rules: []admissionregistrationv1.RuleWithOperations{ + { + Operations: []admissionregistrationv1.OperationType{admissionregistrationv1.Create}, + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{""}, + APIVersions: []string{"v1"}, + Resources: []string{"pods"}, + }, + }, + }, + NamespaceSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "kubernetes.io/metadata.name", + Operator: metav1.LabelSelectorOpNotIn, + Values: generateExcludedNamespaces(namespace), + }, + }, + }, + FailurePolicy: &failurePolicy, + SideEffects: &sideEffects, + }, + }, + } +} + +// generateExcludedNamespaces creates a list of namespaces to be excluded from webhook processing. +// This includes the webhook's own namespace and CNI module namespaces. +func generateExcludedNamespaces(currentNamespace string) []string { + excluded := []string{currentNamespace} // Exclude the webhook's own namespace (e.g., "cni-switch") + for _, module := range CNIModuleConfigs { + excluded = append(excluded, "d8-"+module) // Exclude "d8-cni-cilium", "d8-cni-flannel", etc. + } + return excluded +} + +// --- RBAC Resources --- + +func getSwitchHelperServiceAccount(namespace string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperServiceAccountName, + Namespace: namespace, + }, + } +} + +func getWebhookServiceAccount(namespace string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookServiceAccountName, + Namespace: namespace, + }, + } +} + +func getSwitchHelperClusterRole() *rbacv1.ClusterRole { + return &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperClusterRoleName, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cnimigrations"}, + Verbs: []string{"get", "list", "watch"}, + }, + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cnimigrations/status"}, + Verbs: []string{"get"}, + }, + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cninodemigrations"}, + Verbs: []string{"get", "list", "watch", "create", "update", "patch", "delete"}, + }, + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cninodemigrations/status"}, + Verbs: []string{"get", "update", "patch"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"pods"}, + Verbs: []string{"get", "list", "watch", "patch", "update", "delete"}, + }, + }, + } +} + +func getWebhookClusterRole() *rbacv1.ClusterRole { + return &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookClusterRoleName, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{"network.deckhouse.io"}, + Resources: []string{"cnimigrations"}, + Verbs: []string{"get", "list", "watch"}, + }, + }, + } +} + +func getSwitchHelperClusterRoleBinding(namespace string) *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: switchHelperClusterRoleBindingName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: switchHelperClusterRoleName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: switchHelperServiceAccountName, + Namespace: namespace, + }, + }, + } +} + +func getWebhookClusterRoleBinding(namespace string) *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookClusterRoleBindingName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: webhookClusterRoleName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: webhookServiceAccountName, + Namespace: namespace, + }, + }, + } +} diff --git a/internal/cni/rollback.go b/internal/cni/rollback.go new file mode 100644 index 00000000..bd383eff --- /dev/null +++ b/internal/cni/rollback.go @@ -0,0 +1,28 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "fmt" + "time" +) + +// RunRollback executes the logic for the 'cni-switch rollback' command. +func RunRollback(timeout time.Duration) error { + fmt.Println("Logic for rollback is not implemented yet.") + return nil +} diff --git a/internal/cni/switch.go b/internal/cni/switch.go new file mode 100644 index 00000000..9a7e137b --- /dev/null +++ b/internal/cni/switch.go @@ -0,0 +1,807 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// RunSwitch executes the logic for the 'cni-switch switch' command. +func RunSwitch(timeout time.Duration) error { + // 0. Ask for user confirmation + confirmed, err := AskForConfirmation("switch") + if err != nil { + return fmt.Errorf("asking for confirmation: %w", err) + } + if !confirmed { + fmt.Println("Operation cancelled by user.") + return nil + } + + startTime := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("🚀 Starting CNI switch (global timeout: %s)\n", timeout) + + // 1. Create a Kubernetes client + safeClient, err := saferequest.NewSafeClient() + if err != nil { + return fmt.Errorf("creating safe client: %w", err) + } + + rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) + if err != nil { + return fmt.Errorf("creating runtime client: %w", err) + } + fmt.Printf("✅ Kubernetes client created (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 2. Find the active migration + activeMigration, err := FindActiveMigration(ctx, rtClient) + if err != nil { + return fmt.Errorf("failed to find active migration: %w", err) + } + + if activeMigration == nil { + return fmt.Errorf("no active CNI migration found. Please run 'd8 cni-switch prepare' first") + } + + // Check if the switch is already completed successfully. + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { + fmt.Printf("🎉 CNI switch to '%s' is already completed successfully.\n", activeMigration.Spec.TargetCNI) + fmt.Println("\nYou can run 'd8 cni-switch cleanup' to remove auxiliary resources.") + return nil + } + } + + // 3. Check if the preparation step was completed successfully + isPrepared := false + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { + isPrepared = true + break + } + } + + if !isPrepared { + return fmt.Errorf("cluster is not ready for switching. " + + "Please ensure the 'prepare' command completed successfully") + } + + // Verify all resources exist + fmt.Println("Verifying all created resources...") + if err = verifyResourcesExist(ctx, rtClient); err != nil { + return fmt.Errorf("verifying resources: %w", err) + } + fmt.Println("- Verified. All necessary resources are present in the cluster") + + fmt.Printf( + "✅ Working with prepared migration '%s' (total elapsed: %s)\n\n", + activeMigration.Name, + time.Since(startTime).Round(time.Millisecond), + ) + + // 4. Enable target CNI + currentCNI := activeMigration.Status.CurrentCNI + targetCNI := activeMigration.Spec.TargetCNI + + // Check if we already allowed the target CNI to start + targetCNIAllowedToStart := false + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "NodeCleanupSucceeded" && cond.Status == metav1.ConditionTrue { + targetCNIAllowedToStart = true + break + } + } + + fmt.Printf("Enabling target CNI module 'cni-%s'...\n", targetCNI) + if err = toggleModule(ctx, rtClient, "cni-"+strings.ToLower(targetCNI), true); err != nil { + return fmt.Errorf("enabling module '%s': %w", targetCNI, err) + } + + var dsName string + if !targetCNIAllowedToStart { + // Wait for target CNI pods to start initializing + targetModuleName := "cni-" + strings.ToLower(targetCNI) + dsName, err = getDaemonSetNameForCNI(targetModuleName) + if err != nil { + return fmt.Errorf("getting daemonset name for target CNI: %w", err) + } + if err = waitForModulePodsInitializing(ctx, rtClient, targetModuleName, dsName); err != nil { + return fmt.Errorf("waiting for target CNI pods to initialize: %w", err) + } + fmt.Printf("✅ CNI module 'cni-%s' enabled and pods initialized (total elapsed: %s)\n\n", + targetCNI, + time.Since(startTime).Round(time.Millisecond)) + + // 5. Disable current CNI + fmt.Printf("Disabling current CNI module 'cni-%s'...\n", currentCNI) + if err = toggleModule(ctx, rtClient, "cni-"+strings.ToLower(currentCNI), false); err != nil { + return fmt.Errorf("disabling module '%s': %w", currentCNI, err) + } + if err = waitForModule(ctx, rtClient, "cni-"+strings.ToLower(currentCNI), false); err != nil { + return fmt.Errorf("waiting for module '%s' to be disabled: %w", currentCNI, err) + } + + var dsNameCurrent string + dsNameCurrent, err = getDaemonSetNameForCNI("cni-" + strings.ToLower(currentCNI)) + if err != nil { + return fmt.Errorf("getting daemonset name for current CNI: %w", err) + } + if err = waitForModulePodsTermination( + ctx, rtClient, "cni-"+strings.ToLower(currentCNI), dsNameCurrent, + ); err != nil { + return fmt.Errorf("waiting for current CNI pods to terminate: %w", err) + } + + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "OldCNIDisabled", + Status: metav1.ConditionTrue, + Reason: "ModuleDisabled", + Message: fmt.Sprintf("Module 'cni-%s' was successfully disabled.", currentCNI), + }); err != nil { + return fmt.Errorf("updating CNIMigration status: %w", err) + } + fmt.Printf("✅ CNI module 'cni-%s' disabled (total elapsed: %s)\n\n", + currentCNI, + time.Since(startTime).Round(time.Millisecond)) + + // 6. Update phase to Migrate (Triggers cleanup on nodes) + fmt.Println("Updating CNIMigration phase to 'Migrate' to trigger node cleanup...") + if err = updateCNIMigrationPhase(ctx, rtClient, activeMigration.Name, "Migrate"); err != nil { + return fmt.Errorf("updating CNIMigration phase: %w", err) + } + fmt.Printf("✅ CNIMigration phase updated (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 7. Wait for nodes to be cleaned up + fmt.Println("Waiting for nodes to be cleaned up by cni-switch-helper...") + if err = waitForNodeConditions(ctx, rtClient, activeMigration, "CleanupSucceeded"); err != nil { + return fmt.Errorf("waiting for node cleanup: %w", err) + } + fmt.Printf("✅ All nodes cleaned up (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 8. This status update is CRITICAL. It unblocks the target CNI's init-container. + fmt.Println("Signaling target CNI pods to proceed by updating CNIMigration status...") + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "NodeCleanupSucceeded", + Status: metav1.ConditionTrue, + Reason: "AllNodesCleanedUp", + Message: "All nodes have been successfully cleaned up from old CNI artifacts.", + }); err != nil { + return fmt.Errorf("updating CNIMigration status: %w", err) + } + fmt.Printf("✅ CNIMigration status updated (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + } else { + fmt.Printf("ℹ️ Skipping wait for init, disable old CNI, cleanup as NodeCleanupSucceeded is already True.\n\n") + } + + // 9. Wait for target CNI to be Ready + // Now that NodeCleanupSucceeded is True, the target CNI pods should unblock and become Ready + if err = waitForModule(ctx, rtClient, "cni-"+strings.ToLower(targetCNI), true); err != nil { + return fmt.Errorf("waiting for module '%s' to be ready: %w", targetCNI, err) + } + fmt.Printf("✅ CNI module 'cni-%s' is now Ready (total elapsed: %s)\n\n", + targetCNI, + time.Since(startTime).Round(time.Millisecond)) + + // 10. Delete Mutating Webhook + fmt.Println("Deleting Mutating Webhook...") + if err = deleteMutatingWebhook(ctx, rtClient); err != nil { + return fmt.Errorf("deleting mutating webhook: %w", err) + } + fmt.Printf("✅ Mutating webhook deleted (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 11. Signal 'NewCNIEnabled' + fmt.Println("Signaling 'NewCNIEnabled' to proceed with pod restart...") + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "NewCNIEnabled", + Status: metav1.ConditionTrue, + Reason: "ModuleEnabled", + Message: fmt.Sprintf("Module 'cni-%s' was successfully enabled.", targetCNI), + }); err != nil { + return fmt.Errorf("updating CNIMigration status: %w", err) + } + fmt.Printf("✅ CNIMigration status updated (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + + // 12. Wait for pods to be restarted + fmt.Println("Waiting for pods to be restarted on all nodes...") + // We do not hard fail here, as per ADR suggestions. + if err = waitForNodeConditions(ctx, rtClient, activeMigration, "PodsRestarted"); err != nil { + fmt.Printf("⚠️ Warning: Timed out waiting for pods to restart: %v\n", err) + fmt.Println("Please check the cluster status manually. The CNI switch is otherwise complete.") + } else { + fmt.Printf("✅ All pods restarted (total elapsed: %s)\n\n", + time.Since(startTime).Round(time.Millisecond)) + } + + // 13. Finalize migration + fmt.Println("Finalizing migration...") + + // Update condition 'Succeeded' using the helper + if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ + Type: "Succeeded", + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "MigrationComplete", + Message: "CNI migration completed successfully.", + }); err != nil { + fmt.Printf("\n ⚠️ Warning: Failed to update status (Succeeded): %v. Proceeding...\n", err) + } + + fmt.Printf( + "🎉 CNI switch to '%s' completed successfully! (total time: %s)\n", + targetCNI, + time.Since(startTime).Round(time.Second), + ) + fmt.Println("\nYou can now run 'd8 cni-switch cleanup' to remove auxiliary resources.") + + return nil +} + +func toggleModule(ctx context.Context, cl client.Client, moduleName string, toggle bool) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + mc := &unstructured.Unstructured{} + mc.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "deckhouse.io", + Version: "v1alpha1", + Kind: "ModuleConfig", + }) + + err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, mc) + if err != nil { + fmt.Printf("\r ⚠️ Error getting module config '%s': %v. Retrying...", moduleName, err) + continue + } + + spec, found, err := unstructured.NestedMap(mc.Object, "spec") + if err != nil { + fmt.Printf("\r ⚠️ Error getting spec from module config '%s': %v. Retrying...", moduleName, err) + continue + } + if !found { + spec = make(map[string]any) + } + + // Skip update if value is already correct + if currentToggle, ok := spec["enabled"].(bool); ok && currentToggle == toggle { + return nil + } + + spec["enabled"] = toggle + + if err := unstructured.SetNestedMap(mc.Object, spec, "spec"); err != nil { + fmt.Printf("\r ⚠️ Error setting spec for module config '%s': %v. Retrying...", moduleName, err) + continue + } + + if err := cl.Update(ctx, mc); err != nil { + fmt.Printf("\r ⚠️ Error updating module config '%s': %v. Retrying...", moduleName, err) + continue + } + return nil + } + } +} + +func waitForModule(ctx context.Context, cl client.Client, moduleName string, shouldBeReady bool) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + module := &unstructured.Unstructured{} + module.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "deckhouse.io", + Version: "v1alpha1", + Kind: "Module", + }) + + err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) + + if shouldBeReady { + if err != nil { + if errors.IsNotFound(err) { + fmt.Printf("\r Waiting for module '%s': not found yet...", moduleName) + } else { + fmt.Printf("\r ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) + } + continue + } + + state, found, err := unstructured.NestedString(module.Object, "status", "phase") + if err != nil || !found { + fmt.Printf("\r Waiting for module '%s': status.phase field not found. Retrying...", + moduleName) + continue + } + + if state == "Ready" { + fmt.Printf("Module '%s' is Ready.", moduleName) + fmt.Println() + return nil + } + fmt.Printf("\r Waiting for module '%s' to be Ready, current state: %s", moduleName, state) + + } else { // should NOT be ready (disabled) + err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) + if err != nil { + if errors.IsNotFound(err) { + fmt.Printf("\r Module '%s' is not found, assuming disabled.", moduleName) + fmt.Println() + return nil + } + fmt.Printf("\r ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) + continue + } + + // Check conditions to see if it's disabled + conditions, found, err := unstructured.NestedSlice(module.Object, "status", "conditions") + if err != nil || !found { + fmt.Printf("\r Waiting for module '%s' status conditions. Retrying...", moduleName) + continue + } + + isReadyFound := false + for _, c := range conditions { + condition, ok := c.(map[string]any) + if !ok { + continue + } + + condType, found, err := unstructured.NestedString(condition, "type") + if err != nil || !found { + continue + } + + if condType == "IsReady" { + isReadyFound = true + condStatus, _, _ := unstructured.NestedString(condition, "status") + if condStatus == "False" { + fmt.Printf("\r✅ Module '%s' is disabled (IsReady=False).\n", moduleName) + fmt.Println() + return nil + } + } + } + + if !isReadyFound { + fmt.Printf("\r Waiting for module '%s' to be disabled, 'IsReady' condition not found...", + moduleName) + } else { + fmt.Printf("\r Waiting for module '%s' to be disabled (IsReady=False)...", moduleName) + } + } + } + } +} + +func updateCNIMigrationStatus( + ctx context.Context, cl client.Client, migrationName string, newCondition metav1.Condition, +) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + migration := &v1alpha1.CNIMigration{} + err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration) + if err != nil { + fmt.Printf("\r ⚠️ Error getting CNIMigration '%s': %v. Retrying...", migrationName, err) + continue + } + + patchedMigration := migration.DeepCopy() + newCondition.LastTransitionTime = metav1.Now() + + found := false + for i, cond := range patchedMigration.Status.Conditions { + if cond.Type == newCondition.Type { + // Check if update is needed + if cond.Status == newCondition.Status && cond.Reason == newCondition.Reason && + cond.Message == newCondition.Message { + return nil // Already up to date + } + patchedMigration.Status.Conditions[i] = newCondition + found = true + break + } + } + + if !found { + patchedMigration.Status.Conditions = append(patchedMigration.Status.Conditions, newCondition) + } + + err = cl.Status().Patch(ctx, patchedMigration, client.MergeFrom(migration)) + if err == nil { + return nil + } + + if !errors.IsConflict(err) { + fmt.Printf("\r ⚠️ Error patching CNIMigration status: %v. Retrying...", err) + } + } + } +} + +func waitForNodeConditions( + ctx context.Context, cl client.Client, cniMigration *v1alpha1.CNIMigration, conditionType string, +) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + reportedFailures := make(map[string]struct{}) + + // Helper to update status stats + updateStats := func(total, succeeded, failed int) { + // Fetch fresh object to avoid conflicts + freshMigration := &v1alpha1.CNIMigration{} + if err := cl.Get(ctx, types.NamespacedName{Name: cniMigration.Name}, freshMigration); err != nil { + return // Ignore error, will retry next tick + } + + if freshMigration.Status.NodesTotal == total && + freshMigration.Status.NodesSucceeded == succeeded && + freshMigration.Status.NodesFailed == failed { + return + } + + patched := freshMigration.DeepCopy() + patched.Status.NodesTotal = total + patched.Status.NodesSucceeded = succeeded + patched.Status.NodesFailed = failed + + _ = cl.Status().Patch(ctx, patched, client.MergeFrom(freshMigration)) + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // Get current total nodes + nodeList := &corev1.NodeList{} + if err := cl.List(ctx, nodeList); err != nil { + fmt.Printf("\r ⚠️ Error listing nodes: %v. Retrying...", err) + continue + } + totalNodes := len(nodeList.Items) + + // Get current node migrations + nodeMigrations := &v1alpha1.CNINodeMigrationList{} + if err := cl.List(ctx, nodeMigrations); err != nil { + fmt.Printf("\r ⚠️ Error listing CNINodeMigration resources: %v. Retrying...", err) + continue + } + + succeededNodes := 0 + failedNodes := 0 + + for _, nm := range nodeMigrations.Items { + if !metav1.IsControlledBy(&nm, cniMigration) { + continue + } + + // Check specific condition success + isSucceeded := false + for _, cond := range nm.Status.Conditions { + if cond.Type == conditionType && cond.Status == metav1.ConditionTrue { + succeededNodes++ + isSucceeded = true + + // Check if it was previously failed + if _, wasFailed := reportedFailures[nm.Name]; wasFailed { + fmt.Printf("\r\033[K") // Clear line + fmt.Printf(" ✅ Node '%s' has recovered.\n", nm.Name) + delete(reportedFailures, nm.Name) + } + break + } + } + + if isSucceeded { + continue + } + + // Check failure + for _, cond := range nm.Status.Conditions { + if cond.Type == "Failed" && cond.Status == metav1.ConditionTrue { + failedNodes++ + if _, reported := reportedFailures[nm.Name]; !reported { + // Clear line, print error, then let the progress bar overwrite next + fmt.Printf("\r\033[K") // Clear current line + fmt.Printf(" ❌ Node '%s' failed: %s\n", nm.Name, cond.Message) + reportedFailures[nm.Name] = struct{}{} + } + break + } + } + } + + // Update status in CNIMigration + go updateStats(totalNodes, succeededNodes, failedNodes) + + // Output progress + fmt.Printf("\r Progress: %d/%d nodes completed, %d failed. ", + succeededNodes, totalNodes, failedNodes) + + // 5. Check exit condition + if succeededNodes >= totalNodes { + fmt.Println() + return nil + } + } + } +} + +func deleteMutatingWebhook(ctx context.Context, cl client.Client) error { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + fmt.Printf(" Deleting mutating webhook '%s'...", webhookConfigName) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + webhook := &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: webhookConfigName, + }, + } + if err := cl.Delete(ctx, webhook); err != nil { + if errors.IsNotFound(err) { + fmt.Println("\r Mutating webhook not found, assuming already deleted.") + return nil + } + fmt.Printf("\r ⚠️ Error deleting mutating webhook: %v. Retrying...", err) + continue + } + fmt.Println("- Mutating webhook deleted") + return nil + } + } +} + +// getDaemonSetNameForCNI returns the name of the main DaemonSet for a given CNI module. +func getDaemonSetNameForCNI(cniModule string) (string, error) { + switch cniModule { + case "cni-cilium": + return "agent", nil + case "cni-flannel": + return "flannel", nil + case "cni-simple-bridge": + return "simple-bridge", nil + default: + return "", fmt.Errorf("unknown CNI module: %s", cniModule) + } +} + +// waitForModulePodsInitializing waits for all pods of a module's daemonset to be in the 'Initializing' state, +// specifically waiting in the init container. +func waitForModulePodsInitializing(ctx context.Context, cl client.Client, moduleName string, dsName string) error { + fmt.Printf(" Waiting for pods of module '%s' to enter 'Initializing' state...", moduleName) + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // Get the DaemonSet + ds := &appsv1.DaemonSet{} + err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) + if err != nil { + if errors.IsNotFound(err) { + fmt.Printf("\r Waiting for DaemonSet '%s' in namespace 'd8-%s': not found...", dsName, moduleName) + continue + } + fmt.Printf("\r Error getting DaemonSet '%s' in namespace 'd8-%s': %v. Retrying...", + dsName, moduleName, err) + continue + } + + // Check if pods are scheduled + if ds.Status.DesiredNumberScheduled == 0 { + fmt.Printf("\r Waiting for DaemonSet '%s' to schedule pods...", dsName) + continue + } + + // List pods + podList := &corev1.PodList{} + opts := []client.ListOption{ + client.InNamespace("d8-" + moduleName), + client.MatchingLabels(ds.Spec.Selector.MatchLabels), + } + if err := cl.List(ctx, podList, opts...); err != nil { + fmt.Printf("\r Error listing pods for module '%s': %v. Retrying...", moduleName, err) + continue + } + + if len(podList.Items) == 0 { + fmt.Printf("\r Waiting for pods of DaemonSet '%s' to be created...", dsName) + continue + } + + initializingPods := 0 + for _, pod := range podList.Items { + // Check pod status + if pod.Status.Phase == corev1.PodPending || pod.Status.Phase == corev1.PodRunning { + for _, initStatus := range pod.Status.InitContainerStatuses { + if initStatus.Name == "cni-switch-init-checker" && + (initStatus.State.Waiting != nil || initStatus.State.Running != nil) { + // Pod is waiting in or running our init-container + initializingPods++ + break + } + } + } + } + + fmt.Printf("\r Progress: %d/%d pods are in 'Initializing' state.", + initializingPods, ds.Status.DesiredNumberScheduled) + + if int32(initializingPods) >= ds.Status.DesiredNumberScheduled { + fmt.Println("\n- All pods for target CNI are correctly waiting in the init-container.") + return nil + } + } + } +} + +// waitForModulePodsTermination waits for all pods of a module's daemonset to be terminated. +func waitForModulePodsTermination(ctx context.Context, cl client.Client, moduleName string, dsName string) error { + fmt.Printf(" Waiting for pods of module '%s' to terminate...", moduleName) + + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // List pods with label app= in namespace d8- + podList := &corev1.PodList{} + opts := []client.ListOption{ + client.InNamespace("d8-" + moduleName), + client.MatchingLabels(map[string]string{"app": dsName}), + } + if err := cl.List(ctx, podList, opts...); err != nil { + fmt.Printf("\r Error listing pods for module '%s': %v. Retrying...", moduleName, err) + continue + } + + if len(podList.Items) == 0 { + fmt.Println("- All pods for disabled CNI module are terminated.") + return nil + } + + fmt.Printf("\r Waiting for %d pods of module '%s' to terminate...", len(podList.Items), moduleName) + } + } +} + +// verifyResourcesExist checks if all expected resources are present in the cluster. +func verifyResourcesExist(ctx context.Context, rtClient client.Client) error { + resources := []struct { + kind string + name string + namespace string + obj client.Object + }{ + {"Namespace", cniSwitchNamespace, "", &corev1.Namespace{}}, + {"ServiceAccount", switchHelperServiceAccountName, cniSwitchNamespace, &corev1.ServiceAccount{}}, + {"ServiceAccount", webhookServiceAccountName, cniSwitchNamespace, &corev1.ServiceAccount{}}, + {"ClusterRole", switchHelperClusterRoleName, "", &rbacv1.ClusterRole{}}, + {"ClusterRole", webhookClusterRoleName, "", &rbacv1.ClusterRole{}}, + {"ClusterRoleBinding", switchHelperClusterRoleBindingName, "", &rbacv1.ClusterRoleBinding{}}, + {"ClusterRoleBinding", webhookClusterRoleBindingName, "", &rbacv1.ClusterRoleBinding{}}, + {"Secret", webhookSecretName, cniSwitchNamespace, &corev1.Secret{}}, + {"Service", webhookServiceName, cniSwitchNamespace, &corev1.Service{}}, + {"MutatingWebhookConfiguration", webhookConfigName, "", &admissionregistrationv1.MutatingWebhookConfiguration{}}, + {"DaemonSet", switchHelperDaemonSetName, cniSwitchNamespace, &appsv1.DaemonSet{}}, + {"Deployment", webhookDeploymentName, cniSwitchNamespace, &appsv1.Deployment{}}, + } + + for _, r := range resources { + key := client.ObjectKey{Name: r.name, Namespace: r.namespace} + if err := rtClient.Get(ctx, key, r.obj); err != nil { + return fmt.Errorf("resource %s '%s' not found: %w", r.kind, r.name, err) + } + } + return nil +} + +func updateCNIMigrationPhase(ctx context.Context, cl client.Client, migrationName string, phase string) error { + update := func() error { // TDEN переписать по типу других + migration := &v1alpha1.CNIMigration{} + if err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration); err != nil { + return fmt.Errorf("getting CNIMigration: %w", err) + } + + if migration.Spec.Phase == phase { + return nil + } + + patchedMigration := migration.DeepCopy() + patchedMigration.Spec.Phase = phase + if err := cl.Patch(ctx, patchedMigration, client.MergeFrom(migration)); err != nil { + return fmt.Errorf("patching CNIMigration: %w", err) + } + return nil + } + + if err := update(); err == nil { + return nil + } else { + fmt.Printf("\r ⚠️ %v. Retrying...", err) + } + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + if err := update(); err != nil { + fmt.Printf("\r ⚠️ %v. Retrying...", err) + continue + } + return nil + } + } +} From 7e3346fdeb54f6abdd15853c4c39f96a10eb557e Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 11:21:35 +0400 Subject: [PATCH 02/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/switch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/cni/switch.go b/internal/cni/switch.go index 9a7e137b..13f3e198 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -765,7 +765,7 @@ func verifyResourcesExist(ctx context.Context, rtClient client.Client) error { } func updateCNIMigrationPhase(ctx context.Context, cl client.Client, migrationName string, phase string) error { - update := func() error { // TDEN переписать по типу других + update := func() error { migration := &v1alpha1.CNIMigration{} if err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration); err != nil { return fmt.Errorf("getting CNIMigration: %w", err) From f03c592c4795c5c606c45603e3619d7c1d77db6a Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 14:54:34 +0400 Subject: [PATCH 03/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/switch.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/internal/cni/switch.go b/internal/cni/switch.go index 13f3e198..c3c70eff 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -404,8 +404,7 @@ func waitForModule(ctx context.Context, cl client.Client, moduleName string, sho isReadyFound = true condStatus, _, _ := unstructured.NestedString(condition, "status") if condStatus == "False" { - fmt.Printf("\r✅ Module '%s' is disabled (IsReady=False).\n", moduleName) - fmt.Println() + fmt.Printf("\r- Module '%s' is disabled (IsReady=False).\n", moduleName) return nil } } @@ -606,7 +605,6 @@ func deleteMutatingWebhook(ctx context.Context, cl client.Client) error { fmt.Printf("\r ⚠️ Error deleting mutating webhook: %v. Retrying...", err) continue } - fmt.Println("- Mutating webhook deleted") return nil } } @@ -724,7 +722,7 @@ func waitForModulePodsTermination(ctx context.Context, cl client.Client, moduleN } if len(podList.Items) == 0 { - fmt.Println("- All pods for disabled CNI module are terminated.") + fmt.Println("\n- All pods for disabled CNI module are terminated.") return nil } From c3fe1afd5ce7e6b659f705d774d09e23f97a4a50 Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 15:35:38 +0400 Subject: [PATCH 04/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/switch.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/cni/switch.go b/internal/cni/switch.go index c3c70eff..ccbaefb8 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -224,7 +224,7 @@ func RunSwitch(timeout time.Duration) error { if err = deleteMutatingWebhook(ctx, rtClient); err != nil { return fmt.Errorf("deleting mutating webhook: %w", err) } - fmt.Printf("✅ Mutating webhook deleted (total elapsed: %s)\n\n", + fmt.Printf("\n✅ Mutating webhook deleted (total elapsed: %s)\n\n", time.Since(startTime).Round(time.Millisecond)) // 11. Signal 'NewCNIEnabled' @@ -253,8 +253,6 @@ func RunSwitch(timeout time.Duration) error { // 13. Finalize migration fmt.Println("Finalizing migration...") - - // Update condition 'Succeeded' using the helper if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ Type: "Succeeded", Status: metav1.ConditionTrue, From ab2dee2cdc5aaff4f798b2415ed3ec19ee463375 Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 16:56:40 +0400 Subject: [PATCH 05/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/cleanup.go | 42 +++++++++++++++++++++++------------------ internal/cni/switch.go | 1 - 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/internal/cni/cleanup.go b/internal/cni/cleanup.go index 53b15fdc..0051aea3 100644 --- a/internal/cni/cleanup.go +++ b/internal/cni/cleanup.go @@ -80,24 +80,7 @@ func RunCleanup(timeout time.Duration) error { } fmt.Println("✅ Active controllers stopped") - // 4. Delete all CNINodeMigration resources - fmt.Println("\nDeleting all CNINodeMigration resources...") - nodeMigrations := &v1alpha1.CNINodeMigrationList{} - if err = rtClient.List(ctx, nodeMigrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { - return fmt.Errorf("listing CNINodeMigrations: %w", err) - } - for _, nm := range nodeMigrations.Items { - // Remove finalizers to ensure deletion even if controller is down - if err = removeFinalizers(ctx, rtClient, &nm); err != nil { - fmt.Printf("⚠️ Warning: failed to remove finalizers from %s: %v\n", nm.Name, err) - } - if err = deleteAndWait(ctx, rtClient, &nm); err != nil { - return err - } - } - fmt.Println("✅ All CNINodeMigration resources deleted") - - // 5. Delete all CNIMigration resources + // 4. Delete all CNIMigration resources fmt.Println("\nDeleting all CNIMigration resources...") migrations := &v1alpha1.CNIMigrationList{} if err = rtClient.List(ctx, migrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { @@ -114,6 +97,23 @@ func RunCleanup(timeout time.Duration) error { } fmt.Println("✅ All CNIMigration resources deleted") + // 5. Delete all CNINodeMigration resources + fmt.Println("\nDeleting all CNINodeMigration resources...") + nodeMigrations := &v1alpha1.CNINodeMigrationList{} + if err = rtClient.List(ctx, nodeMigrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { + return fmt.Errorf("listing CNINodeMigrations: %w", err) + } + for _, nm := range nodeMigrations.Items { + // Remove finalizers to ensure deletion even if controller is down + if err = removeFinalizers(ctx, rtClient, &nm); err != nil { + fmt.Printf("⚠️ Warning: failed to remove finalizers from %s: %v\n", nm.Name, err) + } + if err = deleteAndWait(ctx, rtClient, &nm); err != nil { + return err + } + } + fmt.Println("✅ All CNINodeMigration resources deleted") + // 6. Remove annotations from all pods if err = removePodAnnotations(ctx, rtClient); err != nil { // Non-fatal, print a warning @@ -262,6 +262,12 @@ func waitForResourceDeletion(ctx context.Context, rtClient client.Client, obj cl fmt.Printf("error: %v\n", err) return fmt.Errorf("getting %s '%s': %w", kind, key.Name, err) } + + // If the resource is still there, check for finalizers and remove them + // to ensure it doesn't get stuck in Terminating state. + if len(obj.GetFinalizers()) > 0 { + _ = removeFinalizers(ctx, rtClient, obj) + } } } } diff --git a/internal/cni/switch.go b/internal/cni/switch.go index ccbaefb8..2d00055c 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -242,7 +242,6 @@ func RunSwitch(timeout time.Duration) error { // 12. Wait for pods to be restarted fmt.Println("Waiting for pods to be restarted on all nodes...") - // We do not hard fail here, as per ADR suggestions. if err = waitForNodeConditions(ctx, rtClient, activeMigration, "PodsRestarted"); err != nil { fmt.Printf("⚠️ Warning: Timed out waiting for pods to restart: %v\n", err) fmt.Println("Please check the cluster status manually. The CNI switch is otherwise complete.") From 4dac11168481b0f6516c1e8fdeec62652ad8f76f Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 18:12:02 +0400 Subject: [PATCH 06/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/common.go | 2 +- internal/cni/resources.go | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/internal/cni/common.go b/internal/cni/common.go index 0fc646f3..ad867e64 100644 --- a/internal/cni/common.go +++ b/internal/cni/common.go @@ -56,7 +56,7 @@ const ( webhookSecretName = "cni-switch-webhook-tls" webhookConfigName = "cni-switch-pod-annotator" webhookConfigurationName = "annotator.cni-switch.deckhouse.io" - webhookPort = 9443 + webhookPort = 42443 // Annotations EffectiveCNIAnnotation = "effective-cni.network.deckhouse.io" diff --git a/internal/cni/resources.go b/internal/cni/resources.go index f7801ccb..fe47446c 100644 --- a/internal/cni/resources.go +++ b/internal/cni/resources.go @@ -55,10 +55,13 @@ func getSwitchHelperDaemonSet(namespace, imageName string) *appsv1.DaemonSet { { Name: "helper", Image: imageName, + Args: []string{ + "--health-probe-bind-address=:42281", + }, Ports: []corev1.ContainerPort{ { Name: "healthz", - ContainerPort: 8081, + ContainerPort: 42281, Protocol: corev1.ProtocolTCP, }, }, @@ -267,6 +270,11 @@ func getWebhookDeployment(namespace, imageName, serviceAccountName string) *apps HostNetwork: true, DNSPolicy: corev1.DNSClusterFirstWithHostNet, TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, + Tolerations: []corev1.Toleration{ + { + Operator: corev1.TolerationOpExists, + }, + }, Affinity: &corev1.Affinity{ PodAntiAffinity: &corev1.PodAntiAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ @@ -287,7 +295,7 @@ func getWebhookDeployment(namespace, imageName, serviceAccountName string) *apps Image: imageName, Args: []string{ "--mode=webhook", - "--health-probe-bind-address=:8082", + "--health-probe-bind-address=:42282", }, Ports: []corev1.ContainerPort{ { @@ -296,7 +304,7 @@ func getWebhookDeployment(namespace, imageName, serviceAccountName string) *apps }, { Name: "healthz", - ContainerPort: 8082, + ContainerPort: 42282, Protocol: corev1.ProtocolTCP, }, }, From 525d34ac659ab26bf181c92cd47e75ca79b71812 Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 19:30:13 +0400 Subject: [PATCH 07/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/cleanup.go | 2 +- internal/cni/prepare.go | 12 ++++---- internal/cni/switch.go | 68 ++++++++++++++++++++--------------------- 3 files changed, 40 insertions(+), 42 deletions(-) diff --git a/internal/cni/cleanup.go b/internal/cni/cleanup.go index 0051aea3..acce2b42 100644 --- a/internal/cni/cleanup.go +++ b/internal/cni/cleanup.go @@ -299,7 +299,7 @@ func removePodAnnotations(ctx context.Context, rtClient client.Client) error { continue } podsPatched++ - fmt.Printf("\r Patched %d pods...", podsPatched) + fmt.Printf("\r\033[K Patched %d pods...", podsPatched) } } diff --git a/internal/cni/prepare.go b/internal/cni/prepare.go index b23d28eb..7fd01847 100644 --- a/internal/cni/prepare.go +++ b/internal/cni/prepare.go @@ -448,7 +448,7 @@ func waitForDaemonSetReady(ctx context.Context, rtClient client.Client, ds *apps // This is the exit condition for the loop. if ds.Status.DesiredNumberScheduled == ds.Status.NumberReady && ds.Status.NumberUnavailable == 0 { fmt.Printf( - "\r Waiting for DaemonSet... %d/%d pods ready\n", + "\r\033[K Waiting for DaemonSet... %d/%d pods ready\n", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled, ) @@ -457,7 +457,7 @@ func waitForDaemonSetReady(ctx context.Context, rtClient client.Client, ds *apps // This is the progress update. fmt.Printf( - "\r Waiting for DaemonSet... %d/%d pods ready", + "\r\033[K Waiting for DaemonSet... %d/%d pods ready", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled, ) @@ -485,7 +485,7 @@ func waitForDeploymentReady(ctx context.Context, rtClient client.Client, dep *ap if dep.Spec.Replicas != nil && dep.Status.ReadyReplicas >= *dep.Spec.Replicas && dep.Status.UnavailableReplicas == 0 { fmt.Printf( - "\r Waiting for Deployment... %d/%d replicas ready\n", + "\r\033[K Waiting for Deployment... %d/%d replicas ready\n", dep.Status.ReadyReplicas, *dep.Spec.Replicas, ) @@ -495,7 +495,7 @@ func waitForDeploymentReady(ctx context.Context, rtClient client.Client, dep *ap // This is the progress update. if dep.Spec.Replicas != nil { fmt.Printf( - "\r Waiting for Deployment... %d/%d replicas ready", + "\r\033[K Waiting for Deployment... %d/%d replicas ready", dep.Status.ReadyReplicas, *dep.Spec.Replicas, ) @@ -536,10 +536,10 @@ func waitForNodesPrepared(ctx context.Context, rtClient client.Client) error { } } - fmt.Printf("\r Progress: %d/%d nodes prepared...", readyNodes, totalNodes) + fmt.Printf("\r\033[K Progress: %d/%d nodes prepared...", readyNodes, totalNodes) if readyNodes >= totalNodes && totalNodes > 0 { - fmt.Printf("\r Progress: %d/%d nodes prepared...\n", readyNodes, totalNodes) + fmt.Printf("\r\033[K Progress: %d/%d nodes prepared...\n", readyNodes, totalNodes) return nil } } diff --git a/internal/cni/switch.go b/internal/cni/switch.go index 2d00055c..a66f0a35 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -290,13 +290,13 @@ func toggleModule(ctx context.Context, cl client.Client, moduleName string, togg err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, mc) if err != nil { - fmt.Printf("\r ⚠️ Error getting module config '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K ⚠️ Error getting module config '%s': %v. Retrying...", moduleName, err) continue } spec, found, err := unstructured.NestedMap(mc.Object, "spec") if err != nil { - fmt.Printf("\r ⚠️ Error getting spec from module config '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K ⚠️ Error getting spec from module config '%s': %v. Retrying...", moduleName, err) continue } if !found { @@ -311,12 +311,12 @@ func toggleModule(ctx context.Context, cl client.Client, moduleName string, togg spec["enabled"] = toggle if err := unstructured.SetNestedMap(mc.Object, spec, "spec"); err != nil { - fmt.Printf("\r ⚠️ Error setting spec for module config '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K ⚠️ Error setting spec for module config '%s': %v. Retrying...", moduleName, err) continue } if err := cl.Update(ctx, mc); err != nil { - fmt.Printf("\r ⚠️ Error updating module config '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K ⚠️ Error updating module config '%s': %v. Retrying...", moduleName, err) continue } return nil @@ -345,16 +345,16 @@ func waitForModule(ctx context.Context, cl client.Client, moduleName string, sho if shouldBeReady { if err != nil { if errors.IsNotFound(err) { - fmt.Printf("\r Waiting for module '%s': not found yet...", moduleName) + fmt.Printf("\r\033[K Waiting for module '%s': not found yet...", moduleName) } else { - fmt.Printf("\r ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) } continue } state, found, err := unstructured.NestedString(module.Object, "status", "phase") if err != nil || !found { - fmt.Printf("\r Waiting for module '%s': status.phase field not found. Retrying...", + fmt.Printf("\r\033[K Waiting for module '%s': status.phase field not found. Retrying...", moduleName) continue } @@ -364,24 +364,24 @@ func waitForModule(ctx context.Context, cl client.Client, moduleName string, sho fmt.Println() return nil } - fmt.Printf("\r Waiting for module '%s' to be Ready, current state: %s", moduleName, state) + fmt.Printf("\r\033[K Waiting for module '%s' to be Ready, current state: %s", moduleName, state) } else { // should NOT be ready (disabled) err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) if err != nil { if errors.IsNotFound(err) { - fmt.Printf("\r Module '%s' is not found, assuming disabled.", moduleName) + fmt.Printf("\r\033[K Module '%s' is not found, assuming disabled.", moduleName) fmt.Println() return nil } - fmt.Printf("\r ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) continue } // Check conditions to see if it's disabled conditions, found, err := unstructured.NestedSlice(module.Object, "status", "conditions") if err != nil || !found { - fmt.Printf("\r Waiting for module '%s' status conditions. Retrying...", moduleName) + fmt.Printf("\r\033[K Waiting for module '%s' status conditions. Retrying...", moduleName) continue } @@ -401,17 +401,17 @@ func waitForModule(ctx context.Context, cl client.Client, moduleName string, sho isReadyFound = true condStatus, _, _ := unstructured.NestedString(condition, "status") if condStatus == "False" { - fmt.Printf("\r- Module '%s' is disabled (IsReady=False).\n", moduleName) + fmt.Printf("\r\033[K- Module '%s' is disabled (IsReady=False).\n", moduleName) return nil } } } if !isReadyFound { - fmt.Printf("\r Waiting for module '%s' to be disabled, 'IsReady' condition not found...", + fmt.Printf("\r\033[K Waiting for module '%s' to be disabled, 'IsReady' condition not found...", moduleName) } else { - fmt.Printf("\r Waiting for module '%s' to be disabled (IsReady=False)...", moduleName) + fmt.Printf("\r\033[K Waiting for module '%s' to be disabled (IsReady=False)...", moduleName) } } } @@ -432,7 +432,7 @@ func updateCNIMigrationStatus( migration := &v1alpha1.CNIMigration{} err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration) if err != nil { - fmt.Printf("\r ⚠️ Error getting CNIMigration '%s': %v. Retrying...", migrationName, err) + fmt.Printf("\r\033[K ⚠️ Error getting CNIMigration '%s': %v. Retrying...", migrationName, err) continue } @@ -463,7 +463,7 @@ func updateCNIMigrationStatus( } if !errors.IsConflict(err) { - fmt.Printf("\r ⚠️ Error patching CNIMigration status: %v. Retrying...", err) + fmt.Printf("\r\033[K ⚠️ Error patching CNIMigration status: %v. Retrying...", err) } } } @@ -507,7 +507,7 @@ func waitForNodeConditions( // Get current total nodes nodeList := &corev1.NodeList{} if err := cl.List(ctx, nodeList); err != nil { - fmt.Printf("\r ⚠️ Error listing nodes: %v. Retrying...", err) + fmt.Printf("\r\033[K ⚠️ Error listing nodes: %v. Retrying...", err) continue } totalNodes := len(nodeList.Items) @@ -515,7 +515,7 @@ func waitForNodeConditions( // Get current node migrations nodeMigrations := &v1alpha1.CNINodeMigrationList{} if err := cl.List(ctx, nodeMigrations); err != nil { - fmt.Printf("\r ⚠️ Error listing CNINodeMigration resources: %v. Retrying...", err) + fmt.Printf("\r\033[K ⚠️ Error listing CNINodeMigration resources: %v. Retrying...", err) continue } @@ -536,8 +536,7 @@ func waitForNodeConditions( // Check if it was previously failed if _, wasFailed := reportedFailures[nm.Name]; wasFailed { - fmt.Printf("\r\033[K") // Clear line - fmt.Printf(" ✅ Node '%s' has recovered.\n", nm.Name) + fmt.Printf("\r\033[K ✅ Node '%s' has recovered.\n", nm.Name) delete(reportedFailures, nm.Name) } break @@ -554,8 +553,7 @@ func waitForNodeConditions( failedNodes++ if _, reported := reportedFailures[nm.Name]; !reported { // Clear line, print error, then let the progress bar overwrite next - fmt.Printf("\r\033[K") // Clear current line - fmt.Printf(" ❌ Node '%s' failed: %s\n", nm.Name, cond.Message) + fmt.Printf("\r\033[K ❌ Node '%s' failed: %s\n", nm.Name, cond.Message) reportedFailures[nm.Name] = struct{}{} } break @@ -567,7 +565,7 @@ func waitForNodeConditions( go updateStats(totalNodes, succeededNodes, failedNodes) // Output progress - fmt.Printf("\r Progress: %d/%d nodes completed, %d failed. ", + fmt.Printf("\r\033[K Progress: %d/%d nodes completed, %d failed. ", succeededNodes, totalNodes, failedNodes) // 5. Check exit condition @@ -596,10 +594,10 @@ func deleteMutatingWebhook(ctx context.Context, cl client.Client) error { } if err := cl.Delete(ctx, webhook); err != nil { if errors.IsNotFound(err) { - fmt.Println("\r Mutating webhook not found, assuming already deleted.") + fmt.Println("\r\033[K Mutating webhook not found, assuming already deleted.") return nil } - fmt.Printf("\r ⚠️ Error deleting mutating webhook: %v. Retrying...", err) + fmt.Printf("\r\033[K ⚠️ Error deleting mutating webhook: %v. Retrying...", err) continue } return nil @@ -639,17 +637,17 @@ func waitForModulePodsInitializing(ctx context.Context, cl client.Client, module err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) if err != nil { if errors.IsNotFound(err) { - fmt.Printf("\r Waiting for DaemonSet '%s' in namespace 'd8-%s': not found...", dsName, moduleName) + fmt.Printf("\r\033[K Waiting for DaemonSet '%s' in namespace 'd8-%s': not found...", dsName, moduleName) continue } - fmt.Printf("\r Error getting DaemonSet '%s' in namespace 'd8-%s': %v. Retrying...", + fmt.Printf("\r\033[K Error getting DaemonSet '%s' in namespace 'd8-%s': %v. Retrying...", dsName, moduleName, err) continue } // Check if pods are scheduled if ds.Status.DesiredNumberScheduled == 0 { - fmt.Printf("\r Waiting for DaemonSet '%s' to schedule pods...", dsName) + fmt.Printf("\r\033[K Waiting for DaemonSet '%s' to schedule pods...", dsName) continue } @@ -660,12 +658,12 @@ func waitForModulePodsInitializing(ctx context.Context, cl client.Client, module client.MatchingLabels(ds.Spec.Selector.MatchLabels), } if err := cl.List(ctx, podList, opts...); err != nil { - fmt.Printf("\r Error listing pods for module '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K Error listing pods for module '%s': %v. Retrying...", moduleName, err) continue } if len(podList.Items) == 0 { - fmt.Printf("\r Waiting for pods of DaemonSet '%s' to be created...", dsName) + fmt.Printf("\r\033[K Waiting for pods of DaemonSet '%s' to be created...", dsName) continue } @@ -684,7 +682,7 @@ func waitForModulePodsInitializing(ctx context.Context, cl client.Client, module } } - fmt.Printf("\r Progress: %d/%d pods are in 'Initializing' state.", + fmt.Printf("\r\033[K Progress: %d/%d pods are in 'Initializing' state.", initializingPods, ds.Status.DesiredNumberScheduled) if int32(initializingPods) >= ds.Status.DesiredNumberScheduled { @@ -714,7 +712,7 @@ func waitForModulePodsTermination(ctx context.Context, cl client.Client, moduleN client.MatchingLabels(map[string]string{"app": dsName}), } if err := cl.List(ctx, podList, opts...); err != nil { - fmt.Printf("\r Error listing pods for module '%s': %v. Retrying...", moduleName, err) + fmt.Printf("\r\033[K Error listing pods for module '%s': %v. Retrying...", moduleName, err) continue } @@ -723,7 +721,7 @@ func waitForModulePodsTermination(ctx context.Context, cl client.Client, moduleN return nil } - fmt.Printf("\r Waiting for %d pods of module '%s' to terminate...", len(podList.Items), moduleName) + fmt.Printf("\r\033[K Waiting for %d pods of module '%s' to terminate...", len(podList.Items), moduleName) } } } @@ -781,7 +779,7 @@ func updateCNIMigrationPhase(ctx context.Context, cl client.Client, migrationNam if err := update(); err == nil { return nil } else { - fmt.Printf("\r ⚠️ %v. Retrying...", err) + fmt.Printf("\r\033[K ⚠️ %v. Retrying...", err) } ticker := time.NewTicker(2 * time.Second) @@ -793,7 +791,7 @@ func updateCNIMigrationPhase(ctx context.Context, cl client.Client, migrationNam return ctx.Err() case <-ticker.C: if err := update(); err != nil { - fmt.Printf("\r ⚠️ %v. Retrying...", err) + fmt.Printf("\r\033[K ⚠️ %v. Retrying...", err) continue } return nil From 1f3de542fc7e5b3041a3f407a9322462258d4269 Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 19:30:24 +0400 Subject: [PATCH 08/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/switch.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/cni/switch.go b/internal/cni/switch.go index a66f0a35..b62392fd 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -637,7 +637,8 @@ func waitForModulePodsInitializing(ctx context.Context, cl client.Client, module err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) if err != nil { if errors.IsNotFound(err) { - fmt.Printf("\r\033[K Waiting for DaemonSet '%s' in namespace 'd8-%s': not found...", dsName, moduleName) + fmt.Printf("\r\033[K Waiting for DaemonSet '%s' in namespace 'd8-%s': not found...", + dsName, moduleName) continue } fmt.Printf("\r\033[K Error getting DaemonSet '%s' in namespace 'd8-%s': %v. Retrying...", From 001b12879aa764cab0983517732e962793208e1f Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Thu, 11 Dec 2025 20:41:24 +0400 Subject: [PATCH 09/16] ++ Signed-off-by: Denis Tarabrin --- internal/cni/switch.go | 44 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/internal/cni/switch.go b/internal/cni/switch.go index b62392fd..11122280 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -212,8 +212,14 @@ func RunSwitch(timeout time.Duration) error { // 9. Wait for target CNI to be Ready // Now that NodeCleanupSucceeded is True, the target CNI pods should unblock and become Ready - if err = waitForModule(ctx, rtClient, "cni-"+strings.ToLower(targetCNI), true); err != nil { - return fmt.Errorf("waiting for module '%s' to be ready: %w", targetCNI, err) + targetModuleName := "cni-" + strings.ToLower(targetCNI) + fmt.Printf("Waiting for pods of module '%s' to become Ready...\n", targetModuleName) + dsName, err = getDaemonSetNameForCNI(targetModuleName) + if err != nil { + return fmt.Errorf("getting daemonset name for target CNI: %w", err) + } + if err = waitForModulePodsReady(ctx, rtClient, targetModuleName, dsName); err != nil { + return fmt.Errorf("waiting for target CNI pods to be ready: %w", err) } fmt.Printf("✅ CNI module 'cni-%s' is now Ready (total elapsed: %s)\n\n", targetCNI, @@ -694,6 +700,40 @@ func waitForModulePodsInitializing(ctx context.Context, cl client.Client, module } } +// waitForModulePodsReady waits for all pods of a module's daemonset to be in the 'Ready' state. +func waitForModulePodsReady(ctx context.Context, cl client.Client, moduleName string, dsName string) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + // Get the DaemonSet + ds := &appsv1.DaemonSet{} + err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) + if err != nil { + fmt.Printf("\r\033[K Error getting DaemonSet '%s': %v. Retrying...", dsName, err) + continue + } + + if ds.Status.DesiredNumberScheduled == 0 { + fmt.Printf("\r\033[K Waiting for DaemonSet '%s' to schedule pods...", dsName) + continue + } + + fmt.Printf("\r\033[K Progress: %d/%d pods are Ready.", + ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) + + if ds.Status.NumberReady >= ds.Status.DesiredNumberScheduled { + fmt.Println("\n- All pods for target CNI are Ready.") + return nil + } + } + } +} + // waitForModulePodsTermination waits for all pods of a module's daemonset to be terminated. func waitForModulePodsTermination(ctx context.Context, cl client.Client, moduleName string, dsName string) error { fmt.Printf(" Waiting for pods of module '%s' to terminate...", moduleName) From ded5e0906bdb2363c04f545b4e658d879495d22c Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Fri, 12 Dec 2025 14:51:58 +0400 Subject: [PATCH 10/16] linter Signed-off-by: Denis Tarabrin --- internal/cni/cleanup.go | 5 +++-- internal/cni/common.go | 3 ++- internal/cni/prepare.go | 7 ++++--- internal/cni/rollback.go | 1 + internal/cni/switch.go | 14 +++++++------- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/internal/cni/cleanup.go b/internal/cni/cleanup.go index acce2b42..4ec771b5 100644 --- a/internal/cni/cleanup.go +++ b/internal/cni/cleanup.go @@ -22,8 +22,6 @@ import ( "strings" "time" - "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" - saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" admissionregistrationv1 "k8s.io/api/admissionregistration/v1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -31,6 +29,9 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" ) // RunCleanup executes the logic for the 'cni-switch cleanup' command. diff --git a/internal/cni/common.go b/internal/cni/common.go index ad867e64..a018f8d8 100644 --- a/internal/cni/common.go +++ b/internal/cni/common.go @@ -23,9 +23,10 @@ import ( "os" "strings" - "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime/schema" "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" ) const ( diff --git a/internal/cni/prepare.go b/internal/cni/prepare.go index 7fd01847..cb72efe8 100644 --- a/internal/cni/prepare.go +++ b/internal/cni/prepare.go @@ -29,14 +29,15 @@ import ( "strings" "time" - "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" - saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" ) // RunPrepare executes the logic for the 'cni-switch prepare' command. @@ -270,7 +271,7 @@ func RunPrepare(targetCNI string, timeout time.Duration) error { } // generateWebhookCertificates creates a self-signed CA and a server certificate for the webhook. -func generateWebhookCertificates(namespace string) (caCert, serverCert, serverKey []byte, err error) { +func generateWebhookCertificates(namespace string) ([]byte, []byte, []byte, error) { caSerialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) caSerialNumber, err := rand.Int(rand.Reader, caSerialNumberLimit) if err != nil { diff --git a/internal/cni/rollback.go b/internal/cni/rollback.go index bd383eff..52999599 100644 --- a/internal/cni/rollback.go +++ b/internal/cni/rollback.go @@ -24,5 +24,6 @@ import ( // RunRollback executes the logic for the 'cni-switch rollback' command. func RunRollback(timeout time.Duration) error { fmt.Println("Logic for rollback is not implemented yet.") + _ = timeout return nil } diff --git a/internal/cni/switch.go b/internal/cni/switch.go index 11122280..39ebef36 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -22,8 +22,6 @@ import ( "strings" "time" - "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" - saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" admissionregistrationv1 "k8s.io/api/admissionregistration/v1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -34,6 +32,9 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" ) // RunSwitch executes the logic for the 'cni-switch switch' command. @@ -371,8 +372,7 @@ func waitForModule(ctx context.Context, cl client.Client, moduleName string, sho return nil } fmt.Printf("\r\033[K Waiting for module '%s' to be Ready, current state: %s", moduleName, state) - - } else { // should NOT be ready (disabled) + } else { err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) if err != nil { if errors.IsNotFound(err) { @@ -817,11 +817,11 @@ func updateCNIMigrationPhase(ctx context.Context, cl client.Client, migrationNam return nil } - if err := update(); err == nil { + err := update() + if err == nil { return nil - } else { - fmt.Printf("\r\033[K ⚠️ %v. Retrying...", err) } + fmt.Printf("\r\033[K ⚠️ %v. Retrying...", err) ticker := time.NewTicker(2 * time.Second) defer ticker.Stop() From 4a5da95863b677390b8c55cc675db0ee7749703b Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Fri, 12 Dec 2025 15:05:10 +0400 Subject: [PATCH 11/16] linter2 Signed-off-by: Denis Tarabrin --- cmd/commands/cni.go | 15 ++++++++------- cmd/commands/kubectl.go | 3 ++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cmd/commands/cni.go b/cmd/commands/cni.go index 229a6b86..8d1ac8ab 100644 --- a/cmd/commands/cni.go +++ b/cmd/commands/cni.go @@ -22,11 +22,12 @@ import ( "strings" "time" - "github.com/deckhouse/deckhouse-cli/internal/cni" "github.com/go-logr/logr" "github.com/spf13/cobra" "k8s.io/kubectl/pkg/util/templates" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" + + "github.com/deckhouse/deckhouse-cli/internal/cni" ) var ( @@ -81,7 +82,7 @@ func NewCmdCniPrepare() *cobra.Command { Use: "prepare", Short: "Prepares the cluster for CNI switching", Example: cniPrepareExample, - PreRunE: func(cmd *cobra.Command, args []string) error { + PreRunE: func(cmd *cobra.Command, _ []string) error { targetCNI, _ := cmd.Flags().GetString("to-cni") for _, supported := range supportedCNIs { if strings.ToLower(targetCNI) == supported { @@ -95,7 +96,7 @@ func NewCmdCniPrepare() *cobra.Command { ) }, - Run: func(cmd *cobra.Command, args []string) { + Run: func(cmd *cobra.Command, _ []string) { targetCNI, _ := cmd.Flags().GetString("to-cni") timeout, _ := cmd.Flags().GetDuration("timeout") if err := cni.RunPrepare(targetCNI, timeout); err != nil { @@ -117,7 +118,7 @@ func NewCmdCniSwitch() *cobra.Command { Use: "switch", Short: "Performs the CNI switching", Example: cniSwitchExample, - Run: func(cmd *cobra.Command, args []string) { + Run: func(cmd *cobra.Command, _ []string) { timeout, _ := cmd.Flags().GetDuration("timeout") if err := cni.RunSwitch(timeout); err != nil { log.Fatalf("❌ Error running switch command: %v", err) @@ -132,7 +133,7 @@ func NewCmdCniCleanup() *cobra.Command { Use: "cleanup", Short: "Cleans up resources created during CNI switching", Example: cniCleanupExample, - Run: func(cmd *cobra.Command, args []string) { + Run: func(cmd *cobra.Command, _ []string) { timeout, _ := cmd.Flags().GetDuration("timeout") if err := cni.RunCleanup(timeout); err != nil { log.Fatalf("❌ Error running cleanup command: %v", err) @@ -146,8 +147,8 @@ func NewCmdCniRollback() *cobra.Command { // TDEN It needs to be done! cmd := &cobra.Command{ Use: "rollback", Short: "Rollback all changes and restore previous CNI", - Example: cniCleanupExample, - Run: func(cmd *cobra.Command, args []string) { + Example: cniRollbackExample, + Run: func(cmd *cobra.Command, _ []string) { timeout, _ := cmd.Flags().GetDuration("timeout") if err := cni.RunRollback(timeout); err != nil { log.Fatalf("❌ Error running rollback command: %v", err) diff --git a/cmd/commands/kubectl.go b/cmd/commands/kubectl.go index 5a65cfc7..5f14c148 100644 --- a/cmd/commands/kubectl.go +++ b/cmd/commands/kubectl.go @@ -25,7 +25,6 @@ import ( "regexp" "time" - "github.com/deckhouse/deckhouse-cli/internal/cni" "github.com/spf13/cobra" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/cli-runtime/pkg/genericclioptions" @@ -33,6 +32,8 @@ import ( cliflag "k8s.io/component-base/cli/flag" "k8s.io/component-base/logs" kubecmd "k8s.io/kubectl/pkg/cmd" + + "github.com/deckhouse/deckhouse-cli/internal/cni" ) const ( From 732b2b60a89acd2895b67a9500ae946b34864991 Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Fri, 12 Dec 2025 15:34:16 +0400 Subject: [PATCH 12/16] message fix Signed-off-by: Denis Tarabrin --- internal/cni/switch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/cni/switch.go b/internal/cni/switch.go index 39ebef36..20cc89e5 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -196,7 +196,7 @@ func RunSwitch(timeout time.Duration) error { time.Since(startTime).Round(time.Millisecond)) // 8. This status update is CRITICAL. It unblocks the target CNI's init-container. - fmt.Println("Signaling target CNI pods to proceed by updating CNIMigration status...") + fmt.Println("Signaling 'NodeCleanupSucceeded' to proceed with pod restart...") if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ Type: "NodeCleanupSucceeded", Status: metav1.ConditionTrue, From 010965b0a40a394b4097d29b135a91163ce39bc9 Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Mon, 19 Jan 2026 19:31:07 +0400 Subject: [PATCH 13/16] + Signed-off-by: Denis Tarabrin --- .../cni/api/v1alpha1/cni_migration_types.go | 16 +- .../api/v1alpha1/cni_node_migration_types.go | 12 +- internal/cni/cleanup.go | 297 +------ internal/cni/prepare.go | 525 +----------- internal/cni/switch.go | 799 +----------------- 5 files changed, 76 insertions(+), 1573 deletions(-) diff --git a/internal/cni/api/v1alpha1/cni_migration_types.go b/internal/cni/api/v1alpha1/cni_migration_types.go index 5cf9a76d..3e2bbdd3 100644 --- a/internal/cni/api/v1alpha1/cni_migration_types.go +++ b/internal/cni/api/v1alpha1/cni_migration_types.go @@ -31,23 +31,17 @@ import ( // and tracks the overall progress across all nodes. type CNIMigration struct { metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` + metav1.ObjectMeta `json:"metadata"` // Spec defines the desired state of CNIMigration. - Spec CNIMigrationSpec `json:"spec,omitempty"` + Spec CNIMigrationSpec `json:"spec"` // Status defines the observed state of CNIMigration. - Status CNIMigrationStatus `json:"status,omitempty"` + Status CNIMigrationStatus `json:"status"` } -// CNIMigrationSpec defines the desired state of CNIMigration. -// +k8s:deepcopy-gen=true type CNIMigrationSpec struct { // TargetCNI is the CNI to switch to. - // Set by the d8 cli utility when starting Phase 1. TargetCNI string `json:"targetCNI"` - // Phase is the phase controlled by the d8 cli to command the agents. - // Possible values: Prepare, Migrate, Cleanup, Abort. - Phase string `json:"phase"` } // CNIMigrationStatus defines the observed state of CNIMigration. @@ -57,7 +51,7 @@ type CNIMigrationStatus struct { CurrentCNI string `json:"currentCNI,omitempty"` // NodesTotal is the total number of nodes involved in the migration. NodesTotal int `json:"nodesTotal,omitempty"` - // NodesSucceeded is the number of nodes that have successfully completed the current phase. + // NodesSucceeded is the number of nodes that have successfully completed the migration. NodesSucceeded int `json:"nodesSucceeded,omitempty"` // NodesFailed is the number of nodes where an error occurred. NodesFailed int `json:"nodesFailed,omitempty"` @@ -70,6 +64,6 @@ type CNIMigrationStatus struct { // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object type CNIMigrationList struct { metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` + metav1.ListMeta `json:"metadata"` Items []CNIMigration `json:"items"` } diff --git a/internal/cni/api/v1alpha1/cni_node_migration_types.go b/internal/cni/api/v1alpha1/cni_node_migration_types.go index b8536266..e8be2550 100644 --- a/internal/cni/api/v1alpha1/cni_node_migration_types.go +++ b/internal/cni/api/v1alpha1/cni_node_migration_types.go @@ -31,12 +31,12 @@ import ( // The d8 cli reads these resources to display detailed status. type CNINodeMigration struct { metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` + metav1.ObjectMeta `json:"metadata"` // Spec can be empty, as all configuration is taken from the parent CNIMigration resource. - Spec CNINodeMigrationSpec `json:"spec,omitempty"` + Spec CNINodeMigrationSpec `json:"spec"` // Status defines the observed state of CNINodeMigration. - Status CNINodeMigrationStatus `json:"status,omitempty"` + Status CNINodeMigrationStatus `json:"status"` } // CNINodeMigrationSpec defines the desired state of CNINodeMigration. @@ -45,11 +45,7 @@ type CNINodeMigrationSpec struct { // The spec can be empty, as all configuration is taken from the parent CNIMigration resource. } -// CNINodeMigrationStatus defines the observed state of CNINodeMigration. -// +k8s:deepcopy-gen=true type CNINodeMigrationStatus struct { - // Phase is the phase of this particular node. - Phase string `json:"phase,omitempty"` // Conditions are the detailed conditions reflecting the steps performed on the node. Conditions []metav1.Condition `json:"conditions,omitempty"` } @@ -58,6 +54,6 @@ type CNINodeMigrationStatus struct { // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object type CNINodeMigrationList struct { metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` + metav1.ListMeta `json:"metadata"` Items []CNINodeMigration `json:"items"` } diff --git a/internal/cni/cleanup.go b/internal/cni/cleanup.go index 4ec771b5..23ac6f56 100644 --- a/internal/cni/cleanup.go +++ b/internal/cni/cleanup.go @@ -19,32 +19,23 @@ package cni import ( "context" "fmt" - "strings" "time" - admissionregistrationv1 "k8s.io/api/admissionregistration/v1" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" ) // RunCleanup executes the logic for the 'cni-switch cleanup' command. -// It performs a robust cleanup by deleting a known list of resources -// and waiting for them to be fully terminated. +// In the new architecture, it deletes the CNIMigration resource. func RunCleanup(timeout time.Duration) error { - startTime := time.Now() ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - fmt.Printf("🚀 Starting CNI switch cleanup (global timeout: %s)\n", timeout) + fmt.Printf("🚀 Starting CNI switch cleanup (timeout: %s)\n", timeout) - // 1. Create Kubernetes client + // 1. Create a Kubernetes client safeClient, err := saferequest.NewSafeClient() if err != nil { return fmt.Errorf("creating safe client: %w", err) @@ -54,284 +45,30 @@ func RunCleanup(timeout time.Duration) error { if err != nil { return fmt.Errorf("creating runtime client: %w", err) } - fmt.Printf("✅ Kubernetes client created\n") - // 2. Delete cluster-scoped resources first, with waiting - fmt.Println("\nDeleting cluster-scoped resources...") - webhookConfig := &admissionregistrationv1.MutatingWebhookConfiguration{ - ObjectMeta: metav1.ObjectMeta{Name: webhookConfigName}, - } - if err = deleteAndWait(ctx, rtClient, webhookConfig); err != nil { - return err - } - - // 3. Stop active controllers first to prevent reconciliation loops - fmt.Printf(" Stopping active controllers in '%s'...\n", cniSwitchNamespace) - helperDs := &appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{Name: switchHelperDaemonSetName, Namespace: cniSwitchNamespace}, - } - if err = deleteAndWait(ctx, rtClient, helperDs); err != nil { - return err - } - webhookDep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{Name: webhookDeploymentName, Namespace: cniSwitchNamespace}, - } - if err = deleteAndWait(ctx, rtClient, webhookDep); err != nil { - return err - } - fmt.Println("✅ Active controllers stopped") - - // 4. Delete all CNIMigration resources - fmt.Println("\nDeleting all CNIMigration resources...") + // 2. Find and delete all CNIMigration resources migrations := &v1alpha1.CNIMigrationList{} - if err = rtClient.List(ctx, migrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { + if err := rtClient.List(ctx, migrations); err != nil { return fmt.Errorf("listing CNIMigrations: %w", err) } - for _, m := range migrations.Items { - // Remove finalizers to ensure deletion even if controller is down - if err = removeFinalizers(ctx, rtClient, &m); err != nil { - fmt.Printf("⚠️ Warning: failed to remove finalizers from %s: %v\n", m.Name, err) - } - if err = deleteAndWait(ctx, rtClient, &m); err != nil { - return err - } - } - fmt.Println("✅ All CNIMigration resources deleted") - - // 5. Delete all CNINodeMigration resources - fmt.Println("\nDeleting all CNINodeMigration resources...") - nodeMigrations := &v1alpha1.CNINodeMigrationList{} - if err = rtClient.List(ctx, nodeMigrations); err != nil && !strings.Contains(err.Error(), "no matches for kind") { - return fmt.Errorf("listing CNINodeMigrations: %w", err) - } - for _, nm := range nodeMigrations.Items { - // Remove finalizers to ensure deletion even if controller is down - if err = removeFinalizers(ctx, rtClient, &nm); err != nil { - fmt.Printf("⚠️ Warning: failed to remove finalizers from %s: %v\n", nm.Name, err) - } - if err = deleteAndWait(ctx, rtClient, &nm); err != nil { - return err - } - } - fmt.Println("✅ All CNINodeMigration resources deleted") - - // 6. Remove annotations from all pods - if err = removePodAnnotations(ctx, rtClient); err != nil { - // Non-fatal, print a warning - fmt.Printf("⚠️ Warning: failed to remove all pod annotations: %v\n", err) - } - - // 7. Delete RBAC resources - fmt.Println("\nDeleting RBAC resources...") - clusterRoleHelper := &rbacv1.ClusterRole{ - ObjectMeta: metav1.ObjectMeta{Name: switchHelperClusterRoleName}, - } - if err = deleteAndWait(ctx, rtClient, clusterRoleHelper); err != nil { - return err - } - clusterRoleWebhook := &rbacv1.ClusterRole{ - ObjectMeta: metav1.ObjectMeta{Name: webhookClusterRoleName}, - } - if err = deleteAndWait(ctx, rtClient, clusterRoleWebhook); err != nil { - return err - } - clusterRoleBindingHelper := &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{Name: switchHelperClusterRoleBindingName}, - } - if err = deleteAndWait(ctx, rtClient, clusterRoleBindingHelper); err != nil { - return err - } - clusterRoleBindingWebhook := &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{Name: webhookClusterRoleBindingName}, - } - if err = deleteAndWait(ctx, rtClient, clusterRoleBindingWebhook); err != nil { - return err - } - fmt.Println("✅ Cluster-scoped RBAC resources deleted") - - // 8. Delete remaining namespaced resources - fmt.Printf("\nDeleting remaining namespaced resources in '%s'...\n", cniSwitchNamespace) - webhookService := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{Name: webhookServiceName, Namespace: cniSwitchNamespace}, - } - if err = deleteAndWait(ctx, rtClient, webhookService); err != nil { - return err - } - webhookSecret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{Name: webhookSecretName, Namespace: cniSwitchNamespace}, - } - if err = deleteAndWait(ctx, rtClient, webhookSecret); err != nil { - return err - } - helperSA := &corev1.ServiceAccount{ - ObjectMeta: metav1.ObjectMeta{Name: switchHelperServiceAccountName, Namespace: cniSwitchNamespace}, - } - if err = deleteAndWait(ctx, rtClient, helperSA); err != nil { - return err - } - webhookSA := &corev1.ServiceAccount{ - ObjectMeta: metav1.ObjectMeta{Name: webhookServiceAccountName, Namespace: cniSwitchNamespace}, - } - if err = deleteAndWait(ctx, rtClient, webhookSA); err != nil { - return err - } - fmt.Println("✅ Remaining namespaced resources deleted") - - // 9. Delete the namespace - fmt.Printf("\nDeleting namespace '%s'...\n", cniSwitchNamespace) - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: cniSwitchNamespace}} - if err = deleteAndWait(ctx, rtClient, ns); err != nil { - return err - } - fmt.Println("✅ Namespace successfully deleted") - - fmt.Printf("\n🎉 CNI switch cleanup successfully completed (total time: %s)\n", - time.Since(startTime).Round(time.Second), - ) - return nil -} -// deleteAndWait deletes a resource and waits for it to be fully terminated. -func deleteAndWait(ctx context.Context, rtClient client.Client, obj client.Object) error { - if err := deleteResource(ctx, rtClient, obj); err != nil { - // This handles the "already deleted" case, so we don't need to check again. - return err - } - return waitForResourceDeletion(ctx, rtClient, obj) -} - -// deleteResource just initiates deletion for a Kubernetes resource. -func deleteResource(ctx context.Context, rtClient client.Client, obj client.Object) error { - kind := getKind(obj) - name := obj.GetName() - ns := obj.GetNamespace() - - fmt.Printf("- Deleting %s '%s%s'... ", kind, name, func() string { - if ns != "" { - return fmt.Sprintf("' in namespace '%s", ns) - } - return "" - }()) - - err := rtClient.Delete(ctx, obj, client.PropagationPolicy(metav1.DeletePropagationBackground)) - if err != nil { - if errors.IsNotFound(err) { - fmt.Println("already deleted.") - return nil - } - return fmt.Errorf("failed to delete %s '%s': %w", kind, name, err) - } - - fmt.Println("deleted.") - return nil -} - -// waitForResourceDeletion polls until a resource is confirmed to be gone. -func waitForResourceDeletion(ctx context.Context, rtClient client.Client, obj client.Object) error { - key := client.ObjectKey{Name: obj.GetName(), Namespace: obj.GetNamespace()} - kind := getKind(obj) - - // Check if the resource is already gone before starting to wait. - if err := rtClient.Get(ctx, key, obj); err != nil { - if errors.IsNotFound(err) { - // It was already gone, no need to wait. - return nil - } - return fmt.Errorf("getting %s '%s': %w", kind, key.Name, err) - } - - fmt.Printf(" Waiting for %s '%s' to terminate... ", kind, key.Name) - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - fmt.Println("timeout.") - return fmt.Errorf("timed out waiting for %s '%s' to be deleted", kind, key.Name) - case <-ticker.C: - err := rtClient.Get(ctx, key, obj) - if err != nil { - if errors.IsNotFound(err) { - fmt.Println("terminated.") - return nil // Success! - } - if strings.Contains(err.Error(), "no matches for kind") { - fmt.Println("Kind not known, assuming terminated.") - return nil - } - fmt.Printf("error: %v\n", err) - return fmt.Errorf("getting %s '%s': %w", kind, key.Name, err) - } - - // If the resource is still there, check for finalizers and remove them - // to ensure it doesn't get stuck in Terminating state. - if len(obj.GetFinalizers()) > 0 { - _ = removeFinalizers(ctx, rtClient, obj) - } - } - } -} - -func removePodAnnotations(ctx context.Context, rtClient client.Client) error { - fmt.Println("\nRemoving CNI switch annotations from all pods...") - - podList := &corev1.PodList{} - if err := rtClient.List(ctx, podList); err != nil { - return fmt.Errorf("listing all pods: %w", err) + if len(migrations.Items) == 0 { + fmt.Println("✅ No active migrations found.") + return nil } - podsPatched := 0 - for _, pod := range podList.Items { - if pod.Spec.HostNetwork { - continue - } - - if _, ok := pod.Annotations[EffectiveCNIAnnotation]; ok { - // Use a merge patch to remove just the one annotation - patch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{"%s":null}}}`, EffectiveCNIAnnotation)) - err := rtClient.Patch(ctx, &pod, client.RawPatch(client.Merge.Type(), patch)) - if err != nil { - if errors.IsNotFound(err) { - // No need to print warning, just continue - continue - } - fmt.Printf("\n⚠️ Warning: failed to patch pod %s/%s: %v", pod.Namespace, pod.Name, err) - continue + for _, m := range migrations.Items { + fmt.Printf("Deleting CNIMigration '%s'...", m.Name) + if err := rtClient.Delete(ctx, &m); err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("deleting CNIMigration %s: %w", m.Name, err) } - podsPatched++ - fmt.Printf("\r\033[K Patched %d pods...", podsPatched) + fmt.Println(" already deleted.") + } else { + fmt.Println(" done.") } } - if podsPatched > 0 { - fmt.Printf("\n✅ Removed annotations from %d pods.\n", podsPatched) - } else { - fmt.Print("✅ No pods with CNI switch annotations were found.\n") - } - + fmt.Println("\n🎉 Cleanup triggered. The cluster-internal controllers will handle the rest.") return nil } - -// getKind extracts a user-friendly kind from a runtime object. -func getKind(obj client.Object) string { - kind := obj.GetObjectKind().GroupVersionKind().Kind - if kind == "" { - t := fmt.Sprintf("%T", obj) - parts := strings.Split(t, ".") - if len(parts) > 0 { - return parts[len(parts)-1] - } - } - return kind -} - -// removeFinalizers patches the object to remove all finalizers. -func removeFinalizers(ctx context.Context, rtClient client.Client, obj client.Object) error { - if len(obj.GetFinalizers()) == 0 { - return nil - } - - patch := []byte(`{"metadata":{"finalizers":null}}`) - return rtClient.Patch(ctx, obj, client.RawPatch(client.Merge.Type(), patch)) -} diff --git a/internal/cni/prepare.go b/internal/cni/prepare.go index cb72efe8..268e0752 100644 --- a/internal/cni/prepare.go +++ b/internal/cni/prepare.go @@ -17,24 +17,12 @@ limitations under the License. package cni import ( - "bytes" "context" - "crypto/rand" - "crypto/rsa" - "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" "fmt" - "math/big" - "strings" "time" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "sigs.k8s.io/controller-runtime/pkg/client" "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" @@ -56,8 +44,7 @@ func RunPrepare(targetCNI string, timeout time.Duration) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - fmt.Printf("🚀 Starting CNI switch preparation for target '%s' (global timeout: %s)\n", - targetCNI, timeout) + fmt.Printf("🚀 Starting CNI switch preparation for target '%s'\n", targetCNI) // 1. Create a Kubernetes client safeClient, err := saferequest.NewSafeClient() @@ -69,519 +56,31 @@ func RunPrepare(targetCNI string, timeout time.Duration) error { if err != nil { return fmt.Errorf("creating runtime client: %w", err) } - fmt.Printf("✅ Kubernetes client created (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - // 2. Find an existing migration or create a new one - activeMigration, err := getOrCreateMigrationForPrepare(ctx, rtClient, targetCNI) - if err != nil { - return err - } - if activeMigration == nil { - // This means preparation is already complete, and the user has been notified. - return nil - } - fmt.Printf( - "✅ Working with CNIMigration '%s' (total elapsed: %s)\n\n", - activeMigration.Name, - time.Since(startTime).Round(time.Millisecond), - ) - - // 3. Detect current CNI and update migration status - if activeMigration.Status.CurrentCNI == "" { - var currentCNI string - currentCNI, err = detectCurrentCNI(rtClient) - if err != nil { - return fmt.Errorf("detecting current CNI: %w", err) - } - fmt.Printf("Detected current CNI: '%s'\n", currentCNI) - - if currentCNI == targetCNI { - return fmt.Errorf("target CNI '%s' is the same as the current CNI. Nothing to do", targetCNI) - } - - activeMigration.Status.CurrentCNI = currentCNI - err = rtClient.Status().Update(ctx, activeMigration) - if err != nil { - return fmt.Errorf("updating migration status with current CNI: %w", err) - } - fmt.Printf( - "✅ Added current CNI to migration status (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond), - ) - } - - // 4. Create the dedicated namespace - fmt.Printf("Creating dedicated namespace '%s'...\n", cniSwitchNamespace) - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: cniSwitchNamespace}} - if err = rtClient.Create(ctx, ns); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating namespace %s: %w", cniSwitchNamespace, err) - } - fmt.Printf("✅ Namespace created (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 5. Get the helper image name from the configmap - cm := &corev1.ConfigMap{} - if err = rtClient.Get(ctx, client.ObjectKey{Name: CMDataName, Namespace: CMDataNameSpace}, cm); err != nil { - return fmt.Errorf("getting %s configmap: %w", CMDataName, err) - } - imageName, ok := cm.Data[cmDataHelperImage] - if !ok || imageName == "" { - return fmt.Errorf("%s not found or empty in %s configmap", cmDataHelperImage, CMDataName) - } - - // 6. Apply RBAC - fmt.Println("Applying RBAC...") - // Helper RBAC - helperSA := getSwitchHelperServiceAccount(cniSwitchNamespace) - if err = rtClient.Create(ctx, helperSA); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating helper service account: %w", err) - } - fmt.Printf("- Helper's ServiceAccount '%s' created\n", helperSA.Name) - - helperRole := getSwitchHelperClusterRole() - if err = rtClient.Create(ctx, helperRole); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating cluster role: %w", err) - } - fmt.Printf("- Helper's ClusterRole '%s' created\n", helperRole.Name) - - helperBinding := getSwitchHelperClusterRoleBinding(cniSwitchNamespace) - if err = rtClient.Create(ctx, helperBinding); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating cluster role binding: %w", err) - } - fmt.Printf("- Helper's ClusterRoleBinding '%s' created\n", helperBinding.Name) - - // Webhook RBAC - webhookSA := getWebhookServiceAccount(cniSwitchNamespace) - if err = rtClient.Create(ctx, webhookSA); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating webhook service account: %w", err) - } - fmt.Printf("- Webhook's ServiceAccount '%s' created\n", webhookSA.Name) - - webhookRole := getWebhookClusterRole() - if err = rtClient.Create(ctx, webhookRole); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating webhook cluster role: %w", err) - } - fmt.Printf("- Webhook's ClusterRole '%s' created\n", webhookRole.Name) - - webhookBinding := getWebhookClusterRoleBinding(cniSwitchNamespace) - if err = rtClient.Create(ctx, webhookBinding); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating webhook cluster role binding: %w", err) - } - fmt.Printf("- Webhook's ClusterRoleBinding '%s' created\n", webhookBinding.Name) - fmt.Printf("✅ RBAC applied (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 7. Create and wait for the mutating webhook - fmt.Println("Deploying Mutating Webhook for annotating new pods...") - // Generate certificates - caCert, serverCert, serverKey, err := generateWebhookCertificates(cniSwitchNamespace) - if err != nil { - return fmt.Errorf("generating webhook certificates: %w", err) - } - fmt.Printf("- TLS certificate generated\n") - - // Create TLS secret - tlsSecret := getWebhookTLSSecret(cniSwitchNamespace, serverCert, serverKey) - if err = rtClient.Create(ctx, tlsSecret); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating webhook tls secret: %w", err) - } - fmt.Printf("- Secret with TLS certificate '%s' created\n", tlsSecret.Name) - - // Create Deployment - webhookDeployment := getWebhookDeployment(cniSwitchNamespace, imageName, webhookServiceAccountName) - if err = rtClient.Create(ctx, webhookDeployment); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating webhook deployment: %w", err) - } - - // Wait for Deployment to be ready - if err = waitForDeploymentReady(ctx, rtClient, webhookDeployment); err != nil { - return fmt.Errorf("waiting for webhook deployment ready: %w", err) - } - fmt.Printf("- Webhook Deployment '%s' created\n", webhookDeployment.Name) - - // Create Service - webhookService := getWebhookService(cniSwitchNamespace) - if err = rtClient.Create(ctx, webhookService); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating webhook service: %w", err) - } - fmt.Printf("- Webhook Service '%s' created\n", webhookService.Name) - - // Create MutatingWebhookConfiguration - webhookConfig := getMutatingWebhookConfiguration(cniSwitchNamespace, caCert) - if err = rtClient.Create(ctx, webhookConfig); err != nil && !errors.IsAlreadyExists(err) { - return fmt.Errorf("creating mutating webhook configuration: %w", err) - } - fmt.Printf("✅ Mutating Webhook '%s' is active (total elapsed: %s)\n\n", - webhookConfig.Name, time.Since(startTime).Round(time.Millisecond)) - - // 8. Create and wait for the Helper daemonset - dsKey := client.ObjectKey{Name: switchHelperDaemonSetName, Namespace: cniSwitchNamespace} - ds := &appsv1.DaemonSet{} - if err = rtClient.Get(ctx, dsKey, ds); err != nil { - if !errors.IsNotFound(err) { - return fmt.Errorf("getting helper daemonset: %w", err) - } - fmt.Printf("Creating helper DaemonSet '%s'...\n", switchHelperDaemonSetName) - dsToCreate := getSwitchHelperDaemonSet(cniSwitchNamespace, imageName) - if err = rtClient.Create(ctx, dsToCreate); err != nil { - return fmt.Errorf("creating helper daemonset: %w", err) - } - ds = dsToCreate - } else { - fmt.Printf("Helper DaemonSet '%s' already exists.\n", switchHelperDaemonSetName) - } - - if err = waitForDaemonSetReady(ctx, rtClient, ds); err != nil { - return fmt.Errorf("waiting for daemonset ready: %w", err) - } - fmt.Printf("✅ Helper DaemonSet is ready (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 9. Wait for all nodes to be prepared - fmt.Println("Waiting for all nodes to complete the preparation step...") - err = waitForNodesPrepared(ctx, rtClient) - if err != nil { - return fmt.Errorf("waiting for nodes to be prepared: %w", err) - } - fmt.Printf("✅ All CNINodeMigrations are created and all nodes are prepared (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 10. Update overall status - activeMigration.Status.Conditions = append(activeMigration.Status.Conditions, metav1.Condition{ - Type: "PreparationSucceeded", - Status: metav1.ConditionTrue, - LastTransitionTime: metav1.Now(), - Reason: "AllNodesPrepared", - Message: "All nodes successfully completed the preparation step.", - }) - - err = rtClient.Status().Update(ctx, activeMigration) - if err != nil { - return fmt.Errorf("updating CNIMigration status to prepared: %w", err) - } - - fmt.Printf( - "🎉 Cluster successfully prepared for CNI switch (total time: %s)\n", - time.Since(startTime).Round(time.Second), - ) - fmt.Println("\nYou can now run 'd8 cni-switch switch' to proceed") - - return nil -} - -// generateWebhookCertificates creates a self-signed CA and a server certificate for the webhook. -func generateWebhookCertificates(namespace string) ([]byte, []byte, []byte, error) { - caSerialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) - caSerialNumber, err := rand.Int(rand.Reader, caSerialNumberLimit) - if err != nil { - return nil, nil, nil, fmt.Errorf("failed to generate CA serial number: %w", err) - } - - // CA configuration - ca := &x509.Certificate{ - SerialNumber: caSerialNumber, - Subject: pkix.Name{ - Organization: []string{"deckhouse.io"}, - }, - NotBefore: time.Now(), - NotAfter: time.Now().AddDate(1, 0, 0), - IsCA: true, - ExtKeyUsage: []x509.ExtKeyUsage{ - x509.ExtKeyUsageClientAuth, - x509.ExtKeyUsageServerAuth, - }, - KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, - BasicConstraintsValid: true, - } - - caPrivKey, err := rsa.GenerateKey(rand.Reader, 4096) - if err != nil { - return nil, nil, nil, fmt.Errorf("generating CA private key: %w", err) - } - - caBytes, err := x509.CreateCertificate(rand.Reader, ca, ca, &caPrivKey.PublicKey, caPrivKey) - if err != nil { - return nil, nil, nil, fmt.Errorf("creating CA certificate: %w", err) - } - - caPEM := new(bytes.Buffer) - _ = pem.Encode(caPEM, &pem.Block{ - Type: "CERTIFICATE", - Bytes: caBytes, - }) - - serverSerialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) - serverSerialNumber, err := rand.Int(rand.Reader, serverSerialNumberLimit) - if err != nil { - return nil, nil, nil, fmt.Errorf("failed to generate server serial number: %w", err) - } - - // Server certificate configuration - commonName := fmt.Sprintf("%s.%s.svc", webhookServiceName, namespace) - cert := &x509.Certificate{ - SerialNumber: serverSerialNumber, - Subject: pkix.Name{ - CommonName: commonName, - Organization: []string{"deckhouse.io"}, - }, - DNSNames: []string{commonName}, - NotBefore: time.Now(), - NotAfter: time.Now().AddDate(1, 0, 0), - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - KeyUsage: x509.KeyUsageDigitalSignature, - } - - serverPrivKey, err := rsa.GenerateKey(rand.Reader, 4096) - if err != nil { - return nil, nil, nil, fmt.Errorf("generating server private key: %w", err) - } - - serverCertBytes, err := x509.CreateCertificate(rand.Reader, cert, ca, &serverPrivKey.PublicKey, caPrivKey) - if err != nil { - return nil, nil, nil, fmt.Errorf("creating server certificate: %w", err) - } - - serverCertPEM := new(bytes.Buffer) - _ = pem.Encode(serverCertPEM, &pem.Block{ - Type: "CERTIFICATE", - Bytes: serverCertBytes, - }) - - serverPrivKeyPEM := new(bytes.Buffer) - _ = pem.Encode(serverPrivKeyPEM, &pem.Block{ - Type: "RSA PRIVATE KEY", - Bytes: x509.MarshalPKCS1PrivateKey(serverPrivKey), - }) - - return caPEM.Bytes(), serverCertPEM.Bytes(), serverPrivKeyPEM.Bytes(), nil -} - -func getOrCreateMigrationForPrepare( - ctx context.Context, rtClient client.Client, targetCNI string, -) (*v1alpha1.CNIMigration, error) { - activeMigration, err := FindActiveMigration(ctx, rtClient) - if err != nil { - return nil, fmt.Errorf("failed to find active migration: %w", err) - } - - if activeMigration != nil { - fmt.Printf("Found active CNIMigration '%s'\n", activeMigration.Name) - - // If an active migration is found, ensure its target CNI matches the requested target CNI. - if activeMigration.Spec.TargetCNI != targetCNI { - return nil, fmt.Errorf( - "an active CNI migration to '%s' is already in progress. "+ - "Cannot prepare for '%s'. To change the target CNI, "+ - "please run 'd8 cni-switch cleanup' first to reset the state", - activeMigration.Spec.TargetCNI, - targetCNI, - ) - } - - // Check if preparation is already done - for _, cond := range activeMigration.Status.Conditions { - if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { - fmt.Println("🎉 Cluster has already been prepared for CNI switch.") - fmt.Println("\nYou can now run 'd8 cni-switch switch' to proceed.") - return nil, nil // Signal to the caller that we can exit gracefully - } - } - - // Ensure the migration is in the 'Prepare' phase - if activeMigration.Spec.Phase != "Prepare" { - return nil, fmt.Errorf( - "an active migration is already in the '%s' phase. "+ - "Cannot run 'prepare' again. To proceed, run 'd8 cni-switch switch'. "+ - "To start over, run 'd8 cni-switch cleanup'", - activeMigration.Spec.Phase, - ) - } - - return activeMigration, nil - } + // 2. Create the CNIMigration resource migrationName := fmt.Sprintf("cni-migration-%s", time.Now().Format("20060102-150405")) - fmt.Printf("No active migration found. Creating a new one...\n") - newMigration := &v1alpha1.CNIMigration{ ObjectMeta: metav1.ObjectMeta{ Name: migrationName, }, Spec: v1alpha1.CNIMigrationSpec{ TargetCNI: targetCNI, - Phase: "Prepare", }, } - err = rtClient.Create(ctx, newMigration) - if err != nil { + if err := rtClient.Create(ctx, newMigration); err != nil { if errors.IsAlreadyExists(err) { - fmt.Println("Migration object was created by another process. Getting it.") - err = rtClient.Get(ctx, client.ObjectKey{Name: migrationName}, newMigration) - if err != nil { - return nil, fmt.Errorf("getting existing CNIMigration object: %w", err) - } - return newMigration, nil - } - return nil, fmt.Errorf("creating new CNIMigration object: %w", err) - } - - fmt.Printf("Successfully created CNIMigration object '%s'\n", newMigration.Name) - return newMigration, nil -} - -func waitForDaemonSetReady(ctx context.Context, rtClient client.Client, ds *appsv1.DaemonSet) error { - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - key := client.ObjectKey{Name: ds.Name, Namespace: ds.Namespace} - err := rtClient.Get(ctx, key, ds) - if err != nil { - fmt.Printf("\n⚠️ Warning: could not get DaemonSet status: %v\n", err) - continue - } - - // This is the exit condition for the loop. - if ds.Status.DesiredNumberScheduled == ds.Status.NumberReady && ds.Status.NumberUnavailable == 0 { - fmt.Printf( - "\r\033[K Waiting for DaemonSet... %d/%d pods ready\n", - ds.Status.NumberReady, - ds.Status.DesiredNumberScheduled, - ) - return nil - } - - // This is the progress update. - fmt.Printf( - "\r\033[K Waiting for DaemonSet... %d/%d pods ready", - ds.Status.NumberReady, - ds.Status.DesiredNumberScheduled, - ) - } - } -} - -func waitForDeploymentReady(ctx context.Context, rtClient client.Client, dep *appsv1.Deployment) error { - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - key := client.ObjectKey{Name: dep.Name, Namespace: dep.Namespace} - err := rtClient.Get(ctx, key, dep) - if err != nil { - fmt.Printf("\n⚠️ Warning: could not get Deployment status: %v\n", err) - continue - } - - // This is the exit condition for the loop. - if dep.Spec.Replicas != nil && dep.Status.ReadyReplicas >= - *dep.Spec.Replicas && dep.Status.UnavailableReplicas == 0 { - fmt.Printf( - "\r\033[K Waiting for Deployment... %d/%d replicas ready\n", - dep.Status.ReadyReplicas, - *dep.Spec.Replicas, - ) - return nil - } - - // This is the progress update. - if dep.Spec.Replicas != nil { - fmt.Printf( - "\r\033[K Waiting for Deployment... %d/%d replicas ready", - dep.Status.ReadyReplicas, - *dep.Spec.Replicas, - ) - } + fmt.Printf("ℹ️ Migration '%s' already exists.\n", migrationName) + } else { + return fmt.Errorf("creating CNIMigration: %w", err) } - } -} - -func waitForNodesPrepared(ctx context.Context, rtClient client.Client) error { - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - nodes := &corev1.NodeList{} - if err := rtClient.List(ctx, nodes); err != nil { - fmt.Printf("⚠️ Warning: could not list nodes: %v\n", err) - continue - } - totalNodes := len(nodes.Items) - - migrations := &v1alpha1.CNINodeMigrationList{} - if err := rtClient.List(ctx, migrations); err != nil { - fmt.Printf("⚠️ Warning: could not list node migrations: %v\n", err) - continue - } - - readyNodes := 0 - for _, migration := range migrations.Items { - for _, cond := range migration.Status.Conditions { - if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { - readyNodes++ - break - } - } - } - - fmt.Printf("\r\033[K Progress: %d/%d nodes prepared...", readyNodes, totalNodes) - - if readyNodes >= totalNodes && totalNodes > 0 { - fmt.Printf("\r\033[K Progress: %d/%d nodes prepared...\n", readyNodes, totalNodes) - return nil - } - } - } -} - -func detectCurrentCNI(rtClient client.Client) (string, error) { - var enabledCNIs []string - for _, cniModule := range CNIModuleConfigs { - mc := &unstructured.Unstructured{} - mc.SetGroupVersionKind(moduleConfigGVK) - - err := rtClient.Get(context.Background(), client.ObjectKey{Name: cniModule}, mc) - if err != nil { - if errors.IsNotFound(err) { - continue - } - return "", fmt.Errorf("getting module config %s: %w", cniModule, err) - } - - enabled, found, err := unstructured.NestedBool(mc.Object, "spec", "enabled") - if err != nil { - return "", fmt.Errorf("parsing 'spec.enabled' for module config %s: %w", cniModule, err) - } - - if found && enabled { - cniName := strings.TrimPrefix(cniModule, "cni-") - enabledCNIs = append(enabledCNIs, cniName) - } - } - - if len(enabledCNIs) == 0 { - return "", fmt.Errorf("no enabled CNI module found. Looked for: %s", strings.Join(CNIModuleConfigs, ", ")) + } else { + fmt.Printf("✅ CNIMigration '%s' created.\n", migrationName) } - if len(enabledCNIs) > 1 { - return "", fmt.Errorf( - "found multiple enabled CNI modules: %s. Please disable all but one", - strings.Join(enabledCNIs, ", "), - ) - } + fmt.Printf("\n🎉 Preparation triggered (total elapsed: %s)\n", time.Since(startTime).Round(time.Millisecond)) + fmt.Println("The migration is now being handled automatically by the cluster.") + fmt.Println("You can monitor the status using 'd8 cni-switch switch --watch' or by checking the CNIMigration resource.") - return enabledCNIs[0], nil + return nil } diff --git a/internal/cni/switch.go b/internal/cni/switch.go index 20cc89e5..eb833a83 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -19,41 +19,22 @@ package cni import ( "context" "fmt" - "strings" "time" - admissionregistrationv1 "k8s.io/api/admissionregistration/v1" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" - "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" ) // RunSwitch executes the logic for the 'cni-switch switch' command. +// In the new architecture, it acts as a status watcher. func RunSwitch(timeout time.Duration) error { - // 0. Ask for user confirmation - confirmed, err := AskForConfirmation("switch") - if err != nil { - return fmt.Errorf("asking for confirmation: %w", err) - } - if !confirmed { - fmt.Println("Operation cancelled by user.") - return nil - } - startTime := time.Now() ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - fmt.Printf("🚀 Starting CNI switch (global timeout: %s)\n", timeout) + fmt.Printf("🚀 Monitoring CNI switch progress (timeout: %s)\n", timeout) // 1. Create a Kubernetes client safeClient, err := saferequest.NewSafeClient() @@ -65,777 +46,73 @@ func RunSwitch(timeout time.Duration) error { if err != nil { return fmt.Errorf("creating runtime client: %w", err) } - fmt.Printf("✅ Kubernetes client created (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 2. Find the active migration - activeMigration, err := FindActiveMigration(ctx, rtClient) - if err != nil { - return fmt.Errorf("failed to find active migration: %w", err) - } - - if activeMigration == nil { - return fmt.Errorf("no active CNI migration found. Please run 'd8 cni-switch prepare' first") - } - - // Check if the switch is already completed successfully. - for _, cond := range activeMigration.Status.Conditions { - if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { - fmt.Printf("🎉 CNI switch to '%s' is already completed successfully.\n", activeMigration.Spec.TargetCNI) - fmt.Println("\nYou can run 'd8 cni-switch cleanup' to remove auxiliary resources.") - return nil - } - } - - // 3. Check if the preparation step was completed successfully - isPrepared := false - for _, cond := range activeMigration.Status.Conditions { - if cond.Type == "PreparationSucceeded" && cond.Status == metav1.ConditionTrue { - isPrepared = true - break - } - } - - if !isPrepared { - return fmt.Errorf("cluster is not ready for switching. " + - "Please ensure the 'prepare' command completed successfully") - } - - // Verify all resources exist - fmt.Println("Verifying all created resources...") - if err = verifyResourcesExist(ctx, rtClient); err != nil { - return fmt.Errorf("verifying resources: %w", err) - } - fmt.Println("- Verified. All necessary resources are present in the cluster") - - fmt.Printf( - "✅ Working with prepared migration '%s' (total elapsed: %s)\n\n", - activeMigration.Name, - time.Since(startTime).Round(time.Millisecond), - ) - - // 4. Enable target CNI - currentCNI := activeMigration.Status.CurrentCNI - targetCNI := activeMigration.Spec.TargetCNI - - // Check if we already allowed the target CNI to start - targetCNIAllowedToStart := false - for _, cond := range activeMigration.Status.Conditions { - if cond.Type == "NodeCleanupSucceeded" && cond.Status == metav1.ConditionTrue { - targetCNIAllowedToStart = true - break - } - } - - fmt.Printf("Enabling target CNI module 'cni-%s'...\n", targetCNI) - if err = toggleModule(ctx, rtClient, "cni-"+strings.ToLower(targetCNI), true); err != nil { - return fmt.Errorf("enabling module '%s': %w", targetCNI, err) - } - - var dsName string - if !targetCNIAllowedToStart { - // Wait for target CNI pods to start initializing - targetModuleName := "cni-" + strings.ToLower(targetCNI) - dsName, err = getDaemonSetNameForCNI(targetModuleName) - if err != nil { - return fmt.Errorf("getting daemonset name for target CNI: %w", err) - } - if err = waitForModulePodsInitializing(ctx, rtClient, targetModuleName, dsName); err != nil { - return fmt.Errorf("waiting for target CNI pods to initialize: %w", err) - } - fmt.Printf("✅ CNI module 'cni-%s' enabled and pods initialized (total elapsed: %s)\n\n", - targetCNI, - time.Since(startTime).Round(time.Millisecond)) - - // 5. Disable current CNI - fmt.Printf("Disabling current CNI module 'cni-%s'...\n", currentCNI) - if err = toggleModule(ctx, rtClient, "cni-"+strings.ToLower(currentCNI), false); err != nil { - return fmt.Errorf("disabling module '%s': %w", currentCNI, err) - } - if err = waitForModule(ctx, rtClient, "cni-"+strings.ToLower(currentCNI), false); err != nil { - return fmt.Errorf("waiting for module '%s' to be disabled: %w", currentCNI, err) - } - - var dsNameCurrent string - dsNameCurrent, err = getDaemonSetNameForCNI("cni-" + strings.ToLower(currentCNI)) - if err != nil { - return fmt.Errorf("getting daemonset name for current CNI: %w", err) - } - if err = waitForModulePodsTermination( - ctx, rtClient, "cni-"+strings.ToLower(currentCNI), dsNameCurrent, - ); err != nil { - return fmt.Errorf("waiting for current CNI pods to terminate: %w", err) - } - if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ - Type: "OldCNIDisabled", - Status: metav1.ConditionTrue, - Reason: "ModuleDisabled", - Message: fmt.Sprintf("Module 'cni-%s' was successfully disabled.", currentCNI), - }); err != nil { - return fmt.Errorf("updating CNIMigration status: %w", err) - } - fmt.Printf("✅ CNI module 'cni-%s' disabled (total elapsed: %s)\n\n", - currentCNI, - time.Since(startTime).Round(time.Millisecond)) - - // 6. Update phase to Migrate (Triggers cleanup on nodes) - fmt.Println("Updating CNIMigration phase to 'Migrate' to trigger node cleanup...") - if err = updateCNIMigrationPhase(ctx, rtClient, activeMigration.Name, "Migrate"); err != nil { - return fmt.Errorf("updating CNIMigration phase: %w", err) - } - fmt.Printf("✅ CNIMigration phase updated (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 7. Wait for nodes to be cleaned up - fmt.Println("Waiting for nodes to be cleaned up by cni-switch-helper...") - if err = waitForNodeConditions(ctx, rtClient, activeMigration, "CleanupSucceeded"); err != nil { - return fmt.Errorf("waiting for node cleanup: %w", err) - } - fmt.Printf("✅ All nodes cleaned up (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 8. This status update is CRITICAL. It unblocks the target CNI's init-container. - fmt.Println("Signaling 'NodeCleanupSucceeded' to proceed with pod restart...") - if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ - Type: "NodeCleanupSucceeded", - Status: metav1.ConditionTrue, - Reason: "AllNodesCleanedUp", - Message: "All nodes have been successfully cleaned up from old CNI artifacts.", - }); err != nil { - return fmt.Errorf("updating CNIMigration status: %w", err) - } - fmt.Printf("✅ CNIMigration status updated (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - } else { - fmt.Printf("ℹ️ Skipping wait for init, disable old CNI, cleanup as NodeCleanupSucceeded is already True.\n\n") - } - - // 9. Wait for target CNI to be Ready - // Now that NodeCleanupSucceeded is True, the target CNI pods should unblock and become Ready - targetModuleName := "cni-" + strings.ToLower(targetCNI) - fmt.Printf("Waiting for pods of module '%s' to become Ready...\n", targetModuleName) - dsName, err = getDaemonSetNameForCNI(targetModuleName) - if err != nil { - return fmt.Errorf("getting daemonset name for target CNI: %w", err) - } - if err = waitForModulePodsReady(ctx, rtClient, targetModuleName, dsName); err != nil { - return fmt.Errorf("waiting for target CNI pods to be ready: %w", err) - } - fmt.Printf("✅ CNI module 'cni-%s' is now Ready (total elapsed: %s)\n\n", - targetCNI, - time.Since(startTime).Round(time.Millisecond)) - - // 10. Delete Mutating Webhook - fmt.Println("Deleting Mutating Webhook...") - if err = deleteMutatingWebhook(ctx, rtClient); err != nil { - return fmt.Errorf("deleting mutating webhook: %w", err) - } - fmt.Printf("\n✅ Mutating webhook deleted (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 11. Signal 'NewCNIEnabled' - fmt.Println("Signaling 'NewCNIEnabled' to proceed with pod restart...") - if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ - Type: "NewCNIEnabled", - Status: metav1.ConditionTrue, - Reason: "ModuleEnabled", - Message: fmt.Sprintf("Module 'cni-%s' was successfully enabled.", targetCNI), - }); err != nil { - return fmt.Errorf("updating CNIMigration status: %w", err) - } - fmt.Printf("✅ CNIMigration status updated (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - - // 12. Wait for pods to be restarted - fmt.Println("Waiting for pods to be restarted on all nodes...") - if err = waitForNodeConditions(ctx, rtClient, activeMigration, "PodsRestarted"); err != nil { - fmt.Printf("⚠️ Warning: Timed out waiting for pods to restart: %v\n", err) - fmt.Println("Please check the cluster status manually. The CNI switch is otherwise complete.") - } else { - fmt.Printf("✅ All pods restarted (total elapsed: %s)\n\n", - time.Since(startTime).Round(time.Millisecond)) - } - - // 13. Finalize migration - fmt.Println("Finalizing migration...") - if err = updateCNIMigrationStatus(ctx, rtClient, activeMigration.Name, metav1.Condition{ - Type: "Succeeded", - Status: metav1.ConditionTrue, - LastTransitionTime: metav1.Now(), - Reason: "MigrationComplete", - Message: "CNI migration completed successfully.", - }); err != nil { - fmt.Printf("\n ⚠️ Warning: Failed to update status (Succeeded): %v. Proceeding...\n", err) - } - - fmt.Printf( - "🎉 CNI switch to '%s' completed successfully! (total time: %s)\n", - targetCNI, - time.Since(startTime).Round(time.Second), - ) - fmt.Println("\nYou can now run 'd8 cni-switch cleanup' to remove auxiliary resources.") - - return nil -} - -func toggleModule(ctx context.Context, cl client.Client, moduleName string, toggle bool) error { ticker := time.NewTicker(2 * time.Second) defer ticker.Stop() - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - mc := &unstructured.Unstructured{} - mc.SetGroupVersionKind(schema.GroupVersionKind{ - Group: "deckhouse.io", - Version: "v1alpha1", - Kind: "ModuleConfig", - }) - - err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, mc) - if err != nil { - fmt.Printf("\r\033[K ⚠️ Error getting module config '%s': %v. Retrying...", moduleName, err) - continue - } - - spec, found, err := unstructured.NestedMap(mc.Object, "spec") - if err != nil { - fmt.Printf("\r\033[K ⚠️ Error getting spec from module config '%s': %v. Retrying...", moduleName, err) - continue - } - if !found { - spec = make(map[string]any) - } - - // Skip update if value is already correct - if currentToggle, ok := spec["enabled"].(bool); ok && currentToggle == toggle { - return nil - } - - spec["enabled"] = toggle - - if err := unstructured.SetNestedMap(mc.Object, spec, "spec"); err != nil { - fmt.Printf("\r\033[K ⚠️ Error setting spec for module config '%s': %v. Retrying...", moduleName, err) - continue - } - - if err := cl.Update(ctx, mc); err != nil { - fmt.Printf("\r\033[K ⚠️ Error updating module config '%s': %v. Retrying...", moduleName, err) - continue - } - return nil - } - } -} - -func waitForModule(ctx context.Context, cl client.Client, moduleName string, shouldBeReady bool) error { - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - module := &unstructured.Unstructured{} - module.SetGroupVersionKind(schema.GroupVersionKind{ - Group: "deckhouse.io", - Version: "v1alpha1", - Kind: "Module", - }) - - err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) - - if shouldBeReady { - if err != nil { - if errors.IsNotFound(err) { - fmt.Printf("\r\033[K Waiting for module '%s': not found yet...", moduleName) - } else { - fmt.Printf("\r\033[K ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) - } - continue - } - - state, found, err := unstructured.NestedString(module.Object, "status", "phase") - if err != nil || !found { - fmt.Printf("\r\033[K Waiting for module '%s': status.phase field not found. Retrying...", - moduleName) - continue - } - - if state == "Ready" { - fmt.Printf("Module '%s' is Ready.", moduleName) - fmt.Println() - return nil - } - fmt.Printf("\r\033[K Waiting for module '%s' to be Ready, current state: %s", moduleName, state) - } else { - err := cl.Get(ctx, types.NamespacedName{Name: moduleName}, module) - if err != nil { - if errors.IsNotFound(err) { - fmt.Printf("\r\033[K Module '%s' is not found, assuming disabled.", moduleName) - fmt.Println() - return nil - } - fmt.Printf("\r\033[K ⚠️ Error getting module '%s': %v. Retrying...", moduleName, err) - continue - } - - // Check conditions to see if it's disabled - conditions, found, err := unstructured.NestedSlice(module.Object, "status", "conditions") - if err != nil || !found { - fmt.Printf("\r\033[K Waiting for module '%s' status conditions. Retrying...", moduleName) - continue - } - - isReadyFound := false - for _, c := range conditions { - condition, ok := c.(map[string]any) - if !ok { - continue - } - - condType, found, err := unstructured.NestedString(condition, "type") - if err != nil || !found { - continue - } - - if condType == "IsReady" { - isReadyFound = true - condStatus, _, _ := unstructured.NestedString(condition, "status") - if condStatus == "False" { - fmt.Printf("\r\033[K- Module '%s' is disabled (IsReady=False).\n", moduleName) - return nil - } - } - } - - if !isReadyFound { - fmt.Printf("\r\033[K Waiting for module '%s' to be disabled, 'IsReady' condition not found...", - moduleName) - } else { - fmt.Printf("\r\033[K Waiting for module '%s' to be disabled (IsReady=False)...", moduleName) - } - } - } - } -} - -func updateCNIMigrationStatus( - ctx context.Context, cl client.Client, migrationName string, newCondition metav1.Condition, -) error { - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() + var lastCondition string for { select { case <-ctx.Done(): return ctx.Err() case <-ticker.C: - migration := &v1alpha1.CNIMigration{} - err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration) + activeMigration, err := FindActiveMigration(ctx, rtClient) if err != nil { - fmt.Printf("\r\033[K ⚠️ Error getting CNIMigration '%s': %v. Retrying...", migrationName, err) + fmt.Printf("\r\033[K⚠️ Error finding active migration: %v", err) continue } - patchedMigration := migration.DeepCopy() - newCondition.LastTransitionTime = metav1.Now() - - found := false - for i, cond := range patchedMigration.Status.Conditions { - if cond.Type == newCondition.Type { - // Check if update is needed - if cond.Status == newCondition.Status && cond.Reason == newCondition.Reason && - cond.Message == newCondition.Message { - return nil // Already up to date - } - patchedMigration.Status.Conditions[i] = newCondition - found = true - break - } - } - - if !found { - patchedMigration.Status.Conditions = append(patchedMigration.Status.Conditions, newCondition) - } - - err = cl.Status().Patch(ctx, patchedMigration, client.MergeFrom(migration)) - if err == nil { + if activeMigration == nil { + fmt.Println("\r\033[Kℹ️ No active migration found.") return nil } - if !errors.IsConflict(err) { - fmt.Printf("\r\033[K ⚠️ Error patching CNIMigration status: %v. Retrying...", err) + // Report current status from conditions + currentCond := getLatestCondition(activeMigration) + if currentCond != lastCondition { + fmt.Printf("\r\033[K[%s] %s: %s\n", + time.Now().Format("15:04:05"), + currentCond, + activeMigration.Name) + lastCondition = currentCond } - } - } -} - -func waitForNodeConditions( - ctx context.Context, cl client.Client, cniMigration *v1alpha1.CNIMigration, conditionType string, -) error { - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - reportedFailures := make(map[string]struct{}) - - // Helper to update status stats - updateStats := func(total, succeeded, failed int) { - // Fetch fresh object to avoid conflicts - freshMigration := &v1alpha1.CNIMigration{} - if err := cl.Get(ctx, types.NamespacedName{Name: cniMigration.Name}, freshMigration); err != nil { - return // Ignore error, will retry next tick - } - - if freshMigration.Status.NodesTotal == total && - freshMigration.Status.NodesSucceeded == succeeded && - freshMigration.Status.NodesFailed == failed { - return - } - - patched := freshMigration.DeepCopy() - patched.Status.NodesTotal = total - patched.Status.NodesSucceeded = succeeded - patched.Status.NodesFailed = failed - - _ = cl.Status().Patch(ctx, patched, client.MergeFrom(freshMigration)) - } - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - // Get current total nodes - nodeList := &corev1.NodeList{} - if err := cl.List(ctx, nodeList); err != nil { - fmt.Printf("\r\033[K ⚠️ Error listing nodes: %v. Retrying...", err) - continue - } - totalNodes := len(nodeList.Items) - - // Get current node migrations - nodeMigrations := &v1alpha1.CNINodeMigrationList{} - if err := cl.List(ctx, nodeMigrations); err != nil { - fmt.Printf("\r\033[K ⚠️ Error listing CNINodeMigration resources: %v. Retrying...", err) - continue - } - - succeededNodes := 0 - failedNodes := 0 - - for _, nm := range nodeMigrations.Items { - if !metav1.IsControlledBy(&nm, cniMigration) { - continue - } - - // Check specific condition success - isSucceeded := false - for _, cond := range nm.Status.Conditions { - if cond.Type == conditionType && cond.Status == metav1.ConditionTrue { - succeededNodes++ - isSucceeded = true - - // Check if it was previously failed - if _, wasFailed := reportedFailures[nm.Name]; wasFailed { - fmt.Printf("\r\033[K ✅ Node '%s' has recovered.\n", nm.Name) - delete(reportedFailures, nm.Name) - } - break - } - } - - if isSucceeded { - continue - } - - // Check failure - for _, cond := range nm.Status.Conditions { - if cond.Type == "Failed" && cond.Status == metav1.ConditionTrue { - failedNodes++ - if _, reported := reportedFailures[nm.Name]; !reported { - // Clear line, print error, then let the progress bar overwrite next - fmt.Printf("\r\033[K ❌ Node '%s' failed: %s\n", nm.Name, cond.Message) - reportedFailures[nm.Name] = struct{}{} - } - break - } + // Report node progress if available + if activeMigration.Status.NodesTotal > 0 { + fmt.Printf("\r\033[K Nodes: %d/%d succeeded", + activeMigration.Status.NodesSucceeded, + activeMigration.Status.NodesTotal) + if activeMigration.Status.NodesFailed > 0 { + fmt.Printf(", %d failed", activeMigration.Status.NodesFailed) } } - // Update status in CNIMigration - go updateStats(totalNodes, succeededNodes, failedNodes) - - // Output progress - fmt.Printf("\r\033[K Progress: %d/%d nodes completed, %d failed. ", - succeededNodes, totalNodes, failedNodes) - - // 5. Check exit condition - if succeededNodes >= totalNodes { - fmt.Println() - return nil - } - } - } -} - -func deleteMutatingWebhook(ctx context.Context, cl client.Client) error { - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - fmt.Printf(" Deleting mutating webhook '%s'...", webhookConfigName) - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - webhook := &admissionregistrationv1.MutatingWebhookConfiguration{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookConfigName, - }, - } - if err := cl.Delete(ctx, webhook); err != nil { - if errors.IsNotFound(err) { - fmt.Println("\r\033[K Mutating webhook not found, assuming already deleted.") + // Check for completion + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { + fmt.Printf("\n\n🎉 CNI switch to '%s' completed successfully! (total time: %s)\n", + activeMigration.Spec.TargetCNI, + time.Since(startTime).Round(time.Second)) return nil } - fmt.Printf("\r\033[K ⚠️ Error deleting mutating webhook: %v. Retrying...", err) - continue - } - return nil - } - } -} - -// getDaemonSetNameForCNI returns the name of the main DaemonSet for a given CNI module. -func getDaemonSetNameForCNI(cniModule string) (string, error) { - switch cniModule { - case "cni-cilium": - return "agent", nil - case "cni-flannel": - return "flannel", nil - case "cni-simple-bridge": - return "simple-bridge", nil - default: - return "", fmt.Errorf("unknown CNI module: %s", cniModule) - } -} - -// waitForModulePodsInitializing waits for all pods of a module's daemonset to be in the 'Initializing' state, -// specifically waiting in the init container. -func waitForModulePodsInitializing(ctx context.Context, cl client.Client, moduleName string, dsName string) error { - fmt.Printf(" Waiting for pods of module '%s' to enter 'Initializing' state...", moduleName) - - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - // Get the DaemonSet - ds := &appsv1.DaemonSet{} - err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) - if err != nil { - if errors.IsNotFound(err) { - fmt.Printf("\r\033[K Waiting for DaemonSet '%s' in namespace 'd8-%s': not found...", - dsName, moduleName) - continue - } - fmt.Printf("\r\033[K Error getting DaemonSet '%s' in namespace 'd8-%s': %v. Retrying...", - dsName, moduleName, err) - continue - } - - // Check if pods are scheduled - if ds.Status.DesiredNumberScheduled == 0 { - fmt.Printf("\r\033[K Waiting for DaemonSet '%s' to schedule pods...", dsName) - continue - } - - // List pods - podList := &corev1.PodList{} - opts := []client.ListOption{ - client.InNamespace("d8-" + moduleName), - client.MatchingLabels(ds.Spec.Selector.MatchLabels), - } - if err := cl.List(ctx, podList, opts...); err != nil { - fmt.Printf("\r\033[K Error listing pods for module '%s': %v. Retrying...", moduleName, err) - continue - } - - if len(podList.Items) == 0 { - fmt.Printf("\r\033[K Waiting for pods of DaemonSet '%s' to be created...", dsName) - continue - } - - initializingPods := 0 - for _, pod := range podList.Items { - // Check pod status - if pod.Status.Phase == corev1.PodPending || pod.Status.Phase == corev1.PodRunning { - for _, initStatus := range pod.Status.InitContainerStatuses { - if initStatus.Name == "cni-switch-init-checker" && - (initStatus.State.Waiting != nil || initStatus.State.Running != nil) { - // Pod is waiting in or running our init-container - initializingPods++ - break - } - } - } - } - - fmt.Printf("\r\033[K Progress: %d/%d pods are in 'Initializing' state.", - initializingPods, ds.Status.DesiredNumberScheduled) - - if int32(initializingPods) >= ds.Status.DesiredNumberScheduled { - fmt.Println("\n- All pods for target CNI are correctly waiting in the init-container.") - return nil - } - } - } -} - -// waitForModulePodsReady waits for all pods of a module's daemonset to be in the 'Ready' state. -func waitForModulePodsReady(ctx context.Context, cl client.Client, moduleName string, dsName string) error { - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - // Get the DaemonSet - ds := &appsv1.DaemonSet{} - err := cl.Get(ctx, types.NamespacedName{Name: dsName, Namespace: "d8-" + moduleName}, ds) - if err != nil { - fmt.Printf("\r\033[K Error getting DaemonSet '%s': %v. Retrying...", dsName, err) - continue - } - - if ds.Status.DesiredNumberScheduled == 0 { - fmt.Printf("\r\033[K Waiting for DaemonSet '%s' to schedule pods...", dsName) - continue - } - - fmt.Printf("\r\033[K Progress: %d/%d pods are Ready.", - ds.Status.NumberReady, ds.Status.DesiredNumberScheduled) - - if ds.Status.NumberReady >= ds.Status.DesiredNumberScheduled { - fmt.Println("\n- All pods for target CNI are Ready.") - return nil - } - } - } -} - -// waitForModulePodsTermination waits for all pods of a module's daemonset to be terminated. -func waitForModulePodsTermination(ctx context.Context, cl client.Client, moduleName string, dsName string) error { - fmt.Printf(" Waiting for pods of module '%s' to terminate...", moduleName) - - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - // List pods with label app= in namespace d8- - podList := &corev1.PodList{} - opts := []client.ListOption{ - client.InNamespace("d8-" + moduleName), - client.MatchingLabels(map[string]string{"app": dsName}), } - if err := cl.List(ctx, podList, opts...); err != nil { - fmt.Printf("\r\033[K Error listing pods for module '%s': %v. Retrying...", moduleName, err) - continue - } - - if len(podList.Items) == 0 { - fmt.Println("\n- All pods for disabled CNI module are terminated.") - return nil - } - - fmt.Printf("\r\033[K Waiting for %d pods of module '%s' to terminate...", len(podList.Items), moduleName) - } - } -} - -// verifyResourcesExist checks if all expected resources are present in the cluster. -func verifyResourcesExist(ctx context.Context, rtClient client.Client) error { - resources := []struct { - kind string - name string - namespace string - obj client.Object - }{ - {"Namespace", cniSwitchNamespace, "", &corev1.Namespace{}}, - {"ServiceAccount", switchHelperServiceAccountName, cniSwitchNamespace, &corev1.ServiceAccount{}}, - {"ServiceAccount", webhookServiceAccountName, cniSwitchNamespace, &corev1.ServiceAccount{}}, - {"ClusterRole", switchHelperClusterRoleName, "", &rbacv1.ClusterRole{}}, - {"ClusterRole", webhookClusterRoleName, "", &rbacv1.ClusterRole{}}, - {"ClusterRoleBinding", switchHelperClusterRoleBindingName, "", &rbacv1.ClusterRoleBinding{}}, - {"ClusterRoleBinding", webhookClusterRoleBindingName, "", &rbacv1.ClusterRoleBinding{}}, - {"Secret", webhookSecretName, cniSwitchNamespace, &corev1.Secret{}}, - {"Service", webhookServiceName, cniSwitchNamespace, &corev1.Service{}}, - {"MutatingWebhookConfiguration", webhookConfigName, "", &admissionregistrationv1.MutatingWebhookConfiguration{}}, - {"DaemonSet", switchHelperDaemonSetName, cniSwitchNamespace, &appsv1.DaemonSet{}}, - {"Deployment", webhookDeploymentName, cniSwitchNamespace, &appsv1.Deployment{}}, - } - - for _, r := range resources { - key := client.ObjectKey{Name: r.name, Namespace: r.namespace} - if err := rtClient.Get(ctx, key, r.obj); err != nil { - return fmt.Errorf("resource %s '%s' not found: %w", r.kind, r.name, err) } } - return nil } -func updateCNIMigrationPhase(ctx context.Context, cl client.Client, migrationName string, phase string) error { - update := func() error { - migration := &v1alpha1.CNIMigration{} - if err := cl.Get(ctx, types.NamespacedName{Name: migrationName}, migration); err != nil { - return fmt.Errorf("getting CNIMigration: %w", err) - } - - if migration.Spec.Phase == phase { - return nil - } - - patchedMigration := migration.DeepCopy() - patchedMigration.Spec.Phase = phase - if err := cl.Patch(ctx, patchedMigration, client.MergeFrom(migration)); err != nil { - return fmt.Errorf("patching CNIMigration: %w", err) - } - return nil +func getLatestCondition(m *v1alpha1.CNIMigration) string { + if len(m.Status.Conditions) == 0 { + return "Pending" } - - err := update() - if err == nil { - return nil - } - fmt.Printf("\r\033[K ⚠️ %v. Retrying...", err) - - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - if err := update(); err != nil { - fmt.Printf("\r\033[K ⚠️ %v. Retrying...", err) - continue + // Return the latest True condition + latest := m.Status.Conditions[0] + for _, c := range m.Status.Conditions { + if c.Status == metav1.ConditionTrue { + if c.LastTransitionTime.After(latest.LastTransitionTime.Time) { + latest = c } - return nil } } + return latest.Type } From 11a36c2c870d784e0ed827f0d97e14590eae07fc Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Mon, 19 Jan 2026 22:47:09 +0400 Subject: [PATCH 14/16] Refactor CNI switch command logic The CNI switch command has been refactored to leverage an in-cluster controller for automated migration. The CLI tool's role is now to trigger the migration and monitor its status. This commit: - Updates the `cni-switch` command description to reflect the new automated workflow. - Removes the `prepare` command, as its functionality is now integrated into the `switch` command. - Introduces a `watch` command to monitor the migration progress. - Simplifies the `cleanup` and `rollback` commands to align with the new process. - Removes unused internal CNI resources and preparation logic. - Introduces basic RBAC for CNI migration controller. - Adds basic `CNIMigration` status reporting and error handling for the watch command. --- cmd/commands/cni.go | 65 +++-- cmd/commands/kubectl.go | 8 +- internal/cni/common.go | 45 ---- internal/cni/prepare.go | 86 ------ internal/cni/resources.go | 545 -------------------------------------- internal/cni/switch.go | 101 +++---- internal/cni/watch.go | 118 +++++++++ 7 files changed, 195 insertions(+), 773 deletions(-) delete mode 100644 internal/cni/prepare.go delete mode 100644 internal/cni/resources.go create mode 100644 internal/cni/watch.go diff --git a/cmd/commands/cni.go b/cmd/commands/cni.go index 8d1ac8ab..a4e884f5 100644 --- a/cmd/commands/cni.go +++ b/cmd/commands/cni.go @@ -34,28 +34,35 @@ var ( cniSwitchLong = templates.LongDesc(` A group of commands to switch the CNI (Container Network Interface) provider in the Deckhouse cluster. -This process is divided into several steps: +The migration process is handled automatically by an in-cluster controller. +This CLI tool is used to trigger the migration and monitor its status. - - 'd8 cni-switch prepare' - STEP 1. Prepares the cluster for CNI migration. - - 'd8 cni-switch switch' - STEP 2. Performs the actual CNI switch. - - 'd8 cni-switch cleanup' - STEP 3. Cleans up resources if the switch is aborted. - - 'd8 cni-switch rollback' - (Optional) Rollback CNI if the switch is aborted.`) +Workflow: + 1. 'd8 cni-switch switch --to-cni ' - Initiates the migration. + This creates a CNIMigration resource, which triggers the deployment of the migration agent. + The agent then performs all necessary steps (validation, node checks, CNI switching). - cniPrepareExample = templates.Examples(` - # Prepare to switch to Cilium CNI - d8 cni-switch prepare --to-cni cilium`) + 2. 'd8 cni-switch watch' - (Optional) Monitors the progress of the migration. + Since the process is automated, this command simply watches the status. + + 3. 'd8 cni-switch cleanup' - Cleans up the migration resources after completion. +`) cniSwitchExample = templates.Examples(` - # Perform the CNI switch after the prepare step is complete - d8 cni-switch switch`) + # Start the migration to Cilium CNI + d8 cni-switch switch --to-cni cilium`) + + cniWatchExample = templates.Examples(` + # Monitor the ongoing migration + d8 cni-switch watch`) cniCleanupExample = templates.Examples(` - # Cleanup resources created by the 'prepare' command + # Cleanup resources created by the 'switch' command d8 cni-switch cleanup`) cniRollbackExample = templates.Examples(` - # Rollback changes, restore previous CNI, cleanup resources created by the 'prepare' command - d8 cni-switch rollback`) + # Rollback changes, restore previous CNI, cleanup resources created by the 'switch' command + d8 cni-switch rollback`) // TODO(tden): Remove functionality, delete internal/cni/rollback.go? supportedCNIs = []string{"cilium", "flannel", "simple-bridge"} ) @@ -70,18 +77,18 @@ func NewCniSwitchCommand() *cobra.Command { Long: cniSwitchLong, } cmd.PersistentFlags().Duration("timeout", 1*time.Hour, "The timeout for the entire operation (e.g., 40m, 1h)") - cmd.AddCommand(NewCmdCniPrepare()) cmd.AddCommand(NewCmdCniSwitch()) + cmd.AddCommand(NewCmdCniWatch()) cmd.AddCommand(NewCmdCniCleanup()) cmd.AddCommand(NewCmdCniRollback()) return cmd } -func NewCmdCniPrepare() *cobra.Command { +func NewCmdCniSwitch() *cobra.Command { cmd := &cobra.Command{ - Use: "prepare", - Short: "Prepares the cluster for CNI switching", - Example: cniPrepareExample, + Use: "switch", + Short: "Initiates the CNI switching", + Example: cniSwitchExample, PreRunE: func(cmd *cobra.Command, _ []string) error { targetCNI, _ := cmd.Flags().GetString("to-cni") for _, supported := range supportedCNIs { @@ -99,8 +106,14 @@ func NewCmdCniPrepare() *cobra.Command { Run: func(cmd *cobra.Command, _ []string) { targetCNI, _ := cmd.Flags().GetString("to-cni") timeout, _ := cmd.Flags().GetDuration("timeout") - if err := cni.RunPrepare(targetCNI, timeout); err != nil { - log.Fatalf("❌ Error running prepare command: %v", err) + + if err := cni.RunSwitch(targetCNI, timeout); err != nil { + log.Fatalf("❌ Error running switch command: %v", err) + } + + fmt.Println() + if err := cni.RunWatch(timeout); err != nil { + log.Fatalf("❌ Error monitoring switch progress: %v", err) } }, } @@ -113,15 +126,15 @@ func NewCmdCniPrepare() *cobra.Command { return cmd } -func NewCmdCniSwitch() *cobra.Command { +func NewCmdCniWatch() *cobra.Command { cmd := &cobra.Command{ - Use: "switch", - Short: "Performs the CNI switching", - Example: cniSwitchExample, + Use: "watch", + Short: "Monitors the CNI switching progress", + Example: cniWatchExample, Run: func(cmd *cobra.Command, _ []string) { timeout, _ := cmd.Flags().GetDuration("timeout") - if err := cni.RunSwitch(timeout); err != nil { - log.Fatalf("❌ Error running switch command: %v", err) + if err := cni.RunWatch(timeout); err != nil { + log.Fatalf("❌ Error running watch command: %v", err) } }, } diff --git a/cmd/commands/kubectl.go b/cmd/commands/kubectl.go index 5f14c148..d029fdc0 100644 --- a/cmd/commands/kubectl.go +++ b/cmd/commands/kubectl.go @@ -32,12 +32,12 @@ import ( cliflag "k8s.io/component-base/cli/flag" "k8s.io/component-base/logs" kubecmd "k8s.io/kubectl/pkg/cmd" - - "github.com/deckhouse/deckhouse-cli/internal/cni" ) const ( - cmImageKey = "debug-container-image" + cmNamespace = "d8-system" + cmName = "debug-container" + cmImageKey = "image" ) var d8CommandRegex = regexp.MustCompile("([\"'`])d8 (\\w+)") @@ -112,7 +112,7 @@ func getDebugImage(cmd *cobra.Command) (string, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - configMap, err := kubeCl.CoreV1().ConfigMaps(cni.CMDataNameSpace).Get(ctx, cni.CMDataName, v1.GetOptions{}) + configMap, err := kubeCl.CoreV1().ConfigMaps(cmNamespace).Get(ctx, cmName, v1.GetOptions{}) if err != nil { return "", ErrGenericImageFetch } diff --git a/internal/cni/common.go b/internal/cni/common.go index a018f8d8..6b43bc89 100644 --- a/internal/cni/common.go +++ b/internal/cni/common.go @@ -23,56 +23,11 @@ import ( "os" "strings" - "k8s.io/apimachinery/pkg/runtime/schema" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" ) -const ( - // Image ConfigMap from Deckhouse - CMDataName = "d8-cli-data" - CMDataNameSpace = "d8-system" - cmDataHelperImage = "cni-switch-helper-image" - - // Namespace where the helper and webhook are deployed - cniSwitchNamespace = "cni-switch" - - // Service Account Names - switchHelperServiceAccountName = "cni-switch-helper-sa" - webhookServiceAccountName = "cni-switch-webhook-sa" - - // DaemonSet Names - switchHelperDaemonSetName = "cni-switch-helper" - - // Cluster Role and Binding Names - switchHelperClusterRoleName = "d8:cni-switch-helper" - webhookClusterRoleName = "d8:cni-switch-webhook" - switchHelperClusterRoleBindingName = "d8:cni-switch-helper" - webhookClusterRoleBindingName = "d8:cni-switch-webhook" - - // Webhook Resources - webhookDeploymentName = "cni-switch-webhook" - webhookServiceName = "cni-switch-webhook-service" - webhookSecretName = "cni-switch-webhook-tls" - webhookConfigName = "cni-switch-pod-annotator" - webhookConfigurationName = "annotator.cni-switch.deckhouse.io" - webhookPort = 42443 - - // Annotations - EffectiveCNIAnnotation = "effective-cni.network.deckhouse.io" -) - -var ( - CNIModuleConfigs = []string{"cni-cilium", "cni-flannel", "cni-simple-bridge"} - - moduleConfigGVK = schema.GroupVersionKind{ - Group: "deckhouse.io", - Version: "v1alpha1", - Kind: "ModuleConfig", - } -) - // AskForConfirmation displays a warning and prompts the user for confirmation. func AskForConfirmation(commandName string) (bool, error) { reader := bufio.NewReader(os.Stdin) diff --git a/internal/cni/prepare.go b/internal/cni/prepare.go deleted file mode 100644 index 268e0752..00000000 --- a/internal/cni/prepare.go +++ /dev/null @@ -1,86 +0,0 @@ -/* -Copyright 2025 Flant JSC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package cni - -import ( - "context" - "fmt" - "time" - - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" - saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" -) - -// RunPrepare executes the logic for the 'cni-switch prepare' command. -func RunPrepare(targetCNI string, timeout time.Duration) error { - // 0. Ask for user confirmation - confirmed, err := AskForConfirmation("prepare") - if err != nil { - return fmt.Errorf("asking for confirmation: %w", err) - } - if !confirmed { - fmt.Println("Operation cancelled by user.") - return nil - } - - startTime := time.Now() - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - - fmt.Printf("🚀 Starting CNI switch preparation for target '%s'\n", targetCNI) - - // 1. Create a Kubernetes client - safeClient, err := saferequest.NewSafeClient() - if err != nil { - return fmt.Errorf("creating safe client: %w", err) - } - - rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) - if err != nil { - return fmt.Errorf("creating runtime client: %w", err) - } - - // 2. Create the CNIMigration resource - migrationName := fmt.Sprintf("cni-migration-%s", time.Now().Format("20060102-150405")) - newMigration := &v1alpha1.CNIMigration{ - ObjectMeta: metav1.ObjectMeta{ - Name: migrationName, - }, - Spec: v1alpha1.CNIMigrationSpec{ - TargetCNI: targetCNI, - }, - } - - if err := rtClient.Create(ctx, newMigration); err != nil { - if errors.IsAlreadyExists(err) { - fmt.Printf("ℹ️ Migration '%s' already exists.\n", migrationName) - } else { - return fmt.Errorf("creating CNIMigration: %w", err) - } - } else { - fmt.Printf("✅ CNIMigration '%s' created.\n", migrationName) - } - - fmt.Printf("\n🎉 Preparation triggered (total elapsed: %s)\n", time.Since(startTime).Round(time.Millisecond)) - fmt.Println("The migration is now being handled automatically by the cluster.") - fmt.Println("You can monitor the status using 'd8 cni-switch switch --watch' or by checking the CNIMigration resource.") - - return nil -} diff --git a/internal/cni/resources.go b/internal/cni/resources.go deleted file mode 100644 index fe47446c..00000000 --- a/internal/cni/resources.go +++ /dev/null @@ -1,545 +0,0 @@ -/* -Copyright 2025 Flant JSC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package cni - -import ( - admissionregistrationv1 "k8s.io/api/admissionregistration/v1" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" -) - -// --- Helper Resources --- - -// getSwitchHelperDaemonSet returns a DaemonSet object. -func getSwitchHelperDaemonSet(namespace, imageName string) *appsv1.DaemonSet { - rootID := int64(0) - truePtr := true - terminationGracePeriodSeconds := int64(5) - mountPropagationBidirectional := corev1.MountPropagationBidirectional - hostPathDirectoryOrCreate := corev1.HostPathDirectoryOrCreate - hostPathDirectory := corev1.HostPathDirectory - - return &appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: switchHelperDaemonSetName, - Namespace: namespace, - }, - Spec: appsv1.DaemonSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"app": switchHelperDaemonSetName}, - }, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{"app": switchHelperDaemonSetName}, - }, - Spec: corev1.PodSpec{ - ServiceAccountName: switchHelperServiceAccountName, - Containers: []corev1.Container{ - { - Name: "helper", - Image: imageName, - Args: []string{ - "--health-probe-bind-address=:42281", - }, - Ports: []corev1.ContainerPort{ - { - Name: "healthz", - ContainerPort: 42281, - Protocol: corev1.ProtocolTCP, - }, - }, - LivenessProbe: &corev1.Probe{ - ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - Path: "/healthz", - Port: intstr.FromString("healthz"), - }, - }, - InitialDelaySeconds: 15, - PeriodSeconds: 20, - }, - ReadinessProbe: &corev1.Probe{ - ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - Path: "/readyz", - Port: intstr.FromString("healthz"), - }, - }, - InitialDelaySeconds: 5, - PeriodSeconds: 10, - }, - Env: []corev1.EnvVar{ - { - Name: "NODE_NAME", - ValueFrom: &corev1.EnvVarSource{ - FieldRef: &corev1.ObjectFieldSelector{ - FieldPath: "spec.nodeName", - }, - }, - }, - { - Name: "KUBERNETES_SERVICE_HOST", - Value: "127.0.0.1", - }, - { - Name: "KUBERNETES_SERVICE_PORT", - Value: "6445", - }, - }, - SecurityContext: &corev1.SecurityContext{ - Privileged: &truePtr, - RunAsUser: &rootID, - RunAsGroup: &rootID, - }, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "host-proc", - MountPath: "/host/proc", - ReadOnly: true, - }, - { - Name: "host-sys", - MountPath: "/host/sys", - ReadOnly: true, - }, - { - Name: "cni-net-d", - MountPath: "/etc/cni/net.d", - }, - { - Name: "cni-bin", - MountPath: "/opt/cni/bin", - }, - { - Name: "host-run", - MountPath: "/run", - }, - { - Name: "host-bpf", - MountPath: "/sys/fs/bpf", - MountPropagation: &mountPropagationBidirectional, - }, - { - Name: "host-lib-modules", - MountPath: "/lib/modules", - ReadOnly: true, - }, - { - Name: "host-var-lib-cni", - MountPath: "/var/lib/cni", - }, - }, - }, - }, - Tolerations: []corev1.Toleration{ - { - Operator: corev1.TolerationOpExists, - }, - }, - PriorityClassName: "system-node-critical", - HostNetwork: true, - HostPID: true, - HostIPC: true, - TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, - Volumes: []corev1.Volume{ - { - Name: "host-proc", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{Path: "/proc"}, - }, - }, - { - Name: "host-sys", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{Path: "/sys"}, - }, - }, - { - Name: "cni-net-d", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{Path: "/etc/cni/net.d"}, - }, - }, - { - Name: "cni-bin", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{Path: "/opt/cni/bin"}, - }, - }, - { - Name: "host-run", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{Path: "/run"}, - }, - }, - { - Name: "host-bpf", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{ - Path: "/sys/fs/bpf", - Type: &hostPathDirectoryOrCreate, - }, - }, - }, - { - Name: "host-lib-modules", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{ - Path: "/lib/modules", - Type: &hostPathDirectory, - }, - }, - }, - { - Name: "host-var-lib-cni", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{ - Path: "/var/lib/cni", - Type: &hostPathDirectoryOrCreate, - }, - }, - }, - }, - }, - }, - }, - } -} - -// --- Webhook Resources --- - -// getWebhookTLSSecret returns a Secret object containing the webhook's TLS certificates. -func getWebhookTLSSecret(namespace string, cert, key []byte) *corev1.Secret { - return &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookSecretName, - Namespace: namespace, - }, - Type: corev1.SecretTypeTLS, - Data: map[string][]byte{ - corev1.TLSCertKey: cert, - corev1.TLSPrivateKeyKey: key, - }, - } -} - -// getWebhookDeployment returns a Deployment object for the webhook server. -func getWebhookDeployment(namespace, imageName, serviceAccountName string) *appsv1.Deployment { - replicas := int32(2) - terminationGracePeriodSeconds := int64(5) - return &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookDeploymentName, - Namespace: namespace, - Labels: map[string]string{ - "app": webhookDeploymentName, - }, - }, - Spec: appsv1.DeploymentSpec{ - Replicas: &replicas, - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": webhookDeploymentName, - }, - }, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - "app": webhookDeploymentName, - }, - }, - Spec: corev1.PodSpec{ - ServiceAccountName: serviceAccountName, - HostNetwork: true, - DNSPolicy: corev1.DNSClusterFirstWithHostNet, - TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, - Tolerations: []corev1.Toleration{ - { - Operator: corev1.TolerationOpExists, - }, - }, - Affinity: &corev1.Affinity{ - PodAntiAffinity: &corev1.PodAntiAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ - { - LabelSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": webhookDeploymentName, - }, - }, - TopologyKey: "kubernetes.io/hostname", - }, - }, - }, - }, - Containers: []corev1.Container{ - { - Name: "webhook", - Image: imageName, - Args: []string{ - "--mode=webhook", - "--health-probe-bind-address=:42282", - }, - Ports: []corev1.ContainerPort{ - { - ContainerPort: webhookPort, - Name: "webhook", - }, - { - Name: "healthz", - ContainerPort: 42282, - Protocol: corev1.ProtocolTCP, - }, - }, - LivenessProbe: &corev1.Probe{ - ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - Path: "/healthz", - Port: intstr.FromString("healthz"), - }, - }, - InitialDelaySeconds: 15, - PeriodSeconds: 20, - }, - ReadinessProbe: &corev1.Probe{ - ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - Path: "/readyz", - Port: intstr.FromString("healthz"), - }, - }, - InitialDelaySeconds: 5, - PeriodSeconds: 10, - }, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "tls-certs", - MountPath: "/etc/tls", - ReadOnly: true, - }, - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "tls-certs", - VolumeSource: corev1.VolumeSource{ - Secret: &corev1.SecretVolumeSource{ - SecretName: webhookSecretName, - }, - }, - }, - }, - }, - }, - }, - } -} - -// getWebhookService returns a Service object for the webhook. -func getWebhookService(namespace string) *corev1.Service { - return &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookServiceName, - Namespace: namespace, - }, - Spec: corev1.ServiceSpec{ - Selector: map[string]string{ - "app": webhookDeploymentName, - }, - Ports: []corev1.ServicePort{ - { - Protocol: corev1.ProtocolTCP, - Port: 443, - TargetPort: intstr.FromInt(webhookPort), - }, - }, - }, - } -} - -// getMutatingWebhookConfiguration returns a MutatingWebhookConfiguration object for annotating pods. -func getMutatingWebhookConfiguration(namespace string, caBundle []byte) *admissionregistrationv1.MutatingWebhookConfiguration { - path := "/mutate-pod" - failurePolicy := admissionregistrationv1.Ignore - sideEffects := admissionregistrationv1.SideEffectClassNone - - return &admissionregistrationv1.MutatingWebhookConfiguration{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookConfigName, - }, - Webhooks: []admissionregistrationv1.MutatingWebhook{ - { - Name: webhookConfigurationName, - AdmissionReviewVersions: []string{"v1"}, - ClientConfig: admissionregistrationv1.WebhookClientConfig{ - Service: &admissionregistrationv1.ServiceReference{ - Name: webhookServiceName, - Namespace: namespace, - Path: &path, - Port: &[]int32{443}[0], - }, - CABundle: caBundle, - }, - Rules: []admissionregistrationv1.RuleWithOperations{ - { - Operations: []admissionregistrationv1.OperationType{admissionregistrationv1.Create}, - Rule: admissionregistrationv1.Rule{ - APIGroups: []string{""}, - APIVersions: []string{"v1"}, - Resources: []string{"pods"}, - }, - }, - }, - NamespaceSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "kubernetes.io/metadata.name", - Operator: metav1.LabelSelectorOpNotIn, - Values: generateExcludedNamespaces(namespace), - }, - }, - }, - FailurePolicy: &failurePolicy, - SideEffects: &sideEffects, - }, - }, - } -} - -// generateExcludedNamespaces creates a list of namespaces to be excluded from webhook processing. -// This includes the webhook's own namespace and CNI module namespaces. -func generateExcludedNamespaces(currentNamespace string) []string { - excluded := []string{currentNamespace} // Exclude the webhook's own namespace (e.g., "cni-switch") - for _, module := range CNIModuleConfigs { - excluded = append(excluded, "d8-"+module) // Exclude "d8-cni-cilium", "d8-cni-flannel", etc. - } - return excluded -} - -// --- RBAC Resources --- - -func getSwitchHelperServiceAccount(namespace string) *corev1.ServiceAccount { - return &corev1.ServiceAccount{ - ObjectMeta: metav1.ObjectMeta{ - Name: switchHelperServiceAccountName, - Namespace: namespace, - }, - } -} - -func getWebhookServiceAccount(namespace string) *corev1.ServiceAccount { - return &corev1.ServiceAccount{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookServiceAccountName, - Namespace: namespace, - }, - } -} - -func getSwitchHelperClusterRole() *rbacv1.ClusterRole { - return &rbacv1.ClusterRole{ - ObjectMeta: metav1.ObjectMeta{ - Name: switchHelperClusterRoleName, - }, - Rules: []rbacv1.PolicyRule{ - { - APIGroups: []string{"network.deckhouse.io"}, - Resources: []string{"cnimigrations"}, - Verbs: []string{"get", "list", "watch"}, - }, - { - APIGroups: []string{"network.deckhouse.io"}, - Resources: []string{"cnimigrations/status"}, - Verbs: []string{"get"}, - }, - { - APIGroups: []string{"network.deckhouse.io"}, - Resources: []string{"cninodemigrations"}, - Verbs: []string{"get", "list", "watch", "create", "update", "patch", "delete"}, - }, - { - APIGroups: []string{"network.deckhouse.io"}, - Resources: []string{"cninodemigrations/status"}, - Verbs: []string{"get", "update", "patch"}, - }, - { - APIGroups: []string{""}, - Resources: []string{"pods"}, - Verbs: []string{"get", "list", "watch", "patch", "update", "delete"}, - }, - }, - } -} - -func getWebhookClusterRole() *rbacv1.ClusterRole { - return &rbacv1.ClusterRole{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookClusterRoleName, - }, - Rules: []rbacv1.PolicyRule{ - { - APIGroups: []string{"network.deckhouse.io"}, - Resources: []string{"cnimigrations"}, - Verbs: []string{"get", "list", "watch"}, - }, - }, - } -} - -func getSwitchHelperClusterRoleBinding(namespace string) *rbacv1.ClusterRoleBinding { - return &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: switchHelperClusterRoleBindingName, - }, - RoleRef: rbacv1.RoleRef{ - APIGroup: "rbac.authorization.k8s.io", - Kind: "ClusterRole", - Name: switchHelperClusterRoleName, - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: switchHelperServiceAccountName, - Namespace: namespace, - }, - }, - } -} - -func getWebhookClusterRoleBinding(namespace string) *rbacv1.ClusterRoleBinding { - return &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: webhookClusterRoleBindingName, - }, - RoleRef: rbacv1.RoleRef{ - APIGroup: "rbac.authorization.k8s.io", - Kind: "ClusterRole", - Name: webhookClusterRoleName, - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: webhookServiceAccountName, - Namespace: namespace, - }, - }, - } -} diff --git a/internal/cni/switch.go b/internal/cni/switch.go index eb833a83..5364ea68 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -21,6 +21,7 @@ import ( "fmt" "time" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" @@ -28,13 +29,22 @@ import ( ) // RunSwitch executes the logic for the 'cni-switch switch' command. -// In the new architecture, it acts as a status watcher. -func RunSwitch(timeout time.Duration) error { +func RunSwitch(targetCNI string, timeout time.Duration) error { + // 0. Ask for user confirmation + confirmed, err := AskForConfirmation("switch") + if err != nil { + return fmt.Errorf("asking for confirmation: %w", err) + } + if !confirmed { + fmt.Println("Operation cancelled by user.") + return nil + } + startTime := time.Now() ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() - fmt.Printf("🚀 Monitoring CNI switch progress (timeout: %s)\n", timeout) + fmt.Printf("🚀 Starting CNI switch for target '%s'\n", targetCNI) // 1. Create a Kubernetes client safeClient, err := saferequest.NewSafeClient() @@ -47,72 +57,29 @@ func RunSwitch(timeout time.Duration) error { return fmt.Errorf("creating runtime client: %w", err) } - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - var lastCondition string - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - activeMigration, err := FindActiveMigration(ctx, rtClient) - if err != nil { - fmt.Printf("\r\033[K⚠️ Error finding active migration: %v", err) - continue - } - - if activeMigration == nil { - fmt.Println("\r\033[Kℹ️ No active migration found.") - return nil - } - - // Report current status from conditions - currentCond := getLatestCondition(activeMigration) - if currentCond != lastCondition { - fmt.Printf("\r\033[K[%s] %s: %s\n", - time.Now().Format("15:04:05"), - currentCond, - activeMigration.Name) - lastCondition = currentCond - } - - // Report node progress if available - if activeMigration.Status.NodesTotal > 0 { - fmt.Printf("\r\033[K Nodes: %d/%d succeeded", - activeMigration.Status.NodesSucceeded, - activeMigration.Status.NodesTotal) - if activeMigration.Status.NodesFailed > 0 { - fmt.Printf(", %d failed", activeMigration.Status.NodesFailed) - } - } - - // Check for completion - for _, cond := range activeMigration.Status.Conditions { - if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { - fmt.Printf("\n\n🎉 CNI switch to '%s' completed successfully! (total time: %s)\n", - activeMigration.Spec.TargetCNI, - time.Since(startTime).Round(time.Second)) - return nil - } - } - } + // 2. Create the CNIMigration resource + migrationName := fmt.Sprintf("cni-migration-%s", time.Now().Format("20060102-150405")) + newMigration := &v1alpha1.CNIMigration{ + ObjectMeta: metav1.ObjectMeta{ + Name: migrationName, + }, + Spec: v1alpha1.CNIMigrationSpec{ + TargetCNI: targetCNI, + }, } -} -func getLatestCondition(m *v1alpha1.CNIMigration) string { - if len(m.Status.Conditions) == 0 { - return "Pending" - } - // Return the latest True condition - latest := m.Status.Conditions[0] - for _, c := range m.Status.Conditions { - if c.Status == metav1.ConditionTrue { - if c.LastTransitionTime.After(latest.LastTransitionTime.Time) { - latest = c - } + if err := rtClient.Create(ctx, newMigration); err != nil { + if errors.IsAlreadyExists(err) { + fmt.Printf("ℹ️ Migration '%s' already exists.\n", migrationName) + } else { + return fmt.Errorf("creating CNIMigration: %w", err) } + } else { + fmt.Printf("✅ CNIMigration '%s' created.\n", migrationName) } - return latest.Type + + fmt.Printf("\n🎉 Switch triggered (total elapsed: %s)\n", time.Since(startTime).Round(time.Millisecond)) + fmt.Println("The migration is now being handled automatically by the cluster.") + + return nil } diff --git a/internal/cni/watch.go b/internal/cni/watch.go new file mode 100644 index 00000000..b49c9294 --- /dev/null +++ b/internal/cni/watch.go @@ -0,0 +1,118 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cni + +import ( + "context" + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/deckhouse/deckhouse-cli/internal/cni/api/v1alpha1" + saferequest "github.com/deckhouse/deckhouse-cli/pkg/libsaferequest/client" +) + +// RunWatch executes the logic for the 'cni-switch watch' command. +// In the new architecture, it acts as a status watcher. +func RunWatch(timeout time.Duration) error { + startTime := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + fmt.Printf("🚀 Monitoring CNI switch progress (timeout: %s)\n", timeout) + + // 1. Create a Kubernetes client + safeClient, err := saferequest.NewSafeClient() + if err != nil { + return fmt.Errorf("creating safe client: %w", err) + } + + rtClient, err := safeClient.NewRTClient(v1alpha1.AddToScheme) + if err != nil { + return fmt.Errorf("creating runtime client: %w", err) + } + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + var lastCondition string + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + activeMigration, err := FindActiveMigration(ctx, rtClient) + if err != nil { + fmt.Printf("\r\033[K⚠️ Error finding active migration: %v", err) + continue + } + + if activeMigration == nil { + fmt.Println("\r\033[Kℹ️ No active migration found.") + return nil + } + + // Report current status from conditions + currentCond := getLatestCondition(activeMigration) + if currentCond != lastCondition { + fmt.Printf("\r\033[K[%s] %s: %s\n", + time.Now().Format("15:04:05"), + currentCond, + activeMigration.Name) + lastCondition = currentCond + } + + // Report node progress if available + if activeMigration.Status.NodesTotal > 0 { + fmt.Printf("\r\033[K Nodes: %d/%d succeeded", + activeMigration.Status.NodesSucceeded, + activeMigration.Status.NodesTotal) + if activeMigration.Status.NodesFailed > 0 { + fmt.Printf(", %d failed", activeMigration.Status.NodesFailed) + } + } + + // Check for completion + for _, cond := range activeMigration.Status.Conditions { + if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { + fmt.Printf("\n\n🎉 CNI switch to '%s' completed successfully! (total time: %s)\n", + activeMigration.Spec.TargetCNI, + time.Since(startTime).Round(time.Second)) + return nil + } + } + } + } +} + +func getLatestCondition(m *v1alpha1.CNIMigration) string { + if len(m.Status.Conditions) == 0 { + return "Pending" + } + // Return the latest True condition + latest := m.Status.Conditions[0] + for _, c := range m.Status.Conditions { + if c.Status == metav1.ConditionTrue { + if c.LastTransitionTime.After(latest.LastTransitionTime.Time) { + latest = c + } + } + } + return latest.Type +} From c5f4d98e13123a4f74876d9614b3b3a0f57c256e Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Tue, 20 Jan 2026 22:24:21 +0400 Subject: [PATCH 15/16] go Signed-off-by: Denis Tarabrin --- cmd/commands/cni.go | 18 +++------ internal/cni/cleanup.go | 15 +++---- internal/cni/rollback.go | 4 +- internal/cni/switch.go | 16 +++----- internal/cni/watch.go | 84 ++++++++++++++++++++++------------------ 5 files changed, 66 insertions(+), 71 deletions(-) diff --git a/cmd/commands/cni.go b/cmd/commands/cni.go index a4e884f5..b78b34d6 100644 --- a/cmd/commands/cni.go +++ b/cmd/commands/cni.go @@ -20,7 +20,6 @@ import ( "fmt" "log" "strings" - "time" "github.com/go-logr/logr" "github.com/spf13/cobra" @@ -76,7 +75,6 @@ func NewCniSwitchCommand() *cobra.Command { Short: "A group of commands to switch CNI in the cluster", Long: cniSwitchLong, } - cmd.PersistentFlags().Duration("timeout", 1*time.Hour, "The timeout for the entire operation (e.g., 40m, 1h)") cmd.AddCommand(NewCmdCniSwitch()) cmd.AddCommand(NewCmdCniWatch()) cmd.AddCommand(NewCmdCniCleanup()) @@ -105,14 +103,13 @@ func NewCmdCniSwitch() *cobra.Command { Run: func(cmd *cobra.Command, _ []string) { targetCNI, _ := cmd.Flags().GetString("to-cni") - timeout, _ := cmd.Flags().GetDuration("timeout") - if err := cni.RunSwitch(targetCNI, timeout); err != nil { + if err := cni.RunSwitch(targetCNI); err != nil { log.Fatalf("❌ Error running switch command: %v", err) } fmt.Println() - if err := cni.RunWatch(timeout); err != nil { + if err := cni.RunWatch(); err != nil { log.Fatalf("❌ Error monitoring switch progress: %v", err) } }, @@ -132,8 +129,7 @@ func NewCmdCniWatch() *cobra.Command { Short: "Monitors the CNI switching progress", Example: cniWatchExample, Run: func(cmd *cobra.Command, _ []string) { - timeout, _ := cmd.Flags().GetDuration("timeout") - if err := cni.RunWatch(timeout); err != nil { + if err := cni.RunWatch(); err != nil { log.Fatalf("❌ Error running watch command: %v", err) } }, @@ -147,8 +143,7 @@ func NewCmdCniCleanup() *cobra.Command { Short: "Cleans up resources created during CNI switching", Example: cniCleanupExample, Run: func(cmd *cobra.Command, _ []string) { - timeout, _ := cmd.Flags().GetDuration("timeout") - if err := cni.RunCleanup(timeout); err != nil { + if err := cni.RunCleanup(); err != nil { log.Fatalf("❌ Error running cleanup command: %v", err) } }, @@ -156,14 +151,13 @@ func NewCmdCniCleanup() *cobra.Command { return cmd } -func NewCmdCniRollback() *cobra.Command { // TDEN It needs to be done! +func NewCmdCniRollback() *cobra.Command { // TODO(tden): It needs to be done! cmd := &cobra.Command{ Use: "rollback", Short: "Rollback all changes and restore previous CNI", Example: cniRollbackExample, Run: func(cmd *cobra.Command, _ []string) { - timeout, _ := cmd.Flags().GetDuration("timeout") - if err := cni.RunRollback(timeout); err != nil { + if err := cni.RunRollback(); err != nil { log.Fatalf("❌ Error running rollback command: %v", err) } }, diff --git a/internal/cni/cleanup.go b/internal/cni/cleanup.go index 23ac6f56..cbc9d72f 100644 --- a/internal/cni/cleanup.go +++ b/internal/cni/cleanup.go @@ -19,7 +19,6 @@ package cni import ( "context" "fmt" - "time" "k8s.io/apimachinery/pkg/api/errors" @@ -28,14 +27,12 @@ import ( ) // RunCleanup executes the logic for the 'cni-switch cleanup' command. -// In the new architecture, it deletes the CNIMigration resource. -func RunCleanup(timeout time.Duration) error { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() +func RunCleanup() error { + ctx := context.Background() - fmt.Printf("🚀 Starting CNI switch cleanup (timeout: %s)\n", timeout) + fmt.Println("🚀 Starting CNI switch cleanup") - // 1. Create a Kubernetes client + // Create a Kubernetes client safeClient, err := saferequest.NewSafeClient() if err != nil { return fmt.Errorf("creating safe client: %w", err) @@ -46,7 +43,7 @@ func RunCleanup(timeout time.Duration) error { return fmt.Errorf("creating runtime client: %w", err) } - // 2. Find and delete all CNIMigration resources + // Find and delete all CNIMigration resources migrations := &v1alpha1.CNIMigrationList{} if err := rtClient.List(ctx, migrations); err != nil { return fmt.Errorf("listing CNIMigrations: %w", err) @@ -69,6 +66,6 @@ func RunCleanup(timeout time.Duration) error { } } - fmt.Println("\n🎉 Cleanup triggered. The cluster-internal controllers will handle the rest.") + fmt.Println("🎉 Cleanup triggered. The cluster-internal controllers will handle the rest.") return nil } diff --git a/internal/cni/rollback.go b/internal/cni/rollback.go index 52999599..92428296 100644 --- a/internal/cni/rollback.go +++ b/internal/cni/rollback.go @@ -18,12 +18,10 @@ package cni import ( "fmt" - "time" ) // RunRollback executes the logic for the 'cni-switch rollback' command. -func RunRollback(timeout time.Duration) error { +func RunRollback() error { fmt.Println("Logic for rollback is not implemented yet.") - _ = timeout return nil } diff --git a/internal/cni/switch.go b/internal/cni/switch.go index 5364ea68..a07ddd71 100644 --- a/internal/cni/switch.go +++ b/internal/cni/switch.go @@ -29,8 +29,8 @@ import ( ) // RunSwitch executes the logic for the 'cni-switch switch' command. -func RunSwitch(targetCNI string, timeout time.Duration) error { - // 0. Ask for user confirmation +func RunSwitch(targetCNI string) error { + // Ask for user confirmation confirmed, err := AskForConfirmation("switch") if err != nil { return fmt.Errorf("asking for confirmation: %w", err) @@ -40,13 +40,9 @@ func RunSwitch(targetCNI string, timeout time.Duration) error { return nil } - startTime := time.Now() - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - fmt.Printf("🚀 Starting CNI switch for target '%s'\n", targetCNI) - // 1. Create a Kubernetes client + // Create a Kubernetes client safeClient, err := saferequest.NewSafeClient() if err != nil { return fmt.Errorf("creating safe client: %w", err) @@ -57,7 +53,7 @@ func RunSwitch(targetCNI string, timeout time.Duration) error { return fmt.Errorf("creating runtime client: %w", err) } - // 2. Create the CNIMigration resource + // Create the CNIMigration resource migrationName := fmt.Sprintf("cni-migration-%s", time.Now().Format("20060102-150405")) newMigration := &v1alpha1.CNIMigration{ ObjectMeta: metav1.ObjectMeta{ @@ -68,7 +64,7 @@ func RunSwitch(targetCNI string, timeout time.Duration) error { }, } - if err := rtClient.Create(ctx, newMigration); err != nil { + if err := rtClient.Create(context.Background(), newMigration); err != nil { if errors.IsAlreadyExists(err) { fmt.Printf("ℹ️ Migration '%s' already exists.\n", migrationName) } else { @@ -78,7 +74,7 @@ func RunSwitch(targetCNI string, timeout time.Duration) error { fmt.Printf("✅ CNIMigration '%s' created.\n", migrationName) } - fmt.Printf("\n🎉 Switch triggered (total elapsed: %s)\n", time.Since(startTime).Round(time.Millisecond)) + fmt.Println("🎉 Switch triggered.") fmt.Println("The migration is now being handled automatically by the cluster.") return nil diff --git a/internal/cni/watch.go b/internal/cni/watch.go index b49c9294..1ddaaacc 100644 --- a/internal/cni/watch.go +++ b/internal/cni/watch.go @@ -19,6 +19,7 @@ package cni import ( "context" "fmt" + "sort" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -28,15 +29,12 @@ import ( ) // RunWatch executes the logic for the 'cni-switch watch' command. -// In the new architecture, it acts as a status watcher. -func RunWatch(timeout time.Duration) error { - startTime := time.Now() - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() +func RunWatch() error { + ctx := context.Background() - fmt.Printf("🚀 Monitoring CNI switch progress (timeout: %s)\n", timeout) + fmt.Println("🚀 Monitoring CNI switch progress") - // 1. Create a Kubernetes client + // Create a Kubernetes client safeClient, err := saferequest.NewSafeClient() if err != nil { return fmt.Errorf("creating safe client: %w", err) @@ -50,7 +48,10 @@ func RunWatch(timeout time.Duration) error { ticker := time.NewTicker(2 * time.Second) defer ticker.Stop() - var lastCondition string + var ( + migrationName string + lastPrintedTime time.Time + ) for { select { @@ -59,23 +60,49 @@ func RunWatch(timeout time.Duration) error { case <-ticker.C: activeMigration, err := FindActiveMigration(ctx, rtClient) if err != nil { - fmt.Printf("\r\033[K⚠️ Error finding active migration: %v", err) + fmt.Printf("\r\033[K⚠️ Error finding active migration: %v", err) continue } if activeMigration == nil { - fmt.Println("\r\033[Kℹ️ No active migration found.") + // If we were watching a migration and it disappeared, it's done or deleted. + if migrationName != "" { + fmt.Println("\r\033[Kℹ️ Migration resource is gone.") + } else { + fmt.Println("\r\033[Kℹ️ No active migration found.") + } return nil } - // Report current status from conditions - currentCond := getLatestCondition(activeMigration) - if currentCond != lastCondition { - fmt.Printf("\r\033[K[%s] %s: %s\n", - time.Now().Format("15:04:05"), - currentCond, - activeMigration.Name) - lastCondition = currentCond + // Print migration name once + if migrationName == "" { + migrationName = activeMigration.Name + fmt.Printf("ℹ️ Monitoring migration resource: %s\n", migrationName) + } + + // Collect and sort valid conditions + var validConditions []metav1.Condition + for _, c := range activeMigration.Status.Conditions { + if c.Status == metav1.ConditionTrue { + validConditions = append(validConditions, c) + } + } + + sort.Slice(validConditions, func(i, j int) bool { + return validConditions[i].LastTransitionTime.Before(&validConditions[j].LastTransitionTime) + }) + + // Print new conditions + for _, c := range validConditions { + if c.LastTransitionTime.Time.After(lastPrintedTime) { + // Clear the progress line before printing log + fmt.Print("\r\033[K") + fmt.Printf("[%s] %s: %s\n", + c.LastTransitionTime.Format("15:04:05"), + c.Type, + c.Message) + lastPrintedTime = c.LastTransitionTime.Time + } } // Report node progress if available @@ -91,28 +118,11 @@ func RunWatch(timeout time.Duration) error { // Check for completion for _, cond := range activeMigration.Status.Conditions { if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { - fmt.Printf("\n\n🎉 CNI switch to '%s' completed successfully! (total time: %s)\n", - activeMigration.Spec.TargetCNI, - time.Since(startTime).Round(time.Second)) + fmt.Printf("\n\n🎉 CNI switch to '%s' completed successfully!\n", + activeMigration.Spec.TargetCNI) return nil } } } } } - -func getLatestCondition(m *v1alpha1.CNIMigration) string { - if len(m.Status.Conditions) == 0 { - return "Pending" - } - // Return the latest True condition - latest := m.Status.Conditions[0] - for _, c := range m.Status.Conditions { - if c.Status == metav1.ConditionTrue { - if c.LastTransitionTime.After(latest.LastTransitionTime.Time) { - latest = c - } - } - } - return latest.Type -} From cc2f5e732b12bf32272d1310e591c09a665ed538 Mon Sep 17 00:00:00 2001 From: Denis Tarabrin Date: Wed, 21 Jan 2026 12:33:41 +0400 Subject: [PATCH 16/16] ++ Signed-off-by: Denis Tarabrin --- .../cni/api/v1alpha1/cni_migration_types.go | 9 ++++ .../cni/api/v1alpha1/zz_generated.deepcopy.go | 20 ++++++++ internal/cni/watch.go | 46 +++++++++++++++---- 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/internal/cni/api/v1alpha1/cni_migration_types.go b/internal/cni/api/v1alpha1/cni_migration_types.go index 3e2bbdd3..2869e0c8 100644 --- a/internal/cni/api/v1alpha1/cni_migration_types.go +++ b/internal/cni/api/v1alpha1/cni_migration_types.go @@ -55,11 +55,20 @@ type CNIMigrationStatus struct { NodesSucceeded int `json:"nodesSucceeded,omitempty"` // NodesFailed is the number of nodes where an error occurred. NodesFailed int `json:"nodesFailed,omitempty"` + // FailedSummary contains details about nodes that failed the migration. + FailedSummary []FailedNodeSummary `json:"failedSummary,omitempty"` // Conditions reflect the state of the migration as a whole. // The d8 cli aggregates statuses from all CNINodeMigrations here. Conditions []metav1.Condition `json:"conditions,omitempty"` } +// FailedNodeSummary captures the error state of a specific node. +// +k8s:deepcopy-gen=true +type FailedNodeSummary struct { + Node string `json:"node"` + Reason string `json:"reason"` +} + // CNIMigrationList contains a list of CNIMigration. // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object type CNIMigrationList struct { diff --git a/internal/cni/api/v1alpha1/zz_generated.deepcopy.go b/internal/cni/api/v1alpha1/zz_generated.deepcopy.go index 495df5c2..cec3b4d6 100644 --- a/internal/cni/api/v1alpha1/zz_generated.deepcopy.go +++ b/internal/cni/api/v1alpha1/zz_generated.deepcopy.go @@ -102,6 +102,11 @@ func (in *CNIMigrationSpec) DeepCopy() *CNIMigrationSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CNIMigrationStatus) DeepCopyInto(out *CNIMigrationStatus) { *out = *in + if in.FailedSummary != nil { + in, out := &in.FailedSummary, &out.FailedSummary + *out = make([]FailedNodeSummary, len(*in)) + copy(*out, *in) + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]v1.Condition, len(*in)) @@ -121,6 +126,21 @@ func (in *CNIMigrationStatus) DeepCopy() *CNIMigrationStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FailedNodeSummary) DeepCopyInto(out *FailedNodeSummary) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FailedNodeSummary. +func (in *FailedNodeSummary) DeepCopy() *FailedNodeSummary { + if in == nil { + return nil + } + out := new(FailedNodeSummary) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CNINodeMigration) DeepCopyInto(out *CNINodeMigration) { *out = *in diff --git a/internal/cni/watch.go b/internal/cni/watch.go index 1ddaaacc..df1929d6 100644 --- a/internal/cni/watch.go +++ b/internal/cni/watch.go @@ -51,6 +51,7 @@ func RunWatch() error { var ( migrationName string lastPrintedTime time.Time + footerLines int // Number of lines in the dynamic footer (progress + errors) ) for { @@ -60,20 +61,28 @@ func RunWatch() error { case <-ticker.C: activeMigration, err := FindActiveMigration(ctx, rtClient) if err != nil { - fmt.Printf("\r\033[K⚠️ Error finding active migration: %v", err) + // Clean previous footer before printing warning + clearFooter(footerLines) + fmt.Printf("⚠️ Error finding active migration: %v\n", err) + footerLines = 1 // We printed one line continue } if activeMigration == nil { + clearFooter(footerLines) // If we were watching a migration and it disappeared, it's done or deleted. if migrationName != "" { - fmt.Println("\r\033[Kℹ️ Migration resource is gone.") + fmt.Println("ℹ️ Migration resource is gone.") } else { - fmt.Println("\r\033[Kℹ️ No active migration found.") + fmt.Println("ℹ️ No active migration found.") } return nil } + // Clear previous footer to verify if we need to print new logs + clearFooter(footerLines) + footerLines = 0 + // Print migration name once if migrationName == "" { migrationName = activeMigration.Name @@ -95,8 +104,6 @@ func RunWatch() error { // Print new conditions for _, c := range validConditions { if c.LastTransitionTime.Time.After(lastPrintedTime) { - // Clear the progress line before printing log - fmt.Print("\r\033[K") fmt.Printf("[%s] %s: %s\n", c.LastTransitionTime.Format("15:04:05"), c.Type, @@ -105,20 +112,37 @@ func RunWatch() error { } } - // Report node progress if available + // Draw Footer + // 1. Node Progress if activeMigration.Status.NodesTotal > 0 { - fmt.Printf("\r\033[K Nodes: %d/%d succeeded", + fmt.Printf(" Nodes: %d/%d succeeded", activeMigration.Status.NodesSucceeded, activeMigration.Status.NodesTotal) if activeMigration.Status.NodesFailed > 0 { fmt.Printf(", %d failed", activeMigration.Status.NodesFailed) } + fmt.Println() // End of progress line + footerLines++ + + // 2. Failed Nodes Details + if len(activeMigration.Status.FailedSummary) > 0 { + fmt.Println(" ⚠️ Failed Nodes:") + footerLines++ + for _, f := range activeMigration.Status.FailedSummary { + fmt.Printf(" - %s: %s\n", f.Node, f.Reason) + footerLines++ + } + } + } else { + // Placeholder if no nodes stats yet + fmt.Println(" Waiting for node statistics...") + footerLines++ } // Check for completion for _, cond := range activeMigration.Status.Conditions { if cond.Type == "Succeeded" && cond.Status == metav1.ConditionTrue { - fmt.Printf("\n\n🎉 CNI switch to '%s' completed successfully!\n", + fmt.Printf("\n🎉 CNI switch to '%s' completed successfully!\n", activeMigration.Spec.TargetCNI) return nil } @@ -126,3 +150,9 @@ func RunWatch() error { } } } + +func clearFooter(lines int) { + for range lines { + fmt.Print("\033[1A\033[K") // Move up and clear line + } +}