Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions api/v1/hypervisor_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ const (
ConditionReasonReadyEvicted = "Evicted"

// ConditionTypeOnboarding reasons
ConditionReasonInitial = "Initial"
ConditionReasonOnboarding = "Onboarding"
ConditionReasonTesting = "Testing"
ConditionReasonAborted = "Aborted"
ConditionReasonInitial = "Initial"
ConditionReasonOnboarding = "Onboarding"
ConditionReasonTesting = "Testing"
ConditionReasonAborted = "Aborted"
ConditionReasonDecommissioning = "Decommissioning"
)

// HypervisorSpec defines the desired state of Hypervisor
Expand Down
12 changes: 12 additions & 0 deletions charts/openstack-hypervisor-operator/crds/hypervisor-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,14 @@ spec:
firmwareVersion:
description: FirmwareVersion
type: string
gardenLinuxCommitID:
description: Represents the Garden Linux build commit id
type: string
gardenLinuxFeatures:
description: Represents the Garden Linux Feature Set
items:
type: string
type: array
hardwareModel:
description: HardwareModel
type: string
Expand All @@ -336,6 +344,10 @@ spec:
prettyVersion:
description: PrettyVersion
type: string
variantID:
description: Identifying a specific variant or edition of the
operating system
type: string
version:
description: Represents the Operating System version.
type: string
Expand Down
20 changes: 1 addition & 19 deletions internal/controller/aggregates_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func (ac *AggregatesController) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, nil
}

aggs, err := aggregatesByName(ctx, ac.computeClient)
aggs, err := openstack.GetAggregatesByName(ctx, ac.computeClient)
if err != nil {
err = fmt.Errorf("failed listing aggregates: %w", err)
if err2 := ac.setErrorCondition(ctx, hv, err.Error()); err2 != nil {
Expand Down Expand Up @@ -163,24 +163,6 @@ func (ac *AggregatesController) SetupWithManager(mgr ctrl.Manager) error {
Complete(ac)
}

func aggregatesByName(ctx context.Context, serviceClient *gophercloud.ServiceClient) (map[string]*aggregates.Aggregate, error) {
pages, err := aggregates.List(serviceClient).AllPages(ctx)
if err != nil {
return nil, fmt.Errorf("cannot list aggregates due to %w", err)
}

aggs, err := aggregates.ExtractAggregates(pages)
if err != nil {
return nil, fmt.Errorf("cannot list aggregates due to %w", err)
}

aggregateMap := make(map[string]*aggregates.Aggregate, len(aggs))
for _, aggregate := range aggs {
aggregateMap[aggregate.Name] = &aggregate
}
return aggregateMap, nil
}

func addToAggregate(ctx context.Context, serviceClient *gophercloud.ServiceClient, aggs map[string]*aggregates.Aggregate, host, name, zone string) (err error) {
aggregate, found := aggs[name]
log := logger.FromContext(ctx)
Expand Down
160 changes: 62 additions & 98 deletions internal/controller/decomission_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,24 @@ import (

"github.com/gophercloud/gophercloud/v2"
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/aggregates"
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/hypervisors"
"github.com/gophercloud/gophercloud/v2/openstack/compute/v2/services"
"github.com/gophercloud/gophercloud/v2/openstack/placement/v1/resourceproviders"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
logger "sigs.k8s.io/controller-runtime/pkg/log"

kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/openstack"
"github.com/cobaltcore-dev/openstack-hypervisor-operator/internal/utils"
)

const (
decommissionFinalizerName = "cobaltcore.cloud.sap/decommission-hypervisor"
DecommissionControllerName = "nodeDecommission"
DecommissionControllerName = "decommission"
)

type NodeDecommissionReconciler struct {
Expand All @@ -57,145 +56,108 @@ type NodeDecommissionReconciler struct {
// The counter-side in gardener is here:
// https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L646

// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;patch;update
// +kubebuilder:rbac:groups="",resources=nodes/finalizers,verbs=update
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors,verbs=get;list;watch
// +kubebuilder:rbac:groups=kvm.cloud.sap,resources=hypervisors/status,verbs=get;list;watch;update;patch

func (r *NodeDecommissionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
hostname := req.Name
log := logger.FromContext(ctx).WithName(req.Name).WithValues("hostname", hostname)
ctx = logger.IntoContext(ctx, log)
log := logger.FromContext(ctx).WithName(req.Name)
hv := &kvmv1.Hypervisor{}
if err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
if err := r.Get(ctx, req.NamespacedName, hv); err != nil {
// ignore not found errors, could be deleted
return k8sclient.IgnoreNotFound(err)
}

node := &corev1.Node{}
if err := r.Get(ctx, req.NamespacedName, node); err != nil {
return ctrl.Result{}, k8sclient.IgnoreNotFound(err)
}
setDecommissioningCondition := func(msg string) {
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
Type: kvmv1.ConditionTypeReady,
Status: metav1.ConditionFalse,
Reason: kvmv1.ConditionReasonDecommissioning,
Message: msg,
})
}

// Fetch HV to check if lifecycle management is enabled
hv := &kvmv1.Hypervisor{}
if err := r.Get(ctx, k8sclient.ObjectKey{Name: hostname}, hv); err != nil {
// ignore not found errors, could be deleted
return ctrl.Result{}, k8sclient.IgnoreNotFound(err)
}
if !hv.Spec.LifecycleEnabled {
// Get out of the way
return r.removeFinalizer(ctx, node)
}
if meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeReady) {
setDecommissioningCondition("Node is being decommissioned, removing host from nova")
return r.Status().Update(ctx, hv)
}

if !controllerutil.ContainsFinalizer(node, decommissionFinalizerName) {
return ctrl.Result{}, retry.RetryOnConflict(retry.DefaultRetry, func() error {
patch := k8sclient.MergeFrom(node.DeepCopy())
controllerutil.AddFinalizer(node, decommissionFinalizerName)
if err := r.Patch(ctx, node, patch); err != nil {
return fmt.Errorf("failed to add finalizer due to %w", err)
hypervisor, err := openstack.GetHypervisorByName(ctx, r.computeClient, hv.Name, true)
if err != nil {
if errors.Is(err, openstack.ErrNoHypervisor) {
// We are (hopefully) done
setDecommissioningCondition("Node not registered in nova anymore, proceeding with deletion")
hv.Status.Evicted = true
return r.Status().Update(ctx, hv)
}
log.Info("Added finalizer")
return nil
})
}

// Not yet deleting hv, nothing more to do
if node.DeletionTimestamp.IsZero() {
return ctrl.Result{}, nil
}
setDecommissioningCondition(fmt.Sprintf("Failed to get %q from openstack: %v", hv.Name, err))
return r.Status().Update(ctx, hv)
}

// Someone is just deleting the hv, without going through termination
// See: https://github.com/gardener/machine-controller-manager/blob/rel-v0.56/pkg/util/provider/machinecontroller/machine.go#L658-L659
if !IsNodeConditionTrue(node.Status.Conditions, "Terminating") {
log.Info("removing finalizer since not terminating")
// So we just get out of the way for now
return r.removeFinalizer(ctx, node)
}
if err = r.doDecomission(ctx, hv, hypervisor); err != nil {
log.Error(err, "Failed to decomission node", "node", hv.Name)
setDecommissioningCondition(err.Error())
return r.Status().Update(ctx, hv)
}

if meta.IsStatusConditionTrue(hv.Status.Conditions, kvmv1.ConditionTypeReady) {
return r.setDecommissioningCondition(ctx, hv, "Node is being decommissioned, removing host from nova")
// Decommissioning succeeded, proceed with deletion
hv.Status.Evicted = true
return r.Status().Update(ctx, hv)
}); err != nil {
return ctrl.Result{}, err
}

log.Info("removing host from nova")

hypervisor, err := openstack.GetHypervisorByName(ctx, r.computeClient, hostname, true)
if errors.Is(err, openstack.ErrNoHypervisor) {
// We are (hopefully) done
return r.removeFinalizer(ctx, node)
}
return ctrl.Result{RequeueAfter: utils.ShortRetryTime}, nil
}

func (r *NodeDecommissionReconciler) doDecomission(ctx context.Context, hv *kvmv1.Hypervisor, hypervisor *hypervisors.Hypervisor) error {
// TODO: remove since RunningVMs is only available until micro-version 2.87, and also is updated asynchronously
// so it might be not accurate
if hypervisor.RunningVMs > 0 {
// Still running VMs, cannot delete the service
msg := fmt.Sprintf("Node is being decommissioned, but still has %d running VMs", hypervisor.RunningVMs)
return r.setDecommissioningCondition(ctx, hv, msg)
return fmt.Errorf("node is being decommissioned, but still has %d running VMs", hypervisor.RunningVMs)
}

if hypervisor.Servers != nil && len(*hypervisor.Servers) > 0 {
// Still VMs assigned to the host, cannot delete the service
msg := fmt.Sprintf("Node is being decommissioned, but still has %d assigned VMs, "+
"check with `openstack server list --all-projects --host %s`", len(*hypervisor.Servers), hostname)
return r.setDecommissioningCondition(ctx, hv, msg)
return fmt.Errorf("node is being decommissioned, but still has %d assigned VMs, "+
"check with `openstack server list --all-projects --host %s`", len(*hypervisor.Servers), hv.Name)
}

// Before removing the service, first take the node out of the aggregates,
// so when the node comes back, it doesn't up with the old associations
aggs, err := aggregatesByName(ctx, r.computeClient)
// Before removing the service, first take the hypervisor out of the aggregates,
// so when the hypervisor comes back, it doesn't up with the old associations
aggs, err := openstack.GetAggregatesByName(ctx, r.computeClient)
if err != nil {
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot list aggregates due to %v", err))
return fmt.Errorf("cannot list aggregates due to: %w", err)
}

host := node.Name
host := hv.Name
for name, aggregate := range aggs {
if slices.Contains(aggregate.Hosts, host) {
opts := aggregates.RemoveHostOpts{Host: host}
if err = aggregates.RemoveHost(ctx, r.computeClient, aggregate.ID, opts).Err; err != nil {
msg := fmt.Sprintf("failed to remove host %v from aggregate %v due to %v", name, host, err)
return r.setDecommissioningCondition(ctx, hv, msg)
return fmt.Errorf("failed to remove host %v from aggregate %v due to %w", name, host, err)
}
}
}

// Deleting and evicted, so better delete the service
err = services.Delete(ctx, r.computeClient, hypervisor.Service.ID).ExtractErr()
if err != nil && !gophercloud.ResponseCodeIs(err, http.StatusNotFound) {
msg := fmt.Sprintf("cannot delete service %s due to %v", hypervisor.Service.ID, err)
return r.setDecommissioningCondition(ctx, hv, msg)
return fmt.Errorf("cannot delete service %s due to %w", hypervisor.Service.ID, err)
}

rp, err := resourceproviders.Get(ctx, r.placementClient, hypervisor.ID).Extract()
if err != nil && !gophercloud.ResponseCodeIs(err, http.StatusNotFound) {
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot get resource provider: %v", err))
return fmt.Errorf("cannot get resource provider %s due to %w", hypervisor.ID, err)
}

if err = openstack.CleanupResourceProvider(ctx, r.placementClient, rp); err != nil {
return r.setDecommissioningCondition(ctx, hv, fmt.Sprintf("cannot clean up resource provider: %v", err))
}

return r.removeFinalizer(ctx, node)
}

func (r *NodeDecommissionReconciler) removeFinalizer(ctx context.Context, node *corev1.Node) (ctrl.Result, error) {
if !controllerutil.ContainsFinalizer(node, decommissionFinalizerName) {
return ctrl.Result{}, nil
return fmt.Errorf("cannot cleanup resource provider: %w", err)
}

nodeBase := node.DeepCopy()
controllerutil.RemoveFinalizer(node, decommissionFinalizerName)
err := r.Patch(ctx, node, k8sclient.MergeFromWithOptions(nodeBase,
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(DecommissionControllerName))
return ctrl.Result{}, err
}

func (r *NodeDecommissionReconciler) setDecommissioningCondition(ctx context.Context, hv *kvmv1.Hypervisor, message string) (ctrl.Result, error) {
base := hv.DeepCopy()
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
Type: kvmv1.ConditionTypeReady,
Status: metav1.ConditionFalse,
Reason: "Decommissioning",
Message: message,
})
if err := r.Status().Patch(ctx, hv, k8sclient.MergeFromWithOptions(base,
k8sclient.MergeFromWithOptimisticLock{}), k8sclient.FieldOwner(DecommissionControllerName)); err != nil {
return ctrl.Result{}, fmt.Errorf("cannot update hypervisor status due to %w", err)
}
return ctrl.Result{RequeueAfter: shortRetryTime}, nil
return nil
}

// SetupWithManager sets up the controller with the Manager.
Expand All @@ -217,6 +179,8 @@ func (r *NodeDecommissionReconciler) SetupWithManager(mgr ctrl.Manager) error {

return ctrl.NewControllerManagedBy(mgr).
Named(DecommissionControllerName).
For(&corev1.Node{}).
For(&kvmv1.Hypervisor{}).
WithEventFilter(utils.HypervisorTerminationPredicate).
WithEventFilter(utils.LifecycleEnabledPredicate).
Complete(r)
}
Loading