From 912519525c98643916fd0fd79c9fd778d4a30216 Mon Sep 17 00:00:00 2001 From: Jason Lynch Date: Wed, 28 Jan 2026 12:02:37 -0500 Subject: [PATCH 1/3] chore: add disaster simulation script Adds a script to simulate losing a host. This script has three different ways of simulating that loss to enable us to develop recovery steps for Swarm and Control Plane/Etcd in parallel. --- hack/simulate-disaster.sh | 130 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100755 hack/simulate-disaster.sh diff --git a/hack/simulate-disaster.sh b/hack/simulate-disaster.sh new file mode 100755 index 00000000..c8def776 --- /dev/null +++ b/hack/simulate-disaster.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail + +# Simulates losing a Docker Swarm node, but retains all Etcd data so that we can +# focus on just the Docker Swarm and database instance recovery steps +simulate_swarm_node_loss() { + local host_id + + for host_id in $@; do + echo "=== simulating swarm node loss on ${host_id} ===" + echo + + ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-'EOF' + if [[ $(docker info --format '{{.Swarm.LocalNodeState}}') == "active" ]]; then + docker swarm leave --force + else + echo "node already left swarm" + fi + echo "removing instances data directory" + sudo rm -rf /data/control-plane/instances +EOF + echo + done + +} + +# Simulates losing an Etcd node, but retains Docker Swarm so that we can focus +# on just the Control Plane and database instance recovery steps +simulate_etcd_node_loss() { + local host_id + + for host_id in $@; do + echo "=== simulating etcd node loss on ${host_id} ===" + echo + + # We're using xargs here to gracefully ignore when the service does not + # exist + ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-EOF + echo "removing control-plane swarm service" + docker service ls \ + --filter 'name=control-plane_${host_id}' \ + --format '{{ .Name }}' \ + | xargs -r docker service rm + echo "removing control-plane data directory" + sudo rm -rf /data/control-plane +EOF + echo + done + +} + +# This is most similar to a real disaster recovery scenario. We're losing the +# entire machine as well as all of its storage. Whatever replacement machine we +# start up may or may not have the same IP address. +simulate_full_loss() { + local host_id + + for host_id in $@; do + echo "=== simulating full loss of ${host_id} ===" + echo + + limactl stop ${host_id} + limactl delete ${host_id} + + echo + done +} + +usage() { +cat < [host-id ...] + +Simulates disasters against the Lima test fixtures. Supports three different +different types of disasters to enable us to develop some recovery steps in +parallel: + +- swarm: simulates losing a Swarm node and database instance data without losing + Etcd data +- etcd: simulates losing a Control Plane/Etcd instance without losing Swarm + quorum. +- full: simulates losing an entire host, affecting both Swarm and Control + Plane/Etcd. + +NOTE: This is only intended to be run against swarm manager/etcd server hosts. + +Examples: + # Simulating losing Swarm on one host + $1 swarm host-1 + + # Simulate losing Swarm on two hosts in order to lose quorum + $1 swarm host-1 host-3 + + # Simulate losing Control Plane/Etcd on one host + $1 etcd host-1 + + # Simulate losing Control Plane/Etcd on two hosts in order to lose quorum + $1 etcd host-1 host-3 + + # Simulate full loss of one host + $1 full host-1 + + # Simulate full loss of two hosts to lose quorum + $1 full host-1 host-3 +EOF +} + +main() { + case $1 in + swarm) + simulate_swarm_node_loss ${@:2} + ;; + etcd) + simulate_etcd_node_loss ${@:2} + ;; + full) + simulate_full_node_loss ${@:2} + ;; + --help|-h) + usage $0 + ;; + *) + usage $0 + exit 1 + ;; + esac +} + +main $@ From 2385face6b9a50a304367ccfd796dac71626e42a Mon Sep 17 00:00:00 2001 From: Jason Lynch Date: Thu, 29 Jan 2026 09:55:28 -0500 Subject: [PATCH 2/3] chore: add reset function to simulate-disaster.sh Adds an option to reset the Lima E2E fixture back to its initial state without tearing it down entirely. This can save a significant amount of time between tests. --- hack/simulate-disaster.sh | 50 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/hack/simulate-disaster.sh b/hack/simulate-disaster.sh index c8def776..9627d790 100755 --- a/hack/simulate-disaster.sh +++ b/hack/simulate-disaster.sh @@ -2,6 +2,12 @@ set -o errexit set -o pipefail +set -x + +script_dir=$( cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +fixtures_dir="${script_dir}/../e2e/fixtures" +fixture_variant="${FIXTURE_VARIANT:-large}" +fixture_extra_vars="${FIXTURE_EXTRA_VARS}" # Simulates losing a Docker Swarm node, but retains all Etcd data so that we can # focus on just the Docker Swarm and database instance recovery steps @@ -23,7 +29,6 @@ simulate_swarm_node_loss() { EOF echo done - } # Simulates losing an Etcd node, but retains Docker Swarm so that we can focus @@ -48,7 +53,6 @@ simulate_etcd_node_loss() { EOF echo done - } # This is most similar to a real disaster recovery scenario. We're losing the @@ -68,6 +72,39 @@ simulate_full_loss() { done } +# Resets Swarm and the Control Plane on all hosts and returns the Control Plane +# to an uninitialized state. +reset() { + echo "=== resetting all hosts ===" + echo + + VARIANT="${fixture_variant}" \ + EXTRA_VARS="${fixture_extra_vars}" \ + make -C "${fixtures_dir}" \ + deploy-lima-machines + + for host_id in $(limactl ls | awk '$1~/^host-/ && $2 == "Running" { print $1 }'); do + echo "resetting swarm on ${host_id}" + + ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-'EOF' + if [[ $(docker info --format '{{.Swarm.LocalNodeState}}') == "active" ]]; then + docker swarm leave --force + else + echo "node already left swarm" + fi + echo "removing control-plane data directory" + sudo rm -rf /data/control-plane + EOF + done + + VARIANT="${fixture_variant}" \ + EXTRA_VARS="${fixture_extra_vars}" \ + make -C "${fixtures_dir}" \ + setup-lima-hosts \ + teardown-lima-control-plane \ + deploy-lima-control-plane +} + usage() { cat < [host-id ...] @@ -103,6 +140,12 @@ Examples: # Simulate full loss of two hosts to lose quorum $1 full host-1 host-3 + + # Reset the fixture back to its initial state + $1 reset + + # Remember to include the fixture variant if you're using a non-default one + FIXTURE_VARIANT=small $1 reset EOF } @@ -117,6 +160,9 @@ main() { full) simulate_full_node_loss ${@:2} ;; + reset) + reset + ;; --help|-h) usage $0 ;; From 07d41dca78e023f2ad2e889f542e59d2bb82490c Mon Sep 17 00:00:00 2001 From: Jason Lynch Date: Thu, 29 Jan 2026 13:36:12 -0500 Subject: [PATCH 3/3] chore: simulate-disaster improvements - Fix etcd simulation for client-mode servers - Remove database services in etcd simulation - Rebuild control-plane in reset - Remove `set -x` --- hack/simulate-disaster.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/hack/simulate-disaster.sh b/hack/simulate-disaster.sh index 9627d790..4f03402c 100755 --- a/hack/simulate-disaster.sh +++ b/hack/simulate-disaster.sh @@ -2,10 +2,10 @@ set -o errexit set -o pipefail -set -x script_dir=$( cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -fixtures_dir="${script_dir}/../e2e/fixtures" +project_dir="${script_dir}/.." +fixtures_dir="${project_dir}/e2e/fixtures" fixture_variant="${FIXTURE_VARIANT:-large}" fixture_extra_vars="${FIXTURE_EXTRA_VARS}" @@ -40,14 +40,23 @@ simulate_etcd_node_loss() { echo "=== simulating etcd node loss on ${host_id} ===" echo - # We're using xargs here to gracefully ignore when the service does not + # We're using xargs here to gracefully ignore when the services do not # exist - ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-EOF + ssh -T -F ~/.lima/host-1/ssh.config lima-host-1 <<-EOF echo "removing control-plane swarm service" docker service ls \ --filter 'name=control-plane_${host_id}' \ --format '{{ .Name }}' \ | xargs -r docker service rm + + echo "removing all database swarm services" + docker service ls \ + --filter 'label=pgedge.host.id=${host_id}' \ + --format '{{ .Name }}' \ + | xargs -r docker service rm +EOF + + ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-EOF echo "removing control-plane data directory" sudo rm -rf /data/control-plane EOF @@ -97,6 +106,8 @@ reset() { EOF done + make -C "${project_dir}" goreleaser-build + VARIANT="${fixture_variant}" \ EXTRA_VARS="${fixture_extra_vars}" \ make -C "${fixtures_dir}" \