Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions hack/simulate-disaster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/env bash

set -o errexit
set -o pipefail

script_dir=$( cd -- "$(dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
project_dir="${script_dir}/.."
fixtures_dir="${project_dir}/e2e/fixtures"
fixture_variant="${FIXTURE_VARIANT:-large}"
fixture_extra_vars="${FIXTURE_EXTRA_VARS}"

# Simulates losing a Docker Swarm node, but retains all Etcd data so that we can
# focus on just the Docker Swarm and database instance recovery steps
simulate_swarm_node_loss() {
local host_id

for host_id in $@; do
echo "=== simulating swarm node loss on ${host_id} ==="
echo

ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-'EOF'
if [[ $(docker info --format '{{.Swarm.LocalNodeState}}') == "active" ]]; then
docker swarm leave --force
else
echo "node already left swarm"
fi
echo "removing instances data directory"
sudo rm -rf /data/control-plane/instances
EOF
echo
done
}

# Simulates losing an Etcd node, but retains Docker Swarm so that we can focus
# on just the Control Plane and database instance recovery steps
simulate_etcd_node_loss() {
local host_id

for host_id in $@; do
echo "=== simulating etcd node loss on ${host_id} ==="
echo

# We're using xargs here to gracefully ignore when the services do not
# exist
ssh -T -F ~/.lima/host-1/ssh.config lima-host-1 <<-EOF
echo "removing control-plane swarm service"
docker service ls \
--filter 'name=control-plane_${host_id}' \
--format '{{ .Name }}' \
| xargs -r docker service rm

echo "removing all database swarm services"
docker service ls \
--filter 'label=pgedge.host.id=${host_id}' \
--format '{{ .Name }}' \
| xargs -r docker service rm
EOF

ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-EOF
echo "removing control-plane data directory"
sudo rm -rf /data/control-plane
EOF
echo
done
}

# This is most similar to a real disaster recovery scenario. We're losing the
# entire machine as well as all of its storage. Whatever replacement machine we
# start up may or may not have the same IP address.
simulate_full_loss() {
local host_id

for host_id in $@; do
echo "=== simulating full loss of ${host_id} ==="
echo

limactl stop ${host_id}
limactl delete ${host_id}

echo
done
}

# Resets Swarm and the Control Plane on all hosts and returns the Control Plane
# to an uninitialized state.
reset() {
echo "=== resetting all hosts ==="
echo

VARIANT="${fixture_variant}" \
EXTRA_VARS="${fixture_extra_vars}" \
make -C "${fixtures_dir}" \
deploy-lima-machines

for host_id in $(limactl ls | awk '$1~/^host-/ && $2 == "Running" { print $1 }'); do
echo "resetting swarm on ${host_id}"

ssh -T -F ~/.lima/${host_id}/ssh.config lima-${host_id} <<-'EOF'
if [[ $(docker info --format '{{.Swarm.LocalNodeState}}') == "active" ]]; then
docker swarm leave --force
else
echo "node already left swarm"
fi
echo "removing control-plane data directory"
sudo rm -rf /data/control-plane
EOF
done

make -C "${project_dir}" goreleaser-build

VARIANT="${fixture_variant}" \
EXTRA_VARS="${fixture_extra_vars}" \
make -C "${fixtures_dir}" \
setup-lima-hosts \
teardown-lima-control-plane \
deploy-lima-control-plane
}

usage() {
cat <<EOF
Usage: $1 <swarm|etcd|full> <host-id> [host-id ...]

Simulates disasters against the Lima test fixtures. Supports three different
different types of disasters to enable us to develop some recovery steps in
parallel:

- swarm: simulates losing a Swarm node and database instance data without losing
Etcd data
- etcd: simulates losing a Control Plane/Etcd instance without losing Swarm
quorum.
- full: simulates losing an entire host, affecting both Swarm and Control
Plane/Etcd.

NOTE: This is only intended to be run against swarm manager/etcd server hosts.

Examples:
# Simulating losing Swarm on one host
$1 swarm host-1

# Simulate losing Swarm on two hosts in order to lose quorum
$1 swarm host-1 host-3

# Simulate losing Control Plane/Etcd on one host
$1 etcd host-1

# Simulate losing Control Plane/Etcd on two hosts in order to lose quorum
$1 etcd host-1 host-3

# Simulate full loss of one host
$1 full host-1

# Simulate full loss of two hosts to lose quorum
$1 full host-1 host-3

# Reset the fixture back to its initial state
$1 reset

# Remember to include the fixture variant if you're using a non-default one
FIXTURE_VARIANT=small $1 reset
EOF
}

main() {
case $1 in
swarm)
simulate_swarm_node_loss ${@:2}
;;
etcd)
simulate_etcd_node_loss ${@:2}
;;
full)
simulate_full_node_loss ${@:2}
;;
reset)
reset
;;
--help|-h)
usage $0
;;
*)
usage $0
exit 1
;;
esac
}

main $@