From b248219e84fbcd87219e049fcb1ea31a6db9c84c Mon Sep 17 00:00:00 2001 From: Jason Lynch Date: Sun, 25 Jan 2026 11:43:02 -0500 Subject: [PATCH 1/2] ci: rerun flaky tests Simplifies our Makefile by combining some targets and adds an option to rerun flaky tests. This option is useful because we have some occasional port conflict issues when the OS assigns an ephemeral port that we're going to use for an instance of Etcd or other service. These issues are extremely difficult to avoid because we can't easily eliminate the gap between our port allocation and when the service binds to the ports. Simply re-running the tests is a much easier option. --- .circleci/config.yml | 6 ++-- Makefile | 67 ++++++++++++++++++++------------------------ common.mk | 2 +- 3 files changed, 35 insertions(+), 40 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4b7d77e3..3392ea8f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -63,19 +63,19 @@ jobs: - common_setup - run: name: Run CI checks - command: make ci + command: make ci TEST_RERUN_FAILS=2 - run: name: Run cluster tests command: | make docker-swarm-init make start-local-registry make buildx-init - make test-cluster-ci CONTROL_PLANE_IMAGE_REPO=172.17.0.1:5000/control-plane + make test-cluster CONTROL_PLANE_IMAGE_REPO=172.17.0.1:5000/control-plane TEST_RERUN_FAILS=2 - run: name: Run e2e tests with Docker Compose command: | make ci-compose-detached - make test-e2e-ci E2E_PARALLEL=4 E2E_DEBUG=1 E2E_FIXTURE=ci + make test-e2e E2E_PARALLEL=4 E2E_DEBUG=1 E2E_FIXTURE=ci TEST_RERUN_FAILS=2 - run: name: Archive debug output command: | diff --git a/Makefile b/Makefile index 47ed38aa..edb6c90f 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,12 @@ include common.mk # Overridable vars +CI ?= false DEBUG ?= 0 LOG_LEVEL ?= info DEV_IMAGE_REPO ?= ghcr.io/pgedge CONTROL_PLANE_IMAGE_REPO ?= host.docker.internal:5000/control-plane +TEST_RERUN_FAILS ?= 0 E2E_FIXTURE ?= E2E_PARALLEL ?= 8 E2E_RUN ?= @@ -30,7 +32,7 @@ docker_compose_dev=WORKSPACE_DIR=$(shell pwd) \ DEV_IMAGE_REPO=$(DEV_IMAGE_REPO) \ docker compose -f ./docker/control-plane-dev/docker-compose.yaml docker_compose_ci=docker compose -f ./docker/control-plane-ci/docker-compose.yaml -e2e_args=-tags=e2e_test -count=1 -timeout=45m ./e2e/... \ +e2e_args=-tags=e2e_test -count=1 -timeout=45m \ $(if $(E2E_PARALLEL),-parallel $(E2E_PARALLEL)) \ $(if $(E2E_RUN),-run $(E2E_RUN)) \ -args \ @@ -39,7 +41,7 @@ e2e_args=-tags=e2e_test -count=1 -timeout=45m ./e2e/... \ $(if $(filter 1,$(E2E_DEBUG)),-debug) \ $(if $(E2E_DEBUG_DIR),-debug-dir $(E2E_DEBUG_DIR)) -cluster_test_args=-tags=cluster_test -count=1 -timeout=10m ./clustertest/... \ +cluster_test_args=-tags=cluster_test -count=1 -timeout=10m \ $(if $(CLUSTER_TEST_PARALLEL),-parallel $(CLUSTER_TEST_PARALLEL)) \ $(if $(CLUSTER_TEST_RUN),-run $(CLUSTER_TEST_RUN)) \ -args \ @@ -48,6 +50,14 @@ cluster_test_args=-tags=cluster_test -count=1 -timeout=10m ./clustertest/... \ $(if $(CLUSTER_TEST_IMAGE_TAG),-image-tag $(CLUSTER_TEST_IMAGE_TAG)) \ $(if $(CLUSTER_TEST_DATA_DIR),-data-dir $(CLUSTER_TEST_DATA_DIR)) +# Automatically adds junit output named after the rule, e.g. +# 'test-e2e-results.xml' in CI environment. +gotestsum=$(gobin)/gotestsum \ + $(if $(filter true,$(CI)),--junitfile $@-results.xml) + +golangci-lint=$(gobin)/golangci-lint \ + $(if $(filter true,$(CI)),--output.junit-xml.path $@-results.xml) + .DEFAULT_GOAL := build ########### @@ -58,47 +68,45 @@ cluster_test_args=-tags=cluster_test -count=1 -timeout=10m ./clustertest/... \ test: $(gotestsum) \ --format-hide-empty-pkg \ - ./... + --rerun-fails=$(TEST_RERUN_FAILS) \ + --packages='./...' .PHONY: test-etcd test-etcd-lifecycle: $(gotestsum) \ --format-hide-empty-pkg \ + --rerun-fails=$(TEST_RERUN_FAILS) \ + --packages='./server/internal/etcd/...' \ -- \ - -tags=etcd_lifecycle_test \ - ./server/internal/etcd/... + -tags=etcd_lifecycle_test .PHONY: test-workflows-backend test-workflows-backend: $(gotestsum) \ --format-hide-empty-pkg \ + --rerun-fails=$(TEST_RERUN_FAILS) \ + --packages='./server/internal/workflows/backend/etcd/...' \ -- \ - -tags=workflows_backend_test \ - ./server/internal/workflows/backend/etcd/... + -tags=workflows_backend_test .PHONY: test-ci test-ci: $(gotestsum) \ --format-hide-empty-pkg \ --junitfile test-results.xml \ + --rerun-fails=$(TEST_RERUN_FAILS) \ + --packages='./...' \ -- \ - -tags=workflows_backend_test,etcd_lifecycle_test \ - ./... + -tags=workflows_backend_test,etcd_lifecycle_test .PHONY: test-e2e test-e2e: $(gotestsum) \ --format-hide-empty-pkg \ --format standard-verbose \ - -- \ - $(e2e_args) - -.PHONY: test-e2e-ci -test-e2e-ci: - $(gotestsum) \ - --format-hide-empty-pkg \ - --format standard-verbose \ - --junitfile e2e-test-results.xml \ + --rerun-fails=$(TEST_RERUN_FAILS) \ + --rerun-fails-max-failures=4 \ + --packages='./e2e/...' \ -- \ $(e2e_args) @@ -108,28 +116,15 @@ test-cluster: $(gotestsum) \ --format-hide-empty-pkg \ --format standard-verbose \ - -- \ - $(cluster_test_args) - -.PHONY: test-cluster-ci -test-cluster-ci: - CONTROL_PLANE_VERSION="$(CONTROL_PLANE_VERSION)" \ - $(gotestsum) \ - --format-hide-empty-pkg \ - --format standard-verbose \ - --junitfile cluster-test-results.xml \ + --rerun-fails=$(TEST_RERUN_FAILS) \ + --rerun-fails-max-failures=4 \ + --packages='./clustertest/...' \ -- \ $(cluster_test_args) .PHONY: lint lint: - $(golangcilint) run ./... - -.PHONY: lint-ci -lint-ci: - $(golangcilint) run \ - --output.junit-xml.path lint-results.xml \ - ./... + $(golangci-lint) run ./... # Exclude some dependencies from NOTICE.txt generation # - github.com/pgEdge/control-plane is our own code @@ -155,7 +150,7 @@ licenses-ci: licenses @echo "NOTICE.txt is up to date." .PHONY: ci -ci: test-ci lint-ci licenses-ci +ci: test-ci lint licenses-ci ################ # e2e fixtures # diff --git a/common.mk b/common.mk index 06187977..3a924e89 100644 --- a/common.mk +++ b/common.mk @@ -2,7 +2,7 @@ first_gopath=$(firstword $(subst :, ,$(shell go env GOPATH))) gobin=$(or $(shell go env GOBIN),$(first_gopath)/bin) gotestsum=$(gobin)/gotestsum -golangcilint=$(gobin)/golangci-lint +golangci-lint=$(gobin)/golangci-lint goa=$(gobin)/goa goreleaser=$(gobin)/goreleaser changie=$(gobin)/changie From 146650c96113ef0a87a31ed590783d9b7358e842 Mon Sep 17 00:00:00 2001 From: Jason Lynch Date: Sun, 25 Jan 2026 11:48:07 -0500 Subject: [PATCH 2/2] test: improve cluster test logging --- clustertest/cluster_test.go | 6 +++++- clustertest/host_test.go | 28 +++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/clustertest/cluster_test.go b/clustertest/cluster_test.go index 9e45f17f..b2fd5dd6 100644 --- a/clustertest/cluster_test.go +++ b/clustertest/cluster_test.go @@ -3,6 +3,7 @@ package clustertest import ( + "encoding/json" "maps" "testing" @@ -78,7 +79,10 @@ func (c *Cluster) AssertHealthy(t testing.TB) { foundHosts := map[string]bool{} for _, host := range resp.Hosts { - assert.Equal(t, "healthy", host.Status.State) + components, err := json.MarshalIndent(host.Status.Components, "\t", " ") + require.NoError(t, err) + + assert.Equal(t, "healthy", host.Status.State, "host '%s' has unhealthy status '%s', components: %s", host.ID, host.Status.State, string(components)) foundHosts[string(host.ID)] = true } for hostID := range c.hosts { diff --git a/clustertest/host_test.go b/clustertest/host_test.go index c559f31f..375cfdca 100644 --- a/clustertest/host_test.go +++ b/clustertest/host_test.go @@ -118,7 +118,10 @@ func NewHost(t testing.TB, config HostConfig) *Host { Started: true, }, ) - require.NoError(t, err) + if err != nil { + printContainerLogs(t.Context(), t, id, ctr) + t.Fatal(err) + } h := &Host{ id: id, @@ -133,12 +136,8 @@ func NewHost(t testing.TB, config HostConfig) *Host { defer cancel() if t.Failed() { - logs, err := containerLogs(ctx, t, h.container) - if err != nil { - tLogf(t, "failed to extract container logs: %s", err) - } else { - tLogf(t, "host %s logs: %s", id, logs) - } + + printContainerLogs(ctx, t, id, h.container) } if testConfig.skipCleanup { @@ -288,6 +287,21 @@ func (h *Host) RecreateWithMode(t testing.TB, newMode EtcdMode) { h.port = ports[0] } +func printContainerLogs(ctx context.Context, t testing.TB, hostID string, container testcontainers.Container) { + t.Helper() + + if container == nil { + tLog(t, "container is nil") + return + } + logs, err := containerLogs(t.Context(), t, container) + if err != nil { + tLogf(t, "failed to extract container logs: %s", err) + } else { + tLogf(t, "host %s logs: %s", hostID, logs) + } +} + func containerLogs(ctx context.Context, t testing.TB, container testcontainers.Container) (string, error) { t.Helper()