From 8607f03b341266c22c5e09e14f56cafacff8ba10 Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 17 Sep 2025 22:20:02 -0700 Subject: [PATCH 1/3] feat: initial commit for perf testing milo --- Taskfile.yaml | 136 +++ config/apiserver/deployment.yaml | 2 +- .../servicemonitor-etcd.yaml | 18 + config/dependencies/etcd/helmrelease.yaml | 2 +- .../components/auth/auth-tokens-secret.yaml | 2 +- test/performance/README.md | 121 +++ test/performance/config/perf-cleanup-job.yaml | 52 + test/performance/config/perf-runner-job.yaml | 76 ++ test/performance/config/perf-runner-rbac.yaml | 29 + test/performance/scripts/perf_run.py | 905 ++++++++++++++++++ 10 files changed, 1340 insertions(+), 3 deletions(-) create mode 100644 config/components/prometheus-monitoring/servicemonitor-etcd.yaml create mode 100644 test/performance/README.md create mode 100644 test/performance/config/perf-cleanup-job.yaml create mode 100644 test/performance/config/perf-runner-job.yaml create mode 100644 test/performance/config/perf-runner-rbac.yaml create mode 100644 test/performance/scripts/perf_run.py diff --git a/Taskfile.yaml b/Taskfile.yaml index 14b1bcb9..096173b8 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -497,3 +497,139 @@ tasks: echo "🎉 All Prometheus rule tests passed." fi silent: false + + perf:run: + desc: Run Milo end-to-end performance scenario and download results + silent: true + cmds: + - | + set -euo pipefail + # Parse CLI key=value overrides passed after -- and export as env + for kv in {{.CLI_ARGS}}; do + case "$kv" in + *=*) key="${kv%%=*}"; val="${kv#*=}"; export "$key=$val" ;; + *) : ;; # ignore non key=value tokens + esac + done + NS="${NS:-milo-system}" + MILO_NS="${MILO_NAMESPACE:-milo-system}" + VM_NS="${VM_NAMESPACE:-telemetry-system}" + VM_SVC_NAME="${VM_SERVICE_NAME:-vmsingle-telemetry-system-vm-victoria-metrics-k8s-stack}" + VM_PORT="${VM_PORT:-8428}" + VM_BASE_URL="${VM_BASE_URL:-http://${VM_SVC_NAME}.${VM_NS}.svc.cluster.local:${VM_PORT}}" + APISERVER_REGEX="${APISERVER_POD_REGEX:-milo-apiserver.*}" + ETCD_REGEX="${ETCD_POD_REGEX:-etcd.*}" + MILO_KUBECONFIG_SECRET_NAME="${MILO_KUBECONFIG_SECRET_NAME:-milo-controller-manager-kubeconfig}" + MILO_KUBECONFIG_SECRET_KEY="${MILO_KUBECONFIG_SECRET_KEY:-kubeconfig}" + MILO_KUBECONFIG_PATH="${MILO_KUBECONFIG_PATH:-/work/milo-kubeconfig}" + NUM_PROJECTS="${NUM_PROJECTS:-{{default "100" .NUM_PROJECTS}}}" + NUM_SECRETS_PER_PROJECT="${NUM_SECRETS_PER_PROJECT:-{{default "100" .NUM_SECRETS_PER_PROJECT}}}" + NUM_CONFIGMAPS_PER_PROJECT="${NUM_CONFIGMAPS_PER_PROJECT:-{{default "100" .NUM_CONFIGMAPS_PER_PROJECT}}}" + PROJECT_CONCURRENCY="${PROJECT_CONCURRENCY:-{{default "4" .PROJECT_CONCURRENCY}}}" + OBJECT_CONCURRENCY="${OBJECT_CONCURRENCY:-{{default "8" .OBJECT_CONCURRENCY}}}" + RUN_OBJECTS_PHASE="${RUN_OBJECTS_PHASE:-{{default "true" .RUN_OBJECTS_PHASE}}}" + OUT_DIR="${OUT_DIR:-{{default "/work/out" .OUT_DIR}}}" + STABILIZE_SECONDS="${STABILIZE_SECONDS:-{{default "90" .STABILIZE_SECONDS}}}" + MEASURE_WINDOW="${MEASURE_WINDOW:-{{default "2m" .MEASURE_WINDOW}}}" + ORG_NAME="${ORG_NAME:-{{default "" .ORG_NAME}}}" + + echo "🔎 Checking Milo kubeconfig …" + if [ ! -f ".milo/kubeconfig" ]; then + echo "Error: .milo/kubeconfig not found. Run 'task dev:setup' first." >&2 + exit 1 + fi + + echo "🔐 Ensuring perf-runner RBAC is applied …" + sed "s/NAMESPACE_PLACEHOLDER/${NS}/g" test/performance/config/perf-runner-rbac.yaml | task test-infra:kubectl -- apply -f - + + echo "🗂 Publishing perf script as ConfigMap …" + task test-infra:kubectl -- -n ${NS} create configmap perf-script \ + --from-file=perf_run.py=test/performance/scripts/perf_run.py \ + --dry-run=client -o yaml | task test-infra:kubectl -- apply -f - + + echo "🚀 Launching perf runner Job …" + sed \ + -e "s/MILO_NAMESPACE_PLACEHOLDER/${MILO_NS}/g" \ + -e "s/NAMESPACE_PLACEHOLDER/${NS}/g" \ + -e "s#VM_BASE_URL_PLACEHOLDER#${VM_BASE_URL}#g" \ + -e "s/APISERVER_REGEX_PLACEHOLDER/${APISERVER_REGEX}/g" \ + -e "s/ETCD_REGEX_PLACEHOLDER/${ETCD_REGEX}/g" \ + -e "s/NUM_PROJECTS_PLACEHOLDER/${NUM_PROJECTS}/g" \ + -e "s/NUM_SECRETS_PLACEHOLDER/${NUM_SECRETS_PER_PROJECT}/g" \ + -e "s/NUM_CONFIGMAPS_PLACEHOLDER/${NUM_CONFIGMAPS_PER_PROJECT}/g" \ + -e "s/STABILIZE_SECONDS_PLACEHOLDER/${STABILIZE_SECONDS}/g" \ + -e "s/MEASURE_WINDOW_PLACEHOLDER/${MEASURE_WINDOW}/g" \ + -e "s/ORG_NAME_PLACEHOLDER/${ORG_NAME}/g" \ + -e "s/PROJECT_CONCURRENCY_PLACEHOLDER/${PROJECT_CONCURRENCY}/g" \ + -e "s/OBJECT_CONCURRENCY_PLACEHOLDER/${OBJECT_CONCURRENCY}/g" \ + -e "s/RUN_OBJECTS_PHASE_PLACEHOLDER/${RUN_OBJECTS_PHASE}/g" \ + -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \ + -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \ + test/performance/scripts/perf-runner-job.yaml | task test-infra:kubectl -- apply -f - + + echo "⏳ Waiting for Job completion …" + task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-runner --timeout=45m + + echo "⬇️ Downloading results …" + mkdir -p reports/perf + # Prefer ConfigMap (works even if pod already terminated) + TEST_ID=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.test_id}' 2>/dev/null || true) + OUT_DIR_LOCAL="reports/perf/${TEST_ID:-latest}" + mkdir -p "$OUT_DIR_LOCAL" + task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.results\.json}' > "$OUT_DIR_LOCAL/results.json" || true + task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.report\.html}' > "$OUT_DIR_LOCAL/report.html" || true + # Fallback to copying from the pod if ConfigMap wasn't available + if [ ! -s "$OUT_DIR_LOCAL/results.json" ] || [ ! -s "$OUT_DIR_LOCAL/report.html" ]; then + POD=$(task test-infra:kubectl -- -n ${NS} get pods -l job-name=perf-runner -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [ -n "$POD" ]; then + task test-infra:kubectl -- -n ${NS} cp "$POD:/work/out/results.json" "$OUT_DIR_LOCAL/results.json" || true + task test-infra:kubectl -- -n ${NS} cp "$POD:/work/out/report.html" "$OUT_DIR_LOCAL/report.html" || true + fi + fi + echo "✅ Results saved to $OUT_DIR_LOCAL" + + perf:cleanup: + desc: Cleanup resources created by the last perf run (org/projects/secrets/configmaps) + silent: true + cmds: + - | + set -euo pipefail + NS="${NS:-milo-system}" + MILO_KUBECONFIG_SECRET_NAME="${MILO_KUBECONFIG_SECRET_NAME:-milo-controller-manager-kubeconfig}" + MILO_KUBECONFIG_SECRET_KEY="${MILO_KUBECONFIG_SECRET_KEY:-kubeconfig}" + MILO_KUBECONFIG_PATH="${MILO_KUBECONFIG_PATH:-/work/milo-kubeconfig}" + + if [ ! -f ".milo/kubeconfig" ]; then + echo "Error: .milo/kubeconfig not found. Run 'task dev:setup' first." >&2 + exit 1 + fi + + echo "🔎 Discovering last test identifiers …" + # Allow override from CLI envs if ConfigMap isn't present + TEST_ID_CM=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.test_id}' 2>/dev/null || true) + ORG_NAME_CM=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.org_name}' 2>/dev/null || true) + TEST_ID="${TEST_ID:-$TEST_ID_CM}" + ORG_NAME="${ORG_NAME:-$ORG_NAME_CM}" + if [ -z "${TEST_ID}" ] || [ -z "${ORG_NAME}" ]; then + echo "No existing results found in namespace ${NS} (ConfigMap perf-results). Nothing to cleanup." + exit 0 + fi + + echo "🚮 Launching cleanup Job for test ${TEST_ID} …" + sed \ + -e "s/NAMESPACE_PLACEHOLDER/${NS}/g" \ + -e "s/TEST_ID_PLACEHOLDER/${TEST_ID}/g" \ + -e "s/ORG_NAME_PLACEHOLDER/${ORG_NAME}/g" \ + -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \ + -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \ + test/performance/config/perf-cleanup-job.yaml | task test-infra:kubectl -- apply -f - + + echo "⏳ Waiting for cleanup Job completion …" + task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-cleanup --timeout=30m + + echo "🧹 Removing runner artifacts (keeping downloaded results) …" + task test-infra:kubectl -- -n ${NS} delete job/perf-runner --ignore-not-found + task test-infra:kubectl -- -n ${NS} delete job/perf-cleanup --ignore-not-found + task test-infra:kubectl -- -n ${NS} delete configmap perf-script --ignore-not-found + task test-infra:kubectl -- -n ${NS} delete configmap perf-results --ignore-not-found + echo "✅ Cleanup complete." diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml index dd4ee1e7..28717530 100644 --- a/config/apiserver/deployment.yaml +++ b/config/apiserver/deployment.yaml @@ -135,7 +135,7 @@ spec: memory: 128Mi limits: cpu: 500m - memory: 512Mi + memory: 2G startupProbe: failureThreshold: 3 httpGet: diff --git a/config/components/prometheus-monitoring/servicemonitor-etcd.yaml b/config/components/prometheus-monitoring/servicemonitor-etcd.yaml new file mode 100644 index 00000000..f08607e0 --- /dev/null +++ b/config/components/prometheus-monitoring/servicemonitor-etcd.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: etcd-metrics + namespace: milo-system +spec: + namespaceSelector: + matchNames: ["milo-system"] + selector: + matchLabels: + app.kubernetes.io/component: etcd + app.kubernetes.io/name: etcd + endpoints: + - port: client + path: /metrics + scheme: http + interval: 15s + diff --git a/config/dependencies/etcd/helmrelease.yaml b/config/dependencies/etcd/helmrelease.yaml index d6c25628..b4a4e506 100644 --- a/config/dependencies/etcd/helmrelease.yaml +++ b/config/dependencies/etcd/helmrelease.yaml @@ -35,7 +35,7 @@ spec: resources: limits: cpu: 500m - memory: 512Mi + memory: 2G requests: cpu: 200m memory: 256Mi diff --git a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml index dd9d855f..d349d947 100644 --- a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml +++ b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml @@ -5,5 +5,5 @@ metadata: type: Opaque stringData: tokens.csv: | - test-admin-token,admin,1001,"system:masters" + test-admin-token,admin,admin,"system:masters" test-user-token,test-user,1002,"system:authenticated" diff --git a/test/performance/README.md b/test/performance/README.md new file mode 100644 index 00000000..882fee18 --- /dev/null +++ b/test/performance/README.md @@ -0,0 +1,121 @@ +### Milo performance runner + +This performance suite provisions Milo/Etcd service monitors and measures CPU/Memory snapshots from VictoriaMetrics. + +Files and structure: +- performance/scripts/perf_run.py: runner script executed inside a Kubernetes Job +- performance/config/perf-runner-job.yaml: Job template for the run phase +- performance/config/perf-cleanup-job.yaml: Job template for cleanup +- performance/config/perf-runner-rbac.yaml: ServiceAccount/Role/RoleBinding used by the jobs + +#### Summary + +- Creates a Milo `Organization`, then N `Projects`, waits for all to be Ready, and times it. +- Takes metrics snapshots before (baseline), after projects are ready, and optionally after per-project object creation. +- Optionally creates M `Secrets` and K `ConfigMaps` in each Project (parallelized), then measures again. +- Saves results to a ConfigMap and downloads a local HTML report and JSON. + +#### Prerequisites + +1) Bring up dev stack and observability: + +```bash +task dev:setup && task dev:install-observability +``` + +2) Ensure a Milo kubeconfig secret exists in your cluster. By default the tasks mount `Secret/milo-controller-manager-kubeconfig` (key `kubeconfig`). You can override via env (see knobs below). + +#### How to run + +- Full run (org + projects + objects) with defaults: + +```bash +task perf:run +``` + +- Projects-only (skip secrets/configmaps) and higher parallelism: + +```bash +task perf:run -- RUN_OBJECTS_PHASE=false PROJECT_CONCURRENCY=10 +``` + +- Cleanup all resources from the last run: + +```bash +task perf:cleanup +``` + +#### Outputs + +- In-cluster: ConfigMap `perf-results` in `NS` (default `milo-system`) with keys `results.json`, `report.html`, `test_id`, `org_name`. +- Local: `reports/perf//results.json` and `report.html` downloaded by the task after the Job completes. The HTML report includes grouped bar charts (CPU cores and Memory MB) and per-project delta KPIs for apiserver and etcd. + +#### What the runner does + +1) Baseline: query VictoriaMetrics for Milo apiserver and etcd CPU/memory. +2) Create Organization (no wait), then create N Projects, wait for all Projects Ready; record duration. +3) Stabilize, then snapshot “after projects”. +4) If enabled, create per-Project objects (Secrets/ConfigMaps) concurrently; stabilize, then snapshot “after secrets+configmaps”. + +Snapshots come from VictoriaMetrics using `container_cpu_usage_seconds_total` (rate) and `container_memory_working_set_bytes` (avg_over_time) for pods matching the configured namespace and pod name regexes. + +#### Configuration knobs (env vars) + +Pass on the `task perf:run -- KEY=value ...` command line. Defaults shown in parentheses. + +- Resource selection + - `NS` (milo-system): Namespace to run Job and store results ConfigMap + - `MILO_NAMESPACE` (milo-system): Namespace to measure apiserver/etcd pods + - `APISERVER_POD_REGEX` (milo-apiserver.*): Regex for apiserver pods + - `ETCD_POD_REGEX` (etcd.*): Regex for etcd pods + +- Metrics source (VictoriaMetrics) + - `VM_NAMESPACE` (telemetry-system) + - `VM_SERVICE_NAME` (vmsingle-telemetry-system-vm-victoria-metrics-k8s-stack) + - `VM_PORT` (8428) + - `VM_BASE_URL` (optional override, e.g. http://hostname:8428). Default uses in-cluster FQDN: `http://..svc.cluster.local:8428`. + - `MEASURE_WINDOW` (2m): Range window for rate/avg_over_time + +- Scale and workload + - `NUM_PROJECTS` (100) + - `RUN_OBJECTS_PHASE` (true): Toggle per-project Secrets/ConfigMaps phase + - `NUM_SECRETS_PER_PROJECT` (100) + - `NUM_CONFIGMAPS_PER_PROJECT` (100) + - `PROJECT_CONCURRENCY` (4): Number of projects processed in parallel when creating objects + - `OBJECT_CONCURRENCY` (8): Secrets/ConfigMaps parallelism inside each project + +- Stabilization windows + - `STABILIZE_SECONDS` (90): Sleep before snapshots after Projects and after Objects + +- Identity / scoping + - `ORG_NAME` (auto-generated): Name of Organization to create + - `MILO_KUBECONFIG_SECRET_NAME` (milo-controller-manager-kubeconfig): Secret containing Milo kubeconfig + - `MILO_KUBECONFIG_SECRET_KEY` (kubeconfig): Secret key with kubeconfig content + - `MILO_KUBECONFIG_PATH` (/work/milo-kubeconfig): In-container path to mount kubeconfig + - `AUTH_BEARER_TOKEN` (optional): Override token injected into kubeconfig user for troubleshooting + +#### Examples + +- Measure project-only impact: + +```bash +task perf:run -- RUN_OBJECTS_PHASE=false STABILIZE_SECONDS=60 NUM_PROJECTS=200 +``` + +- Heavier objects phase, more parallelism: + +```bash +task perf:run -- NUM_SECRETS_PER_PROJECT=500 NUM_CONFIGMAPS_PER_PROJECT=500 PROJECT_CONCURRENCY=12 OBJECT_CONCURRENCY=24 +``` + +- Point to a custom VictoriaMetrics endpoint: + +```bash +task perf:run -- VM_BASE_URL=http://vm.my-domain.local:8428 +``` + +- Use a specific Organization name: + +```bash +task perf:run -- ORG_NAME=perf-cow +``` diff --git a/test/performance/config/perf-cleanup-job.yaml b/test/performance/config/perf-cleanup-job.yaml new file mode 100644 index 00000000..7bc1b327 --- /dev/null +++ b/test/performance/config/perf-cleanup-job.yaml @@ -0,0 +1,52 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: perf-cleanup + namespace: NAMESPACE_PLACEHOLDER +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 0 + template: + spec: + serviceAccountName: perf-runner + restartPolicy: Never + containers: + - name: cleanup + image: python:3.11 + imagePullPolicy: IfNotPresent + env: + - name: TARGET_NAMESPACE + value: NAMESPACE_PLACEHOLDER + - name: RUN_MODE + value: cleanup + - name: TEST_ID + value: "TEST_ID_PLACEHOLDER" + - name: ORG_NAME + value: "ORG_NAME_PLACEHOLDER" + - name: MILO_KUBECONFIG_PATH + value: "/work/milo-kubeconfig" + volumeMounts: + - name: script + mountPath: /work/perf_run.py + subPath: perf_run.py + readOnly: true + - name: milo-kubeconfig + mountPath: /work/milo-kubeconfig + subPath: MILO_KUBECONFIG_KEY_PLACEHOLDER + readOnly: true + command: ["bash","-lc"] + args: + - >- + python -m pip install --no-cache-dir kubernetes requests pyyaml && + python -u /work/perf_run.py + volumes: + - name: script + configMap: + name: perf-script + defaultMode: 0444 + - name: milo-kubeconfig + secret: + secretName: MILO_KUBECONFIG_SECRET_PLACEHOLDER + defaultMode: 0400 + + diff --git a/test/performance/config/perf-runner-job.yaml b/test/performance/config/perf-runner-job.yaml new file mode 100644 index 00000000..f9c1da5c --- /dev/null +++ b/test/performance/config/perf-runner-job.yaml @@ -0,0 +1,76 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: perf-runner + namespace: NAMESPACE_PLACEHOLDER +spec: + ttlSecondsAfterFinished: 600 + backoffLimit: 0 + template: + spec: + serviceAccountName: perf-runner + restartPolicy: Never + containers: + - name: runner + image: python:3.11 + imagePullPolicy: IfNotPresent + env: + - name: TARGET_NAMESPACE + value: NAMESPACE_PLACEHOLDER + - name: VM_BASE_URL + value: VM_BASE_URL_PLACEHOLDER + - name: MILO_NAMESPACE + value: MILO_NAMESPACE_PLACEHOLDER + - name: APISERVER_POD_REGEX + value: APISERVER_REGEX_PLACEHOLDER + - name: ETCD_POD_REGEX + value: ETCD_REGEX_PLACEHOLDER + - name: NUM_PROJECTS + value: "NUM_PROJECTS_PLACEHOLDER" + - name: NUM_SECRETS_PER_PROJECT + value: "NUM_SECRETS_PLACEHOLDER" + - name: NUM_CONFIGMAPS_PER_PROJECT + value: "NUM_CONFIGMAPS_PLACEHOLDER" + - name: STABILIZE_SECONDS + value: "STABILIZE_SECONDS_PLACEHOLDER" + - name: MEASURE_WINDOW + value: "MEASURE_WINDOW_PLACEHOLDER" + - name: ORG_NAME + value: "ORG_NAME_PLACEHOLDER" + - name: PROJECT_CONCURRENCY + value: "PROJECT_CONCURRENCY_PLACEHOLDER" + - name: OBJECT_CONCURRENCY + value: "OBJECT_CONCURRENCY_PLACEHOLDER" + - name: RUN_OBJECTS_PHASE + value: "RUN_OBJECTS_PHASE_PLACEHOLDER" + - name: OUT_DIR + value: "/work/out" + - name: MPLBACKEND + value: Agg + - name: MILO_KUBECONFIG_PATH + value: "/work/milo-kubeconfig" + volumeMounts: + - name: script + mountPath: /work/perf_run.py + subPath: perf_run.py + readOnly: true + - name: milo-kubeconfig + mountPath: /work/milo-kubeconfig + subPath: MILO_KUBECONFIG_KEY_PLACEHOLDER + readOnly: true + command: ["bash","-lc"] + args: + - >- + python -m pip install --no-cache-dir kubernetes requests pyyaml matplotlib && + python -u /work/perf_run.py + volumes: + - name: script + configMap: + name: perf-script + defaultMode: 0444 + - name: milo-kubeconfig + secret: + secretName: MILO_KUBECONFIG_SECRET_PLACEHOLDER + defaultMode: 0400 + + diff --git a/test/performance/config/perf-runner-rbac.yaml b/test/performance/config/perf-runner-rbac.yaml new file mode 100644 index 00000000..9fad73bb --- /dev/null +++ b/test/performance/config/perf-runner-rbac.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: perf-runner + namespace: NAMESPACE_PLACEHOLDER +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: perf-results-manager + namespace: NAMESPACE_PLACEHOLDER +rules: +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: perf-results-manager-binding + namespace: NAMESPACE_PLACEHOLDER +subjects: +- kind: ServiceAccount + name: perf-runner + namespace: NAMESPACE_PLACEHOLDER +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: perf-results-manager diff --git a/test/performance/scripts/perf_run.py b/test/performance/scripts/perf_run.py new file mode 100644 index 00000000..12990ffd --- /dev/null +++ b/test/performance/scripts/perf_run.py @@ -0,0 +1,905 @@ +import base64 +import json +import concurrent.futures +import os +import sys +import time +import uuid +from datetime import datetime, timezone +from io import BytesIO + +import requests +import urllib3 +import yaml +from kubernetes import client as k8s_client +from kubernetes import config as k8s_config +from kubernetes.client import ApiException + + +def get_env(name: str, default: str | None = None) -> str: + value = os.getenv(name, default) + if value is None: + print(f"Missing required env var: {name}", file=sys.stderr) + sys.exit(1) + return value + + +def parse_bool(value: str | None, default: bool = True) -> bool: + if value is None: + return default + return value.strip().lower() in {"1", "true", "t", "yes", "y", "on"} + + +def load_yaml_file(path: str) -> dict: + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) + + +# Reduce noisy TLS warnings from in-cluster/self-signed configs +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +try: + # Also quiet requests' vendored urllib3, if present + requests.packages.urllib3.disable_warnings(category=urllib3.exceptions.InsecureRequestWarning) # type: ignore +except Exception: + pass + + +def log(message: str) -> None: + now = datetime.now(timezone.utc).strftime("%H:%M:%S") + print(f"[{now}] {message}", flush=True) + + +def should_retry_api_exception(e: ApiException) -> bool: + try: + if e.status in (429, 500): + return True + body = getattr(e, 'body', '') or '' + msg = str(body) + # Handle webhook EOF/transient failures + if 'Internal error occurred' in msg or 'failed calling webhook' in msg or 'EOF' in msg: + return True + except Exception: + pass + return False + + +def retry_with_backoff(action_name: str, fn, *args, **kwargs): + delay = 1.0 + for attempt in range(1, 7): + try: + return fn(*args, **kwargs) + except ApiException as e: + if should_retry_api_exception(e) and attempt < 6: + log(f"{action_name} failed (attempt {attempt}/6): status={e.status} retrying in {delay:.0f}s …") + time.sleep(delay) + delay = min(delay * 2, 16) + continue + raise + + +def save_results_configmap(namespace: str, name: str, data: dict[str, str]) -> None: + # Uses in-cluster config to write to Kubernetes ConfigMap + try: + k8s_config.load_incluster_config() + except Exception: + # Fallback to default kubeconfig for local runs + k8s_config.load_kube_config() + + v1 = k8s_client.CoreV1Api() + metadata = k8s_client.V1ObjectMeta(name=name) + cm = k8s_client.V1ConfigMap(metadata=metadata, data=data) + + try: + existing = v1.read_namespaced_config_map(name=name, namespace=namespace) + existing.data = data + v1.replace_namespaced_config_map(name=name, namespace=namespace, body=existing) + except ApiException as e: + if e.status == 404: + v1.create_namespaced_config_map(namespace=namespace, body=cm) + else: + raise + + +def save_checkpoint(namespace: str, test_id: str, org_name: str, phase: str, extra: dict | None = None) -> None: + data: dict[str, str] = { + "test_id": test_id, + "org_name": org_name, + "phase": phase, + } + if extra: + for k, v in extra.items(): + try: + data[k] = json.dumps(v) if isinstance(v, (dict, list)) else str(v) + except Exception: + data[k] = str(v) + save_results_configmap(namespace, "perf-results", data) + + +def http_get_json(url: str, params: dict | None = None) -> dict: + # Basic retry for transient VM connectivity (EOF, connection reset, etc.) + last_err: Exception | None = None + for attempt in range(6): + try: + resp = requests.get(url, params=params, timeout=30) + resp.raise_for_status() + return resp.json() + except Exception as e: + last_err = e + sleep_s = 2 * (attempt + 1) + log(f"[metrics] request failed (attempt {attempt+1}/6): {e}; retrying in {sleep_s}s") + time.sleep(sleep_s) + assert last_err is not None + raise last_err + + +def prom_query(base_url: str, query: str, context: str | None = None) -> float: + url = f"{base_url.rstrip('/')}/api/v1/query" + start = time.time() + data = http_get_json(url, params={"query": query}) + duration = time.time() - start + if context: + log(f"[metrics] {context} took {duration:.1f}s") + if data.get("status") != "success": + raise RuntimeError(f"Prom query failed: {data}") + result = data.get("data", {}).get("result", []) + if not result: + if context: + log(f"[metrics] {context} returned empty result") + return 0.0 + # Use the first scalar/vector value (sum/avg queries should return single series) + value = float(result[0]["value"][1]) + if context: + log(f"[metrics] {context} value={value}") + return value + + +def measure_metrics(base_url: str, namespace: str, apiserver_regex: str, etcd_regex: str, window: str) -> dict: + # CPU is in cores (rate over window). Memory in bytes (avg over window) + log(f"[metrics] VM_BASE_URL={base_url} namespace={namespace} window={window}") + log("[metrics] querying apiserver cpu/memory and etcd cpu/memory …") + # Pre-flight series counts to aid debugging + try: + ns_cpu_series = prom_query( + base_url, + f'count(container_cpu_usage_seconds_total{{namespace="{namespace}"}})', + context="series_count_ns_cpu", + ) + ns_mem_series = prom_query( + base_url, + f'count(container_memory_working_set_bytes{{namespace="{namespace}"}})', + context="series_count_ns_mem", + ) + apiserver_series = prom_query( + base_url, + f'count(container_cpu_usage_seconds_total{{namespace="{namespace}",pod=~"{apiserver_regex}"}})', + context="series_count_apiserver", + ) + etcd_series = prom_query( + base_url, + f'count(container_cpu_usage_seconds_total{{namespace="{namespace}",pod=~"{etcd_regex}"}})', + context="series_count_etcd", + ) + log( + f"[metrics] series counts: ns_cpu={ns_cpu_series} ns_mem={ns_mem_series} apiserver={apiserver_series} etcd={etcd_series}" + ) + except Exception as e: + log(f"[metrics] pre-flight series counts failed: {e}") + def run_queries(pod_label: str, include_container_filter: bool) -> dict[str, float]: + container_filter = 'container!="",container!="POD"' if include_container_filter else '' + comma = ',' if include_container_filter else '' + # Build label selectors without f-strings to avoid brace escaping issues + apiserver_selector = '{namespace="%s",%s=~"%s"%s%s}' % ( + namespace, + pod_label, + apiserver_regex, + comma, + container_filter, + ) + etcd_selector = '{namespace="%s",%s=~"%s"%s%s}' % ( + namespace, + pod_label, + etcd_regex, + comma, + container_filter, + ) + queries: dict[str, str] = { + "apiserver_cpu_cores": 'sum(rate(container_cpu_usage_seconds_total%s[%s]))' % (apiserver_selector, window), + "apiserver_mem_bytes": 'sum(avg_over_time(container_memory_working_set_bytes%s[%s]))' % (apiserver_selector, window), + "etcd_cpu_cores": 'sum(rate(container_cpu_usage_seconds_total%s[%s]))' % (etcd_selector, window), + "etcd_mem_bytes": 'sum(avg_over_time(container_memory_working_set_bytes%s[%s]))' % (etcd_selector, window), + } + for k, q in queries.items(): + log(f"[metrics] query[{k}] (label={pod_label}, filter={include_container_filter}): {q}") + results: dict[str, float] = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + future_to_key = {executor.submit(prom_query, base_url, query, k): k for k, query in queries.items()} + for future in concurrent.futures.as_completed(future_to_key): + k = future_to_key[future] + try: + results[k] = future.result() + except Exception as e: + log(f"[metrics] query failed for {k}: {e}; using 0") + results[k] = 0.0 + return results + + # Try multiple label variants and filters; return first non-zero set + for pod_label, with_filter in [("pod", True), ("pod", False), ("pod_name", True), ("pod_name", False)]: + log(f"[metrics] attempting with pod_label={pod_label} filter={with_filter}") + results = run_queries(pod_label, with_filter) + if any(v != 0.0 for v in results.values()): + return results + + log("[metrics] all query variants returned 0; returning zeros") + return {"apiserver_cpu_cores": 0.0, "apiserver_mem_bytes": 0.0, "etcd_cpu_cores": 0.0, "etcd_mem_bytes": 0.0} + + +def find_condition(conditions: list[dict] | None, ctype: str) -> dict | None: + if not conditions: + return None + for c in conditions: + if c.get("type") == ctype: + return c + return None + + +def wait_condition_ready( + coapi: k8s_client.CustomObjectsApi, + group: str, + version: str, + plural: str, + name: str, + timeout_s: int = 600, + log_context: str | None = None, +) -> None: + start = time.time() + deadline = start + timeout_s + last_log = 0.0 + while time.time() < deadline: + obj = coapi.get_cluster_custom_object(group=group, version=version, plural=plural, name=name) + cond = find_condition(obj.get("status", {}).get("conditions"), "Ready") + if cond and str(cond.get("status")) == "True": + return + # Periodic detail log to explain why we're still waiting + now = time.time() + if now - start >= 10 and now - last_log >= 15: + last_log = now + if cond: + reason = cond.get("reason", "") + message = cond.get("message", "") + status = cond.get("status", "Unknown") + ctx = f"{plural}/{name}" if not log_context else f"{log_context} ({plural}/{name})" + log(f"waiting for {ctx}: Ready={status} reason={reason} message={message}") + else: + ctx = f"{plural}/{name}" if not log_context else f"{log_context} ({plural}/{name})" + log(f"waiting for {ctx}: no Ready condition yet") + time.sleep(2) + raise TimeoutError(f"Timed out waiting for {plural}/{name} Ready") + + +def build_scoped_kubeconfig(base_cfg: dict, scope_path: str, new_name: str) -> dict: + cfg = yaml.safe_load(yaml.safe_dump(base_cfg)) # deep copy + for c in cfg.get("clusters", []): + server = c["cluster"].get("server", "").rstrip("/") + c["name"] = new_name + c["cluster"]["server"] = f"{server}{scope_path}" + # context names follow cluster names + if cfg.get("contexts"): + for ctx in cfg["contexts"]: + ctx["name"] = new_name + ctx["context"]["cluster"] = new_name + if cfg.get("current-context"): + cfg["current-context"] = new_name + return cfg + + +def kube_client_from_config(cfg: dict): + # Load a kubernetes client from a kubeconfig dict (not a file) + loader = k8s_config.kube_config.KubeConfigLoader(config_dict=cfg) + configuration = k8s_client.Configuration() + loader.load_and_set(configuration) + return k8s_client.ApiClient(configuration) + + +def create_org_and_projects(milo_kubeconfig_path: str, org_name: str, num_projects: int, labels: dict[str, str]) -> tuple[dict, list[str], float]: + base_cfg = load_yaml_file(milo_kubeconfig_path) + # Optional override to inject a bearer token for auth troubleshooting + override_token = os.getenv("AUTH_BEARER_TOKEN") + if override_token: + try: + if base_cfg.get("users"): + base_cfg["users"][0]["user"]["token"] = override_token + log("Using AUTH_BEARER_TOKEN override for kubeconfig user[0]") + except Exception: + pass + # Client for Milo API server (cluster-scoped CRDs) + api_client = kube_client_from_config(base_cfg) + coapi = k8s_client.CustomObjectsApi(api_client) + try: + cluster_server = base_cfg.get("clusters", [{}])[0].get("cluster", {}).get("server", "") + user_name = base_cfg.get("users", [{}])[0].get("name", "") + log(f"Using cluster-scoped kubeconfig: server={cluster_server} user={user_name}") + except Exception: + pass + + # Create Organization + log(f"Creating Organization '{org_name}' …") + org_body = { + "apiVersion": "resourcemanager.miloapis.com/v1alpha1", + "kind": "Organization", + "metadata": {"name": org_name, "labels": labels}, + "spec": {"type": "Standard"}, + } + try: + retry_with_backoff( + "create Organization", + coapi.create_cluster_custom_object, + group="resourcemanager.miloapis.com", + version="v1alpha1", + plural="organizations", + body=org_body, + ) + except ApiException as e: + if e.status != 409: + raise + + # Do not wait for Organization readiness; proceed immediately to projects + log(f"Organization '{org_name}' created") + + # Build an organization-scoped kubeconfig so requests carry parent context + org_scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/organizations/{org_name}/control-plane" + org_cfg = build_scoped_kubeconfig(base_cfg, org_scope_path, new_name=f"organization-{org_name}") + org_client = kube_client_from_config(org_cfg) + org_coapi = k8s_client.CustomObjectsApi(org_client) + try: + org_server = org_cfg.get("clusters", [{}])[0].get("cluster", {}).get("server", "") + org_user = org_cfg.get("users", [{}])[0].get("name", "") + log(f"Using organization-scoped kubeconfig: server={org_server} user={org_user}") + except Exception: + pass + + # Create Projects + project_names: list[str] = [] + start = time.time() + log(f"Creating {num_projects} Projects …") + for i in range(1, num_projects + 1): + pname = f"{org_name}-p-{i:03d}" + project_names.append(pname) + proj_body = { + "apiVersion": "resourcemanager.miloapis.com/v1alpha1", + "kind": "Project", + "metadata": {"name": pname, "labels": labels}, + "spec": {"ownerRef": {"kind": "Organization", "name": org_name}}, + } + try: + retry_with_backoff( + f"create Project {pname}", + org_coapi.create_cluster_custom_object, + group="resourcemanager.miloapis.com", + version="v1alpha1", + plural="projects", + body=proj_body, + ) + except ApiException as e: + if e.status != 409: + log(f"error creating Project '{pname}': {getattr(e, 'body', e)}") + raise + if i % 10 == 0 or i == num_projects: + log(f"Created {i}/{num_projects} Projects …") + + # Wait for all projects Ready + ready = 0 + log("Waiting for Projects to become Ready …") + for pname in project_names: + wait_condition_ready(org_coapi, "resourcemanager.miloapis.com", "v1alpha1", "projects", pname, timeout_s=900, log_context="Project") + ready += 1 + if ready % 10 == 0 or ready == len(project_names): + log(f"Projects Ready: {ready}/{len(project_names)} …") + end = time.time() + total_seconds = end - start + + # Return base kubeconfig (for building scoped configs), project names, and duration + return base_cfg, project_names, total_seconds + + +def create_objects_in_projects( + base_cfg: dict, + org_name: str, + project_names: list[str], + num_secrets: int, + num_configmaps: int, + labels: dict[str, str], + project_concurrency: int, + object_concurrency: int, +) -> None: + def work_project(pname: str) -> None: + scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/projects/{pname}/control-plane" + proj_cfg = build_scoped_kubeconfig(base_cfg, scope_path, new_name=f"project-{pname}") + proj_client = kube_client_from_config(proj_cfg) + v1 = k8s_client.CoreV1Api(proj_client) + + def create_secret(i: int) -> None: + sname = f"perf-secret-{i:03d}" + body = k8s_client.V1Secret( + metadata=k8s_client.V1ObjectMeta(name=sname, labels=labels), + string_data={"note": f"secret {i} for {pname}"}, + type="Opaque", + ) + try: + v1.create_namespaced_secret(namespace="default", body=body) + except ApiException as e: + if e.status != 409: + raise + + def create_configmap(i: int) -> None: + cname = f"perf-configmap-{i:03d}" + body = k8s_client.V1ConfigMap( + metadata=k8s_client.V1ObjectMeta(name=cname, labels=labels), + data={"note": f"configmap {i} for {pname}"}, + ) + try: + v1.create_namespaced_config_map(namespace="default", body=body) + except ApiException as e: + if e.status != 409: + raise + + log(f"[{pname}] Creating {num_secrets} Secrets (concurrency={object_concurrency}) …") + if num_secrets > 0: + with concurrent.futures.ThreadPoolExecutor(max_workers=object_concurrency) as ex: + list(ex.map(create_secret, range(1, num_secrets + 1))) + log(f"[{pname}] Secrets created: {num_secrets}/{num_secrets}") + + log(f"[{pname}] Creating {num_configmaps} ConfigMaps (concurrency={object_concurrency}) …") + if num_configmaps > 0: + with concurrent.futures.ThreadPoolExecutor(max_workers=object_concurrency) as ex: + list(ex.map(create_configmap, range(1, num_configmaps + 1))) + log(f"[{pname}] ConfigMaps created: {num_configmaps}/{num_configmaps}") + + # Run multiple projects in parallel + if project_concurrency <= 1: + for pname in project_names: + work_project(pname) + else: + log(f"Creating objects across projects (concurrency={project_concurrency}) …") + with concurrent.futures.ThreadPoolExecutor(max_workers=project_concurrency) as ex: + list(ex.map(work_project, project_names)) + + +def generate_report_html( + metrics_before: dict, + metrics_after_projects: dict, + metrics_after_secrets: dict | None, + num_projects: int, + projects_ready_seconds: float, +) -> str: + # Minimal inline charts using simple ASCII bars as fallback if matplotlib unavailable + try: + import matplotlib.pyplot as plt # type: ignore + + def grouped_bars_png( + series: list[tuple[str, list[float]]], + categories: list[str], + title: str, + ylabel: str, + ) -> str: + fig, ax = plt.subplots(figsize=(7.5, 3.5)) + num_series = len(series) + x = range(len(categories)) + total_bar_width = 0.8 + bar_width = total_bar_width / max(1, num_series) + offsets = [(-total_bar_width / 2) + (i + 0.5) * bar_width for i in range(num_series)] + colors = ["#4c78a8", "#f58518", "#54a24b"] + for idx, (label, values) in enumerate(series): + ax.bar([xi + offsets[idx] for xi in x], values, width=bar_width, label=label, color=colors[idx % len(colors)]) + ax.set_title(title) + ax.set_ylabel(ylabel) + ax.set_xticks(list(x)) + ax.set_xticklabels(categories) + ax.legend(loc="upper left", fontsize=8) + plt.tight_layout() + buf = BytesIO() + plt.savefig(buf, format="png") + plt.close(fig) + b64 = base64.b64encode(buf.getvalue()).decode("ascii") + return f"{title}" + + # Build series for CPU (cores) and Memory (MB) + cpu_series: list[tuple[str, list[float]]] = [ + ("baseline", [metrics_before["apiserver_cpu_cores"], metrics_before["etcd_cpu_cores"]]), + ("after-projects", [metrics_after_projects["apiserver_cpu_cores"], metrics_after_projects["etcd_cpu_cores"]]), + ] + mem_series: list[tuple[str, list[float]]] = [ + ( + "baseline", + [metrics_before["apiserver_mem_bytes"] / (1024 * 1024), metrics_before["etcd_mem_bytes"] / (1024 * 1024)], + ), + ( + "after-projects", + [ + metrics_after_projects["apiserver_mem_bytes"] / (1024 * 1024), + metrics_after_projects["etcd_mem_bytes"] / (1024 * 1024), + ], + ), + ] + if metrics_after_secrets is not None: + cpu_series.append( + ( + "after-objects", + [metrics_after_secrets["apiserver_cpu_cores"], metrics_after_secrets["etcd_cpu_cores"]], + ) + ) + mem_series.append( + ( + "after-objects", + [ + metrics_after_secrets["apiserver_mem_bytes"] / (1024 * 1024), + metrics_after_secrets["etcd_mem_bytes"] / (1024 * 1024), + ], + ) + ) + + cpu_img = grouped_bars_png(cpu_series, ["apiserver", "etcd"], "CPU (cores)", "cores") + mem_img = grouped_bars_png(mem_series, ["apiserver", "etcd"], "Memory (MB)", "MB") + + # Quick stats and deltas + t_total = projects_ready_seconds + per_project_s = (t_total / num_projects) if num_projects > 0 else 0.0 + def delta(a: float, b: float) -> float: + return b - a + apiserver_mem_delta_mb = delta( + metrics_before["apiserver_mem_bytes"] / (1024 * 1024), + metrics_after_projects["apiserver_mem_bytes"] / (1024 * 1024), + ) + etcd_mem_delta_mb = delta( + metrics_before["etcd_mem_bytes"] / (1024 * 1024), + metrics_after_projects["etcd_mem_bytes"] / (1024 * 1024), + ) + apiserver_cpu_delta = delta(metrics_before["apiserver_cpu_cores"], metrics_after_projects["apiserver_cpu_cores"]) + etcd_cpu_delta = delta(metrics_before["etcd_cpu_cores"], metrics_after_projects["etcd_cpu_cores"]) + + # Per-project implications (naive average impact per created Project) + per_proj_cpu_apiserver = (apiserver_cpu_delta / num_projects) if num_projects > 0 else 0.0 + per_proj_cpu_etcd = (etcd_cpu_delta / num_projects) if num_projects > 0 else 0.0 + per_proj_mem_apiserver_mb = (apiserver_mem_delta_mb / num_projects) if num_projects > 0 else 0.0 + per_proj_mem_etcd_mb = (etcd_mem_delta_mb / num_projects) if num_projects > 0 else 0.0 + + after_objects_stats = "" + if metrics_after_secrets is not None: + apiserver_mem_delta_mb_obj = delta( + metrics_after_projects["apiserver_mem_bytes"] / (1024 * 1024), + metrics_after_secrets["apiserver_mem_bytes"] / (1024 * 1024), + ) + etcd_mem_delta_mb_obj = delta( + metrics_after_projects["etcd_mem_bytes"] / (1024 * 1024), + metrics_after_secrets["etcd_mem_bytes"] / (1024 * 1024), + ) + apiserver_cpu_delta_obj = delta( + metrics_after_projects["apiserver_cpu_cores"], metrics_after_secrets["apiserver_cpu_cores"] + ) + etcd_cpu_delta_obj = delta( + metrics_after_projects["etcd_cpu_cores"], metrics_after_secrets["etcd_cpu_cores"] + ) + per_proj_cpu_apiserver_obj = (apiserver_cpu_delta_obj / num_projects) if num_projects > 0 else 0.0 + per_proj_cpu_etcd_obj = (etcd_cpu_delta_obj / num_projects) if num_projects > 0 else 0.0 + per_proj_mem_apiserver_mb_obj = (apiserver_mem_delta_mb_obj / num_projects) if num_projects > 0 else 0.0 + per_proj_mem_etcd_mb_obj = (etcd_mem_delta_mb_obj / num_projects) if num_projects > 0 else 0.0 + after_objects_stats = f""" +
+
After objects (per-project deltas)
+
+
CPU apiserver
{per_proj_cpu_apiserver_obj:+.4f} cores
+
CPU etcd
{per_proj_cpu_etcd_obj:+.4f} cores
+
MEM apiserver
{per_proj_mem_apiserver_mb_obj:+.2f} MB
+
MEM etcd
{per_proj_mem_etcd_mb_obj:+.2f} MB
+
+
+""" + + html = """ + + + + + Milo Performance Report + + + +

Milo Performance Report

+
Projects: {num_projects} • Time to Ready: {t_total:.1f}s ({per_project_s:.2f}s/project)
+ +
+
{cpu_img}
+
{mem_img}
+
+ +
+
+
After-projects (per-project deltas)
+
+
CPU apiserver
{per_proj_cpu_apiserver:+.4f} cores
+
CPU etcd
{per_proj_cpu_etcd:+.4f} cores
+
MEM apiserver
{per_proj_mem_apiserver_mb:+.2f} MB
+
MEM etcd
{per_proj_mem_etcd_mb:+.2f} MB
+
+
+ {after_objects_stats} +
+ + +""" + return html.format( + cpu_img=cpu_img, + mem_img=mem_img, + num_projects=num_projects, + t_total=t_total, + per_project_s=per_project_s, + apiserver_mem_delta_mb=apiserver_mem_delta_mb, + etcd_mem_delta_mb=etcd_mem_delta_mb, + apiserver_cpu_delta=apiserver_cpu_delta, + etcd_cpu_delta=etcd_cpu_delta, + per_proj_cpu_apiserver=per_proj_cpu_apiserver, + per_proj_cpu_etcd=per_proj_cpu_etcd, + per_proj_mem_apiserver_mb=per_proj_mem_apiserver_mb, + per_proj_mem_etcd_mb=per_proj_mem_etcd_mb, + after_objects_stats=after_objects_stats, + ) + except Exception as e: # Fallback text-only report with error context + payload: dict[str, object] = { + "baseline": metrics_before, + "after_projects": metrics_after_projects, + "num_projects": num_projects, + "projects_ready_seconds": projects_ready_seconds, + } + if metrics_after_secrets is not None: + payload["after_secrets"] = metrics_after_secrets + return ( + "
chart rendering unavailable; showing raw metrics\n\n"
+            + "error: "
+            + str(e)
+            + "\n\n"
+            + json.dumps(payload, indent=2)
+            + "
" + ) + + +def cleanup_resources(milo_kubeconfig_path: str, test_id: str, org_name: str) -> None: + base_cfg = load_yaml_file(milo_kubeconfig_path) + api_client = kube_client_from_config(base_cfg) + coapi = k8s_client.CustomObjectsApi(api_client) + + # Use organization-scoped client for project-scoped operations (admission context) + org_scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/organizations/{org_name}/control-plane" + org_cfg = build_scoped_kubeconfig(base_cfg, org_scope_path, new_name=f"organization-{org_name}") + org_client = kube_client_from_config(org_cfg) + org_coapi = k8s_client.CustomObjectsApi(org_client) + + # List and delete projects by label + proj_list = coapi.list_cluster_custom_object( + group="resourcemanager.miloapis.com", + version="v1alpha1", + plural="projects", + label_selector=f"app=milo-perf,test-id={test_id}", + ) + for item in proj_list.get("items", []): + pname = item["metadata"]["name"] + log(f"[cleanup] project {pname}: deleting Secrets/ConfigMaps and Project …") + # Delete per-project objects first (tolerate not found) + try: + proj_scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/projects/{pname}/control-plane" + proj_cfg = build_scoped_kubeconfig(base_cfg, proj_scope_path, new_name=f"project-{pname}") + proj_client = kube_client_from_config(proj_cfg) + v1 = k8s_client.CoreV1Api(proj_client) + label_sel = f"app=milo-perf,test-id={test_id}" + # Secrets + try: + sl = v1.list_namespaced_secret(namespace="default", label_selector=label_sel) + for s in sl.items or []: + try: + v1.delete_namespaced_secret(name=s.metadata.name, namespace="default") + except ApiException as e: + if e.status != 404: + raise + except ApiException: + pass + # ConfigMaps + try: + cml = v1.list_namespaced_config_map(namespace="default", label_selector=label_sel) + for cm in cml.items or []: + try: + v1.delete_namespaced_config_map(name=cm.metadata.name, namespace="default") + except ApiException as e: + if e.status != 404: + raise + except ApiException: + pass + except Exception: + # Keep going even if per-project object cleanup fails + pass + + # Delete the Project (use org-scoped API for proper parent context) + try: + org_coapi.delete_cluster_custom_object( + group="resourcemanager.miloapis.com", + version="v1alpha1", + plural="projects", + name=pname, + ) + except ApiException as e: + if e.status not in (404, 409): + raise + + # Delete organization last + try: + coapi.delete_cluster_custom_object( + group="resourcemanager.miloapis.com", + version="v1alpha1", + plural="organizations", + name=org_name, + ) + except ApiException as e: + if e.status not in (404, 409): + raise + + +def main() -> None: + run_mode = os.getenv("RUN_MODE", "run").lower() + target_ns = get_env("TARGET_NAMESPACE", "milo-system") + + if run_mode == "cleanup": + milo_kubeconfig_path = get_env("MILO_KUBECONFIG_PATH", "/work/milo-kubeconfig") + test_id = get_env("TEST_ID") + org_name = get_env("ORG_NAME") + cleanup_resources(milo_kubeconfig_path, test_id=test_id, org_name=org_name) + # Remove results ConfigMap if present + try: + k8s_config.load_incluster_config() + except Exception: + k8s_config.load_kube_config() + v1 = k8s_client.CoreV1Api() + try: + v1.delete_namespaced_config_map(name="perf-results", namespace=target_ns) + except ApiException as e: + if e.status != 404: + raise + print("Cleanup complete") + return + + # RUN + milo_kubeconfig_path = get_env("MILO_KUBECONFIG_PATH", "/work/milo-kubeconfig") + milo_metrics_ns = get_env("MILO_NAMESPACE", "milo-system") + vm_base_url = get_env("VM_BASE_URL") + apiserver_regex = get_env("APISERVER_POD_REGEX", "milo-apiserver.*") + etcd_regex = get_env("ETCD_POD_REGEX", "etcd.*") + window = get_env("MEASURE_WINDOW", "2m") + stabilize_seconds = int(get_env("STABILIZE_SECONDS", "90")) + num_projects = int(get_env("NUM_PROJECTS", "100")) + num_secrets = int(get_env("NUM_SECRETS_PER_PROJECT", "100")) + num_configmaps = int(get_env("NUM_CONFIGMAPS_PER_PROJECT", "100")) + project_concurrency = int(os.getenv("PROJECT_CONCURRENCY", "4")) + object_concurrency = int(os.getenv("OBJECT_CONCURRENCY", "8")) + run_objects_phase = parse_bool(os.getenv("RUN_OBJECTS_PHASE", "true"), default=True) + out_dir = os.getenv("OUT_DIR", "/work/out") + + test_id = uuid.uuid4().hex[:8] + _org_env = os.getenv("ORG_NAME") + org_name = _org_env.strip() if (_org_env is not None and _org_env.strip() != "") else f"perf-{test_id}" + labels = {"app": "milo-perf", "test-id": test_id} + + # Initial checkpoint (so cleanup works even if run aborts early) + save_checkpoint(target_ns, test_id, org_name, phase="init", extra={"num_projects": num_projects}) + + # Baseline metrics (no pre-stabilization) + log("Measuring baseline metrics …") + baseline = measure_metrics(vm_base_url, milo_metrics_ns, apiserver_regex, etcd_regex, window) + + # Create org & projects + log(f"Creating org '{org_name}' and {num_projects} projects …") + base_cfg, project_names, projects_ready_seconds = create_org_and_projects( + milo_kubeconfig_path, org_name, num_projects, labels + ) + # Update checkpoint that org exists (projects will be created next) + save_checkpoint(target_ns, test_id, org_name, phase="org-created") + + # After projects metrics + if stabilize_seconds > 0: + log(f"Stabilizing for {stabilize_seconds}s after projects are Ready …") + time.sleep(stabilize_seconds) + log("Measuring metrics after projects are Ready …") + after_projects = measure_metrics(vm_base_url, milo_metrics_ns, apiserver_regex, etcd_regex, window) + # Update checkpoint after projects are ready + save_checkpoint(target_ns, test_id, org_name, phase="projects-ready", extra={"num_projects": num_projects}) + + after_secrets = None + if run_objects_phase: + # Create objects within each project + log(f"Creating {num_secrets} secrets and {num_configmaps} configmaps per project …") + create_objects_in_projects( + base_cfg, + org_name, + project_names, + num_secrets, + num_configmaps, + labels, + project_concurrency, + object_concurrency, + ) + + # After secrets/configmaps metrics + if stabilize_seconds > 0: + log(f"Stabilizing for {stabilize_seconds}s after secrets/configmaps …") + time.sleep(stabilize_seconds) + log("Measuring metrics after creating secrets/configmaps …") + after_secrets = measure_metrics(vm_base_url, milo_metrics_ns, apiserver_regex, etcd_regex, window) + + # Build results + now_iso = datetime.now(timezone.utc).isoformat() + results = { + "test_id": test_id, + "timestamp": now_iso, + "org_name": org_name, + "num_projects": num_projects, + "num_secrets_per_project": num_secrets, + "num_configmaps_per_project": num_configmaps, + "projects_ready_seconds": projects_ready_seconds, + "metrics": { + "baseline": baseline, + "after_projects": after_projects, + "after_secrets": after_secrets, + }, + } + + report_html = generate_report_html( + baseline, + after_projects, + after_secrets, + num_projects, + projects_ready_seconds, + ) + + # Persist results to files (Task will copy locally and publish a ConfigMap) + try: + os.makedirs(out_dir, exist_ok=True) + with open(os.path.join(out_dir, "results.json"), "w", encoding="utf-8") as f: + json.dump(results, f, indent=2) + with open(os.path.join(out_dir, "report.html"), "w", encoding="utf-8") as f: + f.write(report_html) + with open(os.path.join(out_dir, "meta.txt"), "w", encoding="utf-8") as f: + f.write(f"test_id={test_id}\norg_name={org_name}\n") + log(f"Results written to {out_dir}") + except Exception as e: + log(f"Failed to write results to {out_dir}: {e}") + + # Best-effort attempt to also write a ConfigMap (may fail if SA lacks RBAC) + try: + cm_data = { + "results.json": json.dumps(results, indent=2), + "report.html": report_html, + "test_id": test_id, + "org_name": org_name, + } + save_results_configmap(target_ns, "perf-results", cm_data) + log("Also saved results to ConfigMap 'perf-results'") + except Exception as e: + log(f"Skipping ConfigMap save (insufficient RBAC?): {e}") + + log("Perf run complete") + + +if __name__ == "__main__": + main() + + From 6b5df83ade0928c0b8551d399360368b4a06606e Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Wed, 17 Sep 2025 22:46:22 -0700 Subject: [PATCH 2/3] fix: path typo --- Taskfile.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Taskfile.yaml b/Taskfile.yaml index 096173b8..6512c2af 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -565,7 +565,7 @@ tasks: -e "s/RUN_OBJECTS_PHASE_PLACEHOLDER/${RUN_OBJECTS_PHASE}/g" \ -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \ -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \ - test/performance/scripts/perf-runner-job.yaml | task test-infra:kubectl -- apply -f - + test/performance/config/perf-runner-job.yaml | task test-infra:kubectl -- apply -f - echo "⏳ Waiting for Job completion …" task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-runner --timeout=45m From 5c200d99d52319b9348eac548f9f5aff4fb8fe4a Mon Sep 17 00:00:00 2001 From: Zach Smith Date: Thu, 18 Sep 2025 17:18:29 -0700 Subject: [PATCH 3/3] fix: use helm monitor and fix auth token --- .../servicemonitor-etcd.yaml | 18 ------------------ config/dependencies/etcd/helmrelease.yaml | 2 +- .../components/auth/auth-tokens-secret.yaml | 2 +- 3 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 config/components/prometheus-monitoring/servicemonitor-etcd.yaml diff --git a/config/components/prometheus-monitoring/servicemonitor-etcd.yaml b/config/components/prometheus-monitoring/servicemonitor-etcd.yaml deleted file mode 100644 index f08607e0..00000000 --- a/config/components/prometheus-monitoring/servicemonitor-etcd.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: etcd-metrics - namespace: milo-system -spec: - namespaceSelector: - matchNames: ["milo-system"] - selector: - matchLabels: - app.kubernetes.io/component: etcd - app.kubernetes.io/name: etcd - endpoints: - - port: client - path: /metrics - scheme: http - interval: 15s - diff --git a/config/dependencies/etcd/helmrelease.yaml b/config/dependencies/etcd/helmrelease.yaml index b4a4e506..f52f798d 100644 --- a/config/dependencies/etcd/helmrelease.yaml +++ b/config/dependencies/etcd/helmrelease.yaml @@ -57,7 +57,7 @@ spec: metrics: enabled: true serviceMonitor: - enabled: false + enabled: true # Logging configuration extraEnvVars: diff --git a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml index d349d947..b76392b4 100644 --- a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml +++ b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml @@ -6,4 +6,4 @@ type: Opaque stringData: tokens.csv: | test-admin-token,admin,admin,"system:masters" - test-user-token,test-user,1002,"system:authenticated" + test-user-token,test-user,test-user,"system:authenticated"