From 8607f03b341266c22c5e09e14f56cafacff8ba10 Mon Sep 17 00:00:00 2001
From: Zach Smith <zachdsmith2015@gmail.com>
Date: Wed, 17 Sep 2025 22:20:02 -0700
Subject: [PATCH 1/3] feat: initial commit for perf testing milo

---
 Taskfile.yaml                                 | 136 +++
 config/apiserver/deployment.yaml              |   2 +-
 .../servicemonitor-etcd.yaml                  |  18 +
 config/dependencies/etcd/helmrelease.yaml     |   2 +-
 .../components/auth/auth-tokens-secret.yaml   |   2 +-
 test/performance/README.md                    | 121 +++
 test/performance/config/perf-cleanup-job.yaml |  52 +
 test/performance/config/perf-runner-job.yaml  |  76 ++
 test/performance/config/perf-runner-rbac.yaml |  29 +
 test/performance/scripts/perf_run.py          | 905 ++++++++++++++++++
 10 files changed, 1340 insertions(+), 3 deletions(-)
 create mode 100644 config/components/prometheus-monitoring/servicemonitor-etcd.yaml
 create mode 100644 test/performance/README.md
 create mode 100644 test/performance/config/perf-cleanup-job.yaml
 create mode 100644 test/performance/config/perf-runner-job.yaml
 create mode 100644 test/performance/config/perf-runner-rbac.yaml
 create mode 100644 test/performance/scripts/perf_run.py

diff --git a/Taskfile.yaml b/Taskfile.yaml
index 14b1bcb9..096173b8 100644
--- a/Taskfile.yaml
+++ b/Taskfile.yaml
@@ -497,3 +497,139 @@ tasks:
             echo "🎉 All Prometheus rule tests passed."
           fi
       silent: false
+
+  perf:run:
+    desc: Run Milo end-to-end performance scenario and download results
+    silent: true
+    cmds:
+      - |
+        set -euo pipefail
+        # Parse CLI key=value overrides passed after -- and export as env
+        for kv in {{.CLI_ARGS}}; do
+          case "$kv" in
+            *=*) key="${kv%%=*}"; val="${kv#*=}"; export "$key=$val" ;;
+            *) : ;; # ignore non key=value tokens
+          esac
+        done
+        NS="${NS:-milo-system}"
+        MILO_NS="${MILO_NAMESPACE:-milo-system}"
+        VM_NS="${VM_NAMESPACE:-telemetry-system}"
+        VM_SVC_NAME="${VM_SERVICE_NAME:-vmsingle-telemetry-system-vm-victoria-metrics-k8s-stack}"
+        VM_PORT="${VM_PORT:-8428}"
+        VM_BASE_URL="${VM_BASE_URL:-http://${VM_SVC_NAME}.${VM_NS}.svc.cluster.local:${VM_PORT}}"
+        APISERVER_REGEX="${APISERVER_POD_REGEX:-milo-apiserver.*}"
+        ETCD_REGEX="${ETCD_POD_REGEX:-etcd.*}"
+        MILO_KUBECONFIG_SECRET_NAME="${MILO_KUBECONFIG_SECRET_NAME:-milo-controller-manager-kubeconfig}"
+        MILO_KUBECONFIG_SECRET_KEY="${MILO_KUBECONFIG_SECRET_KEY:-kubeconfig}"
+        MILO_KUBECONFIG_PATH="${MILO_KUBECONFIG_PATH:-/work/milo-kubeconfig}"
+        NUM_PROJECTS="${NUM_PROJECTS:-{{default "100" .NUM_PROJECTS}}}"
+        NUM_SECRETS_PER_PROJECT="${NUM_SECRETS_PER_PROJECT:-{{default "100" .NUM_SECRETS_PER_PROJECT}}}"
+        NUM_CONFIGMAPS_PER_PROJECT="${NUM_CONFIGMAPS_PER_PROJECT:-{{default "100" .NUM_CONFIGMAPS_PER_PROJECT}}}"
+        PROJECT_CONCURRENCY="${PROJECT_CONCURRENCY:-{{default "4" .PROJECT_CONCURRENCY}}}"
+        OBJECT_CONCURRENCY="${OBJECT_CONCURRENCY:-{{default "8" .OBJECT_CONCURRENCY}}}"
+        RUN_OBJECTS_PHASE="${RUN_OBJECTS_PHASE:-{{default "true" .RUN_OBJECTS_PHASE}}}"
+        OUT_DIR="${OUT_DIR:-{{default "/work/out" .OUT_DIR}}}"
+        STABILIZE_SECONDS="${STABILIZE_SECONDS:-{{default "90" .STABILIZE_SECONDS}}}"
+        MEASURE_WINDOW="${MEASURE_WINDOW:-{{default "2m" .MEASURE_WINDOW}}}"
+        ORG_NAME="${ORG_NAME:-{{default "" .ORG_NAME}}}"
+
+        echo "🔎 Checking Milo kubeconfig …"
+        if [ ! -f ".milo/kubeconfig" ]; then
+          echo "Error: .milo/kubeconfig not found. Run 'task dev:setup' first." >&2
+          exit 1
+        fi
+
+        echo "🔐 Ensuring perf-runner RBAC is applied …"
+        sed "s/NAMESPACE_PLACEHOLDER/${NS}/g" test/performance/config/perf-runner-rbac.yaml | task test-infra:kubectl -- apply -f -
+
+        echo "🗂  Publishing perf script as ConfigMap …"
+        task test-infra:kubectl -- -n ${NS} create configmap perf-script \
+          --from-file=perf_run.py=test/performance/scripts/perf_run.py \
+          --dry-run=client -o yaml | task test-infra:kubectl -- apply -f -
+
+        echo "🚀 Launching perf runner Job …"
+        sed \
+          -e "s/MILO_NAMESPACE_PLACEHOLDER/${MILO_NS}/g" \
+          -e "s/NAMESPACE_PLACEHOLDER/${NS}/g" \
+          -e "s#VM_BASE_URL_PLACEHOLDER#${VM_BASE_URL}#g" \
+          -e "s/APISERVER_REGEX_PLACEHOLDER/${APISERVER_REGEX}/g" \
+          -e "s/ETCD_REGEX_PLACEHOLDER/${ETCD_REGEX}/g" \
+          -e "s/NUM_PROJECTS_PLACEHOLDER/${NUM_PROJECTS}/g" \
+          -e "s/NUM_SECRETS_PLACEHOLDER/${NUM_SECRETS_PER_PROJECT}/g" \
+          -e "s/NUM_CONFIGMAPS_PLACEHOLDER/${NUM_CONFIGMAPS_PER_PROJECT}/g" \
+          -e "s/STABILIZE_SECONDS_PLACEHOLDER/${STABILIZE_SECONDS}/g" \
+          -e "s/MEASURE_WINDOW_PLACEHOLDER/${MEASURE_WINDOW}/g" \
+          -e "s/ORG_NAME_PLACEHOLDER/${ORG_NAME}/g" \
+          -e "s/PROJECT_CONCURRENCY_PLACEHOLDER/${PROJECT_CONCURRENCY}/g" \
+          -e "s/OBJECT_CONCURRENCY_PLACEHOLDER/${OBJECT_CONCURRENCY}/g" \
+          -e "s/RUN_OBJECTS_PHASE_PLACEHOLDER/${RUN_OBJECTS_PHASE}/g" \
+          -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \
+          -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \
+          test/performance/scripts/perf-runner-job.yaml | task test-infra:kubectl -- apply -f -
+
+        echo "⏳ Waiting for Job completion …"
+        task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-runner --timeout=45m
+
+        echo "⬇️  Downloading results …"
+        mkdir -p reports/perf
+        # Prefer ConfigMap (works even if pod already terminated)
+        TEST_ID=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.test_id}' 2>/dev/null || true)
+        OUT_DIR_LOCAL="reports/perf/${TEST_ID:-latest}"
+        mkdir -p "$OUT_DIR_LOCAL"
+        task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.results\.json}' > "$OUT_DIR_LOCAL/results.json" || true
+        task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.report\.html}' > "$OUT_DIR_LOCAL/report.html" || true
+        # Fallback to copying from the pod if ConfigMap wasn't available
+        if [ ! -s "$OUT_DIR_LOCAL/results.json" ] || [ ! -s "$OUT_DIR_LOCAL/report.html" ]; then
+          POD=$(task test-infra:kubectl -- -n ${NS} get pods -l job-name=perf-runner -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
+          if [ -n "$POD" ]; then
+            task test-infra:kubectl -- -n ${NS} cp "$POD:/work/out/results.json" "$OUT_DIR_LOCAL/results.json" || true
+            task test-infra:kubectl -- -n ${NS} cp "$POD:/work/out/report.html" "$OUT_DIR_LOCAL/report.html" || true
+          fi
+        fi
+        echo "✅ Results saved to $OUT_DIR_LOCAL"
+
+  perf:cleanup:
+    desc: Cleanup resources created by the last perf run (org/projects/secrets/configmaps)
+    silent: true
+    cmds:
+      - |
+        set -euo pipefail
+        NS="${NS:-milo-system}"
+        MILO_KUBECONFIG_SECRET_NAME="${MILO_KUBECONFIG_SECRET_NAME:-milo-controller-manager-kubeconfig}"
+        MILO_KUBECONFIG_SECRET_KEY="${MILO_KUBECONFIG_SECRET_KEY:-kubeconfig}"
+        MILO_KUBECONFIG_PATH="${MILO_KUBECONFIG_PATH:-/work/milo-kubeconfig}"
+
+        if [ ! -f ".milo/kubeconfig" ]; then
+          echo "Error: .milo/kubeconfig not found. Run 'task dev:setup' first." >&2
+          exit 1
+        fi
+
+        echo "🔎 Discovering last test identifiers …"
+        # Allow override from CLI envs if ConfigMap isn't present
+        TEST_ID_CM=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.test_id}' 2>/dev/null || true)
+        ORG_NAME_CM=$(task test-infra:kubectl -- -n ${NS} get cm perf-results -o jsonpath='{.data.org_name}' 2>/dev/null || true)
+        TEST_ID="${TEST_ID:-$TEST_ID_CM}"
+        ORG_NAME="${ORG_NAME:-$ORG_NAME_CM}"
+        if [ -z "${TEST_ID}" ] || [ -z "${ORG_NAME}" ]; then
+          echo "No existing results found in namespace ${NS} (ConfigMap perf-results). Nothing to cleanup."
+          exit 0
+        fi
+
+        echo "🚮 Launching cleanup Job for test ${TEST_ID} …"
+        sed \
+          -e "s/NAMESPACE_PLACEHOLDER/${NS}/g" \
+          -e "s/TEST_ID_PLACEHOLDER/${TEST_ID}/g" \
+          -e "s/ORG_NAME_PLACEHOLDER/${ORG_NAME}/g" \
+          -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \
+          -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \
+          test/performance/config/perf-cleanup-job.yaml | task test-infra:kubectl -- apply -f -
+
+        echo "⏳ Waiting for cleanup Job completion …"
+        task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-cleanup --timeout=30m
+
+        echo "🧹 Removing runner artifacts (keeping downloaded results) …"
+        task test-infra:kubectl -- -n ${NS} delete job/perf-runner --ignore-not-found
+        task test-infra:kubectl -- -n ${NS} delete job/perf-cleanup --ignore-not-found
+        task test-infra:kubectl -- -n ${NS} delete configmap perf-script --ignore-not-found
+        task test-infra:kubectl -- -n ${NS} delete configmap perf-results --ignore-not-found
+        echo "✅ Cleanup complete."
diff --git a/config/apiserver/deployment.yaml b/config/apiserver/deployment.yaml
index dd4ee1e7..28717530 100644
--- a/config/apiserver/deployment.yaml
+++ b/config/apiserver/deployment.yaml
@@ -135,7 +135,7 @@ spec:
             memory: 128Mi
           limits:
             cpu: 500m
-            memory: 512Mi
+            memory: 2G
         startupProbe:
           failureThreshold: 3
           httpGet:
diff --git a/config/components/prometheus-monitoring/servicemonitor-etcd.yaml b/config/components/prometheus-monitoring/servicemonitor-etcd.yaml
new file mode 100644
index 00000000..f08607e0
--- /dev/null
+++ b/config/components/prometheus-monitoring/servicemonitor-etcd.yaml
@@ -0,0 +1,18 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: etcd-metrics
+  namespace: milo-system
+spec:
+  namespaceSelector:
+    matchNames: ["milo-system"]
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: etcd
+      app.kubernetes.io/name: etcd
+  endpoints:
+    - port: client
+      path: /metrics
+      scheme: http
+      interval: 15s
+
diff --git a/config/dependencies/etcd/helmrelease.yaml b/config/dependencies/etcd/helmrelease.yaml
index d6c25628..b4a4e506 100644
--- a/config/dependencies/etcd/helmrelease.yaml
+++ b/config/dependencies/etcd/helmrelease.yaml
@@ -35,7 +35,7 @@ spec:
     resources:
       limits:
         cpu: 500m
-        memory: 512Mi
+        memory: 2G
       requests:
         cpu: 200m
         memory: 256Mi
diff --git a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml
index dd9d855f..d349d947 100644
--- a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml
+++ b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml
@@ -5,5 +5,5 @@ metadata:
 type: Opaque
 stringData:
   tokens.csv: |
-    test-admin-token,admin,1001,"system:masters"
+    test-admin-token,admin,admin,"system:masters"
     test-user-token,test-user,1002,"system:authenticated"
diff --git a/test/performance/README.md b/test/performance/README.md
new file mode 100644
index 00000000..882fee18
--- /dev/null
+++ b/test/performance/README.md
@@ -0,0 +1,121 @@
+### Milo performance runner
+
+This performance suite provisions Milo/Etcd service monitors and measures CPU/Memory snapshots from VictoriaMetrics.
+
+Files and structure:
+- performance/scripts/perf_run.py: runner script executed inside a Kubernetes Job
+- performance/config/perf-runner-job.yaml: Job template for the run phase
+- performance/config/perf-cleanup-job.yaml: Job template for cleanup
+- performance/config/perf-runner-rbac.yaml: ServiceAccount/Role/RoleBinding used by the jobs
+
+#### Summary
+
+- Creates a Milo `Organization`, then N `Projects`, waits for all to be Ready, and times it.
+- Takes metrics snapshots before (baseline), after projects are ready, and optionally after per-project object creation.
+- Optionally creates M `Secrets` and K `ConfigMaps` in each Project (parallelized), then measures again.
+- Saves results to a ConfigMap and downloads a local HTML report and JSON.
+
+#### Prerequisites
+
+1) Bring up dev stack and observability:
+
+```bash
+task dev:setup && task dev:install-observability
+```
+
+2) Ensure a Milo kubeconfig secret exists in your cluster. By default the tasks mount `Secret/milo-controller-manager-kubeconfig` (key `kubeconfig`). You can override via env (see knobs below).
+
+#### How to run
+
+- Full run (org + projects + objects) with defaults:
+
+```bash
+task perf:run
+```
+
+- Projects-only (skip secrets/configmaps) and higher parallelism:
+
+```bash
+task perf:run -- RUN_OBJECTS_PHASE=false PROJECT_CONCURRENCY=10
+```
+
+- Cleanup all resources from the last run:
+
+```bash
+task perf:cleanup
+```
+
+#### Outputs
+
+- In-cluster: ConfigMap `perf-results` in `NS` (default `milo-system`) with keys `results.json`, `report.html`, `test_id`, `org_name`.
+- Local: `reports/perf/<test_id>/results.json` and `report.html` downloaded by the task after the Job completes. The HTML report includes grouped bar charts (CPU cores and Memory MB) and per-project delta KPIs for apiserver and etcd.
+
+#### What the runner does
+
+1) Baseline: query VictoriaMetrics for Milo apiserver and etcd CPU/memory.
+2) Create Organization (no wait), then create N Projects, wait for all Projects Ready; record duration.
+3) Stabilize, then snapshot “after projects”.
+4) If enabled, create per-Project objects (Secrets/ConfigMaps) concurrently; stabilize, then snapshot “after secrets+configmaps”.
+
+Snapshots come from VictoriaMetrics using `container_cpu_usage_seconds_total` (rate) and `container_memory_working_set_bytes` (avg_over_time) for pods matching the configured namespace and pod name regexes.
+
+#### Configuration knobs (env vars)
+
+Pass on the `task perf:run -- KEY=value ...` command line. Defaults shown in parentheses.
+
+- Resource selection
+  - `NS` (milo-system): Namespace to run Job and store results ConfigMap
+  - `MILO_NAMESPACE` (milo-system): Namespace to measure apiserver/etcd pods
+  - `APISERVER_POD_REGEX` (milo-apiserver.*): Regex for apiserver pods
+  - `ETCD_POD_REGEX` (etcd.*): Regex for etcd pods
+
+- Metrics source (VictoriaMetrics)
+  - `VM_NAMESPACE` (telemetry-system)
+  - `VM_SERVICE_NAME` (vmsingle-telemetry-system-vm-victoria-metrics-k8s-stack)
+  - `VM_PORT` (8428)
+  - `VM_BASE_URL` (optional override, e.g. http://hostname:8428). Default uses in-cluster FQDN: `http://<service>.<namespace>.svc.cluster.local:8428`.
+  - `MEASURE_WINDOW` (2m): Range window for rate/avg_over_time
+
+- Scale and workload
+  - `NUM_PROJECTS` (100)
+  - `RUN_OBJECTS_PHASE` (true): Toggle per-project Secrets/ConfigMaps phase
+  - `NUM_SECRETS_PER_PROJECT` (100)
+  - `NUM_CONFIGMAPS_PER_PROJECT` (100)
+  - `PROJECT_CONCURRENCY` (4): Number of projects processed in parallel when creating objects
+  - `OBJECT_CONCURRENCY` (8): Secrets/ConfigMaps parallelism inside each project
+
+- Stabilization windows
+  - `STABILIZE_SECONDS` (90): Sleep before snapshots after Projects and after Objects
+
+- Identity / scoping
+  - `ORG_NAME` (auto-generated): Name of Organization to create
+  - `MILO_KUBECONFIG_SECRET_NAME` (milo-controller-manager-kubeconfig): Secret containing Milo kubeconfig
+  - `MILO_KUBECONFIG_SECRET_KEY` (kubeconfig): Secret key with kubeconfig content
+  - `MILO_KUBECONFIG_PATH` (/work/milo-kubeconfig): In-container path to mount kubeconfig
+  - `AUTH_BEARER_TOKEN` (optional): Override token injected into kubeconfig user for troubleshooting
+
+#### Examples
+
+- Measure project-only impact:
+
+```bash
+task perf:run -- RUN_OBJECTS_PHASE=false STABILIZE_SECONDS=60 NUM_PROJECTS=200
+```
+
+- Heavier objects phase, more parallelism:
+
+```bash
+task perf:run -- NUM_SECRETS_PER_PROJECT=500 NUM_CONFIGMAPS_PER_PROJECT=500 PROJECT_CONCURRENCY=12 OBJECT_CONCURRENCY=24
+```
+
+- Point to a custom VictoriaMetrics endpoint:
+
+```bash
+task perf:run -- VM_BASE_URL=http://vm.my-domain.local:8428
+```
+
+- Use a specific Organization name:
+
+```bash
+task perf:run -- ORG_NAME=perf-cow
+```
diff --git a/test/performance/config/perf-cleanup-job.yaml b/test/performance/config/perf-cleanup-job.yaml
new file mode 100644
index 00000000..7bc1b327
--- /dev/null
+++ b/test/performance/config/perf-cleanup-job.yaml
@@ -0,0 +1,52 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: perf-cleanup
+  namespace: NAMESPACE_PLACEHOLDER
+spec:
+  ttlSecondsAfterFinished: 300
+  backoffLimit: 0
+  template:
+    spec:
+      serviceAccountName: perf-runner
+      restartPolicy: Never
+      containers:
+      - name: cleanup
+        image: python:3.11
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: TARGET_NAMESPACE
+          value: NAMESPACE_PLACEHOLDER
+        - name: RUN_MODE
+          value: cleanup
+        - name: TEST_ID
+          value: "TEST_ID_PLACEHOLDER"
+        - name: ORG_NAME
+          value: "ORG_NAME_PLACEHOLDER"
+        - name: MILO_KUBECONFIG_PATH
+          value: "/work/milo-kubeconfig"
+        volumeMounts:
+        - name: script
+          mountPath: /work/perf_run.py
+          subPath: perf_run.py
+          readOnly: true
+        - name: milo-kubeconfig
+          mountPath: /work/milo-kubeconfig
+          subPath: MILO_KUBECONFIG_KEY_PLACEHOLDER
+          readOnly: true
+        command: ["bash","-lc"]
+        args:
+        - >-
+          python -m pip install --no-cache-dir kubernetes requests pyyaml &&
+          python -u /work/perf_run.py
+      volumes:
+      - name: script
+        configMap:
+          name: perf-script
+          defaultMode: 0444
+      - name: milo-kubeconfig
+        secret:
+          secretName: MILO_KUBECONFIG_SECRET_PLACEHOLDER
+          defaultMode: 0400
+
+
diff --git a/test/performance/config/perf-runner-job.yaml b/test/performance/config/perf-runner-job.yaml
new file mode 100644
index 00000000..f9c1da5c
--- /dev/null
+++ b/test/performance/config/perf-runner-job.yaml
@@ -0,0 +1,76 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: perf-runner
+  namespace: NAMESPACE_PLACEHOLDER
+spec:
+  ttlSecondsAfterFinished: 600
+  backoffLimit: 0
+  template:
+    spec:
+      serviceAccountName: perf-runner
+      restartPolicy: Never
+      containers:
+      - name: runner
+        image: python:3.11
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: TARGET_NAMESPACE
+          value: NAMESPACE_PLACEHOLDER
+        - name: VM_BASE_URL
+          value: VM_BASE_URL_PLACEHOLDER
+        - name: MILO_NAMESPACE
+          value: MILO_NAMESPACE_PLACEHOLDER
+        - name: APISERVER_POD_REGEX
+          value: APISERVER_REGEX_PLACEHOLDER
+        - name: ETCD_POD_REGEX
+          value: ETCD_REGEX_PLACEHOLDER
+        - name: NUM_PROJECTS
+          value: "NUM_PROJECTS_PLACEHOLDER"
+        - name: NUM_SECRETS_PER_PROJECT
+          value: "NUM_SECRETS_PLACEHOLDER"
+        - name: NUM_CONFIGMAPS_PER_PROJECT
+          value: "NUM_CONFIGMAPS_PLACEHOLDER"
+        - name: STABILIZE_SECONDS
+          value: "STABILIZE_SECONDS_PLACEHOLDER"
+        - name: MEASURE_WINDOW
+          value: "MEASURE_WINDOW_PLACEHOLDER"
+        - name: ORG_NAME
+          value: "ORG_NAME_PLACEHOLDER"
+        - name: PROJECT_CONCURRENCY
+          value: "PROJECT_CONCURRENCY_PLACEHOLDER"
+        - name: OBJECT_CONCURRENCY
+          value: "OBJECT_CONCURRENCY_PLACEHOLDER"
+        - name: RUN_OBJECTS_PHASE
+          value: "RUN_OBJECTS_PHASE_PLACEHOLDER"
+        - name: OUT_DIR
+          value: "/work/out"
+        - name: MPLBACKEND
+          value: Agg
+        - name: MILO_KUBECONFIG_PATH
+          value: "/work/milo-kubeconfig"
+        volumeMounts:
+        - name: script
+          mountPath: /work/perf_run.py
+          subPath: perf_run.py
+          readOnly: true
+        - name: milo-kubeconfig
+          mountPath: /work/milo-kubeconfig
+          subPath: MILO_KUBECONFIG_KEY_PLACEHOLDER
+          readOnly: true
+        command: ["bash","-lc"]
+        args:
+        - >-
+          python -m pip install --no-cache-dir kubernetes requests pyyaml matplotlib &&
+          python -u /work/perf_run.py
+      volumes:
+      - name: script
+        configMap:
+          name: perf-script
+          defaultMode: 0444
+      - name: milo-kubeconfig
+        secret:
+          secretName: MILO_KUBECONFIG_SECRET_PLACEHOLDER
+          defaultMode: 0400
+
+
diff --git a/test/performance/config/perf-runner-rbac.yaml b/test/performance/config/perf-runner-rbac.yaml
new file mode 100644
index 00000000..9fad73bb
--- /dev/null
+++ b/test/performance/config/perf-runner-rbac.yaml
@@ -0,0 +1,29 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: perf-runner
+  namespace: NAMESPACE_PLACEHOLDER
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: perf-results-manager
+  namespace: NAMESPACE_PLACEHOLDER
+rules:
+- apiGroups: [""]
+  resources: ["configmaps"]
+  verbs: ["get", "list", "create", "update", "patch", "delete"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: perf-results-manager-binding
+  namespace: NAMESPACE_PLACEHOLDER
+subjects:
+- kind: ServiceAccount
+  name: perf-runner
+  namespace: NAMESPACE_PLACEHOLDER
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: perf-results-manager
diff --git a/test/performance/scripts/perf_run.py b/test/performance/scripts/perf_run.py
new file mode 100644
index 00000000..12990ffd
--- /dev/null
+++ b/test/performance/scripts/perf_run.py
@@ -0,0 +1,905 @@
+import base64
+import json
+import concurrent.futures
+import os
+import sys
+import time
+import uuid
+from datetime import datetime, timezone
+from io import BytesIO
+
+import requests
+import urllib3
+import yaml
+from kubernetes import client as k8s_client
+from kubernetes import config as k8s_config
+from kubernetes.client import ApiException
+
+
+def get_env(name: str, default: str | None = None) -> str:
+    value = os.getenv(name, default)
+    if value is None:
+        print(f"Missing required env var: {name}", file=sys.stderr)
+        sys.exit(1)
+    return value
+
+
+def parse_bool(value: str | None, default: bool = True) -> bool:
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "t", "yes", "y", "on"}
+
+
+def load_yaml_file(path: str) -> dict:
+    with open(path, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+
+
+# Reduce noisy TLS warnings from in-cluster/self-signed configs
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+try:
+    # Also quiet requests' vendored urllib3, if present
+    requests.packages.urllib3.disable_warnings(category=urllib3.exceptions.InsecureRequestWarning)  # type: ignore
+except Exception:
+    pass
+
+
+def log(message: str) -> None:
+    now = datetime.now(timezone.utc).strftime("%H:%M:%S")
+    print(f"[{now}] {message}", flush=True)
+
+
+def should_retry_api_exception(e: ApiException) -> bool:
+    try:
+        if e.status in (429, 500):
+            return True
+        body = getattr(e, 'body', '') or ''
+        msg = str(body)
+        # Handle webhook EOF/transient failures
+        if 'Internal error occurred' in msg or 'failed calling webhook' in msg or 'EOF' in msg:
+            return True
+    except Exception:
+        pass
+    return False
+
+
+def retry_with_backoff(action_name: str, fn, *args, **kwargs):
+    delay = 1.0
+    for attempt in range(1, 7):
+        try:
+            return fn(*args, **kwargs)
+        except ApiException as e:
+            if should_retry_api_exception(e) and attempt < 6:
+                log(f"{action_name} failed (attempt {attempt}/6): status={e.status} retrying in {delay:.0f}s …")
+                time.sleep(delay)
+                delay = min(delay * 2, 16)
+                continue
+            raise
+
+
+def save_results_configmap(namespace: str, name: str, data: dict[str, str]) -> None:
+    # Uses in-cluster config to write to Kubernetes ConfigMap
+    try:
+        k8s_config.load_incluster_config()
+    except Exception:
+        # Fallback to default kubeconfig for local runs
+        k8s_config.load_kube_config()
+
+    v1 = k8s_client.CoreV1Api()
+    metadata = k8s_client.V1ObjectMeta(name=name)
+    cm = k8s_client.V1ConfigMap(metadata=metadata, data=data)
+
+    try:
+        existing = v1.read_namespaced_config_map(name=name, namespace=namespace)
+        existing.data = data
+        v1.replace_namespaced_config_map(name=name, namespace=namespace, body=existing)
+    except ApiException as e:
+        if e.status == 404:
+            v1.create_namespaced_config_map(namespace=namespace, body=cm)
+        else:
+            raise
+
+
+def save_checkpoint(namespace: str, test_id: str, org_name: str, phase: str, extra: dict | None = None) -> None:
+    data: dict[str, str] = {
+        "test_id": test_id,
+        "org_name": org_name,
+        "phase": phase,
+    }
+    if extra:
+        for k, v in extra.items():
+            try:
+                data[k] = json.dumps(v) if isinstance(v, (dict, list)) else str(v)
+            except Exception:
+                data[k] = str(v)
+    save_results_configmap(namespace, "perf-results", data)
+
+
+def http_get_json(url: str, params: dict | None = None) -> dict:
+    # Basic retry for transient VM connectivity (EOF, connection reset, etc.)
+    last_err: Exception | None = None
+    for attempt in range(6):
+        try:
+            resp = requests.get(url, params=params, timeout=30)
+            resp.raise_for_status()
+            return resp.json()
+        except Exception as e:
+            last_err = e
+            sleep_s = 2 * (attempt + 1)
+            log(f"[metrics] request failed (attempt {attempt+1}/6): {e}; retrying in {sleep_s}s")
+            time.sleep(sleep_s)
+    assert last_err is not None
+    raise last_err
+
+
+def prom_query(base_url: str, query: str, context: str | None = None) -> float:
+    url = f"{base_url.rstrip('/')}/api/v1/query"
+    start = time.time()
+    data = http_get_json(url, params={"query": query})
+    duration = time.time() - start
+    if context:
+        log(f"[metrics] {context} took {duration:.1f}s")
+    if data.get("status") != "success":
+        raise RuntimeError(f"Prom query failed: {data}")
+    result = data.get("data", {}).get("result", [])
+    if not result:
+        if context:
+            log(f"[metrics] {context} returned empty result")
+        return 0.0
+    # Use the first scalar/vector value (sum/avg queries should return single series)
+    value = float(result[0]["value"][1])
+    if context:
+        log(f"[metrics] {context} value={value}")
+    return value
+
+
+def measure_metrics(base_url: str, namespace: str, apiserver_regex: str, etcd_regex: str, window: str) -> dict:
+    # CPU is in cores (rate over window). Memory in bytes (avg over window)
+    log(f"[metrics] VM_BASE_URL={base_url} namespace={namespace} window={window}")
+    log("[metrics] querying apiserver cpu/memory and etcd cpu/memory …")
+    # Pre-flight series counts to aid debugging
+    try:
+        ns_cpu_series = prom_query(
+            base_url,
+            f'count(container_cpu_usage_seconds_total{{namespace="{namespace}"}})',
+            context="series_count_ns_cpu",
+        )
+        ns_mem_series = prom_query(
+            base_url,
+            f'count(container_memory_working_set_bytes{{namespace="{namespace}"}})',
+            context="series_count_ns_mem",
+        )
+        apiserver_series = prom_query(
+            base_url,
+            f'count(container_cpu_usage_seconds_total{{namespace="{namespace}",pod=~"{apiserver_regex}"}})',
+            context="series_count_apiserver",
+        )
+        etcd_series = prom_query(
+            base_url,
+            f'count(container_cpu_usage_seconds_total{{namespace="{namespace}",pod=~"{etcd_regex}"}})',
+            context="series_count_etcd",
+        )
+        log(
+            f"[metrics] series counts: ns_cpu={ns_cpu_series} ns_mem={ns_mem_series} apiserver={apiserver_series} etcd={etcd_series}"
+        )
+    except Exception as e:
+        log(f"[metrics] pre-flight series counts failed: {e}")
+    def run_queries(pod_label: str, include_container_filter: bool) -> dict[str, float]:
+        container_filter = 'container!="",container!="POD"' if include_container_filter else ''
+        comma = ',' if include_container_filter else ''
+        # Build label selectors without f-strings to avoid brace escaping issues
+        apiserver_selector = '{namespace="%s",%s=~"%s"%s%s}' % (
+            namespace,
+            pod_label,
+            apiserver_regex,
+            comma,
+            container_filter,
+        )
+        etcd_selector = '{namespace="%s",%s=~"%s"%s%s}' % (
+            namespace,
+            pod_label,
+            etcd_regex,
+            comma,
+            container_filter,
+        )
+        queries: dict[str, str] = {
+            "apiserver_cpu_cores": 'sum(rate(container_cpu_usage_seconds_total%s[%s]))' % (apiserver_selector, window),
+            "apiserver_mem_bytes": 'sum(avg_over_time(container_memory_working_set_bytes%s[%s]))' % (apiserver_selector, window),
+            "etcd_cpu_cores": 'sum(rate(container_cpu_usage_seconds_total%s[%s]))' % (etcd_selector, window),
+            "etcd_mem_bytes": 'sum(avg_over_time(container_memory_working_set_bytes%s[%s]))' % (etcd_selector, window),
+        }
+        for k, q in queries.items():
+            log(f"[metrics] query[{k}] (label={pod_label}, filter={include_container_filter}): {q}")
+        results: dict[str, float] = {}
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            future_to_key = {executor.submit(prom_query, base_url, query, k): k for k, query in queries.items()}
+            for future in concurrent.futures.as_completed(future_to_key):
+                k = future_to_key[future]
+                try:
+                    results[k] = future.result()
+                except Exception as e:
+                    log(f"[metrics] query failed for {k}: {e}; using 0")
+                    results[k] = 0.0
+        return results
+
+    # Try multiple label variants and filters; return first non-zero set
+    for pod_label, with_filter in [("pod", True), ("pod", False), ("pod_name", True), ("pod_name", False)]:
+        log(f"[metrics] attempting with pod_label={pod_label} filter={with_filter}")
+        results = run_queries(pod_label, with_filter)
+        if any(v != 0.0 for v in results.values()):
+            return results
+
+    log("[metrics] all query variants returned 0; returning zeros")
+    return {"apiserver_cpu_cores": 0.0, "apiserver_mem_bytes": 0.0, "etcd_cpu_cores": 0.0, "etcd_mem_bytes": 0.0}
+
+
+def find_condition(conditions: list[dict] | None, ctype: str) -> dict | None:
+    if not conditions:
+        return None
+    for c in conditions:
+        if c.get("type") == ctype:
+            return c
+    return None
+
+
+def wait_condition_ready(
+    coapi: k8s_client.CustomObjectsApi,
+    group: str,
+    version: str,
+    plural: str,
+    name: str,
+    timeout_s: int = 600,
+    log_context: str | None = None,
+) -> None:
+    start = time.time()
+    deadline = start + timeout_s
+    last_log = 0.0
+    while time.time() < deadline:
+        obj = coapi.get_cluster_custom_object(group=group, version=version, plural=plural, name=name)
+        cond = find_condition(obj.get("status", {}).get("conditions"), "Ready")
+        if cond and str(cond.get("status")) == "True":
+            return
+        # Periodic detail log to explain why we're still waiting
+        now = time.time()
+        if now - start >= 10 and now - last_log >= 15:
+            last_log = now
+            if cond:
+                reason = cond.get("reason", "")
+                message = cond.get("message", "")
+                status = cond.get("status", "Unknown")
+                ctx = f"{plural}/{name}" if not log_context else f"{log_context} ({plural}/{name})"
+                log(f"waiting for {ctx}: Ready={status} reason={reason} message={message}")
+            else:
+                ctx = f"{plural}/{name}" if not log_context else f"{log_context} ({plural}/{name})"
+                log(f"waiting for {ctx}: no Ready condition yet")
+        time.sleep(2)
+    raise TimeoutError(f"Timed out waiting for {plural}/{name} Ready")
+
+
+def build_scoped_kubeconfig(base_cfg: dict, scope_path: str, new_name: str) -> dict:
+    cfg = yaml.safe_load(yaml.safe_dump(base_cfg))  # deep copy
+    for c in cfg.get("clusters", []):
+        server = c["cluster"].get("server", "").rstrip("/")
+        c["name"] = new_name
+        c["cluster"]["server"] = f"{server}{scope_path}"
+    # context names follow cluster names
+    if cfg.get("contexts"):
+        for ctx in cfg["contexts"]:
+            ctx["name"] = new_name
+            ctx["context"]["cluster"] = new_name
+    if cfg.get("current-context"):
+        cfg["current-context"] = new_name
+    return cfg
+
+
+def kube_client_from_config(cfg: dict):
+    # Load a kubernetes client from a kubeconfig dict (not a file)
+    loader = k8s_config.kube_config.KubeConfigLoader(config_dict=cfg)
+    configuration = k8s_client.Configuration()
+    loader.load_and_set(configuration)
+    return k8s_client.ApiClient(configuration)
+
+
+def create_org_and_projects(milo_kubeconfig_path: str, org_name: str, num_projects: int, labels: dict[str, str]) -> tuple[dict, list[str], float]:
+    base_cfg = load_yaml_file(milo_kubeconfig_path)
+    # Optional override to inject a bearer token for auth troubleshooting
+    override_token = os.getenv("AUTH_BEARER_TOKEN")
+    if override_token:
+        try:
+            if base_cfg.get("users"):
+                base_cfg["users"][0]["user"]["token"] = override_token
+                log("Using AUTH_BEARER_TOKEN override for kubeconfig user[0]")
+        except Exception:
+            pass
+    # Client for Milo API server (cluster-scoped CRDs)
+    api_client = kube_client_from_config(base_cfg)
+    coapi = k8s_client.CustomObjectsApi(api_client)
+    try:
+        cluster_server = base_cfg.get("clusters", [{}])[0].get("cluster", {}).get("server", "")
+        user_name = base_cfg.get("users", [{}])[0].get("name", "")
+        log(f"Using cluster-scoped kubeconfig: server={cluster_server} user={user_name}")
+    except Exception:
+        pass
+
+    # Create Organization
+    log(f"Creating Organization '{org_name}' …")
+    org_body = {
+        "apiVersion": "resourcemanager.miloapis.com/v1alpha1",
+        "kind": "Organization",
+        "metadata": {"name": org_name, "labels": labels},
+        "spec": {"type": "Standard"},
+    }
+    try:
+        retry_with_backoff(
+            "create Organization",
+            coapi.create_cluster_custom_object,
+            group="resourcemanager.miloapis.com",
+            version="v1alpha1",
+            plural="organizations",
+            body=org_body,
+        )
+    except ApiException as e:
+        if e.status != 409:
+            raise
+
+    # Do not wait for Organization readiness; proceed immediately to projects
+    log(f"Organization '{org_name}' created")
+
+    # Build an organization-scoped kubeconfig so requests carry parent context
+    org_scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/organizations/{org_name}/control-plane"
+    org_cfg = build_scoped_kubeconfig(base_cfg, org_scope_path, new_name=f"organization-{org_name}")
+    org_client = kube_client_from_config(org_cfg)
+    org_coapi = k8s_client.CustomObjectsApi(org_client)
+    try:
+        org_server = org_cfg.get("clusters", [{}])[0].get("cluster", {}).get("server", "")
+        org_user = org_cfg.get("users", [{}])[0].get("name", "")
+        log(f"Using organization-scoped kubeconfig: server={org_server} user={org_user}")
+    except Exception:
+        pass
+
+    # Create Projects
+    project_names: list[str] = []
+    start = time.time()
+    log(f"Creating {num_projects} Projects …")
+    for i in range(1, num_projects + 1):
+        pname = f"{org_name}-p-{i:03d}"
+        project_names.append(pname)
+        proj_body = {
+            "apiVersion": "resourcemanager.miloapis.com/v1alpha1",
+            "kind": "Project",
+            "metadata": {"name": pname, "labels": labels},
+            "spec": {"ownerRef": {"kind": "Organization", "name": org_name}},
+        }
+        try:
+            retry_with_backoff(
+                f"create Project {pname}",
+                org_coapi.create_cluster_custom_object,
+                group="resourcemanager.miloapis.com",
+                version="v1alpha1",
+                plural="projects",
+                body=proj_body,
+            )
+        except ApiException as e:
+            if e.status != 409:
+                log(f"error creating Project '{pname}': {getattr(e, 'body', e)}")
+                raise
+        if i % 10 == 0 or i == num_projects:
+            log(f"Created {i}/{num_projects} Projects …")
+
+    # Wait for all projects Ready
+    ready = 0
+    log("Waiting for Projects to become Ready …")
+    for pname in project_names:
+        wait_condition_ready(org_coapi, "resourcemanager.miloapis.com", "v1alpha1", "projects", pname, timeout_s=900, log_context="Project")
+        ready += 1
+        if ready % 10 == 0 or ready == len(project_names):
+            log(f"Projects Ready: {ready}/{len(project_names)} …")
+    end = time.time()
+    total_seconds = end - start
+
+    # Return base kubeconfig (for building scoped configs), project names, and duration
+    return base_cfg, project_names, total_seconds
+
+
+def create_objects_in_projects(
+    base_cfg: dict,
+    org_name: str,
+    project_names: list[str],
+    num_secrets: int,
+    num_configmaps: int,
+    labels: dict[str, str],
+    project_concurrency: int,
+    object_concurrency: int,
+) -> None:
+    def work_project(pname: str) -> None:
+        scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/projects/{pname}/control-plane"
+        proj_cfg = build_scoped_kubeconfig(base_cfg, scope_path, new_name=f"project-{pname}")
+        proj_client = kube_client_from_config(proj_cfg)
+        v1 = k8s_client.CoreV1Api(proj_client)
+
+        def create_secret(i: int) -> None:
+            sname = f"perf-secret-{i:03d}"
+            body = k8s_client.V1Secret(
+                metadata=k8s_client.V1ObjectMeta(name=sname, labels=labels),
+                string_data={"note": f"secret {i} for {pname}"},
+                type="Opaque",
+            )
+            try:
+                v1.create_namespaced_secret(namespace="default", body=body)
+            except ApiException as e:
+                if e.status != 409:
+                    raise
+
+        def create_configmap(i: int) -> None:
+            cname = f"perf-configmap-{i:03d}"
+            body = k8s_client.V1ConfigMap(
+                metadata=k8s_client.V1ObjectMeta(name=cname, labels=labels),
+                data={"note": f"configmap {i} for {pname}"},
+            )
+            try:
+                v1.create_namespaced_config_map(namespace="default", body=body)
+            except ApiException as e:
+                if e.status != 409:
+                    raise
+
+        log(f"[{pname}] Creating {num_secrets} Secrets (concurrency={object_concurrency}) …")
+        if num_secrets > 0:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=object_concurrency) as ex:
+                list(ex.map(create_secret, range(1, num_secrets + 1)))
+        log(f"[{pname}] Secrets created: {num_secrets}/{num_secrets}")
+
+        log(f"[{pname}] Creating {num_configmaps} ConfigMaps (concurrency={object_concurrency}) …")
+        if num_configmaps > 0:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=object_concurrency) as ex:
+                list(ex.map(create_configmap, range(1, num_configmaps + 1)))
+        log(f"[{pname}] ConfigMaps created: {num_configmaps}/{num_configmaps}")
+
+    # Run multiple projects in parallel
+    if project_concurrency <= 1:
+        for pname in project_names:
+            work_project(pname)
+    else:
+        log(f"Creating objects across projects (concurrency={project_concurrency}) …")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=project_concurrency) as ex:
+            list(ex.map(work_project, project_names))
+
+
+def generate_report_html(
+    metrics_before: dict,
+    metrics_after_projects: dict,
+    metrics_after_secrets: dict | None,
+    num_projects: int,
+    projects_ready_seconds: float,
+) -> str:
+    # Minimal inline charts using simple ASCII bars as fallback if matplotlib unavailable
+    try:
+        import matplotlib.pyplot as plt  # type: ignore
+
+        def grouped_bars_png(
+            series: list[tuple[str, list[float]]],
+            categories: list[str],
+            title: str,
+            ylabel: str,
+        ) -> str:
+            fig, ax = plt.subplots(figsize=(7.5, 3.5))
+            num_series = len(series)
+            x = range(len(categories))
+            total_bar_width = 0.8
+            bar_width = total_bar_width / max(1, num_series)
+            offsets = [(-total_bar_width / 2) + (i + 0.5) * bar_width for i in range(num_series)]
+            colors = ["#4c78a8", "#f58518", "#54a24b"]
+            for idx, (label, values) in enumerate(series):
+                ax.bar([xi + offsets[idx] for xi in x], values, width=bar_width, label=label, color=colors[idx % len(colors)])
+            ax.set_title(title)
+            ax.set_ylabel(ylabel)
+            ax.set_xticks(list(x))
+            ax.set_xticklabels(categories)
+            ax.legend(loc="upper left", fontsize=8)
+            plt.tight_layout()
+            buf = BytesIO()
+            plt.savefig(buf, format="png")
+            plt.close(fig)
+            b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+            return f"<img alt='{title}' src='data:image/png;base64,{b64}' />"
+
+        # Build series for CPU (cores) and Memory (MB)
+        cpu_series: list[tuple[str, list[float]]] = [
+            ("baseline", [metrics_before["apiserver_cpu_cores"], metrics_before["etcd_cpu_cores"]]),
+            ("after-projects", [metrics_after_projects["apiserver_cpu_cores"], metrics_after_projects["etcd_cpu_cores"]]),
+        ]
+        mem_series: list[tuple[str, list[float]]] = [
+            (
+                "baseline",
+                [metrics_before["apiserver_mem_bytes"] / (1024 * 1024), metrics_before["etcd_mem_bytes"] / (1024 * 1024)],
+            ),
+            (
+                "after-projects",
+                [
+                    metrics_after_projects["apiserver_mem_bytes"] / (1024 * 1024),
+                    metrics_after_projects["etcd_mem_bytes"] / (1024 * 1024),
+                ],
+            ),
+        ]
+        if metrics_after_secrets is not None:
+            cpu_series.append(
+                (
+                    "after-objects",
+                    [metrics_after_secrets["apiserver_cpu_cores"], metrics_after_secrets["etcd_cpu_cores"]],
+                )
+            )
+            mem_series.append(
+                (
+                    "after-objects",
+                    [
+                        metrics_after_secrets["apiserver_mem_bytes"] / (1024 * 1024),
+                        metrics_after_secrets["etcd_mem_bytes"] / (1024 * 1024),
+                    ],
+                )
+            )
+
+        cpu_img = grouped_bars_png(cpu_series, ["apiserver", "etcd"], "CPU (cores)", "cores")
+        mem_img = grouped_bars_png(mem_series, ["apiserver", "etcd"], "Memory (MB)", "MB")
+
+        # Quick stats and deltas
+        t_total = projects_ready_seconds
+        per_project_s = (t_total / num_projects) if num_projects > 0 else 0.0
+        def delta(a: float, b: float) -> float:
+            return b - a
+        apiserver_mem_delta_mb = delta(
+            metrics_before["apiserver_mem_bytes"] / (1024 * 1024),
+            metrics_after_projects["apiserver_mem_bytes"] / (1024 * 1024),
+        )
+        etcd_mem_delta_mb = delta(
+            metrics_before["etcd_mem_bytes"] / (1024 * 1024),
+            metrics_after_projects["etcd_mem_bytes"] / (1024 * 1024),
+        )
+        apiserver_cpu_delta = delta(metrics_before["apiserver_cpu_cores"], metrics_after_projects["apiserver_cpu_cores"])
+        etcd_cpu_delta = delta(metrics_before["etcd_cpu_cores"], metrics_after_projects["etcd_cpu_cores"])
+
+        # Per-project implications (naive average impact per created Project)
+        per_proj_cpu_apiserver = (apiserver_cpu_delta / num_projects) if num_projects > 0 else 0.0
+        per_proj_cpu_etcd = (etcd_cpu_delta / num_projects) if num_projects > 0 else 0.0
+        per_proj_mem_apiserver_mb = (apiserver_mem_delta_mb / num_projects) if num_projects > 0 else 0.0
+        per_proj_mem_etcd_mb = (etcd_mem_delta_mb / num_projects) if num_projects > 0 else 0.0
+
+        after_objects_stats = ""
+        if metrics_after_secrets is not None:
+            apiserver_mem_delta_mb_obj = delta(
+                metrics_after_projects["apiserver_mem_bytes"] / (1024 * 1024),
+                metrics_after_secrets["apiserver_mem_bytes"] / (1024 * 1024),
+            )
+            etcd_mem_delta_mb_obj = delta(
+                metrics_after_projects["etcd_mem_bytes"] / (1024 * 1024),
+                metrics_after_secrets["etcd_mem_bytes"] / (1024 * 1024),
+            )
+            apiserver_cpu_delta_obj = delta(
+                metrics_after_projects["apiserver_cpu_cores"], metrics_after_secrets["apiserver_cpu_cores"]
+            )
+            etcd_cpu_delta_obj = delta(
+                metrics_after_projects["etcd_cpu_cores"], metrics_after_secrets["etcd_cpu_cores"]
+            )
+            per_proj_cpu_apiserver_obj = (apiserver_cpu_delta_obj / num_projects) if num_projects > 0 else 0.0
+            per_proj_cpu_etcd_obj = (etcd_cpu_delta_obj / num_projects) if num_projects > 0 else 0.0
+            per_proj_mem_apiserver_mb_obj = (apiserver_mem_delta_mb_obj / num_projects) if num_projects > 0 else 0.0
+            per_proj_mem_etcd_mb_obj = (etcd_mem_delta_mb_obj / num_projects) if num_projects > 0 else 0.0
+            after_objects_stats = f"""
+<div class="card">
+  <div class="card-title">After objects (per-project deltas)</div>
+  <div class="kpis">
+    <div class="kpi"><div class="kpi-label">CPU apiserver</div><div class="kpi-value">{per_proj_cpu_apiserver_obj:+.4f} cores</div></div>
+    <div class="kpi"><div class="kpi-label">CPU etcd</div><div class="kpi-value">{per_proj_cpu_etcd_obj:+.4f} cores</div></div>
+    <div class="kpi"><div class="kpi-label">MEM apiserver</div><div class="kpi-value">{per_proj_mem_apiserver_mb_obj:+.2f} MB</div></div>
+    <div class="kpi"><div class="kpi-label">MEM etcd</div><div class="kpi-value">{per_proj_mem_etcd_mb_obj:+.2f} MB</div></div>
+  </div>
+</div>
+"""
+
+        html = """
+<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <title>Milo Performance Report</title>
+    <style>
+      body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Helvetica Neue", Arial, sans-serif; color: #1f2937; margin: 24px; }}
+      h1 {{ margin: 0 0 8px; font-size: 22px; }}
+      .subtitle {{ color: #6b7280; margin-bottom: 20px; }}
+      .section {{ margin: 24px 0; }}
+      .grid {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); gap: 12px; }}
+      .card {{ border: 1px solid #e5e7eb; border-radius: 8px; padding: 12px; background: #fff; }}
+      .card-title {{ font-weight: 600; margin-bottom: 8px; color: #374151; }}
+      .kpis {{ display: grid; grid-template-columns: repeat(4, minmax(120px, 1fr)); gap: 8px; }}
+      .kpi {{ background: #f9fafb; border: 1px solid #f3f4f6; border-radius: 6px; padding: 8px; }}
+      .kpi-label {{ color: #6b7280; font-size: 12px; }}
+      .kpi-value {{ font-size: 14px; font-weight: 600; }}
+      .charts {{ display: grid; grid-template-columns: 1fr; gap: 16px; }}
+      @media (min-width: 900px) {{ .charts {{ grid-template-columns: 1fr 1fr; }} }}
+      .img {{ border: 1px solid #e5e7eb; border-radius: 8px; padding: 8px; background: #fff; text-align: center; }}
+    </style>
+  </head>
+  <body>
+    <h1>Milo Performance Report</h1>
+    <div class="subtitle">Projects: {num_projects} • Time to Ready: {t_total:.1f}s ({per_project_s:.2f}s/project)</div>
+
+    <div class="section charts">
+      <div class="img">{cpu_img}</div>
+      <div class="img">{mem_img}</div>
+    </div>
+
+    <div class="section">
+      <div class="card">
+        <div class="card-title">After-projects (per-project deltas)</div>
+        <div class="kpis">
+          <div class="kpi"><div class="kpi-label">CPU apiserver</div><div class="kpi-value">{per_proj_cpu_apiserver:+.4f} cores</div></div>
+          <div class="kpi"><div class="kpi-label">CPU etcd</div><div class="kpi-value">{per_proj_cpu_etcd:+.4f} cores</div></div>
+          <div class="kpi"><div class="kpi-label">MEM apiserver</div><div class="kpi-value">{per_proj_mem_apiserver_mb:+.2f} MB</div></div>
+          <div class="kpi"><div class="kpi-label">MEM etcd</div><div class="kpi-value">{per_proj_mem_etcd_mb:+.2f} MB</div></div>
+        </div>
+      </div>
+      {after_objects_stats}
+    </div>
+  </body>
+</html>
+"""
+        return html.format(
+            cpu_img=cpu_img,
+            mem_img=mem_img,
+            num_projects=num_projects,
+            t_total=t_total,
+            per_project_s=per_project_s,
+            apiserver_mem_delta_mb=apiserver_mem_delta_mb,
+            etcd_mem_delta_mb=etcd_mem_delta_mb,
+            apiserver_cpu_delta=apiserver_cpu_delta,
+            etcd_cpu_delta=etcd_cpu_delta,
+            per_proj_cpu_apiserver=per_proj_cpu_apiserver,
+            per_proj_cpu_etcd=per_proj_cpu_etcd,
+            per_proj_mem_apiserver_mb=per_proj_mem_apiserver_mb,
+            per_proj_mem_etcd_mb=per_proj_mem_etcd_mb,
+            after_objects_stats=after_objects_stats,
+        )
+    except Exception as e:  # Fallback text-only report with error context
+        payload: dict[str, object] = {
+            "baseline": metrics_before,
+            "after_projects": metrics_after_projects,
+            "num_projects": num_projects,
+            "projects_ready_seconds": projects_ready_seconds,
+        }
+        if metrics_after_secrets is not None:
+            payload["after_secrets"] = metrics_after_secrets
+        return (
+            "<pre>chart rendering unavailable; showing raw metrics\n\n"
+            + "error: "
+            + str(e)
+            + "\n\n"
+            + json.dumps(payload, indent=2)
+            + "</pre>"
+        )
+
+
+def cleanup_resources(milo_kubeconfig_path: str, test_id: str, org_name: str) -> None:
+    base_cfg = load_yaml_file(milo_kubeconfig_path)
+    api_client = kube_client_from_config(base_cfg)
+    coapi = k8s_client.CustomObjectsApi(api_client)
+
+    # Use organization-scoped client for project-scoped operations (admission context)
+    org_scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/organizations/{org_name}/control-plane"
+    org_cfg = build_scoped_kubeconfig(base_cfg, org_scope_path, new_name=f"organization-{org_name}")
+    org_client = kube_client_from_config(org_cfg)
+    org_coapi = k8s_client.CustomObjectsApi(org_client)
+
+    # List and delete projects by label
+    proj_list = coapi.list_cluster_custom_object(
+        group="resourcemanager.miloapis.com",
+        version="v1alpha1",
+        plural="projects",
+        label_selector=f"app=milo-perf,test-id={test_id}",
+    )
+    for item in proj_list.get("items", []):
+        pname = item["metadata"]["name"]
+        log(f"[cleanup] project {pname}: deleting Secrets/ConfigMaps and Project …")
+        # Delete per-project objects first (tolerate not found)
+        try:
+            proj_scope_path = f"/apis/resourcemanager.miloapis.com/v1alpha1/projects/{pname}/control-plane"
+            proj_cfg = build_scoped_kubeconfig(base_cfg, proj_scope_path, new_name=f"project-{pname}")
+            proj_client = kube_client_from_config(proj_cfg)
+            v1 = k8s_client.CoreV1Api(proj_client)
+            label_sel = f"app=milo-perf,test-id={test_id}"
+            # Secrets
+            try:
+                sl = v1.list_namespaced_secret(namespace="default", label_selector=label_sel)
+                for s in sl.items or []:
+                    try:
+                        v1.delete_namespaced_secret(name=s.metadata.name, namespace="default")
+                    except ApiException as e:
+                        if e.status != 404:
+                            raise
+            except ApiException:
+                pass
+            # ConfigMaps
+            try:
+                cml = v1.list_namespaced_config_map(namespace="default", label_selector=label_sel)
+                for cm in cml.items or []:
+                    try:
+                        v1.delete_namespaced_config_map(name=cm.metadata.name, namespace="default")
+                    except ApiException as e:
+                        if e.status != 404:
+                            raise
+            except ApiException:
+                pass
+        except Exception:
+            # Keep going even if per-project object cleanup fails
+            pass
+
+        # Delete the Project (use org-scoped API for proper parent context)
+        try:
+            org_coapi.delete_cluster_custom_object(
+                group="resourcemanager.miloapis.com",
+                version="v1alpha1",
+                plural="projects",
+                name=pname,
+            )
+        except ApiException as e:
+            if e.status not in (404, 409):
+                raise
+
+    # Delete organization last
+    try:
+        coapi.delete_cluster_custom_object(
+            group="resourcemanager.miloapis.com",
+            version="v1alpha1",
+            plural="organizations",
+            name=org_name,
+        )
+    except ApiException as e:
+        if e.status not in (404, 409):
+            raise
+
+
+def main() -> None:
+    run_mode = os.getenv("RUN_MODE", "run").lower()
+    target_ns = get_env("TARGET_NAMESPACE", "milo-system")
+
+    if run_mode == "cleanup":
+        milo_kubeconfig_path = get_env("MILO_KUBECONFIG_PATH", "/work/milo-kubeconfig")
+        test_id = get_env("TEST_ID")
+        org_name = get_env("ORG_NAME")
+        cleanup_resources(milo_kubeconfig_path, test_id=test_id, org_name=org_name)
+        # Remove results ConfigMap if present
+        try:
+            k8s_config.load_incluster_config()
+        except Exception:
+            k8s_config.load_kube_config()
+        v1 = k8s_client.CoreV1Api()
+        try:
+            v1.delete_namespaced_config_map(name="perf-results", namespace=target_ns)
+        except ApiException as e:
+            if e.status != 404:
+                raise
+        print("Cleanup complete")
+        return
+
+    # RUN
+    milo_kubeconfig_path = get_env("MILO_KUBECONFIG_PATH", "/work/milo-kubeconfig")
+    milo_metrics_ns = get_env("MILO_NAMESPACE", "milo-system")
+    vm_base_url = get_env("VM_BASE_URL")
+    apiserver_regex = get_env("APISERVER_POD_REGEX", "milo-apiserver.*")
+    etcd_regex = get_env("ETCD_POD_REGEX", "etcd.*")
+    window = get_env("MEASURE_WINDOW", "2m")
+    stabilize_seconds = int(get_env("STABILIZE_SECONDS", "90"))
+    num_projects = int(get_env("NUM_PROJECTS", "100"))
+    num_secrets = int(get_env("NUM_SECRETS_PER_PROJECT", "100"))
+    num_configmaps = int(get_env("NUM_CONFIGMAPS_PER_PROJECT", "100"))
+    project_concurrency = int(os.getenv("PROJECT_CONCURRENCY", "4"))
+    object_concurrency = int(os.getenv("OBJECT_CONCURRENCY", "8"))
+    run_objects_phase = parse_bool(os.getenv("RUN_OBJECTS_PHASE", "true"), default=True)
+    out_dir = os.getenv("OUT_DIR", "/work/out")
+
+    test_id = uuid.uuid4().hex[:8]
+    _org_env = os.getenv("ORG_NAME")
+    org_name = _org_env.strip() if (_org_env is not None and _org_env.strip() != "") else f"perf-{test_id}"
+    labels = {"app": "milo-perf", "test-id": test_id}
+
+    # Initial checkpoint (so cleanup works even if run aborts early)
+    save_checkpoint(target_ns, test_id, org_name, phase="init", extra={"num_projects": num_projects})
+
+    # Baseline metrics (no pre-stabilization)
+    log("Measuring baseline metrics …")
+    baseline = measure_metrics(vm_base_url, milo_metrics_ns, apiserver_regex, etcd_regex, window)
+
+    # Create org & projects
+    log(f"Creating org '{org_name}' and {num_projects} projects …")
+    base_cfg, project_names, projects_ready_seconds = create_org_and_projects(
+        milo_kubeconfig_path, org_name, num_projects, labels
+    )
+    # Update checkpoint that org exists (projects will be created next)
+    save_checkpoint(target_ns, test_id, org_name, phase="org-created")
+
+    # After projects metrics
+    if stabilize_seconds > 0:
+        log(f"Stabilizing for {stabilize_seconds}s after projects are Ready …")
+        time.sleep(stabilize_seconds)
+    log("Measuring metrics after projects are Ready …")
+    after_projects = measure_metrics(vm_base_url, milo_metrics_ns, apiserver_regex, etcd_regex, window)
+    # Update checkpoint after projects are ready
+    save_checkpoint(target_ns, test_id, org_name, phase="projects-ready", extra={"num_projects": num_projects})
+
+    after_secrets = None
+    if run_objects_phase:
+        # Create objects within each project
+        log(f"Creating {num_secrets} secrets and {num_configmaps} configmaps per project …")
+        create_objects_in_projects(
+            base_cfg,
+            org_name,
+            project_names,
+            num_secrets,
+            num_configmaps,
+            labels,
+            project_concurrency,
+            object_concurrency,
+        )
+
+        # After secrets/configmaps metrics
+        if stabilize_seconds > 0:
+            log(f"Stabilizing for {stabilize_seconds}s after secrets/configmaps …")
+            time.sleep(stabilize_seconds)
+        log("Measuring metrics after creating secrets/configmaps …")
+        after_secrets = measure_metrics(vm_base_url, milo_metrics_ns, apiserver_regex, etcd_regex, window)
+
+    # Build results
+    now_iso = datetime.now(timezone.utc).isoformat()
+    results = {
+        "test_id": test_id,
+        "timestamp": now_iso,
+        "org_name": org_name,
+        "num_projects": num_projects,
+        "num_secrets_per_project": num_secrets,
+        "num_configmaps_per_project": num_configmaps,
+        "projects_ready_seconds": projects_ready_seconds,
+        "metrics": {
+            "baseline": baseline,
+            "after_projects": after_projects,
+            "after_secrets": after_secrets,
+        },
+    }
+
+    report_html = generate_report_html(
+        baseline,
+        after_projects,
+        after_secrets,
+        num_projects,
+        projects_ready_seconds,
+    )
+
+    # Persist results to files (Task will copy locally and publish a ConfigMap)
+    try:
+        os.makedirs(out_dir, exist_ok=True)
+        with open(os.path.join(out_dir, "results.json"), "w", encoding="utf-8") as f:
+            json.dump(results, f, indent=2)
+        with open(os.path.join(out_dir, "report.html"), "w", encoding="utf-8") as f:
+            f.write(report_html)
+        with open(os.path.join(out_dir, "meta.txt"), "w", encoding="utf-8") as f:
+            f.write(f"test_id={test_id}\norg_name={org_name}\n")
+        log(f"Results written to {out_dir}")
+    except Exception as e:
+        log(f"Failed to write results to {out_dir}: {e}")
+
+    # Best-effort attempt to also write a ConfigMap (may fail if SA lacks RBAC)
+    try:
+        cm_data = {
+            "results.json": json.dumps(results, indent=2),
+            "report.html": report_html,
+            "test_id": test_id,
+            "org_name": org_name,
+        }
+        save_results_configmap(target_ns, "perf-results", cm_data)
+        log("Also saved results to ConfigMap 'perf-results'")
+    except Exception as e:
+        log(f"Skipping ConfigMap save (insufficient RBAC?): {e}")
+
+    log("Perf run complete")
+
+
+if __name__ == "__main__":
+    main()
+
+

From 6b5df83ade0928c0b8551d399360368b4a06606e Mon Sep 17 00:00:00 2001
From: Zach Smith <zachdsmith2015@gmail.com>
Date: Wed, 17 Sep 2025 22:46:22 -0700
Subject: [PATCH 2/3] fix: path typo

---
 Taskfile.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Taskfile.yaml b/Taskfile.yaml
index 096173b8..6512c2af 100644
--- a/Taskfile.yaml
+++ b/Taskfile.yaml
@@ -565,7 +565,7 @@ tasks:
           -e "s/RUN_OBJECTS_PHASE_PLACEHOLDER/${RUN_OBJECTS_PHASE}/g" \
           -e "s/MILO_KUBECONFIG_SECRET_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_NAME}/g" \
           -e "s/MILO_KUBECONFIG_KEY_PLACEHOLDER/${MILO_KUBECONFIG_SECRET_KEY}/g" \
-          test/performance/scripts/perf-runner-job.yaml | task test-infra:kubectl -- apply -f -
+          test/performance/config/perf-runner-job.yaml | task test-infra:kubectl -- apply -f -
 
         echo "⏳ Waiting for Job completion …"
         task test-infra:kubectl -- -n ${NS} wait --for=condition=Complete job/perf-runner --timeout=45m

From 5c200d99d52319b9348eac548f9f5aff4fb8fe4a Mon Sep 17 00:00:00 2001
From: Zach Smith <zachdsmith2015@gmail.com>
Date: Thu, 18 Sep 2025 17:18:29 -0700
Subject: [PATCH 3/3] fix: use helm monitor and fix auth token

---
 .../servicemonitor-etcd.yaml                   | 18 ------------------
 config/dependencies/etcd/helmrelease.yaml      |  2 +-
 .../components/auth/auth-tokens-secret.yaml    |  2 +-
 3 files changed, 2 insertions(+), 20 deletions(-)
 delete mode 100644 config/components/prometheus-monitoring/servicemonitor-etcd.yaml

diff --git a/config/components/prometheus-monitoring/servicemonitor-etcd.yaml b/config/components/prometheus-monitoring/servicemonitor-etcd.yaml
deleted file mode 100644
index f08607e0..00000000
--- a/config/components/prometheus-monitoring/servicemonitor-etcd.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: etcd-metrics
-  namespace: milo-system
-spec:
-  namespaceSelector:
-    matchNames: ["milo-system"]
-  selector:
-    matchLabels:
-      app.kubernetes.io/component: etcd
-      app.kubernetes.io/name: etcd
-  endpoints:
-    - port: client
-      path: /metrics
-      scheme: http
-      interval: 15s
-
diff --git a/config/dependencies/etcd/helmrelease.yaml b/config/dependencies/etcd/helmrelease.yaml
index b4a4e506..f52f798d 100644
--- a/config/dependencies/etcd/helmrelease.yaml
+++ b/config/dependencies/etcd/helmrelease.yaml
@@ -57,7 +57,7 @@ spec:
     metrics:
       enabled: true
       serviceMonitor:
-        enabled: false
+        enabled: true
 
     # Logging configuration
     extraEnvVars:
diff --git a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml
index d349d947..b76392b4 100644
--- a/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml
+++ b/config/overlays/test-infra/components/auth/auth-tokens-secret.yaml
@@ -6,4 +6,4 @@ type: Opaque
 stringData:
   tokens.csv: |
     test-admin-token,admin,admin,"system:masters"
-    test-user-token,test-user,1002,"system:authenticated"
+    test-user-token,test-user,test-user,"system:authenticated"