From 7c4a15736f86f715d2e2bc1f3aea51043363f0ac Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Fri, 5 Dec 2025 11:19:42 +0000 Subject: [PATCH 01/12] feat: automatic Prometheus VM setup in test driver --- rs/tests/driver/src/driver/context.rs | 3 + rs/tests/driver/src/driver/group.rs | 80 +++++++++++++++++++-- rs/tests/driver/src/driver/prometheus_vm.rs | 23 +++--- 3 files changed, 91 insertions(+), 15 deletions(-) diff --git a/rs/tests/driver/src/driver/context.rs b/rs/tests/driver/src/driver/context.rs index b97c7b3c770d..a6806290112c 100644 --- a/rs/tests/driver/src/driver/context.rs +++ b/rs/tests/driver/src/driver/context.rs @@ -24,6 +24,7 @@ pub struct GroupContext { pub debug_keepalive: bool, pub no_farm_keepalive: bool, pub group_base_name: String, + pub metrics_enabled: bool, pub logs_enabled: bool, pub exclude_logs: Vec, pub quiet: bool, @@ -42,6 +43,7 @@ impl GroupContext { debug_keepalive: bool, no_farm_keepalive: bool, group_base_name: String, + metrics_enabled: bool, logs_enabled: bool, exclude_logs: Vec, quiet: bool, @@ -68,6 +70,7 @@ impl GroupContext { debug_keepalive, no_farm_keepalive, group_base_name, + metrics_enabled, logs_enabled, exclude_logs, quiet, diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index 635c0266da86..c7e0b10b6761 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -14,6 +14,7 @@ use crate::driver::{ dsl::{SubprocessFn, TestFunction}, event::TaskId, plan::{EvalOrder, Plan}, + prometheus_vm::{HasPrometheus, PrometheusVm}, report::Outcome, task::{DebugKeepaliveTask, EmptyTask}, task_scheduler::TaskTable, @@ -67,6 +68,7 @@ const DEBUG_KEEPALIVE_TASK_NAME: &str = "debug_keepalive"; const REPORT_TASK_NAME: &str = "report"; const KEEPALIVE_TASK_NAME: &str = "keepalive"; const UVMS_LOGS_STREAM_TASK_NAME: &str = "uvms_logs_stream"; +const METRICS_TASK_NAME: &str = "metrics"; const VECTOR_TASK_NAME: &str = "vector_logging"; const SETUP_TASK_NAME: &str = "setup"; const TEARDOWN_TASK_NAME: &str = "teardown"; @@ -133,6 +135,12 @@ pub struct CliArgs { )] pub required_host_features: Option>, + #[clap( + long = "enable-metrics", + help = "If set, a PrometheusVm will be spawned running both p8s configured to scrape the testnet & Grafana." + )] + pub enable_metrics: bool, + #[clap(long = "no-logs", help = "If set, the vector vm will not be spawned.")] pub no_logs: bool, @@ -206,7 +214,17 @@ impl TestEnvAttribute for SetupResult { } pub fn is_task_visible_to_user(task_id: &TaskId) -> bool { - matches!(task_id, TaskId::Test(task_name) if task_name.ne(REPORT_TASK_NAME) && task_name.ne(KEEPALIVE_TASK_NAME) && task_name.ne(UVMS_LOGS_STREAM_TASK_NAME) && task_name.ne(VECTOR_TASK_NAME) && !task_name.starts_with(LIFETIME_GUARD_TASK_PREFIX) && !task_name.starts_with("dummy(")) + matches!( + task_id, + TaskId::Test(task_name) + if task_name.ne(REPORT_TASK_NAME) && + task_name.ne(KEEPALIVE_TASK_NAME) && + task_name.ne(UVMS_LOGS_STREAM_TASK_NAME) && + task_name.ne(METRICS_TASK_NAME) && + task_name.ne(VECTOR_TASK_NAME) && + !task_name.starts_with(LIFETIME_GUARD_TASK_PREFIX) && + !task_name.starts_with("dummy(") + ) } pub struct ComposeContext<'a> { @@ -718,6 +736,45 @@ impl SystemTestGroup { Box::from(EmptyTask::new(keepalive_task_id)) as Box }; + let metrics_task_id = TaskId::Test(String::from(METRICS_TASK_NAME)); + let metrics_task = if group_ctx.metrics_enabled { + let logger = group_ctx.logger().clone(); + let group_ctx = group_ctx.clone(); + + let metrics_task = subproc( + metrics_task_id, + move || { + debug!(logger, ">>> metrics_fn"); + + let setup_dir = group_ctx.group_dir.join(constants::GROUP_SETUP_DIR); + let env = + TestEnv::new_without_duplicating_logger(setup_dir.clone(), logger.clone()); + while !setup_dir.exists() || env.prep_dir("").is_none() { + info!(logger, "Setup and/or prep directories not created yet."); + std::thread::sleep(KEEPALIVE_INTERVAL); + } + + PrometheusVm::default() + .start(&env) + .expect("failed to start prometheus VM"); + loop { + if let Err(e) = env.sync_with_prometheus_result() { + warn!(logger, "Failed to sync with PrometheusVm due to: {:?}", e); + } + + std::thread::sleep(KEEPALIVE_INTERVAL); + } + }, + &mut compose_ctx, + quiet, + ); + + Box::from(metrics_task) as Box + } else { + debug!(group_ctx.logger(), "Not spawning metrics task"); + Box::from(EmptyTask::new(metrics_task_id)) as Box + }; + let logging_task_id = TaskId::Test(String::from(VECTOR_TASK_NAME)); let log_task = if group_ctx.logs_enabled { let logger = group_ctx.logger().clone(); @@ -850,6 +907,13 @@ impl SystemTestGroup { &mut compose_ctx, ); + let metrics_plan = compose( + Some(metrics_task), + EvalOrder::Sequential, + vec![logs_plan], + &mut compose_ctx, + ); + let report_plan = Ok(compose( Some(Box::new(EmptyTask::new(TaskId::Test( REPORT_TASK_NAME.to_string(), @@ -857,13 +921,13 @@ impl SystemTestGroup { EvalOrder::Sequential, vec![if let Some(overall_timeout) = self.overall_timeout { timed( - logs_plan, + metrics_plan, overall_timeout, Some(String::from("::group")), &mut compose_ctx, ) } else { - logs_plan + metrics_plan }], &mut compose_ctx, )); @@ -893,6 +957,13 @@ impl SystemTestGroup { &mut compose_ctx, ); + let metrics_plan = compose( + Some(metrics_task), + EvalOrder::Sequential, + vec![logs_plan], + &mut compose_ctx, + ); + let report_plan = compose( Some(Box::new(EmptyTask::new(TaskId::Test( REPORT_TASK_NAME.to_string(), @@ -917,7 +988,7 @@ impl SystemTestGroup { Ok(compose( Some(keepalive_task), EvalOrder::Parallel, - vec![report_plan, logs_plan], + vec![report_plan, metrics_plan], &mut compose_ctx, )) } @@ -942,6 +1013,7 @@ impl SystemTestGroup { args.debug_keepalive, args.no_farm_keepalive || args.no_group_ttl, args.group_base_name, + args.enable_metrics, !args.no_logs, args.exclude_logs, args.quiet, diff --git a/rs/tests/driver/src/driver/prometheus_vm.rs b/rs/tests/driver/src/driver/prometheus_vm.rs index b8c12db85af7..7ea586c553b7 100644 --- a/rs/tests/driver/src/driver/prometheus_vm.rs +++ b/rs/tests/driver/src/driver/prometheus_vm.rs @@ -348,9 +348,11 @@ chown -R {SSH_USERNAME}:users {PROMETHEUS_SCRAPING_TARGETS_DIR} /// configuring its scraping targets based on the latest IC topology /// and finally downloading its data directory. pub trait HasPrometheus { + /// Same as `sync_with_prometheus_result` but panics in case it fails. + fn sync_with_prometheus(&self); /// Retrieves a topology snapshot, converts it into p8s scraping target /// JSON files and scps them to the prometheus VM. - fn sync_with_prometheus(&self); + fn sync_with_prometheus_result(&self) -> Result<()>; /// Downloads prometheus' data directory to the test artifacts /// such that we can run a local p8s on that later. @@ -363,6 +365,10 @@ pub trait HasPrometheus { impl HasPrometheus for TestEnv { fn sync_with_prometheus(&self) { + self.sync_with_prometheus_result().unwrap() + } + + fn sync_with_prometheus_result(&self) -> Result<()> { let vm_name = PROMETHEUS_VM_NAME.to_string(); // Write the scraping target JSON files to the local prometheus config directory. let prometheus_config_dir = self.get_universal_vm_config_dir(&vm_name); @@ -379,21 +385,15 @@ impl HasPrometheus for TestEnv { group_name.clone(), self.topology_snapshot(), &playnet_domain, - ) - .expect("Failed to synchronize prometheus config with the latest IC topology!"); + )?; sync_prometheus_config_dir_with_ic_gateways( self, prometheus_config_dir.clone(), group_name, - ) - .expect( - "Failed to synchronize prometheus config with the last deployments of the ic-gateways", - ); + )?; // Setup an SSH session to the prometheus VM which we'll use to scp the JSON files. - let deployed_prometheus_vm = self.get_deployed_universal_vm(&vm_name).unwrap(); - let session = deployed_prometheus_vm - .block_on_ssh_session() - .unwrap_or_else(|e| panic!("Failed to setup SSH session to {vm_name} because: {e:?}!")); + let deployed_prometheus_vm = self.get_deployed_universal_vm(&vm_name)?; + let session = deployed_prometheus_vm.block_on_ssh_session()?; // scp the scraping target JSON files to prometheus VM. let mut target_json_files = vec![ REPLICA_PROMETHEUS_TARGET, @@ -416,6 +416,7 @@ impl HasPrometheus for TestEnv { let to = Path::new(PROMETHEUS_SCRAPING_TARGETS_DIR).join(file); scp_send_to(self.logger(), &session, &from, &to, 0o644); } + Ok(()) } fn download_prometheus_data_dir_if_exists(&self) { From 54161608aa2cac79f5ccb9c0bd030cee197ad9fd Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Tue, 9 Dec 2025 11:27:13 +0000 Subject: [PATCH 02/12] wip --- rs/ethereum/cketh/mainnet/README.md | 4 -- rs/tests/boundary_nodes/BUILD.bazel | 3 ++ .../performance_test_common/src/lib.rs | 7 --- rs/tests/consensus/BUILD.bazel | 1 + .../catch_up_loop_prevention_test.rs | 9 ---- rs/tests/driver/src/driver/context.rs | 3 -- rs/tests/driver/src/driver/group.rs | 28 ++++++++---- rs/tests/driver/src/driver/prometheus_vm.rs | 43 ++++++++++++++++--- rs/tests/system_tests.bzl | 16 ++++++- rs/tests/testnets/BUILD.bazel | 1 + rs/tests/testnets/single_large_node.rs | 6 --- 11 files changed, 78 insertions(+), 43 deletions(-) diff --git a/rs/ethereum/cketh/mainnet/README.md b/rs/ethereum/cketh/mainnet/README.md index 7ef60105a297..f593e4b0ae35 100644 --- a/rs/ethereum/cketh/mainnet/README.md +++ b/rs/ethereum/cketh/mainnet/README.md @@ -186,9 +186,6 @@ To test the proposals with a testnet that uses the same canister IDs as in the p The simplest is to tweak the setup from [small](https://sourcegraph.com/github.com/dfinity/ic@7313a15e21d8fb06fa119ef3ab9371da47c2cddc/-/blob/rs/tests/idx/testnets/small.rs?L62) ```rust pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); let mut ic = InternetComputer::new().add_subnet(Subnet::new(SubnetType::System).add_nodes(1)); for _ in 0..36 { ic = ic.add_subnet(Subnet::new(SubnetType::Application).add_nodes(1)); @@ -200,7 +197,6 @@ pub fn setup(env: TestEnv) { env.topology_snapshot(), NnsCustomizations::default(), ); - env.sync_with_prometheus(); } ``` diff --git a/rs/tests/boundary_nodes/BUILD.bazel b/rs/tests/boundary_nodes/BUILD.bazel index 5375ebd31c1d..626791774c68 100644 --- a/rs/tests/boundary_nodes/BUILD.bazel +++ b/rs/tests/boundary_nodes/BUILD.bazel @@ -137,6 +137,7 @@ system_test_nns( "memory_kibibytes": 512142680, "boot_image_minimal_size_gibibytes": 500, }, + prometheus_vm_required_host_features = ["performance"], tags = ["manual"], runtime_deps = GRAFANA_RUNTIME_DEPS + COUNTER_CANISTER_RUNTIME_DEPS, deps = [ @@ -157,6 +158,7 @@ system_test( "boot_image_minimal_size_gibibytes": 500, }, env_inherit = ["BOUNDARY_NODE_IPV6"], + prometheus_vm_required_host_features = ["performance"], tags = [ "manual", ], @@ -178,6 +180,7 @@ system_test( "boot_image_minimal_size_gibibytes": 500, }, env_inherit = ["BOUNDARY_NODE_IPV6"], + prometheus_vm_required_host_features = ["performance"], tags = [ "manual", ], diff --git a/rs/tests/boundary_nodes/performance_test_common/src/lib.rs b/rs/tests/boundary_nodes/performance_test_common/src/lib.rs index 5e69049629a1..9a398d5c59cb 100644 --- a/rs/tests/boundary_nodes/performance_test_common/src/lib.rs +++ b/rs/tests/boundary_nodes/performance_test_common/src/lib.rs @@ -10,7 +10,6 @@ use ic_system_test_driver::{ driver::{ farm::HostFeature, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, NnsInstallationBuilder, @@ -40,10 +39,6 @@ const MAX_RUNTIME_THREADS: usize = 64; const MAX_RUNTIME_BLOCKING_THREADS: usize = MAX_RUNTIME_THREADS; pub fn setup(env: TestEnv) { - PrometheusVm::default() - .with_required_host_features(vec![HostFeature::Performance]) - .start(&env) - .expect("failed to start prometheus VM"); InternetComputer::new() .with_required_host_features(vec![HostFeature::Performance]) .add_subnet(Subnet::new(SubnetType::System).add_nodes(1)) @@ -68,8 +63,6 @@ pub fn setup(env: TestEnv) { NnsInstallationBuilder::new() .install(&nns_node, &env) .expect("Could not install NNS canisters"); - - env.sync_with_prometheus(); } // Execute update calls (without polling) with an increasing req/s rate, against a counter canister via the boundary node agent. diff --git a/rs/tests/consensus/BUILD.bazel b/rs/tests/consensus/BUILD.bazel index c33c77081733..85652369a6f1 100644 --- a/rs/tests/consensus/BUILD.bazel +++ b/rs/tests/consensus/BUILD.bazel @@ -44,6 +44,7 @@ rust_library( system_test( name = "catch_up_loop_prevention_test", + prometheus_vm_scrape_interval_secs = 5, # TODO(NET-1683): Adjust test for faster p2p tags = [ "long_test", diff --git a/rs/tests/consensus/catch_up_loop_prevention_test.rs b/rs/tests/consensus/catch_up_loop_prevention_test.rs index 2c643abab6c0..c933d12d06da 100644 --- a/rs/tests/consensus/catch_up_loop_prevention_test.rs +++ b/rs/tests/consensus/catch_up_loop_prevention_test.rs @@ -25,7 +25,6 @@ use ic_consensus_system_test_catch_up_test_common::test_catch_up_possible; use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::systest; @@ -34,7 +33,6 @@ use ic_types::Height; use ic_types::malicious_behavior::MaliciousBehavior; use std::time::Duration; -const PROMETHEUS_SCRAPE_INTERVAL: Duration = Duration::from_secs(5); const TIMEOUT: Duration = Duration::from_secs(30 * 60); const EXECUTION_DELAY_FACTOR: f64 = 1.2; @@ -47,11 +45,6 @@ fn setup(env: TestEnv) { let execution_delay_ms = (EXECUTION_DELAY_FACTOR * TARGET_FR_MS as f64) as u64; let state_sync_delay_ms = (STATE_SYNC_DELAY_FACTOR * DKG_INTERVAL_TIME_MS as f64) as u64; - PrometheusVm::default() - .with_scrape_interval(PROMETHEUS_SCRAPE_INTERVAL) - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) @@ -70,8 +63,6 @@ fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("failed to setup IC under test"); - - env.sync_with_prometheus(); } fn main() -> Result<()> { diff --git a/rs/tests/driver/src/driver/context.rs b/rs/tests/driver/src/driver/context.rs index a6806290112c..b97c7b3c770d 100644 --- a/rs/tests/driver/src/driver/context.rs +++ b/rs/tests/driver/src/driver/context.rs @@ -24,7 +24,6 @@ pub struct GroupContext { pub debug_keepalive: bool, pub no_farm_keepalive: bool, pub group_base_name: String, - pub metrics_enabled: bool, pub logs_enabled: bool, pub exclude_logs: Vec, pub quiet: bool, @@ -43,7 +42,6 @@ impl GroupContext { debug_keepalive: bool, no_farm_keepalive: bool, group_base_name: String, - metrics_enabled: bool, logs_enabled: bool, exclude_logs: Vec, quiet: bool, @@ -70,7 +68,6 @@ impl GroupContext { debug_keepalive, no_farm_keepalive, group_base_name, - metrics_enabled, logs_enabled, exclude_logs, quiet, diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index c7e0b10b6761..9b9ae3cb9069 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -135,12 +135,6 @@ pub struct CliArgs { )] pub required_host_features: Option>, - #[clap( - long = "enable-metrics", - help = "If set, a PrometheusVm will be spawned running both p8s configured to scrape the testnet & Grafana." - )] - pub enable_metrics: bool, - #[clap(long = "no-logs", help = "If set, the vector vm will not be spawned.")] pub no_logs: bool, @@ -737,8 +731,12 @@ impl SystemTestGroup { }; let metrics_task_id = TaskId::Test(String::from(METRICS_TASK_NAME)); - let metrics_task = if group_ctx.metrics_enabled { + let metrics_enabled: bool = std::env::var("ENABLE_METRICS") + .map(|v| v == "1" || v.to_lowercase() == "true") + .unwrap_or(false); + let metrics_task = if metrics_enabled { let logger = group_ctx.logger().clone(); + info!(logger, "Setting up PrometheusVm ..."); let group_ctx = group_ctx.clone(); let metrics_task = subproc( @@ -754,7 +752,22 @@ impl SystemTestGroup { std::thread::sleep(KEEPALIVE_INTERVAL); } + let host_features: Vec = + std::env::var("PROMETHEUS_VM_REQUIRED_HOST_FEATURES") + .map_err(|e| e.to_string()) + .and_then(|s| serde_json::from_str(&s).map_err(|e| e.to_string())) + .unwrap_or_default(); + + let prometheus_scrape_interval = + std::env::var("PROMETHEUS_SCRAPE_INTERVAL_SECS") + .ok() + .and_then(|s| s.parse::().ok()) + .map(Duration::from_secs) + .unwrap_or(Duration::from_secs(10)); + PrometheusVm::default() + .with_required_host_features(host_features) + .with_scrape_interval(prometheus_scrape_interval) .start(&env) .expect("failed to start prometheus VM"); loop { @@ -1013,7 +1026,6 @@ impl SystemTestGroup { args.debug_keepalive, args.no_farm_keepalive || args.no_group_ttl, args.group_base_name, - args.enable_metrics, !args.no_logs, args.exclude_logs, args.quiet, diff --git a/rs/tests/driver/src/driver/prometheus_vm.rs b/rs/tests/driver/src/driver/prometheus_vm.rs index 7ea586c553b7..ad9f03c409b8 100644 --- a/rs/tests/driver/src/driver/prometheus_vm.rs +++ b/rs/tests/driver/src/driver/prometheus_vm.rs @@ -8,9 +8,10 @@ use std::{ }; use anyhow::{Context, Result}; +use ic_crypto_sha2::Sha256; use maplit::hashmap; use reqwest::Url; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use serde_json::json; use slog::{Logger, debug, info, warn}; @@ -91,6 +92,17 @@ pub struct PrometheusVm { scrape_interval: Duration, } +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct PrometheusConfigHash { + hash: String, +} + +impl TestEnvAttribute for PrometheusConfigHash { + fn attribute_name() -> String { + "prometheus_config_hash".to_string() + } +} + impl Default for PrometheusVm { fn default() -> Self { PrometheusVm::new(PROMETHEUS_VM_NAME.to_string()) @@ -391,10 +403,7 @@ impl HasPrometheus for TestEnv { prometheus_config_dir.clone(), group_name, )?; - // Setup an SSH session to the prometheus VM which we'll use to scp the JSON files. - let deployed_prometheus_vm = self.get_deployed_universal_vm(&vm_name)?; - let session = deployed_prometheus_vm.block_on_ssh_session()?; - // scp the scraping target JSON files to prometheus VM. + let mut target_json_files = vec![ REPLICA_PROMETHEUS_TARGET, ORCHESTRATOR_PROMETHEUS_TARGET, @@ -411,11 +420,35 @@ impl HasPrometheus for TestEnv { target_json_files.push(DOGECOIN_MAINNET_CANISTER_PROMETHEUS_TARGET); target_json_files.push(DOGECOIN_TESTNET_CANISTER_PROMETHEUS_TARGET); } + + // Hash the contents of the scraping target JSON files and exit early if nothing changed compared to the last time we synced. + let mut hasher = Sha256::new(); + for file_name in &target_json_files { + let file_path = prometheus_config_dir.join(file_name); + let mut file = File::open(file_path)?; + std::io::copy(&mut file, &mut hasher)?; + } + let new_hash = hex::encode(hasher.finish()); + + if let Ok(stored_hash) = PrometheusConfigHash::try_read_attribute(self) + && stored_hash.hash == new_hash + { + info!( + self.logger(), + "No changes in Prometheus scraping targets detected, skipping sync." + ); + return Ok(()); + } + + // scp the scraping target JSON files to prometheus VM. + let deployed_prometheus_vm = self.get_deployed_universal_vm(&vm_name)?; + let session = deployed_prometheus_vm.block_on_ssh_session()?; for file in &target_json_files { let from = prometheus_config_dir.join(file); let to = Path::new(PROMETHEUS_SCRAPING_TARGETS_DIR).join(file); scp_send_to(self.logger(), &session, &from, &to, 0o644); } + PrometheusConfigHash { hash: new_hash }.write_attribute(self); Ok(()) } diff --git a/rs/tests/system_tests.bzl b/rs/tests/system_tests.bzl index e7d3e65cbab5..409c716cefb2 100644 --- a/rs/tests/system_tests.bzl +++ b/rs/tests/system_tests.bzl @@ -195,6 +195,9 @@ def system_test( tags = [], test_timeout = "long", flaky = False, + enable_metrics = False, + prometheus_vm_required_host_features = [], + prometheus_vm_scrape_interval_secs = 10, colocated_test_driver_vm_resources = default_vm_resources, colocated_test_driver_vm_required_host_features = [], colocated_test_driver_vm_enable_ipv4 = False, @@ -233,6 +236,9 @@ def system_test( tags: additional tags for the system_test. test_timeout: bazel test timeout (short, moderate, long or eternal). flaky: rerun in case of failure (up to 3 times). + enable_metrics: if True, a PrometheusVm will be spawned running both p8s (configured to scrape the testnet) & Grafana. + prometheus_vm_required_host_features: a list of strings specifying the required host features of the PrometheusVm. + prometheus_vm_scrape_interval_secs: the scrape interval in seconds for the PrometheusVm. Defaults to 10 seconds. colocated_test_driver_vm_resources: a structure describing the required resources of the colocated test-driver VM. For example: { @@ -435,6 +441,14 @@ def system_test( name: "$(rootpath {})".format(dep) for name, dep in _env_deps.items() } + + if enable_metrics: + env |= {"ENABLE_METRICS": "1"} + + env |= { + "PROMETHEUS_VM_REQUIRED_HOST_FEATURES": json.encode(prometheus_vm_required_host_features), + "PROMETHEUS_VM_SCRAPE_INTERVAL_SECS": json.encode(prometheus_vm_scrape_interval_secs), + } for dep in _env_deps.values(): if dep not in deps: deps.append(dep) @@ -455,7 +469,7 @@ def system_test( exclude_logs = exclude_logs, ) - env = env | { + env |= { "COLOCATED_TEST": test_name, "COLOCATED_TEST_DRIVER_VM_REQUIRED_HOST_FEATURES": json.encode(colocated_test_driver_vm_required_host_features), "COLOCATED_TEST_DRIVER_VM_RESOURCES": json.encode(colocated_test_driver_vm_resources), diff --git a/rs/tests/testnets/BUILD.bazel b/rs/tests/testnets/BUILD.bazel index 97a88d0dc05a..ca14d919136d 100644 --- a/rs/tests/testnets/BUILD.bazel +++ b/rs/tests/testnets/BUILD.bazel @@ -11,6 +11,7 @@ IC_GATEWAY_UVM_IMAGE = [ system_test( name = "single_large_node", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", diff --git a/rs/tests/testnets/single_large_node.rs b/rs/tests/testnets/single_large_node.rs index 85b15177480f..b657d3c5e08c 100644 --- a/rs/tests/testnets/single_large_node.rs +++ b/rs/tests/testnets/single_large_node.rs @@ -41,7 +41,6 @@ use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{ AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources, }; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; fn main() -> Result<()> { @@ -52,10 +51,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) @@ -68,5 +63,4 @@ pub fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("failed to setup IC under test"); - env.sync_with_prometheus(); } From f118d9ac79d682653ca4f02fcd27471968b060b8 Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Tue, 30 Dec 2025 12:29:41 +0000 Subject: [PATCH 03/12] wip --- rs/tests/driver/BUILD.bazel | 1 - rs/tests/driver/Cargo.toml | 1 - rs/tests/driver/src/driver/prometheus_vm.rs | 37 ++++++++++----------- rs/tests/testnets/BUILD.bazel | 1 + rs/tests/testnets/single_app_small_node.rs | 6 ---- 5 files changed, 19 insertions(+), 27 deletions(-) diff --git a/rs/tests/driver/BUILD.bazel b/rs/tests/driver/BUILD.bazel index bbde67ed392e..4518b68598c9 100644 --- a/rs/tests/driver/BUILD.bazel +++ b/rs/tests/driver/BUILD.bazel @@ -109,7 +109,6 @@ rust_library( "@crate_index//:json5", "@crate_index//:lazy_static", "@crate_index//:macaddr", - "@crate_index//:maplit", "@crate_index//:nix", "@crate_index//:num_cpus", "@crate_index//:once_cell", diff --git a/rs/tests/driver/Cargo.toml b/rs/tests/driver/Cargo.toml index 8541dc2c26fb..95d392c31025 100644 --- a/rs/tests/driver/Cargo.toml +++ b/rs/tests/driver/Cargo.toml @@ -84,7 +84,6 @@ itertools = { workspace = true } json5 = "0.4.1" lazy_static = { workspace = true } macaddr = { workspace = true } -maplit = "1.0.2" nix = { workspace = true } num_cpus = "1.13.1" on_wire = { path = "../../rust_canisters/on_wire" } diff --git a/rs/tests/driver/src/driver/prometheus_vm.rs b/rs/tests/driver/src/driver/prometheus_vm.rs index ad9f03c409b8..71d389cfbac5 100644 --- a/rs/tests/driver/src/driver/prometheus_vm.rs +++ b/rs/tests/driver/src/driver/prometheus_vm.rs @@ -1,5 +1,5 @@ use std::{ - collections::HashMap, + collections::BTreeMap, fs::{self, File}, net::Ipv6Addr, path::{Path, PathBuf}, @@ -9,7 +9,6 @@ use std::{ use anyhow::{Context, Result}; use ic_crypto_sha2::Sha256; -use maplit::hashmap; use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_json::json; @@ -429,8 +428,8 @@ impl HasPrometheus for TestEnv { std::io::copy(&mut file, &mut hasher)?; } let new_hash = hex::encode(hasher.finish()); - - if let Ok(stored_hash) = PrometheusConfigHash::try_read_attribute(self) + let opt_stored_hash = PrometheusConfigHash::try_read_attribute(self); + if let Ok(stored_hash) = opt_stored_hash && stored_hash.hash == new_hash { info!( @@ -505,7 +504,8 @@ sudo systemctl start prometheus.service #[derive(Serialize)] struct PrometheusStaticConfig { targets: Vec, - labels: HashMap, + // A BTreeMap is used to ensure a deterministic key ordering in JSON output. + labels: BTreeMap, } fn write_prometheus_config_dir(config_dir: PathBuf, scrape_interval: Duration) -> Result<()> { @@ -664,12 +664,11 @@ fn sync_prometheus_config_dir_with_ic_gateways( .collect::>()?; for (name, ipv6) in ic_gateways.iter() { - let labels: HashMap = [ + let labels: BTreeMap = [ ("ic".to_string(), group_name.clone()), ("gateways".to_string(), name.to_string()), ] - .iter() - .cloned() + .into_iter() .collect(); ic_gateways_p8s_static_configs.push(PrometheusStaticConfig { targets: vec![format!("[{:?}]:{:?}", ipv6, IC_GATEWAY_METRICS_PORT)], @@ -697,13 +696,12 @@ fn sync_prometheus_config_dir( let mut node_exporter_p8s_static_configs: Vec = Vec::new(); for subnet in topology_snapshot.subnets() { for node in subnet.nodes() { - let labels: HashMap = [ + let labels: BTreeMap = [ ("ic".to_string(), group_name.clone()), ("ic_node".to_string(), node.node_id.to_string()), ("ic_subnet".to_string(), subnet.subnet_id.to_string()), ] - .iter() - .cloned() + .into_iter() .collect(); replica_p8s_static_configs.push(PrometheusStaticConfig { targets: vec![scraping_target_url(&node, REPLICA_METRICS_PORT)], @@ -720,12 +718,11 @@ fn sync_prometheus_config_dir( } } for node in topology_snapshot.unassigned_nodes() { - let labels: HashMap = [ + let labels: BTreeMap = [ ("ic".to_string(), group_name.clone()), ("ic_node".to_string(), node.node_id.to_string()), ] - .iter() - .cloned() + .into_iter() .collect(); orchestrator_p8s_static_configs.push(PrometheusStaticConfig { targets: vec![scraping_target_url(&node, ORCHESTRATOR_METRICS_PORT)], @@ -738,13 +735,12 @@ fn sync_prometheus_config_dir( } for node in topology_snapshot.api_boundary_nodes() { - let labels: HashMap = [ + let labels: BTreeMap = [ ("ic".to_string(), group_name.clone()), ("ic_node".to_string(), node.node_id.to_string()), ("ic_api_bn".to_string(), "1".to_string()), ] - .iter() - .cloned() + .into_iter() .collect(); orchestrator_p8s_static_configs.push(PrometheusStaticConfig { targets: vec![scraping_target_url(&node, ORCHESTRATOR_METRICS_PORT)], @@ -766,7 +762,10 @@ fn sync_prometheus_config_dir( &File::create(prometheus_config_dir.join(LEDGER_CANISTER_PROMETHEUS_TARGET))?, &vec![PrometheusStaticConfig { targets: vec![format!("ryjl3-tyaaa-aaaaa-aaaba-cai.raw.{}", domain)], - labels: hashmap! {"ic".to_string() => group_name.clone(), "token".to_string() => "icp".to_string()}, + labels: BTreeMap::from([ + ("ic".to_string(), group_name.clone()), + ("token".to_string(), "icp".to_string()), + ]), }], )?; // Bitcoin and Dogecoin canisters @@ -800,7 +799,7 @@ fn sync_prometheus_config_dir( &File::create(prometheus_config_dir.join(prometheus_target))?, &vec![PrometheusStaticConfig { targets: vec![format!("{canister_id}.raw.{domain}")], - labels: hashmap! {"ic".to_string() => group_name.clone()}, + labels: BTreeMap::from([("ic".to_string(), group_name.clone())]), }], )?; } diff --git a/rs/tests/testnets/BUILD.bazel b/rs/tests/testnets/BUILD.bazel index b10176c0166e..2ee5093d0102 100644 --- a/rs/tests/testnets/BUILD.bazel +++ b/rs/tests/testnets/BUILD.bazel @@ -56,6 +56,7 @@ system_test_nns( system_test( name = "single_app_small_node", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", diff --git a/rs/tests/testnets/single_app_small_node.rs b/rs/tests/testnets/single_app_small_node.rs index ad7fd5ad45cc..67f3390af185 100644 --- a/rs/tests/testnets/single_app_small_node.rs +++ b/rs/tests/testnets/single_app_small_node.rs @@ -41,7 +41,6 @@ use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{ AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources, }; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; fn main() -> Result<()> { @@ -52,10 +51,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::Application) @@ -68,5 +63,4 @@ pub fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("failed to setup IC under test"); - env.sync_with_prometheus(); } From 1370bfa004cdb6b23e43cf10c2f5a852fa88c6b5 Mon Sep 17 00:00:00 2001 From: IDX GitHub Automation Date: Tue, 30 Dec 2025 12:31:58 +0000 Subject: [PATCH 04/12] Automatically updated Cargo*.lock --- Cargo.lock | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 9427306c0e09..ddd078e37308 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14869,7 +14869,6 @@ dependencies = [ "json5", "lazy_static", "macaddr", - "maplit", "nix 0.24.3", "num_cpus", "on_wire", From a31994d4b69aa680180802378371b7ae58dd8bf8 Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Fri, 2 Jan 2026 12:29:17 +0000 Subject: [PATCH 05/12] wip --- rs/tests/driver/src/driver/group.rs | 139 +++++++++++------- ...{metrics_task.rs => metrics_setup_task.rs} | 24 +-- .../driver/src/driver/metrics_sync_task.rs | 30 ++++ rs/tests/driver/src/driver/mod.rs | 3 +- rs/tests/idx/BUILD.bazel | 1 + rs/tests/idx/basic_health_test.rs | 5 - 6 files changed, 128 insertions(+), 74 deletions(-) rename rs/tests/driver/src/driver/{metrics_task.rs => metrics_setup_task.rs} (65%) create mode 100644 rs/tests/driver/src/driver/metrics_sync_task.rs diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index 8b7e280e7974..a2e8811e47a1 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -16,7 +16,8 @@ use crate::driver::{ }; use crate::driver::{ keepalive_task::{KEEPALIVE_TASK_NAME, keepalive_task}, - metrics_task::{METRICS_TASK_NAME, metrics_task}, + metrics_setup_task::{METRICS_SETUP_TASK_NAME, metrics_setup_task}, + metrics_sync_task::{METRICS_SYNC_TASK_NAME, metrics_sync_task}, report::SystemTestGroupError, subprocess_task::SubprocessTask, task::{SkipTestTask, Task}, @@ -33,6 +34,7 @@ use crate::driver::{ use anyhow::{Result, bail}; use chrono::Utc; use clap::Parser; +use itertools::Itertools; use regex::Regex; use serde::{Deserialize, Serialize}; use slog::{Logger, debug, info, trace}; @@ -189,7 +191,8 @@ pub fn is_task_visible_to_user(task_id: &TaskId) -> bool { if task_name.ne(REPORT_TASK_NAME) && task_name.ne(KEEPALIVE_TASK_NAME) && task_name.ne(UVMS_LOGS_STREAM_TASK_NAME) - && task_name.ne(METRICS_TASK_NAME) + && task_name.ne(METRICS_SETUP_TASK_NAME) + && task_name.ne(METRICS_SYNC_TASK_NAME) && task_name.ne(VECTOR_LOGGING_TASK_NAME) && !task_name.starts_with(LIFETIME_GUARD_TASK_PREFIX) && !task_name.starts_with("dummy(") @@ -221,24 +224,30 @@ fn subproc( } fn timed( - plan: Plan>, + children: Vec>>, + ordering: EvalOrder, timeout: Duration, descriptor: Option, ctx: &mut ComposeContext, ) -> Plan> { trace!( ctx.logger, - "timed(plan={:?}, timeout={:?})", &plan, &timeout + "timed(children={:?}, timeout={:?})", &children, &timeout ); let timeout_task = TimeoutTask::new( ctx.rh.clone(), timeout, - TaskId::Timeout(descriptor.unwrap_or_else(|| plan.root_task_id().name())), + TaskId::Timeout(descriptor.unwrap_or_else(|| { + children + .iter() + .map(|child| child.root_task_id().name()) + .join(", ") + })), ); Plan::Supervised { supervisor: Box::from(timeout_task) as Box, - ordering: EvalOrder::Sequential, // the order is irrelevant since there is only one child - children: vec![plan], + ordering, + children, } } @@ -389,9 +398,10 @@ impl SystemTestSubGroup { } }; timed( - Plan::Leaf { + vec![Plan::Leaf { task: Box::from(subproc(task_id, closure, ctx, false)), - }, + }], + EvalOrder::Sequential, ctx.timeout_per_test, None, ctx, @@ -561,7 +571,8 @@ impl SystemTestGroup { } fn make_plan(self, rh: &Handle, group_ctx: GroupContext) -> Result>> { - debug!(group_ctx.log(), "SystemTestGroup.make_plan"); + let logger = group_ctx.logger(); + debug!(logger, "SystemTestGroup.make_plan"); let start_time = Utc::now(); let quiet = group_ctx.quiet; @@ -570,7 +581,7 @@ impl SystemTestGroup { rh, group_ctx: group_ctx.clone(), empty_task_counter: 0, - logger: group_ctx.logger().clone(), + logger: logger.clone(), timeout_per_test: self.effective_timeout_per_test(), }; @@ -600,25 +611,40 @@ impl SystemTestGroup { Box::from(EmptyTask::new(keepalive_task_id)) as Box }; - let metrics_task_id = TaskId::Test(String::from(METRICS_TASK_NAME)); + let metrics_setup_task_id = TaskId::Test(String::from(METRICS_SETUP_TASK_NAME)); + let metrics_sync_task_id = TaskId::Test(String::from(METRICS_SYNC_TASK_NAME)); let metrics_enabled: bool = std::env::var("ENABLE_METRICS") .map(|v| v == "1" || v.to_lowercase() == "true") .unwrap_or(false); - let metrics_task = if metrics_enabled { - let metrics_task = subproc( - metrics_task_id, + let (metrics_setup_task, metrics_sync_task) = if metrics_enabled { + let metrics_setup_task = subproc( + metrics_setup_task_id, { let group_ctx = group_ctx.clone(); - move || metrics_task(group_ctx) + move || metrics_setup_task(group_ctx) }, &mut compose_ctx, quiet, ); - - Box::from(metrics_task) as Box + let metrics_sync_task = subproc( + metrics_sync_task_id, + { + let group_ctx = group_ctx.clone(); + move || metrics_sync_task(group_ctx) + }, + &mut compose_ctx, + quiet, + ); + ( + Box::from(metrics_setup_task) as Box, + Box::from(metrics_sync_task) as Box, + ) } else { - debug!(group_ctx.logger(), "Not spawning metrics task"); - Box::from(EmptyTask::new(metrics_task_id)) as Box + debug!(logger, "Not spawning metrics tasks"); + ( + Box::from(EmptyTask::new(metrics_setup_task_id)) as Box, + Box::from(EmptyTask::new(metrics_sync_task_id)) as Box, + ) }; let vector_logging_task_id = TaskId::Test(String::from(VECTOR_LOGGING_TASK_NAME)); @@ -637,18 +663,18 @@ impl SystemTestGroup { Box::from(vector_logging_task) as Box } else { - debug!(group_ctx.logger(), "Not spawning vector logging task"); + debug!(logger, "Not spawning vector logging task"); Box::from(EmptyTask::new(vector_logging_task_id)) as Box }; - let setup_plan = { - let logger = group_ctx.logger().clone(); - let group_ctx = group_ctx.clone(); - let setup_fn = self - .setup - .unwrap_or_else(|| panic!("setup function not specified for SystemTestGroup.")); - let setup_task = subproc( - TaskId::Test(String::from(SETUP_TASK_NAME)), + let setup_fn = self + .setup + .unwrap_or_else(|| panic!("setup function not specified for SystemTestGroup.")); + let setup_task = subproc( + TaskId::Test(String::from(SETUP_TASK_NAME)), + { + let group_ctx = group_ctx.clone(); + let logger = logger.clone(); move || { debug!(logger, ">>> setup_fn"); let cli_arguments = CliArguments { @@ -662,22 +688,14 @@ impl SystemTestGroup { setup_fn(env.clone()); SetupResult {}.write_attribute(&env); - }, - &mut compose_ctx, - false, - ); - timed( - Plan::Leaf { - task: Box::from(setup_task), - }, - compose_ctx.timeout_per_test, - None, - &mut compose_ctx, - ) - }; + } + }, + &mut compose_ctx, + false, + ); let teardown_plan = self.teardown.map(|teardown_fn| { - let logger = group_ctx.logger().clone(); + let logger = logger.clone(); let group_ctx = group_ctx.clone(); let teardown_task = subproc( TaskId::Test(String::from(TEARDOWN_TASK_NAME)), @@ -690,15 +708,31 @@ impl SystemTestGroup { false, ); timed( - Plan::Leaf { + vec![Plan::Leaf { task: Box::from(teardown_task), - }, + }], + EvalOrder::Sequential, compose_ctx.timeout_per_test, None, &mut compose_ctx, ) }); + let setup_plan = timed( + vec![ + Plan::Leaf { + task: Box::from(setup_task), + }, + Plan::Leaf { + task: metrics_setup_task, + }, + ], + EvalOrder::Parallel, + compose_ctx.timeout_per_test, + None, + &mut compose_ctx, + ); + // normal case: no keepalive, overall timeout is active if !group_ctx.keepalive { let keepalive_plan = compose( @@ -734,8 +768,8 @@ impl SystemTestGroup { &mut compose_ctx, ); - let metrics_plan = compose( - Some(metrics_task), + let metrics_sync_plan = compose( + Some(metrics_sync_task), EvalOrder::Sequential, vec![logs_plan], &mut compose_ctx, @@ -748,13 +782,14 @@ impl SystemTestGroup { EvalOrder::Sequential, vec![if let Some(overall_timeout) = self.overall_timeout { timed( - metrics_plan, + vec![metrics_sync_plan], + EvalOrder::Sequential, overall_timeout, Some(String::from("::group")), &mut compose_ctx, ) } else { - metrics_plan + metrics_sync_plan }], &mut compose_ctx, )); @@ -784,8 +819,8 @@ impl SystemTestGroup { &mut compose_ctx, ); - let metrics_plan = compose( - Some(metrics_task), + let metrics_sync_plan = compose( + Some(metrics_sync_task), EvalOrder::Sequential, vec![logs_plan], &mut compose_ctx, @@ -815,7 +850,7 @@ impl SystemTestGroup { Ok(compose( Some(keepalive_task), EvalOrder::Parallel, - vec![report_plan, metrics_plan], + vec![report_plan, metrics_sync_plan], &mut compose_ctx, )) } diff --git a/rs/tests/driver/src/driver/metrics_task.rs b/rs/tests/driver/src/driver/metrics_setup_task.rs similarity index 65% rename from rs/tests/driver/src/driver/metrics_task.rs rename to rs/tests/driver/src/driver/metrics_setup_task.rs index 3f637e792763..b79f6939be74 100644 --- a/rs/tests/driver/src/driver/metrics_task.rs +++ b/rs/tests/driver/src/driver/metrics_setup_task.rs @@ -1,24 +1,21 @@ use crate::driver::test_env::HasIcPrepDir; use crate::driver::{ - constants::{GROUP_SETUP_DIR, KEEPALIVE_INTERVAL}, - context::GroupContext, - farm::HostFeature, - prometheus_vm::{HasPrometheus, PrometheusVm}, - test_env::TestEnv, + constants::GROUP_SETUP_DIR, context::GroupContext, farm::HostFeature, + prometheus_vm::PrometheusVm, test_env::TestEnv, }; -use slog::{debug, info, warn}; +use slog::{debug, info}; use std::time::Duration; -pub(crate) const METRICS_TASK_NAME: &str = "metrics"; +pub(crate) const METRICS_SETUP_TASK_NAME: &str = "metrics_setup"; -pub(crate) fn metrics_task(group_ctx: GroupContext) -> () { +pub(crate) fn metrics_setup_task(group_ctx: GroupContext) { let logger = group_ctx.logger().clone(); - debug!(logger, ">>> metrics_fn"); + debug!(logger, ">>> metrics_setup_fn"); let setup_dir = group_ctx.group_dir.join(GROUP_SETUP_DIR); let env = TestEnv::new_without_duplicating_logger(setup_dir.clone(), logger.clone()); while !setup_dir.exists() || env.prep_dir("").is_none() { info!(logger, "Setup and/or prep directories not created yet."); - std::thread::sleep(KEEPALIVE_INTERVAL); + std::thread::sleep(Duration::from_secs(2)); } let host_features: Vec = std::env::var("PROMETHEUS_VM_REQUIRED_HOST_FEATURES") @@ -39,11 +36,6 @@ pub(crate) fn metrics_task(group_ctx: GroupContext) -> () { .with_scrape_interval(prometheus_scrape_interval) .start(&env) .expect("failed to start prometheus VM"); - loop { - if let Err(e) = env.sync_with_prometheus_result() { - warn!(logger, "Failed to sync with PrometheusVm due to: {:?}", e); - } - std::thread::sleep(KEEPALIVE_INTERVAL); - } + info!(logger, "PrometheusVm setup complete."); } diff --git a/rs/tests/driver/src/driver/metrics_sync_task.rs b/rs/tests/driver/src/driver/metrics_sync_task.rs new file mode 100644 index 000000000000..9b9d82302aed --- /dev/null +++ b/rs/tests/driver/src/driver/metrics_sync_task.rs @@ -0,0 +1,30 @@ +use crate::driver::test_env::HasIcPrepDir; +use crate::driver::{ + constants::GROUP_SETUP_DIR, context::GroupContext, prometheus_vm::HasPrometheus, + test_env::TestEnv, +}; +use slog::{debug, info, warn}; +use std::time::Duration; + +pub(crate) const METRICS_SYNC_TASK_NAME: &str = "metrics_sync"; + +pub(crate) fn metrics_sync_task(group_ctx: GroupContext) { + let logger = group_ctx.logger().clone(); + debug!(logger, ">>> metrics_sync_fn"); + let setup_dir = group_ctx.group_dir.join(GROUP_SETUP_DIR); + let env = TestEnv::new_without_duplicating_logger(setup_dir.clone(), logger.clone()); + while !setup_dir.exists() || env.prep_dir("").is_none() { + info!(logger, "Setup and/or prep directories not created yet."); + std::thread::sleep(Duration::from_secs(2)); + } + loop { + if let Err(e) = env.sync_with_prometheus_result() { + warn!( + logger, + "Failed to sync with PrometheusVm due to: {}", + e.to_string() + ); + } + std::thread::sleep(Duration::from_secs(10)); + } +} diff --git a/rs/tests/driver/src/driver/mod.rs b/rs/tests/driver/src/driver/mod.rs index 47f6c97dbc47..c2e08e692c07 100644 --- a/rs/tests/driver/src/driver/mod.rs +++ b/rs/tests/driver/src/driver/mod.rs @@ -17,7 +17,8 @@ pub mod uvms_logs_stream_task; mod ic_images; pub mod log_events; pub mod logger; -pub mod metrics_task; +pub mod metrics_setup_task; +pub mod metrics_sync_task; pub mod nested; pub mod node_software_version; pub mod plan; diff --git a/rs/tests/idx/BUILD.bazel b/rs/tests/idx/BUILD.bazel index 39273dc9761d..d52af9d45b29 100644 --- a/rs/tests/idx/BUILD.bazel +++ b/rs/tests/idx/BUILD.bazel @@ -27,6 +27,7 @@ rust_binary( system_test( name = "basic_health_test", + enable_metrics = True, env = UNIVERSAL_CANISTER_ENV, flaky = True, # flakiness rate of 1.22% over the month from 2025-02-11 till 2025-03-11 tags = [ diff --git a/rs/tests/idx/basic_health_test.rs b/rs/tests/idx/basic_health_test.rs index 8ce6335b4d8c..6af953d3371e 100644 --- a/rs/tests/idx/basic_health_test.rs +++ b/rs/tests/idx/basic_health_test.rs @@ -35,7 +35,6 @@ use anyhow::bail; use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::*; use ic_system_test_driver::systest; @@ -53,9 +52,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) @@ -69,7 +65,6 @@ pub fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("failed to setup IC under test"); - env.sync_with_prometheus(); } const MSG: &[u8] = b"this beautiful prose should be persisted for future generations"; From 140e15de9e39e5f7aac732f4997e3d361a2b1c24 Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Fri, 2 Jan 2026 18:26:03 +0000 Subject: [PATCH 06/12] wip --- rs/tests/boundary_nodes/BUILD.bazel | 2 + rs/tests/consensus/BUILD.bazel | 3 ++ rs/tests/consensus/catch_up_possible_test.rs | 9 ----- rs/tests/consensus/consensus_performance.rs | 8 +--- rs/tests/consensus/tecdsa/BUILD.bazel | 25 ++++++++++++ .../tecdsa_performance_test_template.rs | 8 +--- rs/tests/dre/utils/lib.rs | 7 ---- .../driver/src/driver/metrics_setup_task.rs | 8 +++- rs/tests/execution/BUILD.bazel | 2 + .../fill_execution_rounds_workload.rs | 8 +--- .../rejoin_test_large_state.rs | 7 ---- .../state_sync_malicious_chunk_test.rs | 5 --- .../xnet/slo_test_lib/xnet_slo_test_lib.rs | 17 -------- .../xnet/xnet_compatibility.rs | 39 +----------------- .../xnet/xnet_malicious_slices.rs | 40 ++++++------------- rs/tests/networking/BUILD.bazel | 12 ++++++ .../networking/canister_http/canister_http.rs | 8 ---- .../networking/cloner_canister_workload.rs | 7 +--- rs/tests/networking/network_large_test.rs | 5 --- rs/tests/networking/p2p_performance_test.rs | 7 +--- rs/tests/networking/state_sync_performance.rs | 7 ---- .../subnet_update_workload/src/lib.rs | 7 ---- rs/tests/nns/sns/lib/src/sns_deployment.rs | 14 +------ rs/tests/system_tests.bzl | 8 ++++ rs/tests/testnets/BUILD.bazel | 21 ++++++++++ rs/tests/testnets/io_perf_benchmark.rs | 8 ---- rs/tests/testnets/large.rs | 6 --- rs/tests/testnets/mainnet_nns.rs | 25 +----------- rs/tests/testnets/medium.rs | 5 --- rs/tests/testnets/nns_recovery.rs | 7 ---- rs/tests/testnets/single_app_large_node.rs | 6 --- .../single_app_large_node_with_nns.rs | 6 --- rs/tests/testnets/small.rs | 5 --- rs/tests/testnets/small_bitcoin.rs | 6 --- rs/tests/testnets/small_high_perf.rs | 6 --- rs/tests/testnets/small_nns.rs | 7 ---- rs/tests/testnets/small_with_query_stats.rs | 6 --- rs/tests/testnets/sns_testing.rs | 6 --- rs/tests/testnets/src_testing.rs | 7 ---- 39 files changed, 103 insertions(+), 287 deletions(-) diff --git a/rs/tests/boundary_nodes/BUILD.bazel b/rs/tests/boundary_nodes/BUILD.bazel index ea3bc620e3a2..df1a2cb2eed7 100644 --- a/rs/tests/boundary_nodes/BUILD.bazel +++ b/rs/tests/boundary_nodes/BUILD.bazel @@ -27,6 +27,8 @@ system_test_nns( "vcpus": 16, }, enable_head_nns_variant = False, + enable_metrics = True, + prometheus_vm_required_host_features = ["performance"], tags = [ "colocate", ], diff --git a/rs/tests/consensus/BUILD.bazel b/rs/tests/consensus/BUILD.bazel index 60285eea20bb..69ef54a694b2 100644 --- a/rs/tests/consensus/BUILD.bazel +++ b/rs/tests/consensus/BUILD.bazel @@ -64,6 +64,7 @@ system_test( system_test( name = "catch_up_possible_test", guestos = "malicious", + prometheus_vm_scrape_interval_secs = 5, # TODO(NET-1683): Adjust test for faster p2p tags = [ "long_test", @@ -353,7 +354,9 @@ system_test_nns( "//conditions:default": [], }), enable_head_nns_variant = False, + enable_metrics = True, env = {"ENV_DEPS__IC_VERSION_FILE": "$(rootpath //bazel:version.txt)"}, + prometheus_vm_required_host_features = ["performance"], tags = [ "colocate", "manual", diff --git a/rs/tests/consensus/catch_up_possible_test.rs b/rs/tests/consensus/catch_up_possible_test.rs index 11ac394eda45..d7b040bcc4a5 100644 --- a/rs/tests/consensus/catch_up_possible_test.rs +++ b/rs/tests/consensus/catch_up_possible_test.rs @@ -25,7 +25,6 @@ use ic_consensus_system_test_catch_up_test_common::test_catch_up_possible; use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::systest; @@ -34,7 +33,6 @@ use ic_types::Height; use ic_types::malicious_behavior::MaliciousBehavior; use std::time::Duration; -const PROMETHEUS_SCRAPE_INTERVAL: Duration = Duration::from_secs(5); const TIMEOUT: Duration = Duration::from_secs(30 * 60); const EXECUTION_DELAY_FACTOR: f64 = 0.8; @@ -47,11 +45,6 @@ fn setup(env: TestEnv) { let execution_delay_ms = (EXECUTION_DELAY_FACTOR * TARGET_FR_MS as f64) as u64; let state_sync_delay_ms = (STATE_SYNC_DELAY_FACTOR * DKG_INTERVAL_TIME_MS as f64) as u64; - PrometheusVm::default() - .with_scrape_interval(PROMETHEUS_SCRAPE_INTERVAL) - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) @@ -70,8 +63,6 @@ fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("failed to setup IC under test"); - - env.sync_with_prometheus(); } fn main() -> Result<()> { diff --git a/rs/tests/consensus/consensus_performance.rs b/rs/tests/consensus/consensus_performance.rs index e1b895be40a8..92a7267b6866 100644 --- a/rs/tests/consensus/consensus_performance.rs +++ b/rs/tests/consensus/consensus_performance.rs @@ -67,7 +67,7 @@ use ic_system_test_driver::driver::test_env_api::get_current_branch_version; use ic_system_test_driver::driver::{ farm::HostFeature, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, + prometheus_vm::HasPrometheus, simulate_network::{FixedNetworkSimulation, SimulateNetwork}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, NnsCustomizations}, @@ -99,11 +99,6 @@ const NETWORK_SIMULATION: FixedNetworkSimulation = FixedNetworkSimulation::new() const SHOULD_SPAWN_JAEGER_VM: bool = false; fn setup(env: TestEnv) { - PrometheusVm::default() - .with_required_host_features(vec![HostFeature::Performance]) - .start(&env) - .expect("Failed to start prometheus VM"); - let mut ic_builder = InternetComputer::new(); if SHOULD_SPAWN_JAEGER_VM { @@ -141,7 +136,6 @@ fn setup(env: TestEnv) { env.topology_snapshot(), NnsCustomizations::default(), ); - env.sync_with_prometheus(); let topology_snapshot = env.topology_snapshot(); let (app_subnet, _) = get_app_subnet_and_node(&topology_snapshot); diff --git a/rs/tests/consensus/tecdsa/BUILD.bazel b/rs/tests/consensus/tecdsa/BUILD.bazel index e5fba1ed886f..d54c3ace5744 100644 --- a/rs/tests/consensus/tecdsa/BUILD.bazel +++ b/rs/tests/consensus/tecdsa/BUILD.bazel @@ -300,6 +300,11 @@ tecdsa_performance_test_template = system_test_nns( "//bazel:upload_perf_systest_results_enabled": ["upload_perf_systest_results"], "//conditions:default": [], }), + enable_metrics = True, + prometheus_vm_required_host_features = [ + "performance", + "spm", + ], tags = [ "colocate", "manual", @@ -342,11 +347,16 @@ system_test_nns( "boot_image_minimal_size_gibibytes": 500, }, enable_head_nns_variant = False, + enable_metrics = True, env = MESSAGE_CANISTER_ENV | SIGNER_CANISTER_ENV | { "TECDSA_PERFORMANCE_TEST_KEY_IDS": "ecdsa_secp256k1", "BENCHMARK_NAME": "tecdsa_performance_test", "DEFAULT_NODES_COUNT": DEFAULT_NODES_COUNT, }, + prometheus_vm_required_host_features = [ + "performance", + "spm", + ], tags = [ "colocate", "manual", @@ -371,11 +381,16 @@ system_test_nns( "boot_image_minimal_size_gibibytes": 500, }, enable_head_nns_variant = False, + enable_metrics = True, env = MESSAGE_CANISTER_ENV | SIGNER_CANISTER_ENV | { "TECDSA_PERFORMANCE_TEST_KEY_IDS": "schnorr_ed25519", "BENCHMARK_NAME": "tschnorr_ed25519_performance_test", "DEFAULT_NODES_COUNT": DEFAULT_NODES_COUNT, }, + prometheus_vm_required_host_features = [ + "performance", + "spm", + ], tags = [ "colocate", "manual", @@ -400,11 +415,16 @@ system_test_nns( "boot_image_minimal_size_gibibytes": 500, }, enable_head_nns_variant = False, + enable_metrics = True, env = MESSAGE_CANISTER_ENV | SIGNER_CANISTER_ENV | { "TECDSA_PERFORMANCE_TEST_KEY_IDS": "schnorr_bip340", "BENCHMARK_NAME": "tschnorr_bip340_performance_test", "DEFAULT_NODES_COUNT": DEFAULT_NODES_COUNT, }, + prometheus_vm_required_host_features = [ + "performance", + "spm", + ], tags = [ "colocate", "manual", @@ -429,11 +449,16 @@ system_test_nns( "boot_image_minimal_size_gibibytes": 500, }, enable_head_nns_variant = False, + enable_metrics = True, env = MESSAGE_CANISTER_ENV | SIGNER_CANISTER_ENV | { "TECDSA_PERFORMANCE_TEST_KEY_IDS": "vetkd_bls12_381_g2", "BENCHMARK_NAME": "vetkd_performance_test", "DEFAULT_NODES_COUNT": DEFAULT_NODES_COUNT, }, + prometheus_vm_required_host_features = [ + "performance", + "spm", + ], tags = [ "colocate", "manual", diff --git a/rs/tests/consensus/tecdsa/tecdsa_performance_test_template.rs b/rs/tests/consensus/tecdsa/tecdsa_performance_test_template.rs index da773bf7f646..65dbc82afb18 100644 --- a/rs/tests/consensus/tecdsa/tecdsa_performance_test_template.rs +++ b/rs/tests/consensus/tecdsa/tecdsa_performance_test_template.rs @@ -70,7 +70,7 @@ use ic_system_test_driver::driver::test_env_api::HasPublicApiUrl; use ic_system_test_driver::driver::{ farm::HostFeature, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, + prometheus_vm::HasPrometheus, simulate_network::{FixedNetworkSimulation, SimulateNetwork}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, IcNodeContainer, NnsCustomizations}, @@ -181,11 +181,6 @@ pub fn setup(env: TestEnv) { let key_ids = make_key_ids(); info!(env.logger(), "Running the test with key ids: {:?}", key_ids); - PrometheusVm::default() - .with_required_host_features(vec![HostFeature::Performance, HostFeature::Supermicro]) - .start(&env) - .expect("Failed to start prometheus VM"); - let vm_resources = VmResources { vcpus: Some(NrOfVCPUs::new(64)), memory_kibibytes: Some(AmountOfMemoryKiB::new(512_142_680)), @@ -229,7 +224,6 @@ pub fn setup(env: TestEnv) { env.topology_snapshot(), NnsCustomizations::default(), ); - env.sync_with_prometheus(); } pub fn test(env: TestEnv) { diff --git a/rs/tests/dre/utils/lib.rs b/rs/tests/dre/utils/lib.rs index aff3613c35b0..bd1c10caccf8 100644 --- a/rs/tests/dre/utils/lib.rs +++ b/rs/tests/dre/utils/lib.rs @@ -5,7 +5,6 @@ use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::{ ic::{InternetComputer, Node, Subnet}, node_software_version::NodeSoftwareVersion, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, NnsCustomizations, READY_WAIT_TIMEOUT, RETRY_BACKOFF}, }; @@ -71,10 +70,6 @@ pub fn setup(env: TestEnv, config: IcConfig) { .for_each(|un| ic = ic.clone().with_api_boundary_node(un)), } } - - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); ic.setup_and_start(&env) .expect("Failed to setup IC under test"); @@ -82,8 +77,6 @@ pub fn setup(env: TestEnv, config: IcConfig) { env.topology_snapshot(), NnsCustomizations::default(), ); - - env.sync_with_prometheus(); } #[derive(Deserialize, Debug)] diff --git a/rs/tests/driver/src/driver/metrics_setup_task.rs b/rs/tests/driver/src/driver/metrics_setup_task.rs index b79f6939be74..13ddb50e5ea0 100644 --- a/rs/tests/driver/src/driver/metrics_setup_task.rs +++ b/rs/tests/driver/src/driver/metrics_setup_task.rs @@ -1,6 +1,6 @@ use crate::driver::test_env::HasIcPrepDir; use crate::driver::{ - constants::GROUP_SETUP_DIR, context::GroupContext, farm::HostFeature, + constants::GROUP_SETUP_DIR, context::GroupContext, farm::HostFeature, ic::VmResources, prometheus_vm::PrometheusVm, test_env::TestEnv, }; use slog::{debug, info}; @@ -23,6 +23,11 @@ pub(crate) fn metrics_setup_task(group_ctx: GroupContext) { .and_then(|s| serde_json::from_str(&s).map_err(|e| e.to_string())) .unwrap_or_default(); + let vm_resources: VmResources = std::env::var("PROMETHEUS_VM_RESOURCES") + .map_err(|e| e.to_string()) + .and_then(|s| serde_json::from_str(&s).map_err(|e| e.to_string())) + .unwrap_or_default(); + let prometheus_scrape_interval = std::env::var("PROMETHEUS_SCRAPE_INTERVAL_SECS") .ok() .and_then(|s| s.parse::().ok()) @@ -33,6 +38,7 @@ pub(crate) fn metrics_setup_task(group_ctx: GroupContext) { PrometheusVm::default() .with_required_host_features(host_features) + .with_vm_resources(vm_resources) .with_scrape_interval(prometheus_scrape_interval) .start(&env) .expect("failed to start prometheus VM"); diff --git a/rs/tests/execution/BUILD.bazel b/rs/tests/execution/BUILD.bazel index bf5e0a8691ca..a6e7a0040e94 100644 --- a/rs/tests/execution/BUILD.bazel +++ b/rs/tests/execution/BUILD.bazel @@ -190,6 +190,8 @@ system_test( system_test( name = "fill_execution_rounds_workload", + enable_metrics = True, + prometheus_vm_required_host_features = ["performance"], tags = [ "manual", ], diff --git a/rs/tests/execution/fill_execution_rounds_workload.rs b/rs/tests/execution/fill_execution_rounds_workload.rs index f35703c84464..12d942f8e198 100644 --- a/rs/tests/execution/fill_execution_rounds_workload.rs +++ b/rs/tests/execution/fill_execution_rounds_workload.rs @@ -17,7 +17,7 @@ use ic_system_test_driver::{ farm::HostFeature, group::SystemTestGroup, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, + prometheus_vm::HasPrometheus, simulate_network::{ProductionSubnetTopology, SimulateNetwork}, test_env::TestEnv, test_env_api::{ @@ -90,11 +90,6 @@ const MAX_CANISTERS_INSTALLING_IN_PARALLEL: usize = 10; pub fn setup(env: TestEnv, subnet_size: usize, initial_notary_delay: Duration) { let logger = env.logger(); - PrometheusVm::default() - .with_required_host_features(vec![HostFeature::Performance]) - .start(&env) - .expect("failed to start prometheus VM"); - let path = get_dependency_path("rs/tests/jaeger_uvm_config_image.zst"); UniversalVm::new(JAEGER_VM_NAME.to_string()) @@ -133,7 +128,6 @@ pub fn setup(env: TestEnv, subnet_size: usize, initial_notary_delay: Duration) { ) .setup_and_start(&env) .expect("Failed to setup IC under test."); - env.sync_with_prometheus(); // Await Replicas info!(&logger, "Checking readiness of all replica nodes..."); diff --git a/rs/tests/message_routing/rejoin_test_large_state.rs b/rs/tests/message_routing/rejoin_test_large_state.rs index 36bb57b91a01..ab312273bc28 100644 --- a/rs/tests/message_routing/rejoin_test_large_state.rs +++ b/rs/tests/message_routing/rejoin_test_large_state.rs @@ -27,7 +27,6 @@ use ic_system_test_driver::driver::ic::{ AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, Subnet, VmResources, }; use ic_system_test_driver::driver::pot_dsl::{PotSetupFn, SysTestFn}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, @@ -94,10 +93,6 @@ fn setup(env: TestEnv, config: Config) { config.nodes_count >= 4, "at least 4 nodes are required for state sync" ); - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) @@ -123,8 +118,6 @@ fn setup(env: TestEnv, config: Config) { .nodes() .for_each(|node| node.await_status_is_healthy().unwrap()) }); - - env.sync_with_prometheus(); } fn test(env: TestEnv, config: Config) { diff --git a/rs/tests/message_routing/state_sync_malicious_chunk_test.rs b/rs/tests/message_routing/state_sync_malicious_chunk_test.rs index d268f24eb6f1..c52ce59a4e50 100644 --- a/rs/tests/message_routing/state_sync_malicious_chunk_test.rs +++ b/rs/tests/message_routing/state_sync_malicious_chunk_test.rs @@ -25,7 +25,6 @@ use ic_system_test_driver::driver::ic::{ AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, Subnet, VmResources, }; use ic_system_test_driver::driver::pot_dsl::{PotSetupFn, SysTestFn}; -use ic_system_test_driver::driver::prometheus_vm::PrometheusVm; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, @@ -122,10 +121,6 @@ impl Config { } } fn setup(env: TestEnv, config: Config) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) diff --git a/rs/tests/message_routing/xnet/slo_test_lib/xnet_slo_test_lib.rs b/rs/tests/message_routing/xnet/slo_test_lib/xnet_slo_test_lib.rs index ae4220018e02..3f2896d2b3fa 100644 --- a/rs/tests/message_routing/xnet/slo_test_lib/xnet_slo_test_lib.rs +++ b/rs/tests/message_routing/xnet/slo_test_lib/xnet_slo_test_lib.rs @@ -30,7 +30,6 @@ use futures::future::join_all; use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet, VmResources}; use ic_system_test_driver::driver::pot_dsl::{PotSetupFn, SysTestFn}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, NnsInstallationBuilder, @@ -70,7 +69,6 @@ pub struct Config { canisters_per_subnet: usize, canister_to_subnet_rate: usize, vm_resources: Option, - with_prometheus: bool, } impl Config { @@ -122,7 +120,6 @@ impl Config { canisters_per_subnet, canister_to_subnet_rate, vm_resources: None, - with_prometheus: false, } } @@ -131,12 +128,6 @@ impl Config { self } - pub fn with_prometheus(self) -> Self { - let mut config = self.clone(); - config.with_prometheus = true; - config - } - pub fn with_call_timeouts(self, timeouts_seconds: &[Option]) -> Self { let mut config = self.clone(); config.call_timeouts_seconds = timeouts_seconds.to_vec(); @@ -186,19 +177,11 @@ fn setup(env: TestEnv, config: Config) { .setup_and_start(&env) .expect("failed to setup IC under test"); - if config.with_prometheus { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - } env.topology_snapshot().subnets().for_each(|subnet| { subnet .nodes() .for_each(|node| node.await_status_is_healthy().unwrap()) }); - if config.with_prometheus { - env.sync_with_prometheus(); - } } pub fn test(env: TestEnv, config: Config) { diff --git a/rs/tests/message_routing/xnet/xnet_compatibility.rs b/rs/tests/message_routing/xnet/xnet_compatibility.rs index d03ec2d6b08e..c73d5798bec5 100644 --- a/rs/tests/message_routing/xnet/xnet_compatibility.rs +++ b/rs/tests/message_routing/xnet/xnet_compatibility.rs @@ -31,8 +31,6 @@ use ic_consensus_system_test_utils::upgrade::{ use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; -use ic_system_test_driver::driver::pot_dsl::{PotSetupFn, SysTestFn}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, IcNodeSnapshot, get_guestos_img_version, @@ -53,10 +51,8 @@ const DKG_INTERVAL: u64 = 9; const NODES_PER_SUBNET: usize = 1; fn main() -> Result<()> { - let config = Config::default(); - let test = config.clone().test(); SystemTestGroup::new() - .with_setup(config.build()) + .with_setup(setup) .add_test(systest!(test)) .with_timeout_per_test(PER_TASK_TIMEOUT) // each task (including the setup function) may take up to `per_task_timeout`. .with_overall_timeout(OVERALL_TIMEOUT) // the entire group may take up to `overall_timeout`. @@ -64,31 +60,8 @@ fn main() -> Result<()> { Ok(()) } -#[derive(Clone, Debug, Default)] -pub struct Config { - with_prometheus: bool, -} - -impl Config { - pub fn with_prometheus(self) -> Self { - Self { - with_prometheus: true, - } - } - - /// Builds the IC instance. - pub fn build(self) -> impl PotSetupFn { - move |env: TestEnv| setup(env, self) - } - - /// Returns a test function based on this configuration. - pub fn test(self) -> impl SysTestFn { - move |env: TestEnv| test(env) - } -} - // Generic setup -fn setup(env: TestEnv, config: Config) { +fn setup(env: TestEnv) { fn subnet(subnet_type: SubnetType, custom_dkg: Option) -> Subnet { let mut subnet = Subnet::new(subnet_type).add_nodes(NODES_PER_SUBNET); if let Some(dkg_interval) = custom_dkg { @@ -96,11 +69,6 @@ fn setup(env: TestEnv, config: Config) { } subnet } - if config.with_prometheus { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - } let ic = InternetComputer::new(); ic.add_subnet(subnet(SubnetType::System, None)) .add_subnet(subnet(SubnetType::Application, Some(DKG_INTERVAL))) @@ -113,9 +81,6 @@ fn setup(env: TestEnv, config: Config) { .for_each(|node| node.await_status_is_healthy().unwrap()) }); install_nns_and_check_progress(env.topology_snapshot()); - if config.with_prometheus { - env.sync_with_prometheus(); - } } pub fn test(env: TestEnv) { diff --git a/rs/tests/message_routing/xnet/xnet_malicious_slices.rs b/rs/tests/message_routing/xnet/xnet_malicious_slices.rs index c19356d7ef69..073b98877499 100644 --- a/rs/tests/message_routing/xnet/xnet_malicious_slices.rs +++ b/rs/tests/message_routing/xnet/xnet_malicious_slices.rs @@ -26,7 +26,6 @@ use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; use ic_system_test_driver::driver::pot_dsl::{PotSetupFn, SysTestFn}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, SubnetSnapshot, @@ -89,33 +88,20 @@ impl Config { // Generic setup fn setup(env: TestEnv, config: Config, malicious_behavior: MaliciousBehavior) { - std::thread::scope(|s| { - s.spawn(|| { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - }); - - s.spawn(|| { - (0..config.subnets) - .fold(InternetComputer::new(), |ic, _idx| { - ic.add_subnet( - Subnet::new(SubnetType::Application).add_malicious_nodes( - config.nodes_per_subnet, - malicious_behavior.clone(), - ), - ) - }) - .setup_and_start(&env) - .expect("failed to setup IC under test"); - env.topology_snapshot().subnets().for_each(|subnet| { - subnet - .nodes() - .for_each(|node| node.await_status_is_healthy().unwrap()) - }); - }); + (0..config.subnets) + .fold(InternetComputer::new(), |ic, _idx| { + ic.add_subnet( + Subnet::new(SubnetType::Application) + .add_malicious_nodes(config.nodes_per_subnet, malicious_behavior.clone()), + ) + }) + .setup_and_start(&env) + .expect("failed to setup IC under test"); + env.topology_snapshot().subnets().for_each(|subnet| { + subnet + .nodes() + .for_each(|node| node.await_status_is_healthy().unwrap()) }); - env.sync_with_prometheus(); } pub fn test(env: TestEnv, config: Config) { diff --git a/rs/tests/networking/BUILD.bazel b/rs/tests/networking/BUILD.bazel index 9521145d7992..53a1e2f0d41f 100644 --- a/rs/tests/networking/BUILD.bazel +++ b/rs/tests/networking/BUILD.bazel @@ -66,6 +66,7 @@ system_test_nns( system_test_nns( name = "canister_http_stress_test", + enable_metrics = True, env = { "PROXY_WASM_PATH": "$(rootpath //rs/rust_canisters/proxy_canister:proxy_canister)", }, @@ -87,6 +88,7 @@ system_test_nns( system_test_nns( name = "canister_http_soak_test", + enable_metrics = True, env = { "PROXY_WASM_PATH": "$(rootpath //rs/rust_canisters/proxy_canister:proxy_canister)", }, @@ -359,6 +361,8 @@ system_test_nns( system_test_nns( name = "p2p_performance_test", + enable_metrics = True, + prometheus_vm_required_host_features = ["performance"], tags = [ "manual", ], @@ -381,7 +385,9 @@ system_test_nns( "vcpus": 16, }, enable_head_nns_variant = False, + enable_metrics = True, flaky = True, # flakiness rate of over 1.1% over the month from 2025-02-11 till 2025-03-11. + prometheus_vm_required_host_features = ["performance"], tags = [ "colocate", ], @@ -394,10 +400,13 @@ system_test_nns( system_test_nns( name = "state_sync_performance", + colocated_test_driver_vm_required_host_features = ["performance"], enable_head_nns_variant = False, # only run this test with the mainnet NNS canisters. + enable_metrics = True, env = UNIVERSAL_CANISTER_ENV | { "STATESYNC_TEST_CANISTER_WASM_PATH": "$(rootpath //rs/rust_canisters/statesync_test:statesync-test-canister)", }, + prometheus_vm_required_host_features = ["performance"], tags = [ "colocate", "manual", @@ -429,7 +438,9 @@ system_test_nns( "vcpus": 16, }, enable_head_nns_variant = False, + enable_metrics = True, flaky = True, # flakiness rate of 1.83% over the month from 2025-02-11 till 2025-03-11. + prometheus_vm_required_host_features = ["performance"], tags = [ "colocate", ], @@ -442,6 +453,7 @@ system_test_nns( system_test( name = "cloner_canister_workload", + enable_metrics = True, env = { "CLONER_CANISTER_WASM_PATH": "$(rootpath //rs/tests/networking/canisters:cloner_canister)", }, diff --git a/rs/tests/networking/canister_http/canister_http.rs b/rs/tests/networking/canister_http/canister_http.rs index 6884b82e7aed..9dc99d9a2407 100644 --- a/rs/tests/networking/canister_http/canister_http.rs +++ b/rs/tests/networking/canister_http/canister_http.rs @@ -4,8 +4,6 @@ use ic_registry_subnet_features::SubnetFeatures; use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::farm::HostFeature; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; -use ic_system_test_driver::driver::prometheus_vm::HasPrometheus; -use ic_system_test_driver::driver::prometheus_vm::PrometheusVm; use ic_system_test_driver::driver::simulate_network::ProductionSubnetTopology; use ic_system_test_driver::driver::simulate_network::SimulateNetwork; use ic_system_test_driver::driver::test_env_api::{ @@ -113,10 +111,6 @@ pub fn setup(env: TestEnv) { } pub fn stress_setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); - UniversalVm::new(String::from(UNIVERSAL_VM_NAME)) .with_config_img(get_dependency_path( "rs/tests/networking/canister_http/http_uvm_config_image.zst", @@ -154,8 +148,6 @@ pub fn stress_setup(env: TestEnv) { 13 => s.apply_network_settings(ProductionSubnetTopology::IO67), _ => {} }); - - env.sync_with_prometheus(); } pub fn get_universal_vm_address(env: &TestEnv) -> Ipv6Addr { diff --git a/rs/tests/networking/cloner_canister_workload.rs b/rs/tests/networking/cloner_canister_workload.rs index e447905488a9..c1f1df7a2c1c 100644 --- a/rs/tests/networking/cloner_canister_workload.rs +++ b/rs/tests/networking/cloner_canister_workload.rs @@ -27,7 +27,7 @@ use ic_system_test_driver::{ farm::HostFeature, group::SystemTestGroup, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, + prometheus_vm::HasPrometheus, test_env::TestEnv, test_env_api::{HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, load_wasm}, }, @@ -65,10 +65,6 @@ fn main() -> Result<()> { pub fn setup(env: TestEnv) { let logger = env.logger(); - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - info!( &logger, "Step 1: Starting the IC with a subnet of size {SUBNET_SIZE}.", @@ -91,7 +87,6 @@ pub fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("Failed to setup IC under test."); - env.sync_with_prometheus(); // Await Replicas info!( diff --git a/rs/tests/networking/network_large_test.rs b/rs/tests/networking/network_large_test.rs index af763e914a63..dd89402cb833 100644 --- a/rs/tests/networking/network_large_test.rs +++ b/rs/tests/networking/network_large_test.rs @@ -19,7 +19,6 @@ use ic_system_test_driver::{ driver::{ group::SystemTestGroup, ic::{AmountOfMemoryKiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, HasVm, IcNodeContainer, NnsInstallationBuilder, @@ -53,9 +52,6 @@ pub fn setup(env: TestEnv) { memory_kibibytes: Some(AmountOfMemoryKiB::new(4195000)), // 4GiB boot_image_minimal_size_gibibytes: None, }; - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) @@ -72,7 +68,6 @@ pub fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("Failed to setup IC under test."); - env.sync_with_prometheus(); } pub fn test(env: TestEnv) { diff --git a/rs/tests/networking/p2p_performance_test.rs b/rs/tests/networking/p2p_performance_test.rs index 590824e224d0..3e69cdea32ec 100644 --- a/rs/tests/networking/p2p_performance_test.rs +++ b/rs/tests/networking/p2p_performance_test.rs @@ -7,7 +7,7 @@ use ic_system_test_driver::{ farm::HostFeature, group::SystemTestGroup, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, + prometheus_vm::HasPrometheus, simulate_network::{ProductionSubnetTopology, SimulateNetwork}, test_env::TestEnv, test_env_api::{ @@ -58,10 +58,6 @@ pub fn setup( boot_image_minimal_size_gibibytes: Option, ) { let logger = env.logger(); - PrometheusVm::default() - .with_required_host_features(vec![HostFeature::Performance]) - .start(&env) - .expect("failed to start prometheus VM"); let path = get_dependency_path("rs/tests/jaeger_uvm_config_image.zst"); @@ -105,7 +101,6 @@ pub fn setup( ) .setup_and_start(&env) .expect("Failed to setup IC under test."); - env.sync_with_prometheus(); info!(logger, "Step 1: Installing NNS canisters ..."); let nns_node = env .topology_snapshot() diff --git a/rs/tests/networking/state_sync_performance.rs b/rs/tests/networking/state_sync_performance.rs index 6b3e2daee12d..176f5511d5a1 100644 --- a/rs/tests/networking/state_sync_performance.rs +++ b/rs/tests/networking/state_sync_performance.rs @@ -23,7 +23,6 @@ use ic_system_test_driver::{ farm::HostFeature, group::SystemTestGroup, ic::{ImageSizeGiB, InternetComputer, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, simulate_network::{FixedNetworkSimulation, SimulateNetwork}, test_env::TestEnv, test_env_api::{ @@ -58,10 +57,6 @@ pub const SUCCESSFUL_STATE_SYNC_DURATION_SECONDS_COUNT: &str = "state_sync_duration_seconds_count{status=\"ok\"}"; fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .with_default_vm_resources(VmResources { boot_image_minimal_size_gibibytes: Some(ImageSizeGiB::new( @@ -100,7 +95,6 @@ fn setup(env: TestEnv) { .with_latency(LATENCY) .with_bandwidth(BANDWIDTH_MBITS), ); - env.sync_with_prometheus(); let nns_node = topology.root_subnet().nodes().next().unwrap(); NnsInstallationBuilder::new() @@ -177,7 +171,6 @@ fn test(env: TestEnv) { .block_for_newer_registry_version() .await .expect("Failed to wait for new topology version"); - env.sync_with_prometheus(); // Wait for the new nodes to report healthy for subnet in topology.subnets() { diff --git a/rs/tests/networking/subnet_update_workload/src/lib.rs b/rs/tests/networking/subnet_update_workload/src/lib.rs index 600392461558..656dace327c0 100644 --- a/rs/tests/networking/subnet_update_workload/src/lib.rs +++ b/rs/tests/networking/subnet_update_workload/src/lib.rs @@ -26,7 +26,6 @@ use ic_system_test_driver::{ driver::{ farm::HostFeature, ic::{ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, NnsInstallationBuilder, @@ -66,10 +65,6 @@ pub fn setup( required_host_features: Vec, ) { let logger = env.logger(); - PrometheusVm::default() - .with_required_host_features(required_host_features.clone()) - .start(&env) - .expect("failed to start prometheus VM"); let vm_resources = VmResources { vcpus: Some(NrOfVCPUs::new(16)), memory_kibibytes: None, @@ -114,8 +109,6 @@ pub fn setup( .await_status_is_healthy() .expect("API boundary node did not come up healthy."); } - - env.sync_with_prometheus(); } // Run a test with configurable number of update requests per second, diff --git a/rs/tests/nns/sns/lib/src/sns_deployment.rs b/rs/tests/nns/sns/lib/src/sns_deployment.rs index dc6612318614..8b5162a6ba15 100644 --- a/rs/tests/nns/sns/lib/src/sns_deployment.rs +++ b/rs/tests/nns/sns/lib/src/sns_deployment.rs @@ -23,7 +23,6 @@ use ic_system_test_driver::{ }, canister_requests, driver::{ - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{ GetFirstHealthyNodeSnapshot, HasPublicApiUrl, HasTopologySnapshot, IcNodeSnapshot, @@ -320,15 +319,8 @@ pub fn setup( } /// Sets up and starts the IC, and creates two subnets (one system subnet and -/// one application subnet). If `fast_test_setup` is false, also sets up -/// Prometheus. +/// one application subnet). fn setup_ic(env: &TestEnv, fast_test_setup: bool) { - if !fast_test_setup { - PrometheusVm::default() - .start(env) - .expect("failed to start prometheus VM"); - } - let mut ic = InternetComputer::new() // NNS .add_subnet( @@ -357,10 +349,6 @@ fn setup_ic(env: &TestEnv, fast_test_setup: bool) { } ic.setup_and_start(env) .expect("failed to setup IC under test"); - - if !fast_test_setup { - env.sync_with_prometheus(); - } } /// Sets up an SNS using "openchat-ish" parameters. diff --git a/rs/tests/system_tests.bzl b/rs/tests/system_tests.bzl index 9d34b2495d41..f7d925407a56 100644 --- a/rs/tests/system_tests.bzl +++ b/rs/tests/system_tests.bzl @@ -133,6 +133,7 @@ def system_test( flaky = False, enable_metrics = False, prometheus_vm_required_host_features = [], + prometheus_vm_resources = default_vm_resources, prometheus_vm_scrape_interval_secs = 10, colocated_test_driver_vm_resources = default_vm_resources, colocated_test_driver_vm_required_host_features = [], @@ -163,6 +164,12 @@ def system_test( flaky: rerun in case of failure (up to 3 times). enable_metrics: if True, a PrometheusVm will be spawned running both p8s (configured to scrape the testnet) & Grafana. prometheus_vm_required_host_features: a list of strings specifying the required host features of the PrometheusVm. + prometheus_vm_resources: a structure describing the required resources of the PrometheusVm. For example: + { + "vcpus": 32, + "memory_kibibytes": 125000000, + "boot_image_minimal_size_gibibytes": 500, + } prometheus_vm_scrape_interval_secs: the scrape interval in seconds for the PrometheusVm. Defaults to 10 seconds. colocated_test_driver_vm_resources: a structure describing the required resources of the colocated test-driver VM. For example: @@ -368,6 +375,7 @@ def system_test( env |= { "PROMETHEUS_VM_REQUIRED_HOST_FEATURES": json.encode(prometheus_vm_required_host_features), + "PROMETHEUS_VM_RESOURCES": json.encode(prometheus_vm_resources), "PROMETHEUS_VM_SCRAPE_INTERVAL_SECS": json.encode(prometheus_vm_scrape_interval_secs), } for dep in _env_deps.values(): diff --git a/rs/tests/testnets/BUILD.bazel b/rs/tests/testnets/BUILD.bazel index 2ee5093d0102..2e3a2b375931 100644 --- a/rs/tests/testnets/BUILD.bazel +++ b/rs/tests/testnets/BUILD.bazel @@ -27,6 +27,7 @@ system_test( system_test( name = "single_app_large_node", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", @@ -41,6 +42,7 @@ system_test( system_test_nns( name = "single_app_large_node_with_nns", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", @@ -71,6 +73,7 @@ system_test( system_test_nns( name = "small", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", @@ -87,6 +90,7 @@ system_test_nns( system_test_nns( name = "small_bitcoin", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", @@ -104,6 +108,7 @@ system_test_nns( system_test_nns( name = "small_high_perf", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", @@ -137,6 +142,7 @@ system_test( system_test_nns( name = "small_nns", + enable_metrics = True, env = { "IC_ICRC1_LEDGER_WASM_PATH": "$(rootpath //rs/ledger_suite/icrc1/ledger:ledger_canister)", "II_WASM_PATH": "$(rootpath @ii_dev_canister//file)", @@ -165,6 +171,7 @@ system_test_nns( system_test_nns( name = "small_with_query_stats", + enable_metrics = True, env = { "IC_ICRC1_LEDGER_WASM_PATH": "$(rootpath //rs/ledger_suite/icrc1/ledger:ledger_canister)", "II_WASM_PATH": "$(rootpath @ii_dev_canister//file)", @@ -192,6 +199,7 @@ system_test_nns( system_test_nns( name = "sns_testing", + enable_metrics = True, env = SNS_CANISTER_ENV | { "II_WASM_PATH": "$(rootpath @ii_dev_canister//file)", "NNS_DAPP_WASM_PATH": "$(rootpath @nns_dapp_canister//file)", @@ -221,6 +229,7 @@ system_test_nns( system_test_nns( name = "medium", + enable_metrics = True, tags = [ "dynamic_testnet", "manual", @@ -237,6 +246,7 @@ system_test_nns( system_test_nns( name = "large", + enable_metrics = True, env = SNS_CANISTER_ENV | { "II_WASM_PATH": "$(rootpath @ii_dev_canister//file)", "NNS_DAPP_WASM_PATH": "$(rootpath @nns_dapp_canister//file)", @@ -265,6 +275,8 @@ system_test_nns( system_test_nns( name = "io_perf_benchmark", + enable_metrics = True, + prometheus_vm_required_host_features = ["performance"], tags = [ "dynamic_testnet", "manual", @@ -283,6 +295,7 @@ system_test_nns( system_test_nns( name = "src_testing", + enable_metrics = True, env = SNS_CANISTER_ENV | { "XRC_WASM_PATH": "$(rootpath //rs/rust_canisters/xrc_mock:xrc_mock_canister)", "II_WASM_PATH": "$(rootpath @ii_dev_canister//file)", @@ -318,6 +331,7 @@ system_test_nns( system_test_nns( name = "nns_recovery", + enable_metrics = True, guestos_update = "test", tags = [ "dynamic_testnet", @@ -339,6 +353,7 @@ system_test_nns( system_test_nns( name = "mainnet_nns", + enable_metrics = True, env = MAINNET_ENV | { # TODO (CON-1624): Remove those dependencies once #8170 reached mainnet NNS "IC_RECOVERY_PATH": "$(rootpath //rs/recovery:ic-recovery)", @@ -346,6 +361,12 @@ system_test_nns( }, env_inherit = ["SSH_AUTH_SOCK"], guestos = "mainnet_nns", + # Requires more resources to scrape mainnet topology + prometheus_vm_resources = { + "vcpus": 32, + "memory_kibibytes": 125000000, + "boot_image_minimal_size_gibibytes": 500, + }, tags = [ "dynamic_testnet", "manual", diff --git a/rs/tests/testnets/io_perf_benchmark.rs b/rs/tests/testnets/io_perf_benchmark.rs index a06b028158d3..6716c685930f 100644 --- a/rs/tests/testnets/io_perf_benchmark.rs +++ b/rs/tests/testnets/io_perf_benchmark.rs @@ -59,7 +59,6 @@ use ic_system_test_driver::driver::pot_dsl::PotSetupFn; use ic_system_test_driver::driver::{ farm::HostFeature, group::SystemTestGroup, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, IcNodeContainer}, }; @@ -211,12 +210,6 @@ impl Config { } pub fn setup(env: TestEnv, config: Config) { - // start p8s for metrics and dashboards - PrometheusVm::default() - .with_required_host_features(vec![HostFeature::Performance]) - .start(&env) - .expect("Failed to start prometheus VM"); - let mut ic = InternetComputer::new() .with_api_boundary_nodes(1) .add_subnet(Subnet::new(SubnetType::System).add_nodes(1)); @@ -303,5 +296,4 @@ pub fn setup(env: TestEnv, config: Config) { .start(&env) .expect("failed to setup ic-gateway"); } - env.sync_with_prometheus(); } diff --git a/rs/tests/testnets/large.rs b/rs/tests/testnets/large.rs index 7885031a5f93..0435cda6b465 100644 --- a/rs/tests/testnets/large.rs +++ b/rs/tests/testnets/large.rs @@ -48,7 +48,6 @@ use ic_system_test_driver::driver::ic::{ use ic_system_test_driver::driver::ic_gateway_vm::{HasIcGatewayVm, IcGatewayVm}; use ic_system_test_driver::driver::{ group::SystemTestGroup, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, IcNodeContainer}, }; @@ -70,10 +69,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - // start p8s for metrics and dashboards - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); // set up IC overriding the default resources to be more powerful let vm_resources = VmResources { vcpus: Some(NrOfVCPUs::new(64)), @@ -110,7 +105,6 @@ pub fn setup(env: TestEnv) { } let ic_gateway = env.get_deployed_ic_gateway("ic-gateway-0").unwrap(); let ic_gateway_url = ic_gateway.get_public_url(); - env.sync_with_prometheus(); // pick an SNS subnet among the application subnets let topology = env.topology_snapshot(); diff --git a/rs/tests/testnets/mainnet_nns.rs b/rs/tests/testnets/mainnet_nns.rs index a2d61306234d..ebb43e41378a 100644 --- a/rs/tests/testnets/mainnet_nns.rs +++ b/rs/tests/testnets/mainnet_nns.rs @@ -36,35 +36,14 @@ // Happy testing! use anyhow::Result; -use ic_system_test_driver::driver::{ - group::SystemTestGroup, - ic::{AmountOfMemoryKiB, ImageSizeGiB, NrOfVCPUs, VmResources}, - prometheus_vm::{HasPrometheus, PrometheusVm}, - test_env::TestEnv, -}; +use ic_system_test_driver::driver::group::SystemTestGroup; use ic_testnet_mainnet_nns::setup as setup_mainnet_nns; use std::time::Duration; -fn setup(env: TestEnv) { - // Requires more resources to scrape mainnet topology - PrometheusVm::default() - .with_vm_resources(VmResources { - vcpus: Some(NrOfVCPUs::new(32)), - memory_kibibytes: Some(AmountOfMemoryKiB::new(125000000)), // ~128 GiB - boot_image_minimal_size_gibibytes: Some(ImageSizeGiB::new(500)), - }) - .start(&env) - .expect("Failed to start prometheus VM"); - - setup_mainnet_nns(env.clone()); - - env.sync_with_prometheus(); -} - fn main() -> Result<()> { SystemTestGroup::new() .with_timeout_per_test(Duration::from_secs(90 * 60)) - .with_setup(setup) + .with_setup(setup_mainnet_nns) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/testnets/medium.rs b/rs/tests/testnets/medium.rs index 31c5706bec50..92b8fe1e43e8 100644 --- a/rs/tests/testnets/medium.rs +++ b/rs/tests/testnets/medium.rs @@ -54,7 +54,6 @@ use ic_system_test_driver::driver::{ group::SystemTestGroup, ic::{InternetComputer, Subnet}, ic_gateway_vm::{IC_GATEWAY_VM_NAME, IcGatewayVm}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, NnsCustomizations}, }; @@ -67,9 +66,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); InternetComputer::new() .add_subnet(Subnet::new(SubnetType::System).add_nodes(4)) .add_subnet(Subnet::new(SubnetType::Application).add_nodes(4)) @@ -84,5 +80,4 @@ pub fn setup(env: TestEnv) { IcGatewayVm::new(IC_GATEWAY_VM_NAME) .start(&env) .expect("failed to setup ic-gateway"); - env.sync_with_prometheus(); } diff --git a/rs/tests/testnets/nns_recovery.rs b/rs/tests/testnets/nns_recovery.rs index 8069ace507d0..721618186cea 100644 --- a/rs/tests/testnets/nns_recovery.rs +++ b/rs/tests/testnets/nns_recovery.rs @@ -39,7 +39,6 @@ use ic_nested_nns_recovery_common::{ SetupConfig, grant_backup_access_to_all_nns_nodes, replace_nns_with_unassigned_nodes, }; use ic_system_test_driver::driver::nested::HasNestedVms; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::{TestEnv, TestEnvAttribute}; use ic_system_test_driver::driver::test_env_api::*; use ic_system_test_driver::driver::test_setup::GroupSetup; @@ -58,10 +57,6 @@ fn setup(env: TestEnv) { .and_then(|s| s.parse::().ok()) .unwrap_or(DKG_INTERVAL_HEIGHT); - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - ic_nested_nns_recovery_common::setup( env.clone(), SetupConfig { @@ -104,8 +99,6 @@ fn log_instructions(env: TestEnv) { replace_nns_with_unassigned_nodes(&env); grant_backup_access_to_all_nns_nodes(&env, &backup_auth, &ssh_backup_pub_key); - env.sync_with_prometheus(); - let upgrade_version = get_guestos_update_img_version(); let upgrade_image_url = get_guestos_update_img_url(); let upgrade_image_hash = get_guestos_update_img_sha256(); diff --git a/rs/tests/testnets/single_app_large_node.rs b/rs/tests/testnets/single_app_large_node.rs index 9bb1af523753..e4b62a9e5a5f 100644 --- a/rs/tests/testnets/single_app_large_node.rs +++ b/rs/tests/testnets/single_app_large_node.rs @@ -41,7 +41,6 @@ use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{ AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources, }; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; fn main() -> Result<()> { @@ -52,10 +51,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::Application) @@ -68,5 +63,4 @@ pub fn setup(env: TestEnv) { ) .setup_and_start(&env) .expect("failed to setup IC under test"); - env.sync_with_prometheus(); } diff --git a/rs/tests/testnets/single_app_large_node_with_nns.rs b/rs/tests/testnets/single_app_large_node_with_nns.rs index 7b7adab6bcb7..53aaefdead07 100644 --- a/rs/tests/testnets/single_app_large_node_with_nns.rs +++ b/rs/tests/testnets/single_app_large_node_with_nns.rs @@ -53,7 +53,6 @@ use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{ AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources, }; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::{HasTopologySnapshot, NnsCustomizations}; @@ -65,10 +64,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - InternetComputer::new() .add_subnet(Subnet::new(SubnetType::System).add_nodes(1)) .add_subnet( @@ -86,5 +81,4 @@ pub fn setup(env: TestEnv) { env.topology_snapshot(), NnsCustomizations::default(), ); - env.sync_with_prometheus(); } diff --git a/rs/tests/testnets/small.rs b/rs/tests/testnets/small.rs index 96b077c6fa92..28d2eb674533 100644 --- a/rs/tests/testnets/small.rs +++ b/rs/tests/testnets/small.rs @@ -42,7 +42,6 @@ use ic_system_test_driver::driver::{ group::SystemTestGroup, ic::{InternetComputer, Subnet}, ic_gateway_vm::{IC_GATEWAY_VM_NAME, IcGatewayVm}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, NnsCustomizations}, }; @@ -55,9 +54,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); InternetComputer::new() .add_subnet(Subnet::new(SubnetType::System).add_nodes(1)) .add_subnet(Subnet::new(SubnetType::Application).add_nodes(1)) @@ -72,5 +68,4 @@ pub fn setup(env: TestEnv) { IcGatewayVm::new(IC_GATEWAY_VM_NAME) .start(&env) .expect("failed to setup ic-gateway"); - env.sync_with_prometheus(); } diff --git a/rs/tests/testnets/small_bitcoin.rs b/rs/tests/testnets/small_bitcoin.rs index 0e880836638d..83ed127d51da 100644 --- a/rs/tests/testnets/small_bitcoin.rs +++ b/rs/tests/testnets/small_bitcoin.rs @@ -44,7 +44,6 @@ use ic_system_test_driver::driver::{ group::SystemTestGroup, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, ic_gateway_vm::{IC_GATEWAY_VM_NAME, IcGatewayVm}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, NnsInstallationBuilder}, }; @@ -59,10 +58,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); - InternetComputer::new() .use_specified_ids_allocation_range() .add_subnet( @@ -82,7 +77,6 @@ pub fn setup(env: TestEnv) { IcGatewayVm::new(IC_GATEWAY_VM_NAME) .start(&env) .expect("failed to setup ic-gateway"); - env.sync_with_prometheus(); } fn await_nodes_healthy(env: &TestEnv) { diff --git a/rs/tests/testnets/small_high_perf.rs b/rs/tests/testnets/small_high_perf.rs index c997d0d4b308..35f0b5e843d7 100644 --- a/rs/tests/testnets/small_high_perf.rs +++ b/rs/tests/testnets/small_high_perf.rs @@ -43,7 +43,6 @@ use ic_system_test_driver::driver::{ group::SystemTestGroup, ic::{AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources}, ic_gateway_vm::{IC_GATEWAY_VM_NAME, IcGatewayVm}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, NnsCustomizations}, }; @@ -56,10 +55,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); - InternetComputer::new() .add_subnet(Subnet::new(SubnetType::System).add_nodes(1)) .add_subnet( @@ -81,5 +76,4 @@ pub fn setup(env: TestEnv) { IcGatewayVm::new(IC_GATEWAY_VM_NAME) .start(&env) .expect("failed to setup ic-gateway"); - env.sync_with_prometheus(); } diff --git a/rs/tests/testnets/small_nns.rs b/rs/tests/testnets/small_nns.rs index de8371314911..a8a5bef8397e 100644 --- a/rs/tests/testnets/small_nns.rs +++ b/rs/tests/testnets/small_nns.rs @@ -42,7 +42,6 @@ use ic_system_test_driver::driver::{ group::SystemTestGroup, ic::{InternetComputer, Subnet}, ic_gateway_vm::{HasIcGatewayVm, IC_GATEWAY_VM_NAME, IcGatewayVm}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::HasTopologySnapshot, }; @@ -58,10 +57,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); - InternetComputer::new() .add_subnet(Subnet::new(SubnetType::System).add_nodes(1)) .add_subnet(Subnet::new(SubnetType::Application).add_nodes(1)) @@ -78,8 +73,6 @@ pub fn setup(env: TestEnv) { .expect("failed to setup ic-gateway"); let ic_gateway = env.get_deployed_ic_gateway(IC_GATEWAY_VM_NAME).unwrap(); let ic_gateway_url = ic_gateway.get_public_url(); - env.sync_with_prometheus(); - install_ii_nns_dapp_and_subnet_rental(&env, &ic_gateway_url, None); set_authorized_subnets(&env); } diff --git a/rs/tests/testnets/small_with_query_stats.rs b/rs/tests/testnets/small_with_query_stats.rs index 6447aa8c5262..c5b9ca3164b1 100644 --- a/rs/tests/testnets/small_with_query_stats.rs +++ b/rs/tests/testnets/small_with_query_stats.rs @@ -8,7 +8,6 @@ use ic_system_test_driver::driver::{ group::SystemTestGroup, ic::{InternetComputer, Subnet}, ic_gateway_vm::{IC_GATEWAY_VM_NAME, IcGatewayVm}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, NnsCustomizations}, }; @@ -21,10 +20,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); - InternetComputer::new() .add_subnet( Subnet::new(SubnetType::System) @@ -47,5 +42,4 @@ pub fn setup(env: TestEnv) { IcGatewayVm::new(IC_GATEWAY_VM_NAME) .start(&env) .expect("failed to setup ic-gateway"); - env.sync_with_prometheus(); } diff --git a/rs/tests/testnets/sns_testing.rs b/rs/tests/testnets/sns_testing.rs index e7bd15714be4..b44b5afd563d 100644 --- a/rs/tests/testnets/sns_testing.rs +++ b/rs/tests/testnets/sns_testing.rs @@ -44,7 +44,6 @@ use ic_system_test_driver::driver::ic_gateway_vm::{ use ic_system_test_driver::driver::{ group::SystemTestGroup, ic::{InternetComputer, Subnet}, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasTopologySnapshot, IcNodeContainer}, }; @@ -63,10 +62,6 @@ fn main() -> Result<()> { } pub fn setup(env: TestEnv) { - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); - InternetComputer::new() .add_subnet(Subnet::new(SubnetType::System).add_nodes(1)) .add_subnet(Subnet::new(SubnetType::Application).add_nodes(1)) @@ -85,7 +80,6 @@ pub fn setup(env: TestEnv) { .expect("failed to setup ic-gateway"); let ic_gateway = env.get_deployed_ic_gateway(IC_GATEWAY_VM_NAME).unwrap(); let ic_gateway_url = ic_gateway.get_public_url(); - env.sync_with_prometheus(); let topology = env.topology_snapshot(); let mut app_subnets = topology diff --git a/rs/tests/testnets/src_testing.rs b/rs/tests/testnets/src_testing.rs index ce69f0369c81..75b1d4a6fd7d 100644 --- a/rs/tests/testnets/src_testing.rs +++ b/rs/tests/testnets/src_testing.rs @@ -49,7 +49,6 @@ use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; use ic_system_test_driver::driver::ic_gateway_vm::{HasIcGatewayVm, IcGatewayVm}; use ic_system_test_driver::driver::{ group::SystemTestGroup, - prometheus_vm::{HasPrometheus, PrometheusVm}, test_env::TestEnv, test_env_api::{HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer}, }; @@ -100,11 +99,6 @@ fn new_icp_cxdr_mock_exchange_rate_canister_init_payload(rate: u64) -> XrcMockIn } pub fn setup(env: TestEnv) { - // start p8s for metrics and dashboards - PrometheusVm::default() - .start(&env) - .expect("Failed to start prometheus VM"); - // set up IC let mut ic = InternetComputer::new().with_api_boundary_nodes(1); // the following subnets are gonna have IDs 1, 2, 3, ... @@ -140,7 +134,6 @@ pub fn setup(env: TestEnv) { .expect("failed to setup ic-gateway"); let ic_gateway = env.get_deployed_ic_gateway(ic_gatewway_name).unwrap(); let ic_gateway_url = ic_gateway.get_public_url(); - env.sync_with_prometheus(); // install II, NNS dapp, and Subnet Rental Canister install_ii_nns_dapp_and_subnet_rental(&env, &ic_gateway_url, None); From bce34cc2f6e85d4d71321183fc38e72ab8b38ad7 Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Mon, 5 Jan 2026 12:42:32 +0000 Subject: [PATCH 07/12] fix --- rs/tests/driver/src/driver/group.rs | 59 +++++++++++++++-------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index a2e8811e47a1..76ebecdec395 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -611,21 +611,12 @@ impl SystemTestGroup { Box::from(EmptyTask::new(keepalive_task_id)) as Box }; - let metrics_setup_task_id = TaskId::Test(String::from(METRICS_SETUP_TASK_NAME)); - let metrics_sync_task_id = TaskId::Test(String::from(METRICS_SYNC_TASK_NAME)); let metrics_enabled: bool = std::env::var("ENABLE_METRICS") .map(|v| v == "1" || v.to_lowercase() == "true") .unwrap_or(false); - let (metrics_setup_task, metrics_sync_task) = if metrics_enabled { - let metrics_setup_task = subproc( - metrics_setup_task_id, - { - let group_ctx = group_ctx.clone(); - move || metrics_setup_task(group_ctx) - }, - &mut compose_ctx, - quiet, - ); + + let metrics_sync_task_id = TaskId::Test(String::from(METRICS_SYNC_TASK_NAME)); + let metrics_sync_task = if metrics_enabled { let metrics_sync_task = subproc( metrics_sync_task_id, { @@ -635,16 +626,9 @@ impl SystemTestGroup { &mut compose_ctx, quiet, ); - ( - Box::from(metrics_setup_task) as Box, - Box::from(metrics_sync_task) as Box, - ) + Box::from(metrics_sync_task) as Box } else { - debug!(logger, "Not spawning metrics tasks"); - ( - Box::from(EmptyTask::new(metrics_setup_task_id)) as Box, - Box::from(EmptyTask::new(metrics_sync_task_id)) as Box, - ) + Box::from(EmptyTask::new(metrics_sync_task_id)) as Box }; let vector_logging_task_id = TaskId::Test(String::from(VECTOR_LOGGING_TASK_NAME)); @@ -718,15 +702,32 @@ impl SystemTestGroup { ) }); - let setup_plan = timed( - vec![ - Plan::Leaf { - task: Box::from(setup_task), - }, - Plan::Leaf { - task: metrics_setup_task, + let setup_plan: Plan> = Plan::Leaf { + task: Box::from(setup_task), + }; + + let setup_tasks = if metrics_enabled { + let metrics_setup_task_id = TaskId::Test(String::from(METRICS_SETUP_TASK_NAME)); + let metrics_setup_task = subproc( + metrics_setup_task_id, + { + let group_ctx = group_ctx.clone(); + move || metrics_setup_task(group_ctx) }, - ], + &mut compose_ctx, + quiet, + ); + let metrics_setup_task = Box::from(metrics_setup_task) as Box; + let metrics_setup_plan = Plan::Leaf { + task: metrics_setup_task, + }; + vec![setup_plan, metrics_setup_plan] + } else { + vec![setup_plan] + }; + + let setup_plan = timed( + setup_tasks, EvalOrder::Parallel, compose_ctx.timeout_per_test, None, From ef049cb13cd4ccf28522dc9eeb342c799f140044 Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Mon, 5 Jan 2026 13:23:25 +0000 Subject: [PATCH 08/12] wip --- rs/tests/boundary_nodes/BUILD.bazel | 3 +++ rs/tests/message_routing/BUILD.bazel | 1 + rs/tests/message_routing/rejoin_test_long_rounds.rs | 7 ------- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/rs/tests/boundary_nodes/BUILD.bazel b/rs/tests/boundary_nodes/BUILD.bazel index df1a2cb2eed7..0432a764b0be 100644 --- a/rs/tests/boundary_nodes/BUILD.bazel +++ b/rs/tests/boundary_nodes/BUILD.bazel @@ -137,6 +137,7 @@ system_test_nns( "memory_kibibytes": 512142680, "boot_image_minimal_size_gibibytes": 500, }, + enable_metrics = True, prometheus_vm_required_host_features = ["performance"], tags = ["manual"], runtime_deps = COUNTER_CANISTER_RUNTIME_DEPS, @@ -157,6 +158,7 @@ system_test( "memory_kibibytes": 512142680, "boot_image_minimal_size_gibibytes": 500, }, + enable_metrics = True, env_inherit = ["BOUNDARY_NODE_IPV6"], prometheus_vm_required_host_features = ["performance"], tags = [ @@ -179,6 +181,7 @@ system_test( "memory_kibibytes": 512142680, "boot_image_minimal_size_gibibytes": 500, }, + enable_metrics = True, env_inherit = ["BOUNDARY_NODE_IPV6"], prometheus_vm_required_host_features = ["performance"], tags = [ diff --git a/rs/tests/message_routing/BUILD.bazel b/rs/tests/message_routing/BUILD.bazel index 18a1168559e5..d307b53e7d76 100644 --- a/rs/tests/message_routing/BUILD.bazel +++ b/rs/tests/message_routing/BUILD.bazel @@ -97,6 +97,7 @@ system_test_nns( system_test_nns( name = "rejoin_test_long_rounds", enable_head_nns_variant = False, # only run this test with the mainnet NNS canisters. + enable_metrics = True, env = UNIVERSAL_CANISTER_ENV | { "STATESYNC_TEST_CANISTER_WASM_PATH": "$(rootpath //rs/rust_canisters/statesync_test:statesync-test-canister)", }, diff --git a/rs/tests/message_routing/rejoin_test_long_rounds.rs b/rs/tests/message_routing/rejoin_test_long_rounds.rs index e0bf9678f915..bdcd5a793cb6 100644 --- a/rs/tests/message_routing/rejoin_test_long_rounds.rs +++ b/rs/tests/message_routing/rejoin_test_long_rounds.rs @@ -24,7 +24,6 @@ use ic_system_test_driver::driver::ic::{ AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources, }; use ic_system_test_driver::driver::pot_dsl::{PotSetupFn, SysTestFn}; -use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::{ HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, @@ -80,10 +79,6 @@ impl Config { } fn setup(env: TestEnv, config: Config) { - PrometheusVm::default() - .start(&env) - .expect("failed to start prometheus VM"); - // VM resources are as for the "large" testnet. let vm_resources = VmResources { vcpus: Some(NrOfVCPUs::new(64)), @@ -106,8 +101,6 @@ fn setup(env: TestEnv, config: Config) { .nodes() .for_each(|node| node.await_status_is_healthy().unwrap()) }); - - env.sync_with_prometheus(); } fn test(env: TestEnv, config: Config) { From 2dd116ca83a4fa75b0cfbff6983a168671c144e3 Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Mon, 5 Jan 2026 13:27:39 +0000 Subject: [PATCH 09/12] wip --- rs/tests/driver/src/driver/metrics_sync_task.rs | 2 +- rs/tests/driver/src/driver/prometheus_vm.rs | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/rs/tests/driver/src/driver/metrics_sync_task.rs b/rs/tests/driver/src/driver/metrics_sync_task.rs index 9b9d82302aed..a63a9599f144 100644 --- a/rs/tests/driver/src/driver/metrics_sync_task.rs +++ b/rs/tests/driver/src/driver/metrics_sync_task.rs @@ -18,7 +18,7 @@ pub(crate) fn metrics_sync_task(group_ctx: GroupContext) { std::thread::sleep(Duration::from_secs(2)); } loop { - if let Err(e) = env.sync_with_prometheus_result() { + if let Err(e) = env.sync_with_prometheus() { warn!( logger, "Failed to sync with PrometheusVm due to: {}", diff --git a/rs/tests/driver/src/driver/prometheus_vm.rs b/rs/tests/driver/src/driver/prometheus_vm.rs index 71d389cfbac5..93a5371eb580 100644 --- a/rs/tests/driver/src/driver/prometheus_vm.rs +++ b/rs/tests/driver/src/driver/prometheus_vm.rs @@ -359,11 +359,9 @@ chown -R {SSH_USERNAME}:users {PROMETHEUS_SCRAPING_TARGETS_DIR} /// configuring its scraping targets based on the latest IC topology /// and finally downloading its data directory. pub trait HasPrometheus { - /// Same as `sync_with_prometheus_result` but panics in case it fails. - fn sync_with_prometheus(&self); /// Retrieves a topology snapshot, converts it into p8s scraping target /// JSON files and scps them to the prometheus VM. - fn sync_with_prometheus_result(&self) -> Result<()>; + fn sync_with_prometheus(&self) -> Result<()>; /// Downloads prometheus' data directory to the test artifacts /// such that we can run a local p8s on that later. @@ -375,11 +373,7 @@ pub trait HasPrometheus { } impl HasPrometheus for TestEnv { - fn sync_with_prometheus(&self) { - self.sync_with_prometheus_result().unwrap() - } - - fn sync_with_prometheus_result(&self) -> Result<()> { + fn sync_with_prometheus(&self) -> Result<()> { let vm_name = PROMETHEUS_VM_NAME.to_string(); // Write the scraping target JSON files to the local prometheus config directory. let prometheus_config_dir = self.get_universal_vm_config_dir(&vm_name); From fe131918dde0da9e2f1e8acee0e22601a829c7d3 Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Mon, 5 Jan 2026 14:45:25 +0000 Subject: [PATCH 10/12] fix colocated tests --- rs/tests/driver/src/driver/context.rs | 3 +++ rs/tests/driver/src/driver/group.rs | 15 +++++++++------ rs/tests/driver/src/driver/subprocess_task.rs | 4 ++++ rs/tests/idx/colocate_test.rs | 6 ++++++ rs/tests/system_tests.bzl | 5 +++++ 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/rs/tests/driver/src/driver/context.rs b/rs/tests/driver/src/driver/context.rs index c7632abfacc9..61d66208c5ff 100644 --- a/rs/tests/driver/src/driver/context.rs +++ b/rs/tests/driver/src/driver/context.rs @@ -24,6 +24,7 @@ pub struct GroupContext { pub keepalive: bool, pub no_farm_keepalive: bool, pub group_base_name: String, + pub enable_metrics: bool, pub logs_enabled: bool, pub exclude_logs: Vec, pub quiet: bool, @@ -42,6 +43,7 @@ impl GroupContext { keepalive: bool, no_farm_keepalive: bool, group_base_name: String, + enable_metrics: bool, logs_enabled: bool, exclude_logs: Vec, quiet: bool, @@ -68,6 +70,7 @@ impl GroupContext { keepalive, no_farm_keepalive, group_base_name, + enable_metrics, logs_enabled, exclude_logs, quiet, diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index 76ebecdec395..110ba23375d3 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -112,6 +112,12 @@ pub struct CliArgs { )] pub required_host_features: Option>, + #[clap( + long = "enable-metrics", + help = "If set, the PrometheusVm, running Prometheus and Grafana, will be spawned." + )] + pub enable_metrics: bool, + #[clap(long = "no-logs", help = "If set, the vector vm will not be spawned.")] pub no_logs: bool, @@ -611,12 +617,8 @@ impl SystemTestGroup { Box::from(EmptyTask::new(keepalive_task_id)) as Box }; - let metrics_enabled: bool = std::env::var("ENABLE_METRICS") - .map(|v| v == "1" || v.to_lowercase() == "true") - .unwrap_or(false); - let metrics_sync_task_id = TaskId::Test(String::from(METRICS_SYNC_TASK_NAME)); - let metrics_sync_task = if metrics_enabled { + let metrics_sync_task = if group_ctx.enable_metrics { let metrics_sync_task = subproc( metrics_sync_task_id, { @@ -706,7 +708,7 @@ impl SystemTestGroup { task: Box::from(setup_task), }; - let setup_tasks = if metrics_enabled { + let setup_tasks = if group_ctx.enable_metrics { let metrics_setup_task_id = TaskId::Test(String::from(METRICS_SETUP_TASK_NAME)); let metrics_setup_task = subproc( metrics_setup_task_id, @@ -876,6 +878,7 @@ impl SystemTestGroup { args.keepalive, args.no_farm_keepalive || args.no_group_ttl, args.group_base_name, + args.enable_metrics, !args.no_logs, args.exclude_logs, args.quiet, diff --git a/rs/tests/driver/src/driver/subprocess_task.rs b/rs/tests/driver/src/driver/subprocess_task.rs index 7b87fe808e9b..8c96f1fdb58f 100644 --- a/rs/tests/driver/src/driver/subprocess_task.rs +++ b/rs/tests/driver/src/driver/subprocess_task.rs @@ -74,6 +74,10 @@ impl Task for SubprocessTask { .arg("--group-base-name") .arg(self.group_ctx.group_base_name.clone()); + if self.group_ctx.enable_metrics { + child_cmd.arg("--enable-metrics"); + } + if !self.group_ctx.logs_enabled { child_cmd.arg("--no-logs"); } diff --git a/rs/tests/idx/colocate_test.rs b/rs/tests/idx/colocate_test.rs index 298eee6e6947..0b68e94b4cd7 100644 --- a/rs/tests/idx/colocate_test.rs +++ b/rs/tests/idx/colocate_test.rs @@ -242,6 +242,11 @@ fn setup(env: TestEnv) { let forward_ssh_agent = env::var("COLOCATED_TEST_DRIVER_VM_FORWARD_SSH_AGENT").unwrap_or("".to_string()); + let metrics_flag = match env::var("ENABLE_METRICS") { + Ok(val) if val == "1" || val.eq_ignore_ascii_case("true") => "--enable-metrics".to_string(), + _ => "".to_string(), + }; + let logs_flag = if env::var("VECTOR_VM_PATH").is_err() { "--no-logs".to_string() } else { @@ -298,6 +303,7 @@ docker run \ --no-delete-farm-group --no-farm-keepalive \ {required_host_features} \ --group-base-name {colocated_test} \ + {metrics_flag} \ {logs_flag} \ {exclude_logs_args} \ run diff --git a/rs/tests/system_tests.bzl b/rs/tests/system_tests.bzl index f7d925407a56..f4dfb44286fd 100644 --- a/rs/tests/system_tests.bzl +++ b/rs/tests/system_tests.bzl @@ -69,6 +69,11 @@ def _run_system_test(ctx): # set some extra arguments for the test driver extra_args = [] + # we enable metrics _if_ the ENABLE_METRICS env var is set to "1" or "true" but only if it's _not_ a colocated test + # Since the colocated driver will --enable-metrics itself based on the ENABLE_METRICS env var. + if ctx.attr.env.get("ENABLE_METRICS", "0") in ("1", "true") and ctx.executable.colocated_test_bin == None: + extra_args.append("--enable-metrics") + # we enable logs _if_ the VECTOR_VM_PATH is set, but only if it's _not_ a colocated test # (colocated tests have their own vector VM logic) enable_logs = ("VECTOR_VM_PATH" in ctx.attr.env) and ctx.executable.colocated_test_bin == None From c2a44f65948d93133d6cd1b8d0a938e74534473e Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Tue, 6 Jan 2026 13:07:25 +0000 Subject: [PATCH 11/12] extra docs --- rs/tests/driver/src/driver/group.rs | 4 ++++ rs/tests/driver/src/driver/prometheus_vm.rs | 1 + rs/tests/system_tests.bzl | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index 110ba23375d3..f06fd93fba46 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -617,6 +617,7 @@ impl SystemTestGroup { Box::from(EmptyTask::new(keepalive_task_id)) as Box }; + // The metrics_sync_task periodically syncs the targets in the current IC topology with Prometheus. let metrics_sync_task_id = TaskId::Test(String::from(METRICS_SYNC_TASK_NAME)); let metrics_sync_task = if group_ctx.enable_metrics { let metrics_sync_task = subproc( @@ -708,6 +709,9 @@ impl SystemTestGroup { task: Box::from(setup_task), }; + // The setup_tasks always includes the setup_task which executes the setup function. + // In case metrics is enabled it also includes the metrics_setup_task which sets up the PrometheusVm. + // These tasks are executed in parallel as part of the setup_plan below. let setup_tasks = if group_ctx.enable_metrics { let metrics_setup_task_id = TaskId::Test(String::from(METRICS_SETUP_TASK_NAME)); let metrics_setup_task = subproc( diff --git a/rs/tests/driver/src/driver/prometheus_vm.rs b/rs/tests/driver/src/driver/prometheus_vm.rs index 93a5371eb580..22996d283a88 100644 --- a/rs/tests/driver/src/driver/prometheus_vm.rs +++ b/rs/tests/driver/src/driver/prometheus_vm.rs @@ -91,6 +91,7 @@ pub struct PrometheusVm { scrape_interval: Duration, } +/// Stores a hash of the Prometheus scraping target JSON files #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct PrometheusConfigHash { hash: String, diff --git a/rs/tests/system_tests.bzl b/rs/tests/system_tests.bzl index f4dfb44286fd..a61b5c149a59 100644 --- a/rs/tests/system_tests.bzl +++ b/rs/tests/system_tests.bzl @@ -70,7 +70,8 @@ def _run_system_test(ctx): extra_args = [] # we enable metrics _if_ the ENABLE_METRICS env var is set to "1" or "true" but only if it's _not_ a colocated test - # Since the colocated driver will --enable-metrics itself based on the ENABLE_METRICS env var. + # Since the colocated driver will --enable-metrics itself based on the ENABLE_METRICS env var + # we should not enable it in the outer driver since you would end up with two Prometheus VMs. if ctx.attr.env.get("ENABLE_METRICS", "0") in ("1", "true") and ctx.executable.colocated_test_bin == None: extra_args.append("--enable-metrics") From 0073081f745a61b4d290f1c3dd40bde69c818cdd Mon Sep 17 00:00:00 2001 From: Bas van Dijk Date: Tue, 6 Jan 2026 14:56:59 +0000 Subject: [PATCH 12/12] Don't crash on a failing topology snapshot --- rs/tests/driver/src/driver/prometheus_vm.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rs/tests/driver/src/driver/prometheus_vm.rs b/rs/tests/driver/src/driver/prometheus_vm.rs index 22996d283a88..535630a53f51 100644 --- a/rs/tests/driver/src/driver/prometheus_vm.rs +++ b/rs/tests/driver/src/driver/prometheus_vm.rs @@ -386,10 +386,12 @@ impl HasPrometheus for TestEnv { None }; + let topology_snapshot = self.safe_topology_snapshot()?; + sync_prometheus_config_dir( prometheus_config_dir.clone(), group_name.clone(), - self.topology_snapshot(), + topology_snapshot, &playnet_domain, )?; sync_prometheus_config_dir_with_ic_gateways(