From e1c58a33002b8b3721f0a1619f0ee884786c8183 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jan 2024 09:10:24 +0000 Subject: [PATCH 1/5] added amd-smi interface --- codecarbon/core/gpu.py | 193 ++++++++++++++++++++++++++------ codecarbon/core/util.py | 20 ++++ codecarbon/emissions_tracker.py | 7 +- 3 files changed, 185 insertions(+), 35 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 70a81cabc..97158c4c4 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -1,10 +1,23 @@ +from collections import namedtuple from dataclasses import dataclass, field -import pynvml - from codecarbon.core.units import Energy, Power, Time +from codecarbon.core.util import is_amd_system, is_nvidia_system from codecarbon.external.logger import logger +USE_AMDSMI = False +USE_PYNVML = False + +if is_nvidia_system(): + import pynvml + + USE_PYNVML = True + +if is_amd_system(): + import amdsmi + + USE_AMDSMI = True + @dataclass class GPUDevice: @@ -92,46 +105,105 @@ def _get_total_energy_consumption(self): """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a """ - return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) + elif USE_AMDSMI: + # returns energy in microjoules (amd-smi metric --energy) + return amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] + else: + raise Exception("No GPU interface available") def _get_gpu_name(self): """Returns the name of the GPU device https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481 """ - name = pynvml.nvmlDeviceGetName(self.handle) + if USE_PYNVML: + name = pynvml.nvmlDeviceGetName(self.handle) + elif USE_AMDSMI: + name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"] + else: + raise Exception("No GPU interface available") + return self._to_utf8(name) def _get_uuid(self): """Returns the globally unique GPU device UUID https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654 """ - uuid = pynvml.nvmlDeviceGetUUID(self.handle) + if USE_PYNVML: + uuid = pynvml.nvmlDeviceGetUUID(self.handle) + elif USE_AMDSMI: + uuid = amdsmi.amdsmi_get_device_uuid(self.handle) + else: + raise Exception("No GPU interface available") + return self._to_utf8(uuid) def _get_memory_info(self): """Returns memory info in bytes https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8 """ - return pynvml.nvmlDeviceGetMemoryInfo(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetMemoryInfo(self.handle) + elif USE_AMDSMI: + # returns memory in megabytes (amd-smi metric --mem-usage) + memory_info = amdsmi.amdsmi_get_vram_usage(self.handle) + AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) + return AMDMemory( + total=memory_info["vram_total"] * 1024 * 1024, + used=memory_info["vram_used"] * 1024 * 1024, + free=(memory_info["vram_total"] - memory_info["vram_used"]) + * 1024 + * 1024, + ) + else: + raise Exception("No GPU interface available") def _get_temperature(self): """Returns degrees in the Celsius scale https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121 """ - return pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU) + if USE_PYNVML: + return pynvml.nvmlDeviceGetTemperature( + self.handle, + sensor=pynvml.NVML_TEMPERATURE_GPU, + ) + elif USE_AMDSMI: + return amdsmi.amdsmi_dev_get_temp_metric( + self.handle, + sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, + metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, + ) + else: + raise Exception("No GPU interface available") def _get_power_usage(self): """Returns power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87 """ - return pynvml.nvmlDeviceGetPowerUsage(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetPowerUsage(self.handle) + elif USE_AMDSMI: + # returns power in Watts (amd-smi metric --power) + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] + * 1000 + ) + else: + raise Exception("No GPU interface available") def _get_power_limit(self): """Returns max power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad """ try: - return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) + elif USE_AMDSMI: + # returns power limit in Watts (amd-smi static --limit) + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000 + ) except Exception: return None @@ -139,51 +211,100 @@ def _get_gpu_utilization(self): """Returns the % of utilization of the kernels during the last sample https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t """ - return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu + if USE_PYNVML: + return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu + elif USE_AMDSMI: + return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"] + else: + raise Exception("No GPU interface available") def _get_compute_mode(self): """Returns the compute mode of the GPU https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gbed1b88f2e3ba39070d31d1db4340233 """ - return pynvml.nvmlDeviceGetComputeMode(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetComputeMode(self.handle) + elif USE_AMDSMI: + return None + else: + raise Exception("No GPU interface available") def _get_compute_processes(self): - """Returns the list of processes ids having a compute context on the - device with the memory used + """Returns the list of processes ids having a compute context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68 """ try: - processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) - - return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] - except pynvml.NVMLError: + if USE_PYNVML: + processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) + return [ + {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes + ] + elif USE_AMDSMI: + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_info = [ + amdsmi.amdsmi_get_process_info(self.handle, p) + for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} + for p in processes_info + ] + except Exception: return [] def _get_graphics_processes(self): - """Returns the list of processes ids having a graphics context on the - device with the memory used + """Returns the list of processes ids having a graphics context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e """ try: - processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) - - return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] - except pynvml.NVMLError: + if USE_PYNVML: + processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) + return [ + {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes + ] + elif USE_AMDSMI: + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_info = [ + amdsmi.amdsmi_get_process_info(self.handle, p) + for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} + for p in processes_info + if p["engine_usage"]["gfx"] > 0 + ] + except Exception: return [] class AllGPUDevices: def __init__(self): if is_gpu_details_available(): - logger.debug("GPU available. Starting setup") - self.device_count = pynvml.nvmlDeviceGetCount() + if USE_PYNVML: + logger.debug("Nvidia GPU available. Starting setup") + pynvml.nvmlInit() + self.device_count = pynvml.nvmlDeviceGetCount() + elif USE_AMDSMI: + logger.debug("AMD GPU available. Starting setup") + amdsmi.amdsmi_init() + self.device_count = len(amdsmi.amdsmi_get_device_handles()) + else: + logger.error("No GPU interface available") + self.device_count = 0 else: logger.error("There is no GPU available") self.device_count = 0 self.devices = [] for i in range(self.device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - gpu_device = GPUDevice(handle=handle, gpu_index=i) + if USE_PYNVML: + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + gpu_device = GPUDevice(handle=handle, gpu_index=i) + elif USE_AMDSMI: + handle = amdsmi.amdsmi_get_device_handles()[i] + gpu_device = GPUDevice(handle=handle, gpu_index=i) + else: + raise Exception("No GPU interface available") + self.devices.append(gpu_device) def get_gpu_static_info(self): @@ -206,7 +327,7 @@ def get_gpu_static_info(self): devices_static_info.append(gpu_device.get_static_details()) return devices_static_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu static info", exc_info=True) return [] @@ -238,7 +359,7 @@ def get_gpu_details(self): devices_info.append(gpu_device.get_gpu_details()) return devices_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] @@ -261,7 +382,7 @@ def get_delta(self, last_duration: Time): devices_info.append(gpu_device.delta(last_duration)) return devices_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] @@ -269,8 +390,14 @@ def get_delta(self, last_duration: Time): def is_gpu_details_available(): """Returns True if the GPU details are available.""" try: - pynvml.nvmlInit() - return True + if USE_PYNVML: + pynvml.nvmlInit() + return True + elif USE_AMDSMI: + amdsmi.amdsmi_init() + return True + else: + return False - except pynvml.NVMLError: + except Exception: return False diff --git a/codecarbon/core/util.py b/codecarbon/core/util.py index 7bf66edb3..ef1d7b81b 100644 --- a/codecarbon/core/util.py +++ b/codecarbon/core/util.py @@ -117,3 +117,23 @@ def count_cpus() -> int: num_cpus = num_cpus_matches[0].replace("NumCPUs=", "") logger.debug(f"Detected {num_cpus} cpus available on SLURM.") return int(num_cpus) + + +def is_amd_system(): + """Returns True if the system has an amd-smi interface.""" + try: + # Check if amd-smi is available + subprocess.check_output(["amd-smi", "--help"]) + return True + except subprocess.CalledProcessError: + return False + + +def is_nvidia_system(): + """Returns True if the system has an nvidia-smi interface.""" + try: + # Check if nvidia-smi is available + subprocess.check_output(["nvidia-smi", "--help"]) + return True + except Exception: + return False diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index b16bbf0de..249dd8e9c 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -18,7 +18,7 @@ from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids from codecarbon.core.emissions import Emissions from codecarbon.core.units import Energy, Power, Time -from codecarbon.core.util import count_cpus, suppress +from codecarbon.core.util import count_cpus, is_amd_system, is_nvidia_system, suppress from codecarbon.external.geography import CloudMetadata, GeoMetadata from codecarbon.external.hardware import CPU, GPU, RAM from codecarbon.external.logger import logger, set_logger_format, set_logger_level @@ -280,7 +280,10 @@ def __init__( # Hardware detection logger.info("[setup] GPU Tracking...") if gpu.is_gpu_details_available(): - logger.info("Tracking Nvidia GPU via pynvml") + if is_nvidia_system(): + logger.info("Tracking Nvidia GPU via pynvml") + elif is_amd_system(): + logger.info("Tracking AMD GPU via amdsmi") gpu_devices = GPU.from_utils(self._gpu_ids) self._hardware.append(gpu_devices) gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()] From fc93306ee9d8d14265b71efe75b748d4cb91d264 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jan 2024 09:46:50 +0000 Subject: [PATCH 2/5] fix energy unit --- codecarbon/core/gpu.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 97158c4c4..fe0edc8e7 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -108,8 +108,12 @@ def _get_total_energy_consumption(self): if USE_PYNVML: return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) elif USE_AMDSMI: - # returns energy in microjoules (amd-smi metric --energy) - return amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] + # returns energy in "Energy Status Units" which is equivalent to 15.3 microjoules (amd-smi metric --energy) + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] + * 15.3 + / 1000 + ) else: raise Exception("No GPU interface available") From 0626e4b6a4753ff3ee9b6c684010a94702110444 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jan 2024 10:04:52 +0000 Subject: [PATCH 3/5] use counter_resolution instead of hard coding it --- codecarbon/core/gpu.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index fe0edc8e7..6fe632a2c 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -108,12 +108,9 @@ def _get_total_energy_consumption(self): if USE_PYNVML: return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) elif USE_AMDSMI: - # returns energy in "Energy Status Units" which is equivalent to 15.3 microjoules (amd-smi metric --energy) - return ( - amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] - * 15.3 - / 1000 - ) + # returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules + energy = amdsmi.amdsmi_dev_get_energy_count(self.handle) + return energy["power"] * energy["counter_resolution"] / 1000 else: raise Exception("No GPU interface available") From 37f07ecf9b4ac5781389747fd0a2a92d35ce1197 Mon Sep 17 00:00:00 2001 From: benoit-cty <4-benoit-cty@users.noreply.git.leximpact.dev> Date: Fri, 26 Jan 2024 12:40:28 +0100 Subject: [PATCH 4/5] wip : handle AMD and Nvidia at the same time --- codecarbon/core/gpu.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 6fe632a2c..a69c98086 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -279,34 +279,35 @@ def _get_graphics_processes(self): class AllGPUDevices: + devices = [] + device_count:int = 0 + def __init__(self): + self.devices = [] if is_gpu_details_available(): if USE_PYNVML: logger.debug("Nvidia GPU available. Starting setup") pynvml.nvmlInit() self.device_count = pynvml.nvmlDeviceGetCount() - elif USE_AMDSMI: + for i in range(self.device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + gpu_device = GPUDevice(handle=handle, gpu_index=i) + self.devices.append(gpu_device) + if USE_AMDSMI: logger.debug("AMD GPU available. Starting setup") amdsmi.amdsmi_init() self.device_count = len(amdsmi.amdsmi_get_device_handles()) + for i in range(self.device_count): + handle = amdsmi.amdsmi_get_device_handles()[i] + gpu_device = GPUDevice(handle=handle, gpu_index=i) + self.devices.append(gpu_device) else: logger.error("No GPU interface available") - self.device_count = 0 else: logger.error("There is no GPU available") - self.device_count = 0 - self.devices = [] - for i in range(self.device_count): - if USE_PYNVML: - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - gpu_device = GPUDevice(handle=handle, gpu_index=i) - elif USE_AMDSMI: - handle = amdsmi.amdsmi_get_device_handles()[i] - gpu_device = GPUDevice(handle=handle, gpu_index=i) - else: - raise Exception("No GPU interface available") + self.device_count = len(self.devices) - self.devices.append(gpu_device) + def get_gpu_static_info(self): """Get all GPUs static information. From 0002c2e57681ac3afec8b8c09f32249434deb735 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 29 Jan 2024 14:36:50 +0000 Subject: [PATCH 5/5] added support for amd and nvidia at the same time --- codecarbon/core/gpu.py | 335 +++++++++++++++----------------- codecarbon/core/util.py | 20 -- codecarbon/emissions_tracker.py | 13 +- 3 files changed, 169 insertions(+), 199 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index a69c98086..c846badf9 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -1,32 +1,66 @@ +import subprocess +from typing import List, Any from collections import namedtuple from dataclasses import dataclass, field + from codecarbon.core.units import Energy, Power, Time -from codecarbon.core.util import is_amd_system, is_nvidia_system from codecarbon.external.logger import logger -USE_AMDSMI = False -USE_PYNVML = False -if is_nvidia_system(): +def is_rocm_system(): + """Returns True if the system has an rocm-smi interface.""" + try: + # Check if rocm-smi is available + subprocess.check_output(["rocm-smi", "--help"]) + return True + except subprocess.CalledProcessError: + return False + + +def is_nvidia_system(): + """Returns True if the system has an nvidia-smi interface.""" + try: + # Check if nvidia-smi is available + subprocess.check_output(["nvidia-smi", "--help"]) + return True + except Exception: + return False + + +try: import pynvml - USE_PYNVML = True + PYNVML_AVAILABLE = True +except ImportError: + if is_nvidia_system(): + logger.warning( + "Nvidia GPU detected but pynvml is not available. " + "Please install pynvml to get GPU metrics." + ) + PYNVML_AVAILABLE = False -if is_amd_system(): +try: import amdsmi - USE_AMDSMI = True + AMDSMI_AVAILABLE = True +except ImportError: + if is_rocm_system(): + logger.warning( + "AMD GPU detected but amdsmi is not available. " + "Please install amdsmi to get GPU metrics." + ) + AMDSMI_AVAILABLE = False @dataclass class GPUDevice: - handle: any + handle: Any gpu_index: int - # Energy consumed in kWh - energy_delta: Energy = field(default_factory=lambda: Energy(0)) # Power based on reading power: Power = field(default_factory=lambda: Power(0)) + # Energy consumed in kWh + energy_delta: Energy = field(default_factory=lambda: Energy(0)) # Last energy reading in kWh last_energy: Energy = field(default_factory=lambda: Energy(0)) @@ -101,213 +135,184 @@ def _to_utf8(self, str_or_bytes): return str_or_bytes + +@dataclass +class NvidiaGPUDevice(GPUDevice): def _get_total_energy_consumption(self): """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) - elif USE_AMDSMI: - # returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules - energy = amdsmi.amdsmi_dev_get_energy_count(self.handle) - return energy["power"] * energy["counter_resolution"] / 1000 - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) def _get_gpu_name(self): """Returns the name of the GPU device https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481 """ - if USE_PYNVML: - name = pynvml.nvmlDeviceGetName(self.handle) - elif USE_AMDSMI: - name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"] - else: - raise Exception("No GPU interface available") - + name = pynvml.nvmlDeviceGetName(self.handle) return self._to_utf8(name) def _get_uuid(self): """Returns the globally unique GPU device UUID https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654 """ - if USE_PYNVML: - uuid = pynvml.nvmlDeviceGetUUID(self.handle) - elif USE_AMDSMI: - uuid = amdsmi.amdsmi_get_device_uuid(self.handle) - else: - raise Exception("No GPU interface available") - + uuid = pynvml.nvmlDeviceGetUUID(self.handle) return self._to_utf8(uuid) def _get_memory_info(self): """Returns memory info in bytes https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetMemoryInfo(self.handle) - elif USE_AMDSMI: - # returns memory in megabytes (amd-smi metric --mem-usage) - memory_info = amdsmi.amdsmi_get_vram_usage(self.handle) - AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) - return AMDMemory( - total=memory_info["vram_total"] * 1024 * 1024, - used=memory_info["vram_used"] * 1024 * 1024, - free=(memory_info["vram_total"] - memory_info["vram_used"]) - * 1024 - * 1024, - ) - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetMemoryInfo(self.handle) def _get_temperature(self): """Returns degrees in the Celsius scale https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetTemperature( - self.handle, - sensor=pynvml.NVML_TEMPERATURE_GPU, - ) - elif USE_AMDSMI: - return amdsmi.amdsmi_dev_get_temp_metric( - self.handle, - sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, - metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, - ) - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetTemperature( + self.handle, sensor=pynvml.NVML_TEMPERATURE_GPU + ) def _get_power_usage(self): """Returns power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetPowerUsage(self.handle) - elif USE_AMDSMI: - # returns power in Watts (amd-smi metric --power) - return ( - amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] - * 1000 - ) - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetPowerUsage(self.handle) def _get_power_limit(self): """Returns max power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad """ - try: - if USE_PYNVML: - return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) - elif USE_AMDSMI: - # returns power limit in Watts (amd-smi static --limit) - return ( - amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000 - ) - except Exception: - return None + return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) def _get_gpu_utilization(self): """Returns the % of utilization of the kernels during the last sample https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu - elif USE_AMDSMI: - return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"] - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu def _get_compute_mode(self): """Returns the compute mode of the GPU https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gbed1b88f2e3ba39070d31d1db4340233 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetComputeMode(self.handle) - elif USE_AMDSMI: - return None - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetComputeMode(self.handle) def _get_compute_processes(self): """Returns the list of processes ids having a compute context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68 """ - try: - if USE_PYNVML: - processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) - return [ - {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes - ] - elif USE_AMDSMI: - processes_handles = amdsmi.amdsmi_get_process_list(self.handle) - processes_info = [ - amdsmi.amdsmi_get_process_info(self.handle, p) - for p in processes_handles - ] - return [ - {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} - for p in processes_info - ] - except Exception: - return [] + processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) + return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] def _get_graphics_processes(self): """Returns the list of processes ids having a graphics context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e """ - try: - if USE_PYNVML: - processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) - return [ - {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes - ] - elif USE_AMDSMI: - processes_handles = amdsmi.amdsmi_get_process_list(self.handle) - processes_info = [ - amdsmi.amdsmi_get_process_info(self.handle, p) - for p in processes_handles - ] - return [ - {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} - for p in processes_info - if p["engine_usage"]["gfx"] > 0 - ] - except Exception: - return [] + processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) + return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] + + +class AMDGPUDevice(GPUDevice): + def _get_total_energy_consumption(self): + """Returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules""" + energy_count = amdsmi.amdsmi_dev_get_energy_count(self.handle) + energy = energy_count["power"] * energy_count["counter_resolution"] / 1000 + return energy + + def _get_gpu_name(self): + """Returns the name of the GPU device""" + name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"] + return self._to_utf8(name) + + def _get_uuid(self): + """Returns the globally unique GPU device UUID""" + uuid = amdsmi.amdsmi_get_device_uuid(self.handle) + return self._to_utf8(uuid) + + def _get_memory_info(self): + """Returns memory info in bytes""" + memory_info = amdsmi.amdsmi_get_vram_usage(self.handle) + AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) + return AMDMemory( + total=memory_info["vram_total"] * 1024 * 1024, + used=memory_info["vram_used"] * 1024 * 1024, + free=(memory_info["vram_total"] - memory_info["vram_used"]) * 1024 * 1024, + ) + + def _get_temperature(self): + """Returns degrees in the Celsius scale""" + return amdsmi.amdsmi_dev_get_temp_metric( + self.handle, + sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, + metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, + ) + + def _get_power_usage(self): + """Returns power usage in milliwatts""" + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] * 1000 + ) + + def _get_power_limit(self): + """Returns max power usage in milliwatts""" + return amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000 + + def _get_gpu_utilization(self): + """Returns the % of utilization of the kernels during the last sample""" + return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"] + + def _get_compute_mode(self): + """Returns the compute mode of the GPU""" + return None + + def _get_compute_processes(self): + """Returns the list of processes ids having a compute context on the device with the memory used""" + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_infos = [ + amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_mem"]} + for p in processes_infos + ] + + def _get_graphics_processes(self): + """Returns the list of processes ids having a graphics context on the device with the memory used""" + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_infos = [ + amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} + for p in processes_infos + if p["engine_usage"]["gfx"] > 0 + ] class AllGPUDevices: - devices = [] - device_count:int = 0 - + device_count: int + devices: List[GPUDevice] + def __init__(self): self.devices = [] - if is_gpu_details_available(): - if USE_PYNVML: - logger.debug("Nvidia GPU available. Starting setup") - pynvml.nvmlInit() - self.device_count = pynvml.nvmlDeviceGetCount() - for i in range(self.device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - gpu_device = GPUDevice(handle=handle, gpu_index=i) - self.devices.append(gpu_device) - if USE_AMDSMI: - logger.debug("AMD GPU available. Starting setup") - amdsmi.amdsmi_init() - self.device_count = len(amdsmi.amdsmi_get_device_handles()) - for i in range(self.device_count): - handle = amdsmi.amdsmi_get_device_handles()[i] - gpu_device = GPUDevice(handle=handle, gpu_index=i) - self.devices.append(gpu_device) - else: - logger.error("No GPU interface available") - else: - logger.error("There is no GPU available") - self.device_count = len(self.devices) - + if is_nvidia_system() and PYNVML_AVAILABLE: + logger.debug("PyNVML available. Starting setup") + pynvml.nvmlInit() + nvidia_devices_count = pynvml.nvmlDeviceGetCount() + for i in range(nvidia_devices_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i) + self.devices.append(nvidia_gpu_device) + + if is_rocm_system() and AMDSMI_AVAILABLE: + logger.debug("AMDSMI available. Starting setup") + amdsmi.amdsmi_init() + amd_devices_handles = amdsmi.amdsmi_get_device_handles() + for i, handle in enumerate(amd_devices_handles): + amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i) + self.devices.append(amd_gpu_device) + + self.device_count = len(self.devices) def get_gpu_static_info(self): """Get all GPUs static information. @@ -357,7 +362,7 @@ def get_gpu_details(self): try: devices_info = [] for i in range(self.device_count): - gpu_device: GPUDevice = self.devices[i] + gpu_device = self.devices[i] devices_info.append(gpu_device.get_gpu_details()) return devices_info @@ -380,26 +385,10 @@ def get_delta(self, last_duration: Time): try: devices_info = [] for i in range(self.device_count): - gpu_device: GPUDevice = self.devices[i] + gpu_device = self.devices[i] devices_info.append(gpu_device.delta(last_duration)) return devices_info except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] - - -def is_gpu_details_available(): - """Returns True if the GPU details are available.""" - try: - if USE_PYNVML: - pynvml.nvmlInit() - return True - elif USE_AMDSMI: - amdsmi.amdsmi_init() - return True - else: - return False - - except Exception: - return False diff --git a/codecarbon/core/util.py b/codecarbon/core/util.py index ef1d7b81b..7bf66edb3 100644 --- a/codecarbon/core/util.py +++ b/codecarbon/core/util.py @@ -117,23 +117,3 @@ def count_cpus() -> int: num_cpus = num_cpus_matches[0].replace("NumCPUs=", "") logger.debug(f"Detected {num_cpus} cpus available on SLURM.") return int(num_cpus) - - -def is_amd_system(): - """Returns True if the system has an amd-smi interface.""" - try: - # Check if amd-smi is available - subprocess.check_output(["amd-smi", "--help"]) - return True - except subprocess.CalledProcessError: - return False - - -def is_nvidia_system(): - """Returns True if the system has an nvidia-smi interface.""" - try: - # Check if nvidia-smi is available - subprocess.check_output(["nvidia-smi", "--help"]) - return True - except Exception: - return False diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 249dd8e9c..cb8469e37 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -18,7 +18,7 @@ from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids from codecarbon.core.emissions import Emissions from codecarbon.core.units import Energy, Power, Time -from codecarbon.core.util import count_cpus, is_amd_system, is_nvidia_system, suppress +from codecarbon.core.util import count_cpus, suppress from codecarbon.external.geography import CloudMetadata, GeoMetadata from codecarbon.external.hardware import CPU, GPU, RAM from codecarbon.external.logger import logger, set_logger_format, set_logger_level @@ -279,11 +279,12 @@ def __init__( # Hardware detection logger.info("[setup] GPU Tracking...") - if gpu.is_gpu_details_available(): - if is_nvidia_system(): - logger.info("Tracking Nvidia GPU via pynvml") - elif is_amd_system(): - logger.info("Tracking AMD GPU via amdsmi") + if gpu.is_nvidia_system() or gpu.is_rocm_system(): + if gpu.is_nvidia_system(): + logger.info("Tracking Nvidia GPUs via PyNVML") + elif gpu.is_rocm_system(): + logger.info("Tracking AMD GPUs via AMDSMI") + gpu_devices = GPU.from_utils(self._gpu_ids) self._hardware.append(gpu_devices) gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()]