From 0eef0811a0a4f57ce2b0eff69b1d23598ff7eb24 Mon Sep 17 00:00:00 2001 From: Hubert Zhang Date: Tue, 20 Jan 2026 03:37:16 +0800 Subject: [PATCH 1/2] misc: translate cuda error code to string when pin and unpin --- checkpoint_engine/pin_memory.py | 4 +++- checkpoint_engine/ps.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/checkpoint_engine/pin_memory.py b/checkpoint_engine/pin_memory.py index 3caa934..f31495d 100644 --- a/checkpoint_engine/pin_memory.py +++ b/checkpoint_engine/pin_memory.py @@ -209,7 +209,9 @@ def _pin(t: torch.Tensor): torch.cuda.set_device(device_index) cudart = torch.cuda.cudart() r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0) - assert r == 0, f"pin memory error, error code: {r}" + if r != 0: + error_msg = cudart.cudaGetErrorString(r) + raise RuntimeError(f"pin memory error, error code: {r}, error message: {error_msg}") # TODO: should only support /dev/shm? but we found files in disk also work? size = os.stat(file_path).st_size diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py index 20f5be6..b5512c3 100644 --- a/checkpoint_engine/ps.py +++ b/checkpoint_engine/ps.py @@ -391,7 +391,11 @@ def _unpin(t: torch.Tensor): ) cudart = torch.cuda.cudart() r = cudart.cudaHostUnregister(t.data_ptr()) - assert r == 0, f"unpin memory error, error code: {r}" + if r != 0: + error_msg = cudart.cudaGetErrorString(r) + raise RuntimeError( + f"unpin memory error, error code: {r}, error message: {error_msg}" + ) # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually try: From b4bed4b0c3eb930a6ae62c56a99b9eb51223aaa6 Mon Sep 17 00:00:00 2001 From: Hubert Zhang Date: Tue, 20 Jan 2026 03:43:19 +0800 Subject: [PATCH 2/2] bugfix: skip empty safetensors file when inplace pin memory --- checkpoint_engine/pin_memory.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/checkpoint_engine/pin_memory.py b/checkpoint_engine/pin_memory.py index f31495d..09c6b03 100644 --- a/checkpoint_engine/pin_memory.py +++ b/checkpoint_engine/pin_memory.py @@ -256,6 +256,12 @@ def _pin(t: torch.Tensor): # Remove the file after successfully loading. This will avoid doubling the memory usage. # We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading. os.remove(file_path) + if not metas: + # TODO: should we still return this buffer? + assert buffer.nbytes == 0, f"buffer nbytes {buffer.nbytes} should be 0" + logger.warning(f"[rank{rank}] no metas found in {file_path}, skip pin memory") + return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=[], manually_pinned=False) + _pin(buffer) logger.info( f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"