diff --git a/checkpoint_engine/pin_memory.py b/checkpoint_engine/pin_memory.py index 3caa934..09c6b03 100644 --- a/checkpoint_engine/pin_memory.py +++ b/checkpoint_engine/pin_memory.py @@ -209,7 +209,9 @@ def _pin(t: torch.Tensor): torch.cuda.set_device(device_index) cudart = torch.cuda.cudart() r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0) - assert r == 0, f"pin memory error, error code: {r}" + if r != 0: + error_msg = cudart.cudaGetErrorString(r) + raise RuntimeError(f"pin memory error, error code: {r}, error message: {error_msg}") # TODO: should only support /dev/shm? but we found files in disk also work? size = os.stat(file_path).st_size @@ -254,6 +256,12 @@ def _pin(t: torch.Tensor): # Remove the file after successfully loading. This will avoid doubling the memory usage. # We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading. os.remove(file_path) + if not metas: + # TODO: should we still return this buffer? + assert buffer.nbytes == 0, f"buffer nbytes {buffer.nbytes} should be 0" + logger.warning(f"[rank{rank}] no metas found in {file_path}, skip pin memory") + return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=[], manually_pinned=False) + _pin(buffer) logger.info( f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB" diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py index 20f5be6..b5512c3 100644 --- a/checkpoint_engine/ps.py +++ b/checkpoint_engine/ps.py @@ -391,7 +391,11 @@ def _unpin(t: torch.Tensor): ) cudart = torch.cuda.cudart() r = cudart.cudaHostUnregister(t.data_ptr()) - assert r == 0, f"unpin memory error, error code: {r}" + if r != 0: + error_msg = cudart.cudaGetErrorString(r) + raise RuntimeError( + f"unpin memory error, error code: {r}, error message: {error_msg}" + ) # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually try: