From b1569d09a9d4af189bceeff87723b4bb382b48dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ferenc=20T=C3=BCk=C3=B6r?= <ferenc.tukor@radwaytech.com>
Date: Tue, 27 Jan 2026 11:07:38 +0100
Subject: [PATCH 1/4] Add flag to LPRec to turn on/off memory estimator

---
 httomolibgpu/recon/algorithm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 8dd91339..7e2c545e 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -204,6 +204,7 @@ def LPRec3d_tomobar(
     power_of_2_cropping: Optional[bool] = False,
     min_mem_usage_filter: Optional[bool] = True,
     min_mem_usage_ifft2: Optional[bool] = True,
+    calc_peak_gpu_mem: bool = False,
 ) -> cp.ndarray:
     """
     Fourier direct inversion in 3D on unequally spaced (also called as Log-Polar) grids using
@@ -232,6 +233,8 @@ def LPRec3d_tomobar(
         The radius of the circular mask that applies to the reconstructed slice in order to crop
         out some undesirable artifacts. The values outside the given diameter will be set to zero.
         To implement the cropping one can use the range [0.7-1.0] or set to 2.0 when no cropping required.
+    calc_peak_gpu_mem: bool
+        Parameter to support memory estimation in HTTomo. Irrelevant to the method itself and can be ignored by user.
 
     Returns
     -------
@@ -253,6 +256,7 @@ def LPRec3d_tomobar(
         power_of_2_cropping=power_of_2_cropping,
         min_mem_usage_filter=min_mem_usage_filter,
         min_mem_usage_ifft2=min_mem_usage_ifft2,
+        calc_peak_gpu_mem=calc_peak_gpu_mem,
     )
     cp._default_memory_pool.free_all_blocks()
     return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")

From 9a23b438b107d945208ac855424b294659664154 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ferenc=20T=C3=BCk=C3=B6r?= <ferenc.tukor@radwaytech.com>
Date: Wed, 28 Jan 2026 12:21:05 +0100
Subject: [PATCH 2/4] Return the estimated memory in estimator mode

---
 httomolibgpu/recon/algorithm.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 7e2c545e..541c886a 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -246,7 +246,7 @@ def LPRec3d_tomobar(
         data, angles, center, detector_pad, recon_size, 0
     )
 
-    reconstruction = RecToolsCP.FOURIER_INV(
+    result = RecToolsCP.FOURIER_INV(
         data,
         recon_mask_radius=recon_mask_radius,
         data_axes_labels_order=input_data_axis_labels,
@@ -259,7 +259,11 @@ def LPRec3d_tomobar(
         calc_peak_gpu_mem=calc_peak_gpu_mem,
     )
     cp._default_memory_pool.free_all_blocks()
-    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
+
+    if calc_peak_gpu_mem:
+        return result
+
+    return cp.require(cp.swapaxes(result, 0, 1), requirements="C")
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% SIRT reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##

From 4b071ad3e592b69dcc0f8156aa04ceeb67924551 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ferenc=20T=C3=BCk=C3=B6r?= <ferenc.tukor@radwaytech.com>
Date: Tue, 3 Feb 2026 13:38:36 +0100
Subject: [PATCH 3/4] Use DeviceMemStack from ToMoBAR

---
 httomolibgpu/memory_estimator_helpers.py | 24 --------------
 httomolibgpu/prep/phase.py               |  6 ++--
 httomolibgpu/prep/stripe.py              | 41 ++++++------------------
 httomolibgpu/recon/algorithm.py          |  5 ++-
 4 files changed, 14 insertions(+), 62 deletions(-)
 delete mode 100644 httomolibgpu/memory_estimator_helpers.py

diff --git a/httomolibgpu/memory_estimator_helpers.py b/httomolibgpu/memory_estimator_helpers.py
deleted file mode 100644
index 4c63a188..00000000
--- a/httomolibgpu/memory_estimator_helpers.py
+++ /dev/null
@@ -1,24 +0,0 @@
-ALLOCATION_UNIT_SIZE = 512
-
-
-class _DeviceMemStack:
-    def __init__(self) -> None:
-        self.allocations = []
-        self.current = 0
-        self.highwater = 0
-
-    def malloc(self, bytes):
-        self.allocations.append(bytes)
-        allocated = self._round_up(bytes)
-        self.current += allocated
-        self.highwater = max(self.current, self.highwater)
-
-    def free(self, bytes):
-        assert bytes in self.allocations
-        self.allocations.remove(bytes)
-        self.current -= self._round_up(bytes)
-        assert self.current >= 0
-
-    def _round_up(self, size):
-        size = (size + ALLOCATION_UNIT_SIZE - 1) // ALLOCATION_UNIT_SIZE
-        return size * ALLOCATION_UNIT_SIZE
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index 41f8468a..9dc01335 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
-from httomolibgpu.memory_estimator_helpers import _DeviceMemStack
+from tomobar.supp.memory_estimator_helpers import DeviceMemStack
 
 cp = cupywrapper.cp
 cupy_run = cupywrapper.cupy_run
@@ -91,7 +91,7 @@ def paganin_filter(
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
-    mem_stack = _DeviceMemStack() if calc_peak_gpu_mem else None
+    mem_stack = DeviceMemStack() if calc_peak_gpu_mem else None
     # Check the input data is valid
     if not mem_stack and tomo.ndim != 3:
         raise ValueError(
@@ -301,7 +301,7 @@ def _pad_projections(
         "next_power_of_2", "next_fast_length", "use_pad_x_y"
     ],
     pad_x_y: Optional[list],
-    mem_stack: Optional[_DeviceMemStack],
+    mem_stack: Optional[DeviceMemStack],
 ) -> Tuple[cp.ndarray, Tuple[int, int]]:
     """
     Performs padding of each projection to a size optimal for FFT.
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index c46c33d9..d3134c3e 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -30,6 +30,7 @@
 from unittest.mock import Mock
 
 if cupy_run:
+    from tomobar.supp.memory_estimator_helpers import DeviceMemStack
     from cupyx.scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
     from cupyx.scipy.fft import fft2, ifft2, fftshift
     from cupyx.scipy.fftpack import get_fft_plan
@@ -204,32 +205,8 @@ def _reflect(x: np.ndarray, minx: float, maxx: float) -> np.ndarray:
     return np.array(out, dtype=x.dtype)
 
 
-class _DeviceMemStack:
-    def __init__(self) -> None:
-        self.allocations = []
-        self.current = 0
-        self.highwater = 0
-
-    def malloc(self, bytes):
-        self.allocations.append(bytes)
-        allocated = self._round_up(bytes)
-        self.current += allocated
-        self.highwater = max(self.current, self.highwater)
-
-    def free(self, bytes):
-        assert bytes in self.allocations
-        self.allocations.remove(bytes)
-        self.current -= self._round_up(bytes)
-        assert self.current >= 0
-
-    def _round_up(self, size):
-        ALLOCATION_UNIT_SIZE = 512
-        size = (size + ALLOCATION_UNIT_SIZE - 1) // ALLOCATION_UNIT_SIZE
-        return size * ALLOCATION_UNIT_SIZE
-
-
 def _mypad(
-    x: cp.ndarray, pad: Tuple[int, int, int, int], mem_stack: Optional[_DeviceMemStack]
+    x: cp.ndarray, pad: Tuple[int, int, int, int], mem_stack: Optional[DeviceMemStack]
 ) -> cp.ndarray:
     """Function to do numpy like padding on Arrays. Only works for 2-D
     padding.
@@ -272,7 +249,7 @@ def _conv2d(
     w: np.ndarray,
     stride: Tuple[int, int],
     groups: int,
-    mem_stack: Optional[_DeviceMemStack],
+    mem_stack: Optional[DeviceMemStack],
 ) -> cp.ndarray:
     """Convolution (equivalent pytorch.conv2d)"""
     b, ci, hi, wi = x.shape if not mem_stack else x
@@ -355,7 +332,7 @@ def _conv_transpose2d(
     stride: Tuple[int, int],
     pad: Tuple[int, int],
     groups: int,
-    mem_stack: Optional[_DeviceMemStack],
+    mem_stack: Optional[DeviceMemStack],
 ) -> cp.ndarray:
     """Transposed convolution (equivalent pytorch.conv_transpose2d)"""
     b, co, ho, wo = x.shape if not mem_stack else x
@@ -428,7 +405,7 @@ def _afb1d(
     h0: np.ndarray,
     h1: np.ndarray,
     dim: int,
-    mem_stack: Optional[_DeviceMemStack],
+    mem_stack: Optional[DeviceMemStack],
 ) -> cp.ndarray:
     """1D analysis filter bank (along one dimension only) of an image
 
@@ -476,7 +453,7 @@ def _sfb1d(
     g0: np.ndarray,
     g1: np.ndarray,
     dim: int,
-    mem_stack: Optional[_DeviceMemStack],
+    mem_stack: Optional[DeviceMemStack],
 ) -> cp.ndarray:
     """1D synthesis filter bank of an image Array"""
 
@@ -520,7 +497,7 @@ def __init__(self, wave: str):
         self.h1_row = np.array(h1_row).astype("float32")[::-1].reshape((1, 1, 1, -1))
 
     def apply(
-        self, x: cp.ndarray, mem_stack: Optional[_DeviceMemStack] = None
+        self, x: cp.ndarray, mem_stack: Optional[DeviceMemStack] = None
     ) -> Tuple[cp.ndarray, cp.ndarray]:
         """Forward pass of the DWT.
 
@@ -582,7 +559,7 @@ def __init__(self, wave: str):
     def apply(
         self,
         coeffs: Tuple[cp.ndarray, cp.ndarray],
-        mem_stack: Optional[_DeviceMemStack] = None,
+        mem_stack: Optional[DeviceMemStack] = None,
     ) -> cp.ndarray:
         """
         Args:
@@ -672,7 +649,7 @@ def remove_stripe_fw(
     sli_shape = [nz, 1, nproj_pad, ni]
 
     if calc_peak_gpu_mem:
-        mem_stack = _DeviceMemStack()
+        mem_stack = DeviceMemStack()
         # A data copy is assumed when invoking the function
         mem_stack.malloc(np.prod(data) * np.float32().itemsize)
         mem_stack.malloc(np.prod(sli_shape) * np.float32().itemsize)
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 541c886a..4050182f 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -29,6 +29,7 @@
 from unittest.mock import Mock
 
 if cupy_run:
+    from tomobar.supp.memory_estimator_helpers import DeviceMemStack
     from tomobar.methodsDIR import RecToolsDIR
     from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
     from tomobar.methodsIR_CuPy import RecToolsIRCuPy
@@ -204,7 +205,6 @@ def LPRec3d_tomobar(
     power_of_2_cropping: Optional[bool] = False,
     min_mem_usage_filter: Optional[bool] = True,
     min_mem_usage_ifft2: Optional[bool] = True,
-    calc_peak_gpu_mem: bool = False,
 ) -> cp.ndarray:
     """
     Fourier direct inversion in 3D on unequally spaced (also called as Log-Polar) grids using
@@ -256,11 +256,10 @@ def LPRec3d_tomobar(
         power_of_2_cropping=power_of_2_cropping,
         min_mem_usage_filter=min_mem_usage_filter,
         min_mem_usage_ifft2=min_mem_usage_ifft2,
-        calc_peak_gpu_mem=calc_peak_gpu_mem,
     )
     cp._default_memory_pool.free_all_blocks()
 
-    if calc_peak_gpu_mem:
+    if DeviceMemStack.instance():
         return result
 
     return cp.require(cp.swapaxes(result, 0, 1), requirements="C")

From 49c1ba8d49ace8c54e1f41234ee6d793486ef727 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ferenc=20T=C3=BCk=C3=B6r?= <ferenc.tukor@radwaytech.com>
Date: Wed, 4 Feb 2026 17:21:17 +0100
Subject: [PATCH 4/4] Update arguments for memory estimator

---
 httomolibgpu/recon/algorithm.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 4050182f..10b6cc55 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -39,7 +39,7 @@
     RecToolsIRCuPy = Mock()
 
 from numpy import float32
-from typing import Optional, Type, Union
+from typing import Optional, Tuple, Type, Union
 
 
 __all__ = [
@@ -193,7 +193,7 @@ def FBP3d_tomobar(
 
 ## %%%%%%%%%%%%%%%%%%%%%%% LPRec  %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 def LPRec3d_tomobar(
-    data: cp.ndarray,
+    data: cp.ndarray | Tuple[int, int, int],
     angles: np.ndarray,
     center: Optional[float] = None,
     detector_pad: Union[bool, int] = False,
@@ -205,6 +205,7 @@ def LPRec3d_tomobar(
     power_of_2_cropping: Optional[bool] = False,
     min_mem_usage_filter: Optional[bool] = True,
     min_mem_usage_ifft2: Optional[bool] = True,
+    **kwargs,
 ) -> cp.ndarray:
     """
     Fourier direct inversion in 3D on unequally spaced (also called as Log-Polar) grids using
@@ -256,6 +257,7 @@ def LPRec3d_tomobar(
         power_of_2_cropping=power_of_2_cropping,
         min_mem_usage_filter=min_mem_usage_filter,
         min_mem_usage_ifft2=min_mem_usage_ifft2,
+        **kwargs,
     )
     cp._default_memory_pool.free_all_blocks()
 
@@ -498,7 +500,7 @@ def FISTA3d_tomobar(
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 def _instantiate_direct_recon_class(
-    data: cp.ndarray,
+    data: cp.ndarray | Tuple[int, int, int],
     angles: np.ndarray,
     center: Optional[float] = None,
     detector_pad: Union[bool, int] = False,
@@ -518,19 +520,22 @@ def _instantiate_direct_recon_class(
     Returns:
         Type[RecToolsDIRCuPy]: an instance of the direct recon class
     """
+
+    data_shape = data if isinstance(data, tuple) else data.shape
+
     if center is None:
-        center = data.shape[2] // 2  # making a crude guess
+        center = data_shape[2] // 2  # making a crude guess
     if recon_size is None:
-        recon_size = data.shape[2]
+        recon_size = data_shape[2]
     if detector_pad is True:
-        detector_pad = __estimate_detectorHoriz_padding(data.shape[2])
+        detector_pad = __estimate_detectorHoriz_padding(data_shape[2])
     elif detector_pad is False:
         detector_pad = 0
     RecToolsCP = RecToolsDIRCuPy(
-        DetectorsDimH=data.shape[2],  # Horizontal detector dimension
+        DetectorsDimH=data_shape[2],  # Horizontal detector dimension
         DetectorsDimH_pad=detector_pad,  # padding for horizontal detector
-        DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
-        CenterRotOffset=data.shape[2] / 2
+        DetectorsDimV=data_shape[1],  # Vertical detector dimension (3D case)
+        CenterRotOffset=data_shape[2] / 2
         - center
         - 0.5,  # Center of Rotation scalar or a vector
         AnglesVec=-angles,  # A vector of projection angles in radians