Skip to content

Update estimator to latest changes #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ def _calc_memory_bytes_LPRec3d_tomobar(
angles_tot = non_slice_dims_shape[0]
DetectorsLengthH = non_slice_dims_shape[1]
SLICES = 200 # dummy multiplier+divisor to pass large batch size threshold
ACTUAL_SLICE_COUNT = 10

n = DetectorsLengthH

Expand All @@ -194,74 +195,152 @@ def _calc_memory_bytes_LPRec3d_tomobar(
sorted_theta_cpu = np.sort(angles)
theta_full_range = abs(sorted_theta_cpu[angles_tot-1] - sorted_theta_cpu[0])
angle_range_pi_count = 1 + int(np.ceil(theta_full_range / math.pi))
angle_range_pi_count += 1 # account for difference from actual algorithm
else:
angle_range_pi_count = 1 + int(np.ceil(2)) # assume a 2 * PI projection angle range

if "chunk_count" in kwargs:
chunk_count = kwargs["chunk_count"]
else:
chunk_count = 2

output_dims = __calc_output_dim_recon(non_slice_dims_shape, **kwargs)
if odd_horiz:
output_dims = tuple(x + 1 for x in output_dims)

def debug_print(ln: int, name: str, size: int, per_slice: bool):
slice_multiplier = ACTUAL_SLICE_COUNT if per_slice else 1
size_kb = size / 1024
size_mb = size_kb / 1024
if size_mb < 1:
print(f"{ln} {name} {size_kb * slice_multiplier} KB")
else:
print(f"{ln} {name} {size_mb * slice_multiplier} MB")

in_slice_size = np.prod(non_slice_dims_shape) * dtype.itemsize
padded_in_slice_size = np.prod(non_slice_dims_shape) * np.float32().itemsize
debug_print(-1, "in_slice_size", in_slice_size, True)
padded_in_slice_size = angles_tot * n * np.float32().itemsize
debug_print(233, "padded_in_slice_size", padded_in_slice_size, True)

theta_size = angles_tot * np.float32().itemsize
sorted_theta_indices_size = angles_tot * np.int64().itemsize
sorted_theta_size = angles_tot * np.float32().itemsize
recon_output_size = (n + 1) * (n + 1) * np.float32().itemsize if odd_horiz else n * n * np.float32().itemsize # 264
linspace_size = n * np.float32().itemsize
meshgrid_size = 2 * n * n * np.float32().itemsize
phi_size = 6 * n * n * np.float32().itemsize
angle_range_size = center_size * center_size * 1 + angle_range_pi_count * 2 * np.int32().itemsize
c1dfftshift_size = n * np.int8().itemsize
c2dfftshift_slice_size = 4 * n * n * np.int8().itemsize
debug_print(240, "theta", theta_size, False)
filter_size = (n // 2 + 1) * np.float32().itemsize
debug_print(259, "filter_size", filter_size, False)
rfftfreq_size = filter_size
debug_print(262, "rfftfreq_size", rfftfreq_size, False)
scaled_filter_size = filter_size
tmp_p_input_slice = np.prod(non_slice_dims_shape) * np.float32().itemsize
padded_tmp_p_input_slice = angles_tot * (n + padding_m * 2) * dtype.itemsize
rfft_result_size = padded_tmp_p_input_slice
filtered_rfft_result_size = rfft_result_size
rfft_plan_slice_size = cufft_estimate_1d(nx=(n + padding_m * 2),fft_type=CufftType.CUFFT_R2C,batch=angles_tot * SLICES) / SLICES
irfft_result_size = filtered_rfft_result_size
irfft_scratch_memory_size = filtered_rfft_result_size
irfft_plan_slice_size = cufft_estimate_1d(nx=(n + padding_m * 2),fft_type=CufftType.CUFFT_C2R,batch=angles_tot * SLICES) / SLICES
conversion_to_complex_size = np.prod(non_slice_dims_shape) * np.complex64().itemsize / 2
datac_size = np.prod(non_slice_dims_shape) * np.complex64().itemsize / 2
debug_print(263, "scaled_filter_size", scaled_filter_size, False)

tmp_p_input_slice = angles_tot * n * np.float32().itemsize
debug_print(266, "tmp_p_input", tmp_p_input_slice, True)

padded_tmp_p_input_slice = angles_tot * (n + padding_m * 2) * dtype.itemsize / chunk_count
debug_print(273, "padded_tmp_p_input_slice", padded_tmp_p_input_slice, True)
rfft_plan_slice_size = cufft_estimate_1d(nx=(n + padding_m * 2),fft_type=CufftType.CUFFT_R2C,batch=angles_tot * SLICES) / SLICES / chunk_count
debug_print(279, "rfft_plan", rfft_plan_slice_size, True)
rfft_result_size = padded_tmp_p_input_slice / chunk_count
debug_print(279, "rfft_result", rfft_result_size, True)
filtered_rfft_result_size = rfft_result_size / chunk_count
debug_print(279, "filtered_rfft_result", filtered_rfft_result_size, True)
irfft_scratch_memory_size = filtered_rfft_result_size / chunk_count
debug_print(280, "irfft_scratch", irfft_scratch_memory_size, True)

datac_size = angles_tot * n * np.complex64().itemsize / 2
debug_print(290, "datac_size", datac_size, True)
fde_size = (2 * m + 2 * n) * (2 * m + 2 * n) * np.complex64().itemsize / 2
shifted_datac_size = datac_size
fft_result_size = datac_size
backshifted_datac_size = datac_size
scaled_backshifted_datac_size = datac_size
debug_print(293, "fde_size", fde_size, True)
fft_plan_slice_size = cufft_estimate_1d(nx=n,fft_type=CufftType.CUFFT_C2C,batch=angles_tot * SLICES) / SLICES
fde_view_size = 4 * n * n * np.complex64().itemsize / 2
shifted_fde_view_size = fde_view_size
ifft2_scratch_memory_size = fde_view_size
ifft2_plan_slice_size = cufft_estimate_2d(nx=(2 * n),ny=(2 * n),fft_type=CufftType.CUFFT_C2C) / 2
fde2_size = n * n * np.complex64().itemsize / 2
concatenate_size = fde2_size
circular_mask_size = np.prod(output_dims) / 2 * np.int64().itemsize * 4
debug_print(309, "fft_plan", fft_plan_slice_size, True)

sorted_theta_indices_size = angles_tot * np.int64().itemsize
debug_print(345, "sorted_indices", sorted_theta_indices_size, False)
sorted_theta_size = angles_tot * np.float32().itemsize
debug_print(346, "sorted_theta", sorted_theta_size, False)
angle_range_size = center_size * center_size * (1 + angle_range_pi_count * 2) * np.int16().itemsize
debug_print(351, "angle_range", angle_range_size, False)

recon_output_size = n * n * np.float32().itemsize
debug_print(434, "recon_up", recon_output_size, True)
ifft2_plan_slice_size = cufft_estimate_2d(nx=(2 * m + 2 * n),ny=(2 * m + 2 * n),fft_type=CufftType.CUFFT_C2C) / 2
debug_print(449, "ifft2_plan", ifft2_plan_slice_size, True)
circular_mask_size = np.prod(output_dims) / 2 * np.int64().itemsize * 4
debug_print(485, "circular_mask", circular_mask_size, False)
after_recon_swapaxis_slice = np.prod(non_slice_dims_shape) * np.float32().itemsize

tot_memory_bytes = int(
max(
in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + tmp_p_input_slice + padded_tmp_p_input_slice + rfft_result_size + filtered_rfft_result_size + irfft_result_size + irfft_scratch_memory_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + tmp_p_input_slice + datac_size + conversion_to_complex_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + fde_size + datac_size + shifted_datac_size + fft_result_size + backshifted_datac_size + scaled_backshifted_datac_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + ifft2_plan_slice_size + shifted_fde_view_size + ifft2_scratch_memory_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + ifft2_plan_slice_size + fde2_size + concatenate_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + ifft2_plan_slice_size + after_recon_swapaxis_slice
)
)

fixed_amount = int(
max(
theta_size + phi_size + linspace_size + meshgrid_size
, theta_size + sorted_theta_indices_size + sorted_theta_size + phi_size + angle_range_size + c1dfftshift_size + c2dfftshift_slice_size + filter_size + rfftfreq_size + scaled_filter_size
, theta_size + sorted_theta_indices_size + sorted_theta_size + phi_size + circular_mask_size
)
)

return (tot_memory_bytes * 1.1, fixed_amount)
debug_print(-1, "swapaxis", after_recon_swapaxis_slice, True)


tot_memory_bytes = 0
current_tot_memory_bytes = 0

fixed_amount = 0
current_fixed_amount = 0

def bump_memory(ln, amount, per_slice: bool):
nonlocal tot_memory_bytes
nonlocal current_tot_memory_bytes
nonlocal fixed_amount
nonlocal current_fixed_amount

if per_slice:
current_tot_memory_bytes += amount
tot_memory_bytes = max(tot_memory_bytes, current_tot_memory_bytes)
else:
current_fixed_amount += amount
fixed_amount = max(fixed_amount, current_fixed_amount)

debug_print(ln, "tot_memory_bytes", tot_memory_bytes, True)
debug_print(ln, "current_tot_memory_bytes", current_tot_memory_bytes, True)
debug_print(ln, "fixed_amount", fixed_amount, False)
debug_print(ln, "current_fixed_amount", current_fixed_amount, False)
debug_print(ln, "peak", tot_memory_bytes * ACTUAL_SLICE_COUNT + fixed_amount, False)
debug_print(ln, "current_mem", current_tot_memory_bytes * ACTUAL_SLICE_COUNT + current_fixed_amount, False)
print("****************")


# bump_memory(-1, in_slice_size, True)
bump_memory(233, padded_in_slice_size, True)

bump_memory(240, theta_size, False)
bump_memory(345, sorted_theta_indices_size, False)
bump_memory(346, sorted_theta_size, False)
bump_memory(351, angle_range_size, False)
bump_memory(259, filter_size, False)
bump_memory(262, rfftfreq_size, False)
bump_memory(263, scaled_filter_size, False)

bump_memory(266, tmp_p_input_slice, True)
bump_memory(273, padded_tmp_p_input_slice, True)
bump_memory(273, padded_tmp_p_input_slice, True)
bump_memory(279, rfft_plan_slice_size, True)
bump_memory(279, rfft_result_size, True)
bump_memory(279, filtered_rfft_result_size, True)
bump_memory(279, -rfft_result_size, True)
bump_memory(279, -filtered_rfft_result_size, True)
bump_memory(280, irfft_scratch_memory_size, True)
bump_memory(283, -padded_tmp_p_input_slice, True)
bump_memory(286, -padded_in_slice_size, True)

bump_memory(286, -filter_size, False)
bump_memory(286, -rfftfreq_size, False)
bump_memory(286, -scaled_filter_size, False)

bump_memory(290, datac_size, True)
bump_memory(293, fde_size, True)
bump_memory(307, -tmp_p_input_slice, True)
bump_memory(309, fft_plan_slice_size, True)
bump_memory(309, datac_size, True)

bump_memory(424, -datac_size, True)
bump_memory(434, recon_output_size, True)
bump_memory(449, ifft2_plan_slice_size, True)
bump_memory(483, -fde_size, True)

bump_memory(485, circular_mask_size, False)
bump_memory(-1, after_recon_swapaxis_slice, True)

print(f"tot_memory_bytes: {tot_memory_bytes / 1024 / 1024} MB, fixed_amount: {fixed_amount / 1024 / 1024} MB")
return (tot_memory_bytes*1.25, fixed_amount)
# return (tot_memory_bytes * 1.25, fixed_amount)



Expand Down
41 changes: 30 additions & 11 deletions tests/test_httomolibgpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,16 @@ def test_recon_FBP3d_tomobar_memoryhook(


@pytest.mark.cupy
# @pytest.mark.parametrize("projections", [2560])
# @pytest.mark.parametrize("detX_size", [2560])
# @pytest.mark.parametrize("slices", [10])
# @pytest.mark.parametrize("projection_angle_range", [(0, np.pi)])

# @pytest.mark.parametrize("projections", [1801])
# @pytest.mark.parametrize("detX_size", [2560])
# @pytest.mark.parametrize("slices", [3])
# @pytest.mark.parametrize("projection_angle_range", [(0, np.pi)])

@pytest.mark.parametrize("projections", [1500, 1801, 2560])
@pytest.mark.parametrize("detX_size", [2560])
@pytest.mark.parametrize("slices", [3, 4, 5, 10])
Expand All @@ -585,7 +595,7 @@ def test_recon_LPRec3d_tomobar_0_pi_memoryhook(slices, detX_size, projections, p
@pytest.mark.full
@pytest.mark.cupy
@pytest.mark.parametrize("projections", [1500, 1801, 2560, 3601])
@pytest.mark.parametrize("detX_size", [2560, 4593])
@pytest.mark.parametrize("detX_size", [2560])
@pytest.mark.parametrize("slices", [3, 4, 5, 10, 15, 20])
@pytest.mark.parametrize("projection_angle_range", [(0, np.pi)])
def test_recon_LPRec3d_tomobar_0_pi_memoryhook_full(slices, detX_size, projections, projection_angle_range, ensure_clean_memory):
Expand All @@ -594,7 +604,7 @@ def test_recon_LPRec3d_tomobar_0_pi_memoryhook_full(slices, detX_size, projectio
@pytest.mark.full
@pytest.mark.cupy
@pytest.mark.parametrize("projections", [1500, 1801, 2560, 3601])
@pytest.mark.parametrize("detX_size", [2560, 4593])
@pytest.mark.parametrize("detX_size", [2560])
@pytest.mark.parametrize("slices", [3, 4, 5, 10, 15, 20])
@pytest.mark.parametrize("projection_angle_range", [(0, np.pi), (0, 2 * np.pi), (-np.pi / 2, np.pi / 2)])
def test_recon_LPRec3d_tomobar_memoryhook_full(slices, detX_size, projections, projection_angle_range, ensure_clean_memory):
Expand All @@ -609,9 +619,13 @@ def __test_recon_LPRec3d_tomobar_memoryhook_common(slices, detX_size, projection
kwargs["recon_size"] = detX_size
kwargs["recon_mask_radius"] = 0.8


hook = MaxMemoryHook()
with hook:
hook2 = PeakMemoryLineProfileHook(running_peak_root_file_names=["methodsDIR_CuPy.py"])
with hook, hook2:
recon_data = LPRec3d_tomobar(cp.copy(data), **kwargs)
hook2.print_report()
# hook.print_report()

# make sure estimator function is within range (80% min, 100% max)
max_mem = (
Expand All @@ -626,27 +640,32 @@ def __test_recon_LPRec3d_tomobar_memoryhook_common(slices, detX_size, projection
non_slice_dims_shape, dtype=input_data_type, **kwargs
)

even_slice_count = True
padded_slices = slices
if (slices % 2) != 0:
even_slice_count = False
padded_slices += 1
odd_horiz = bool(detX_size % 2)
odd_vert = bool(slices % 2)

slices += odd_vert

if even_slice_count:
if not odd_horiz and not odd_vert:
input_slice_size = np.prod(non_slice_dims_shape) * input_data_type.itemsize
estimated_memory_bytes -= input_slice_size

estimated_memory_mb = round(padded_slices * estimated_memory_bytes / (1024**2), 2)
print(f"slice: {slices}")
estimated_memory_mb = round(slices * estimated_memory_bytes / (1024**2), 2)
max_mem_mb = round(max_mem / (1024**2), 2)
print(f"max_mem_mb before: {max_mem_mb}")
max_mem -= subtract_bytes
max_mem_mb = round(max_mem / (1024**2), 2)

# now we compare both memory estimations
print(f"estimated_memory_mb: {estimated_memory_mb}")
print(f"max_mem_mb: {max_mem_mb}")
difference_mb = abs(estimated_memory_mb - max_mem_mb)
print(f"difference_mb: {difference_mb}")
percents_relative_maxmem = round((difference_mb / max_mem_mb) * 100)
# the estimated_memory_mb should be LARGER or EQUAL to max_mem_mb
# the resulting percent value should not deviate from max_mem on more than 20%
assert estimated_memory_mb >= max_mem_mb
assert percents_relative_maxmem <= 35
assert percents_relative_maxmem <= 40


@pytest.mark.cupy
Expand Down
Loading