diff --git a/experiments/README.md b/experiments/README.md
index 8c8fd6d..981f2b4 100644
--- a/experiments/README.md
+++ b/experiments/README.md
@@ -55,9 +55,9 @@ For GPU,
 - $ pip install https://download.pytorch.org/whl/nightly/cu121/torch-2.2.0.dev20231117%2Bcu121-cp310-cp310-linux_x86_64.whl
 - $ pip install https://download.pytorch.org/whl/nightly/cu121/torchvision-0.17.0.dev20231117%2Bcu121-cp310-cp310-linux_x86_64.whl
 For CPU,
-- $ pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240509%2Bcpu-cp310-cp310-linux_x86_64.whl
-- $ pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240509%2Bcpu-cp310-cp310-linux_x86_64.whl
-- $ pip install triton
+- $ pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240530%2Bcpu-cp310-cp310-linux_x86_64.whl
+- $ pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240530%2Bcpu-cp310-cp310-linux_x86_64.whl
+- $ install triton based on https://github.com/triton-lang/triton?tab=readme-ov-file#quick-installation
 
 $ git clone https://github.com/cpuhrsch/segment-anything.git
 $ cd segment-anything
diff --git a/experiments/eval_combo.py b/experiments/eval_combo.py
index d0ca231..0b32d8f 100644
--- a/experiments/eval_combo.py
+++ b/experiments/eval_combo.py
@@ -6,6 +6,7 @@
 import math
 import segment_anything_fast
 import time
+import resource
 
 torch._dynamo.config.cache_size_limit = 50000
 
@@ -257,7 +258,10 @@ def profile_top_runner(fn, *args, **kwargs):
                         torch.profiler.ProfilerActivity.CUDA],
             record_shapes=True) as prof:
         result = fn(*args, **kwargs)
-    print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+    if torch.cuda.is_available():
+        print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+    else:
+        print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
     return result
 
 
@@ -444,15 +448,22 @@ def run(
         batch_ms_batch_size = (avg_ms_per_img * num_images) / num_batches / batch_size
 
     mIoU = calculate_miou(results, mask_debug_out_dir, True, cat_id_to_cat)
-    max_memory_allocated_bytes = torch.cuda.max_memory_allocated()
-    _, total_memory = torch.cuda.mem_get_info()
-    max_memory_allocated_percentage = int(100 * (max_memory_allocated_bytes / total_memory))
-    max_memory_allocated_bytes = max_memory_allocated_bytes >> 20
+    if torch.cuda.is_available():
+        max_memory_allocated_bytes = torch.cuda.max_memory_allocated()
+        _, total_memory = torch.cuda.mem_get_info()
+        max_memory_allocated_percentage = int(100 * (max_memory_allocated_bytes / total_memory))
+        max_memory_allocated_bytes = max_memory_allocated_bytes >> 20
+    else:
+        import psutil
+        total_memory = psutil.virtual_memory().total
+        max_memory_allocated_bytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        max_memory_allocated_percentage = int(100 * (max_memory_allocated_bytes / (total_memory >> 10)))
+        max_memory_allocated_bytes = max_memory_allocated_bytes >> 10
 
     if print_header:
-        print(",".join(["sam_model_type", "batch_size", "memory(MiB)", "memory(%)", "img_s(avg)", "batch_ms(avg)/batch_size", "mIoU", "use_compile",
+        print(",".join(["device", "sam_model_type", "batch_size", "memory(MiB)", "memory(%)", "img_s(avg)", "batch_ms(avg)/batch_size", "mIoU", "use_compile",
               "use_half", "compress", "epilogue_fusion_first", "use_compile_decoder", "use_nested_tensor", "use_rel_pos", "pad_input_image_batch", "num_workers", "num_batches", "num_images", "profile_path", "memory_path"]))
-    print(",".join(map(str, [sam_model_type, batch_size, max_memory_allocated_bytes, max_memory_allocated_percentage, img_s, batch_ms_batch_size, mIoU, use_compile,
+    print(",".join(map(str, [device, sam_model_type, batch_size, max_memory_allocated_bytes, max_memory_allocated_percentage, img_s, batch_ms_batch_size, mIoU, use_compile,
           use_half, compress, epilogue_fusion_first, use_compile_decoder, use_nested_tensor, use_rel_pos, pad_input_image_batch, num_workers, num_batches, num_images, profile_path, memory_path])))