wip

kylesayrs · kylesayrs · commit 72dd867114b2 · 2025-04-14T13:45:43.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -65,11 +65,37 @@
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
+import tracemalloc
+import linecache
+import objgraph
 
 if TYPE_CHECKING:
     # dummy type if not available from transformers
     CompressedTensorsConfig = TypeVar("CompressedTensorsConfig")
 
+def display_top(snapshot, key_type='lineno', limit=3):
+    snapshot = snapshot.filter_traces((
+        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
+        tracemalloc.Filter(False, "<unknown>"),
+    ))
+    top_stats = snapshot.statistics(key_type)
+
+    print("Top %s lines" % limit)
+    for index, stat in enumerate(top_stats[:limit], 1):
+        frame = stat.traceback[0]
+        print("#%s: %s:%s: %.1f MB"
+              % (index, frame.filename, frame.lineno, stat.size / (1024 * 1024)))
+        line = linecache.getline(frame.filename, frame.lineno).strip()
+        if line:
+            print('    %s' % line)
+
+    other = top_stats[limit:]
+    if other:
+        size = sum(stat.size for stat in other)
+        print("%s other: %.1f MB" % (len(other), size / (1024 * 1024)))
+    total = sum(stat.size for stat in top_stats)
+    print(f"Total Python-tracked memory: {total / (1024 * 1024):.2f} MB")
+
 
 class ModelCompressor:
     """
@@ -362,25 +388,37 @@ def get_unexpected_file_keys(self, model: Module) -> List[str]:
     def compress(
         self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None
     ) -> Dict[str, Tensor]:
+        from torch.profiler import profile, ProfilerActivity
+        from .track_tensor_memory import TrackTensorAllocations
         """
         Compresses a dense state dict or model with sparsity and/or quantization
 
         :param model: uncompressed model to compress
         :param state_dict: optional uncompressed state_dict to insert into model
         :return: compressed state dict
         """
+
         if state_dict is None:
             state_dict = model.state_dict()
 
         if self.quantization_compressor is not None:
-            module_to_scheme = map_module_to_scheme(model)
-            state_dict = self.quantization_compressor.compress(
-                state_dict, names_to_scheme=module_to_scheme
-            )
-            if self.quantization_config.format != CompressionFormat.dense.value:
-                self.quantization_config.quantization_status = (
-                    QuantizationStatus.COMPRESSED
+            #with profile(activities=[ProfilerActivity.CUDA], profile_memory=True, record_shapes=True, with_stack=True) as prof:
+            with TrackTensorAllocations() as prof:
+                module_to_scheme = map_module_to_scheme(model)
+                state_dict = self.quantization_compressor.compress(
+                    state_dict, names_to_scheme=module_to_scheme
                 )
+            print(prof.total_tensor_memory_mib)
+            breakpoint()
+            # if self.quantization_config.format != CompressionFormat.dense.value:
+            #     self.quantization_config.quantization_status = (
+            #         QuantizationStatus.COMPRESSED
+            #     )
+
+            #prof.export_memory_timeline("memory.html")
+            #print(prof.key_averages().table(sort_by="self_device_memory_usage", row_limit=3))
+            #breakpoint()
+            return state_dict
 
         if self.sparsity_compressor is not None:
             sparse_compression_targets: Set[str] = expand_target_names(
diff --git a/src/compressed_tensors/compressors/model_compressors/track_tensor_memory.py b/src/compressed_tensors/compressors/model_compressors/track_tensor_memory.py
@@ -0,0 +1,68 @@
+from typing import Callable, Any, Type, List, Set
+from functools import partial
+
+import gc
+import torch
+import weakref
+
+
+class TrackTensorAllocations:
+    total_tensor_memory: int
+    memory_timeline: List[int]
+    
+    _tracked_tensors: Set[int]
+    _original_init_fn: Callable[[Any], None]
+
+    def __init__(self):
+        self.total_tensor_memory = 0
+        self.memory_timeline = []
+
+        self._tracked_tensors = set()
+        self._original_init_fn = torch.Tensor.__init__
+
+    def __enter__(self):
+        def wrapped_init(instance, *args, **kwargs):
+            if isinstance(instance, torch.Tensor):
+                self._original_init_fn(instance)
+                self.track_tensor(instance)
+            else:
+                # parameters, ect.
+                type(instance).__init__(instance, *args, **kwargs)
+        
+        torch.Tensor.__init__ = wrapped_init
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch.Tensor.__init__ = self._original_init_fn
+        self._active = False
+        gc.collect()
+
+    def track_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
+        tensor_hash = hash(tensor)
+        tensor_memory = tensor.numel() * tensor.element_size()
+
+        # warn when init is called twice
+        if tensor_hash in self._tracked_tensors:
+            print("double init")
+            return
+
+        # add memory
+        self.total_tensor_memory += tensor_memory
+        self._add_to_timeline()
+        self._tracked_tensors.add(tensor_hash)
+
+        # register hook to subtract memory
+        weakref.finalize(tensor, partial(self._on_tensor_deallocated, tensor_memory, tensor_hash))
+
+    def _on_tensor_deallocated(self, tensor_memory, tensor_hash):
+        self.total_tensor_memory -= tensor_memory
+        self._add_to_timeline()
+        self._tracked_tensors.remove(tensor_hash)
+    
+    @property
+    def total_tensor_memory_mib(self):
+        return self.total_tensor_memory / (1024 * 1024)
+    
+    def _add_to_timeline(self):
+        self.memory_timeline.append(self.total_tensor_memory)
diff --git a/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py
@@ -187,7 +187,7 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
 
     # convert back to signed and torch
     packed = np.ascontiguousarray(packed).view(np.int32)
-    return torch.from_numpy(packed)
+    return torch.Tensor(torch.from_numpy(packed))
 
 
 def unpack_from_int32(
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -322,6 +322,7 @@ def calculate_compression_ratio(model: Module) -> float:
     :param model: pytorch module to calculate compression ratio for
     :return: compression ratio of the whole model
     """
+    return 0.0
     total_compressed = 0.0
     total_uncompressed = 0.0
     for name, submodule in tqdm(