replacement shows reduction

kylesayrs · kylesayrs · commit 2fc040351a83 · 2025-04-17T14:41:15.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -19,9 +19,11 @@
 import re
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union, Callable
 
 import compressed_tensors
+from compressed_tensors.linear.compressed_linear import CompressedLinear
+from compressed_tensors.utils.offload import update_offload_parameter
 import torch
 import transformers
 from compressed_tensors.base import (
@@ -65,37 +67,36 @@
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
-import tracemalloc
-import linecache
-import objgraph
+def module_replace_dfs(
+    module: Module,
+    func: Callable[[Module], Module],
+    pre: bool = True,
+    progress: Union[bool, tqdm] = False,
+) -> Module:
+    if progress is True:
+        total = len(list(module.modules()))
+        progress = tqdm(total=total)
+
+    if pre:
+        module = func(module)
+
+    for name, child in list(module.named_children()):
+        module.add_module(name, module_replace_dfs(child, func, pre, progress))
+
+    if not pre:
+        module = func(module)
+
+    if isinstance(progress, tqdm):
+        progress.update(1)
+
+    return module
+
+
 
 if TYPE_CHECKING:
     # dummy type if not available from transformers
     CompressedTensorsConfig = TypeVar("CompressedTensorsConfig")
 
-def display_top(snapshot, key_type='lineno', limit=3):
-    snapshot = snapshot.filter_traces((
-        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
-        tracemalloc.Filter(False, "<unknown>"),
-    ))
-    top_stats = snapshot.statistics(key_type)
-
-    print("Top %s lines" % limit)
-    for index, stat in enumerate(top_stats[:limit], 1):
-        frame = stat.traceback[0]
-        print("#%s: %s:%s: %.1f MB"
-              % (index, frame.filename, frame.lineno, stat.size / (1024 * 1024)))
-        line = linecache.getline(frame.filename, frame.lineno).strip()
-        if line:
-            print('    %s' % line)
-
-    other = top_stats[limit:]
-    if other:
-        size = sum(stat.size for stat in other)
-        print("%s other: %.1f MB" % (len(other), size / (1024 * 1024)))
-    total = sum(stat.size for stat in top_stats)
-    print(f"Total Python-tracked memory: {total / (1024 * 1024):.2f} MB")
-
 
 class ModelCompressor:
     """
@@ -384,6 +385,30 @@ def get_unexpected_file_keys(self, model: Module) -> List[str]:
                 )
 
         return list(unexpected_keys)
+    
+    def apply_compression_status(self, model: Module) -> Module:
+        quantization_format = self.quantization_config.format
+
+        def replace_with_compressed(module: Module) -> Module:
+            scheme = getattr(module, "quantization_scheme", None)
+            if isinstance(module, torch.nn.Linear) and scheme is not None:
+                #compressed_state_dict_2 = self.compress(module)  # debug
+
+                module = CompressedLinear.from_linear(
+                    module,
+                    quantization_scheme=scheme,
+                    quantization_format=quantization_format
+                )
+                state_dict = module.compressor.compress(module.state_dict(), {"": scheme})  # added by compressed linear
+
+                for name, value in state_dict.items():
+                    update_offload_parameter(module, name, value)
+
+            return module
+
+
+        progress = tqdm(total=len(list(model.modules())))
+        return module_replace_dfs(model, replace_with_compressed, progress=progress)
 
     def compress(
         self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None
@@ -403,13 +428,11 @@ def compress(
 
         if self.quantization_compressor is not None:
             #with profile(activities=[ProfilerActivity.CUDA], profile_memory=True, record_shapes=True, with_stack=True) as prof:
-            with TrackTensorAllocations() as prof:
-                module_to_scheme = map_module_to_scheme(model)
-                state_dict = self.quantization_compressor.compress(
-                    state_dict, names_to_scheme=module_to_scheme
-                )
-            print(prof.total_tensor_memory_mib)
-            breakpoint()
+            #with TrackTensorAllocations() as prof:
+            module_to_scheme = map_module_to_scheme(model)
+            state_dict = self.quantization_compressor.compress(
+                state_dict, names_to_scheme=module_to_scheme
+            )
             # if self.quantization_config.format != CompressionFormat.dense.value:
             #     self.quantization_config.quantization_status = (
             #         QuantizationStatus.COMPRESSED
@@ -559,13 +582,11 @@ def map_module_to_scheme(model: Module) -> Dict[str, QuantizationScheme]:
     """
     Returns a dictionary which maps quantized module names to their quantization schemes
     """
-    quantized_modules_to_args = {}
-    for name, submodule in iter_named_leaf_modules(model):
-        if is_module_quantized(submodule):
-            name = fix_fsdp_module_name(name)
-            quantized_modules_to_args[name] = submodule.quantization_scheme
-
-    return quantized_modules_to_args
+    return {
+        fix_fsdp_module_name(name): module.quantization_scheme
+        for name, module in iter_named_leaf_modules(model)
+        if is_module_quantized(module)
+    }
 
 
 # HACK: Override the dtype_byte_size function in transformers to support float8 types
diff --git a/src/compressed_tensors/compressors/model_compressors/track_tensor_memory.py b/src/compressed_tensors/compressors/model_compressors/track_tensor_memory.py
@@ -1,5 +1,6 @@
 from typing import Callable, Any, Type, List, Set
 from functools import partial
+import matplotlib.pyplot as plt
 
 import gc
 import torch
@@ -66,3 +67,25 @@ def total_tensor_memory_mib(self):
     
     def _add_to_timeline(self):
         self.memory_timeline.append(self.total_tensor_memory)
+
+
+    def plot_values_over_time(self, dpi=300):
+        values = self.memory_timeline
+        """
+        Plots a list of float values over time using matplotlib.
+
+        Parameters:
+            values (list of float): The values to plot.
+        """
+        if not values:
+            print("The list of values is empty.")
+            return
+
+        plt.figure(figsize=(10, 4))
+        plt.plot(range(len(values)), values, marker='o', linestyle='-')
+        plt.title("Values Over Time")
+        plt.xlabel("Time")
+        plt.ylabel("Value")
+        plt.grid(True)
+        plt.tight_layout()
+        plt.savefig("file.png", dpi=dpi)
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -88,21 +88,22 @@ def compress(
             value = model_state[name]
 
             # compress weights
-            if name.endswith(".weight"):
-                prefix = remove_suffix(name, ".weight")
+            if name.endswith("weight"):
+                prefix = remove_suffix(name, "weight")
 
                 # gather qparams
-                scale = model_state.get(merge_names(prefix, "weight_scale"), None)
-                g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
-                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
+                scale = model_state.get(prefix + "weight_scale", None)
+                g_idx = model_state.get(prefix + "weight_g_idx", None)
+                zp = model_state.get(prefix + "weight_zero_point", None)
 
                 # is scale does not exist, then weight cannot be compressed
                 if scale is None:
                     model_state[name] = value.to(save_device)
                     continue
 
                 # compress values on cpu (memory movement too expensive)
-                quant_args = names_to_scheme[prefix].weights
+                module_path = prefix[:-1] if prefix.endswith(".") else prefix
+                quant_args = names_to_scheme[module_path].weights
                 compressed_values = self.compress_weight(
                     weight=value,
                     scale=scale,
@@ -115,7 +116,7 @@ def compress(
                 # update state dict
                 del model_state[name]
                 for key, value in compressed_values.items():
-                    model_state[merge_names(prefix, key)] = value.to(save_device)
+                    model_state[prefix + key] = value.to(save_device)
 
             else:
                 # omit saving zero points for symmetric quantization
@@ -202,7 +203,10 @@ def _decompress_from_state_dict(
 
 
 def _is_symmetric(name: str, names_to_scheme: Dict[str, QuantizationScheme]) -> bool:
-    weight_name, zp_name = name.rsplit(".", 1)
+    try:
+        weight_name, zp_name = name.rsplit(".", 1) if "." in name else ("", name)
+    except:
+        breakpoint()
     scheme = names_to_scheme[weight_name]
 
     if zp_name == "weight_zero_point":
diff --git a/src/compressed_tensors/linear/compressed_linear.py b/src/compressed_tensors/linear/compressed_linear.py
@@ -15,6 +15,7 @@
 import warnings
 from typing import Dict, Tuple
 
+from compressed_tensors.utils.offload import get_execution_device
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.quantization import (
@@ -60,7 +61,7 @@ def from_linear(
         """
         module.__class__ = CompressedLinear
         module.compressor = BaseCompressor.load_from_registry(quantization_format)
-        device = next(module.parameters()).device
+        init_device = get_execution_device(module)
 
         # this will initialize all the scales and zero points
         initialize_module_for_quantization(
@@ -79,7 +80,7 @@ def from_linear(
         # populate compressed weights and quantization parameters
         for name, (shape, dtype) in compression_params.items():
             param = Parameter(
-                torch.empty(shape, device=device, dtype=dtype), requires_grad=False
+                torch.empty(shape, device=init_device, dtype=dtype), requires_grad=False
             )
             register_offload_parameter(module, name, param)