From 487885ce77db4f7676e6db50541a4b4842817d79 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 2 Jun 2025 17:54:15 -0400 Subject: [PATCH 1/3] use weakref to model Signed-off-by: Kyle Sayers --- .../sparsification/compressed_tensors_utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 0129075d5..1d8fcdd8e 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -56,6 +56,10 @@ def save_pretrained_compressed(save_pretrained_method): model_class = model_ref().__class__ del save_pretrained_method + # hotfix: create a weak reference to the model to avoid circular dep + # TODO: determine why circular dep is not collected and how to clean up this fn + model_ref = weakref.ref(model) + @wraps(original_save_pretrained) def save_pretrained_wrapper( save_directory: str, @@ -95,11 +99,11 @@ def save_pretrained_wrapper( state_dict = kwargs.pop("state_dict", None) if state_dict is None: logger.info("Fetching state_dict - this may take some time") - state_dict = get_state_dict_offloaded_model(model) + state_dict = get_state_dict_offloaded_model(model_ref()) logger.info("Fetching compressor") compressor = get_model_compressor( - model=model, + model=model_ref(), sparsity_config=sparsity_config, quantization_format=quantization_format, save_compressed=save_compressed, @@ -111,7 +115,7 @@ def save_pretrained_wrapper( if compressor is None: # model is not compressed or quantized, save as normal original_save_pretrained_func = original_save_pretrained.__get__( - model, model_class + model_ref(), model_class ) original_save_pretrained_func( save_directory, state_dict=state_dict, **kwargs @@ -121,10 +125,10 @@ def save_pretrained_wrapper( # make sure we're on the main process when saving if state_dict is not None and len(state_dict) > 0: compressed_state_dict = compressor.compress( - model, state_dict, show_progress=True + model_ref(), state_dict, show_progress=True ) logger.info("Saving compressed model to disk") - original_save_pretrained.__get__(model, model_class)( + original_save_pretrained.__get__(model_ref(), model_class)( save_directory, state_dict=compressed_state_dict, safe_serialization=safe_serialization, @@ -133,10 +137,10 @@ def save_pretrained_wrapper( compressor.update_config(save_directory) # update existing recipe - update_and_save_recipe(model.name_or_path, save_directory) + update_and_save_recipe(model_ref().name_or_path, save_directory) # copy python files from cache dir to save_path if any - copy_python_files_from_model_cache(model, save_directory) + copy_python_files_from_model_cache(model_ref(), save_directory) save_pretrained_wrapper._overridden = True return save_pretrained_wrapper From 71820e191eefdeb96dd582d7582d09963d59b2b2 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Jun 2025 15:25:17 -0400 Subject: [PATCH 2/3] explicitly call gc collect Signed-off-by: Kyle Sayers --- .../sparsification/compressed_tensors_utils.py | 18 +++++++----------- tests/llmcompressor/conftest.py | 6 +++++- .../sparse/gpu/llama_7b_sparse_second.yaml | 7 +++++++ 3 files changed, 19 insertions(+), 12 deletions(-) create mode 100644 tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 1d8fcdd8e..0129075d5 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -56,10 +56,6 @@ def save_pretrained_compressed(save_pretrained_method): model_class = model_ref().__class__ del save_pretrained_method - # hotfix: create a weak reference to the model to avoid circular dep - # TODO: determine why circular dep is not collected and how to clean up this fn - model_ref = weakref.ref(model) - @wraps(original_save_pretrained) def save_pretrained_wrapper( save_directory: str, @@ -99,11 +95,11 @@ def save_pretrained_wrapper( state_dict = kwargs.pop("state_dict", None) if state_dict is None: logger.info("Fetching state_dict - this may take some time") - state_dict = get_state_dict_offloaded_model(model_ref()) + state_dict = get_state_dict_offloaded_model(model) logger.info("Fetching compressor") compressor = get_model_compressor( - model=model_ref(), + model=model, sparsity_config=sparsity_config, quantization_format=quantization_format, save_compressed=save_compressed, @@ -115,7 +111,7 @@ def save_pretrained_wrapper( if compressor is None: # model is not compressed or quantized, save as normal original_save_pretrained_func = original_save_pretrained.__get__( - model_ref(), model_class + model, model_class ) original_save_pretrained_func( save_directory, state_dict=state_dict, **kwargs @@ -125,10 +121,10 @@ def save_pretrained_wrapper( # make sure we're on the main process when saving if state_dict is not None and len(state_dict) > 0: compressed_state_dict = compressor.compress( - model_ref(), state_dict, show_progress=True + model, state_dict, show_progress=True ) logger.info("Saving compressed model to disk") - original_save_pretrained.__get__(model_ref(), model_class)( + original_save_pretrained.__get__(model, model_class)( save_directory, state_dict=compressed_state_dict, safe_serialization=safe_serialization, @@ -137,10 +133,10 @@ def save_pretrained_wrapper( compressor.update_config(save_directory) # update existing recipe - update_and_save_recipe(model_ref().name_or_path, save_directory) + update_and_save_recipe(model.name_or_path, save_directory) # copy python files from cache dir to save_path if any - copy_python_files_from_model_cache(model_ref(), save_directory) + copy_python_files_from_model_cache(model, save_directory) save_pretrained_wrapper._overridden = True return save_pretrained_wrapper diff --git a/tests/llmcompressor/conftest.py b/tests/llmcompressor/conftest.py index f078fd0ae..f13076407 100644 --- a/tests/llmcompressor/conftest.py +++ b/tests/llmcompressor/conftest.py @@ -1,3 +1,4 @@ +import gc import os import shutil import tempfile @@ -80,7 +81,7 @@ def check_for_created_files(): @pytest.fixture(autouse=True, scope="function") -def setup_fresh_session(): +def llm_compressor_setup_teardown(): """ setup any state tied to the execution of the given method in a class. setup_method is invoked for every test method of a class. @@ -92,3 +93,6 @@ def setup_fresh_session(): yield # reset the session after each test reset_session() + # explictly collect memory to catch memory bugs, + # see https://github.com/vllm-project/llm-compressor/pull/1503 + gc.collect() diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml new file mode 100644 index 000000000..8bea1fbf3 --- /dev/null +++ b/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: "meta-llama/Llama-2-7b-hf" +dataset: open_platypus +recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml" +sparsity: 0.3 +device: "cuda:0" \ No newline at end of file From 4ff874bf5a386b52ec06c24018f397394c651fe0 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Jun 2025 15:25:40 -0400 Subject: [PATCH 3/3] remove unused test Signed-off-by: Kyle Sayers --- .../obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml deleted file mode 100644 index 8bea1fbf3..000000000 --- a/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_second.yaml +++ /dev/null @@ -1,7 +0,0 @@ -cadence: "nightly" -test_type: "regression" -model: "meta-llama/Llama-2-7b-hf" -dataset: open_platypus -recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml" -sparsity: 0.3 -device: "cuda:0" \ No newline at end of file