From 54d3560daf1dc50a390c722e5175aab62333d1ca Mon Sep 17 00:00:00 2001 From: Anmol Mekala <49127549+molereddy@users.noreply.github.com> Date: Sat, 1 Mar 2025 09:13:50 -0500 Subject: [PATCH 01/10] Fix hyperlinks in README (#2) * testing commit * Fixes * cleanup --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dce38e5..a04d486 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ We provide several variants for each of the components in the unlearning pipelin ## πŸ“Œ Table of Contents - πŸ“– [Overview](#-overview) -- πŸ—ƒοΈ [Available Components](#-available-components) +- πŸ—ƒοΈ [Available Components](#%EF%B8%8F-available-components) - ⚑ [Quickstart](#-quickstart) - πŸ› οΈ [Environment Setup](#-environment-setup) - πŸ’Ύ [Data Setup](#-data-setup) @@ -56,7 +56,7 @@ We provide several variants for each of the components in the unlearning pipelin - βž• [How to Add New Components](#-how-to-add-new-components) - πŸ“š [Further Documentation](#-further-documentation) - πŸ”— [Support & Contributors](#-support--contributors) -- πŸ“ [Citing this work](#-citating-this-work) +- πŸ“ [Citing this work](#-citing-this-work) - 🀝 [Acknowledgements](#-acknowledgements) - πŸ“„ [License](#-license) @@ -198,7 +198,7 @@ If you use OpenUnlearning in your research, please cite: --- -### 🀝 Acknowledgments +### 🀝 Acknowledgements - This repo is inspired from [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory). - The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/jaechan-repo/muse_bench) benchmarks served as the foundation for our re-implementation. From 4c36e4f5a39d979280efd20452be7ba5ff54e40a Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 18:37:19 +0000 Subject: [PATCH 02/10] Fixed DPO command --- scripts/tofu_unlearn.sh | 56 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index a556bd1..eba38ff 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -9,12 +9,6 @@ models=( "Llama-3.2-3B-Instruct" "Llama-3.1-8B-Instruct" ) -trainers_experiments=( - "GradAscent unlearn/tofu/default.yaml" - "GradDiff unlearn/tofu/default.yaml" - "NPO unlearn/tofu/default.yaml" - "DPO unlearn/tofu/default.yaml" -) forget_retain_splits=( "forget01 retain99" "forget05 retain95" @@ -29,7 +23,57 @@ gradient_accumulation_steps=4 ########################################### Unlearn TOFU models ######################################################## ######################################################################################################################## +trainers_experiments=( + "GradAscent unlearn/tofu/default.yaml" + "GradDiff unlearn/tofu/default.yaml" + "NPO unlearn/tofu/default.yaml" +) +for split in "${forget_retain_splits[@]}"; do + forget_split=$(echo $split | cut -d' ' -f1) + retain_split=$(echo $split | cut -d' ' -f2) + for model in "${models[@]}"; do + for trainer_experiment in "${trainers_experiments[@]}"; do + trainer=$(echo $trainer_experiment | cut -d' ' -f1) + experiment=$(echo $trainer_experiment | cut -d' ' -f2) + + task_name=tofu_${model}_${forget_split}_${trainer} + model_path=open-unlearning/tofu_${model}_full + echo ${task_name}: Unlearning ${model_path} using ${trainer} + + # Unlearn + CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ + src/train.py --config-name=unlearn.yaml \ + experiment=${experiment} \ + trainer=${trainer} \ + task_name=${task_name} \ + model=${model} \ + forget_split=${forget_split} \ + retain_split=${retain_split} \ + model.model_args.pretrained_model_name_or_path=${model_path} \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ + trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ + trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ + trainer.args.ddp_find_unused_parameters=true \ + trainer.args.gradient_checkpointing=true + + # Eval + CUDA_VISIBLE_DEVICES=0 python src/eval.py \ + experiment=eval/tofu/default.yaml \ + forget_split=${forget_split} \ + model=${model} \ + task_name=${task_name} \ + model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ + paths.output_dir=saves/unlearn/${task_name}/evals \ + retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json + done + done +done + + +trainers_experiments=( + "DPO unlearn/tofu/idk.yaml" +) for split in "${forget_retain_splits[@]}"; do forget_split=$(echo $split | cut -d' ' -f1) retain_split=$(echo $split | cut -d' ' -f2) From f7a69dee40cfe918e32250ce7c1ba7564205c9b4 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 18:50:47 +0000 Subject: [PATCH 03/10] download idk --- setup_data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/setup_data.py b/setup_data.py index 48de0ad..358779c 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,8 +1,17 @@ from huggingface_hub import snapshot_download +# Setup retain model metrics snapshot_download( repo_id="open-unlearning/eval", allow_patterns="*.json", repo_type="dataset", local_dir="saves/eval", ) + +# Setup data +snapshot_download( + repo_id="open-unlearning/idk", + allow_patterns="*.jsonl", + repo_type="dataset", + local_dir="data", +) \ No newline at end of file From 332af36c4772eb1e836b767adbd102951693b60c Mon Sep 17 00:00:00 2001 From: Vineeth <48151992+Dornavineeth@users.noreply.github.com> Date: Sun, 2 Mar 2025 14:13:43 -0500 Subject: [PATCH 04/10] Revert "Dpo fix" --- scripts/tofu_unlearn.sh | 56 +++++------------------------------------ setup_data.py | 9 ------- 2 files changed, 6 insertions(+), 59 deletions(-) diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index eba38ff..a556bd1 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -9,6 +9,12 @@ models=( "Llama-3.2-3B-Instruct" "Llama-3.1-8B-Instruct" ) +trainers_experiments=( + "GradAscent unlearn/tofu/default.yaml" + "GradDiff unlearn/tofu/default.yaml" + "NPO unlearn/tofu/default.yaml" + "DPO unlearn/tofu/default.yaml" +) forget_retain_splits=( "forget01 retain99" "forget05 retain95" @@ -23,57 +29,7 @@ gradient_accumulation_steps=4 ########################################### Unlearn TOFU models ######################################################## ######################################################################################################################## -trainers_experiments=( - "GradAscent unlearn/tofu/default.yaml" - "GradDiff unlearn/tofu/default.yaml" - "NPO unlearn/tofu/default.yaml" -) -for split in "${forget_retain_splits[@]}"; do - forget_split=$(echo $split | cut -d' ' -f1) - retain_split=$(echo $split | cut -d' ' -f2) - for model in "${models[@]}"; do - for trainer_experiment in "${trainers_experiments[@]}"; do - trainer=$(echo $trainer_experiment | cut -d' ' -f1) - experiment=$(echo $trainer_experiment | cut -d' ' -f2) - - task_name=tofu_${model}_${forget_split}_${trainer} - model_path=open-unlearning/tofu_${model}_full - echo ${task_name}: Unlearning ${model_path} using ${trainer} - - # Unlearn - CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ - src/train.py --config-name=unlearn.yaml \ - experiment=${experiment} \ - trainer=${trainer} \ - task_name=${task_name} \ - model=${model} \ - forget_split=${forget_split} \ - retain_split=${retain_split} \ - model.model_args.pretrained_model_name_or_path=${model_path} \ - retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json \ - trainer.args.per_device_train_batch_size=$per_device_train_batch_size \ - trainer.args.gradient_accumulation_steps=$gradient_accumulation_steps \ - trainer.args.ddp_find_unused_parameters=true \ - trainer.args.gradient_checkpointing=true - - # Eval - CUDA_VISIBLE_DEVICES=0 python src/eval.py \ - experiment=eval/tofu/default.yaml \ - forget_split=${forget_split} \ - model=${model} \ - task_name=${task_name} \ - model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ - paths.output_dir=saves/unlearn/${task_name}/evals \ - retain_logs_path=saves/eval/tofu_${model}_${retain_split}/TOFU_EVAL.json - done - done -done - - -trainers_experiments=( - "DPO unlearn/tofu/idk.yaml" -) for split in "${forget_retain_splits[@]}"; do forget_split=$(echo $split | cut -d' ' -f1) retain_split=$(echo $split | cut -d' ' -f2) diff --git a/setup_data.py b/setup_data.py index 358779c..48de0ad 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,17 +1,8 @@ from huggingface_hub import snapshot_download -# Setup retain model metrics snapshot_download( repo_id="open-unlearning/eval", allow_patterns="*.json", repo_type="dataset", local_dir="saves/eval", ) - -# Setup data -snapshot_download( - repo_id="open-unlearning/idk", - allow_patterns="*.jsonl", - repo_type="dataset", - local_dir="data", -) \ No newline at end of file From f468efb9eaa0c737e6e8b4e64abb411131ff7a99 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 19:22:21 +0000 Subject: [PATCH 05/10] download idk data --- setup_data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/setup_data.py b/setup_data.py index 48de0ad..358779c 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,8 +1,17 @@ from huggingface_hub import snapshot_download +# Setup retain model metrics snapshot_download( repo_id="open-unlearning/eval", allow_patterns="*.json", repo_type="dataset", local_dir="saves/eval", ) + +# Setup data +snapshot_download( + repo_id="open-unlearning/idk", + allow_patterns="*.jsonl", + repo_type="dataset", + local_dir="data", +) \ No newline at end of file From ca8d5038b07ca8b8bbf0a71bf8a9a5502899f154 Mon Sep 17 00:00:00 2001 From: Dornavineeth Date: Sun, 2 Mar 2025 19:22:45 +0000 Subject: [PATCH 06/10] fix dpo experiment config --- scripts/tofu_unlearn.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index a556bd1..1794c9b 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -13,7 +13,7 @@ trainers_experiments=( "GradAscent unlearn/tofu/default.yaml" "GradDiff unlearn/tofu/default.yaml" "NPO unlearn/tofu/default.yaml" - "DPO unlearn/tofu/default.yaml" + "DPO unlearn/tofu/idk.yaml" ) forget_retain_splits=( "forget01 retain99" From 8b073d66a7a08b4923eeffa6cbb485ffcc3246eb Mon Sep 17 00:00:00 2001 From: Vineeth <48151992+Dornavineeth@users.noreply.github.com> Date: Sun, 9 Mar 2025 15:32:43 -0400 Subject: [PATCH 07/10] RMU (#6) * IdkDPO script fix in tofu_unlearn.sh (#65) * Fix hyperlinks in README * Download I don't know data in setup_data.py * Fix tofu_unlearn.sh for IdkDPO --------- Co-authored-by: Anmol Mekala <49127549+molereddy@users.noreply.github.com> * overwrite=True * RMU added * Fix ref model device * ruff fix * RMU updated * Update rmu.py * Update README.md: add RMU * Added references and renamed functions --------- Co-authored-by: Anmol Mekala <49127549+molereddy@users.noreply.github.com> --- README.md | 4 +- configs/experiment/unlearn/muse/default.yaml | 1 + .../experiment/unlearn/muse/scalability.yaml | 1 + .../unlearn/muse/sustainabilty.yaml | 1 + configs/experiment/unlearn/tofu/default.yaml | 1 + configs/experiment/unlearn/tofu/idk.yaml | 1 + configs/trainer/RMU.yaml | 14 ++ docs/results.md | 37 ++++- scripts/tofu_unlearn.sh | 1 + src/trainer/__init__.py | 2 + src/trainer/unlearn/grad_diff.py | 2 +- src/trainer/unlearn/rmu.py | 142 ++++++++++++++++++ 12 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 configs/trainer/RMU.yaml create mode 100644 src/trainer/unlearn/rmu.py diff --git a/README.md b/README.md index a04d486..4c2cf8a 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ ## πŸ“– Overview -We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 5 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 7+ LLMs. Each of these can be easily extended to incorporate more variants. +We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 7+ LLMs. Each of these can be easily extended to incorporate more variants. We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field. @@ -35,7 +35,7 @@ We provide several variants for each of the components in the unlearning pipelin | **Component** | **Available Options** | |------------------------|----------------------| | **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/) | -| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO | +| **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | | **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, MIA Attacks, TruthRatio, Model Utility | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) | | **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2, ICLM; Additional: Phi-3.5, Phi-1.5, Gemma | diff --git a/configs/experiment/unlearn/muse/default.yaml b/configs/experiment/unlearn/muse/default.yaml index 454a84e..b4bdbe0 100644 --- a/configs/experiment/unlearn/muse/default.yaml +++ b/configs/experiment/unlearn/muse/default.yaml @@ -34,6 +34,7 @@ eval: muse: data_split: ${data_split} retain_logs_path: ${retain_logs_path} + overwrite: true trainer: args: diff --git a/configs/experiment/unlearn/muse/scalability.yaml b/configs/experiment/unlearn/muse/scalability.yaml index 11d90f5..b19e0cb 100644 --- a/configs/experiment/unlearn/muse/scalability.yaml +++ b/configs/experiment/unlearn/muse/scalability.yaml @@ -34,6 +34,7 @@ eval: muse: data_split: ${data_split} retain_logs_path: ${retain_logs_path} + overwrite: true trainer: args: diff --git a/configs/experiment/unlearn/muse/sustainabilty.yaml b/configs/experiment/unlearn/muse/sustainabilty.yaml index e5d7968..9a0a03e 100644 --- a/configs/experiment/unlearn/muse/sustainabilty.yaml +++ b/configs/experiment/unlearn/muse/sustainabilty.yaml @@ -34,6 +34,7 @@ eval: muse: data_split: ${data_split} retain_logs_path: ${retain_logs_path} + overwrite: true trainer: args: diff --git a/configs/experiment/unlearn/tofu/default.yaml b/configs/experiment/unlearn/tofu/default.yaml index 5f7c475..f2e0ab1 100644 --- a/configs/experiment/unlearn/tofu/default.yaml +++ b/configs/experiment/unlearn/tofu/default.yaml @@ -20,6 +20,7 @@ eval: tofu: forget_split: ${forget_split} retain_logs_path: ${retain_logs_path} + overwrite: true data: anchor: forget diff --git a/configs/experiment/unlearn/tofu/idk.yaml b/configs/experiment/unlearn/tofu/idk.yaml index 61a365d..5fcb85d 100644 --- a/configs/experiment/unlearn/tofu/idk.yaml +++ b/configs/experiment/unlearn/tofu/idk.yaml @@ -20,6 +20,7 @@ eval: tofu: forget_split: ${forget_split} retain_logs_path: ${retain_logs_path} + overwrite: true data: anchor: forget diff --git a/configs/trainer/RMU.yaml b/configs/trainer/RMU.yaml new file mode 100644 index 0000000..7e1f902 --- /dev/null +++ b/configs/trainer/RMU.yaml @@ -0,0 +1,14 @@ +defaults: + - GradDiff + +handler: RMU +method_args: + # The params here are more dependent on model and dataset. Tune them carefully to work + gamma: 1.0 + steering_coeff: 2 + retain_loss_type: EMBED_DIFF + alpha: 1 + module_regex: model\.layers\.7 + trainable_params_regex: + - .* # update all parameters (as done in https://github.com/tmlr-group/G-effect/blob/ef368eea3b2c6dba1e090b9ebb021ac9f047e0ae/dataloader.py#L271) + # - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26) \ No newline at end of file diff --git a/docs/results.md b/docs/results.md index 3f7cc7c..3af7cb6 100644 --- a/docs/results.md +++ b/docs/results.md @@ -23,7 +23,7 @@ For all the experiments below, we used the following setup | **Hyperparameters** | Learning Rate (lr) = 1e-5
Ξ± = 1, Ξ³ = 1, Ξ² = 0.1 (where applicable)
Number of Epochs = 10
Optimizer: [paged_adamw_32bit](https://huggingface.co/docs/bitsandbytes/main/en/reference/optim/adamw#bitsandbytes.optim.PagedAdamW) | __Note:__ -1. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO, can be significantly improved with careful tuning. **Please use these numbers only for reproducibility purposes**. +1. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO & RMU can be significantly improved with careful tuning. **Please use these numbers only for reproducibility purposes**. 2. NPO in MUSE: for NPO, the MUSE implementation is inconsistent with the [original paper](https://github.com/licong-lin/negative-preference-optimization) as discussed [here]( https://github.com/jaechan-repo/muse_bench/issues/2). This inconsistency is carried over into implementations like [SimNPO](https://github.com/OPTML-Group/Unlearn-Simple/issues/5). Here, we use the original NPO implementation with the same loss function expression across datasets. @@ -140,6 +140,18 @@ __Note:__ 0.6 3.17e-04 + + RMU + 0.4 + 0.62 + 0.64 + 9.59e-10 + 0.02 + 0.81 + 6.92e-21 + 0.03 + 0.81 + @@ -257,6 +269,18 @@ __Note:__ 0.54 1.07e-05 + + RMU + 0.16 + 0.55 + 0.70 + 4.87e-10 + 0.58 + 0.77 + 3.15e-15 + 0.59 + 0.76 + @@ -354,6 +378,17 @@ __Note:__ -54.26 0.54 + + RMU + 0.48 + 0.05 + 56.36 + 0.51 + 0.29 + 0.79 + -60.52 + 0.48 + \ No newline at end of file diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index 1794c9b..ae33189 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -14,6 +14,7 @@ trainers_experiments=( "GradDiff unlearn/tofu/default.yaml" "NPO unlearn/tofu/default.yaml" "DPO unlearn/tofu/idk.yaml" + "RMU unlearn/tofu/default.yaml" ) forget_retain_splits=( "forget01 retain99" diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py index 1c769bf..7e195fa 100644 --- a/src/trainer/__init__.py +++ b/src/trainer/__init__.py @@ -9,6 +9,7 @@ from trainer.unlearn.npo import NPO from trainer.unlearn.dpo import DPO from trainer.unlearn.simnpo import SimNPO +from trainer.unlearn.rmu import RMU TRAINER_REGISTRY: Dict[str, Any] = {} @@ -79,3 +80,4 @@ def load_trainer( _register_trainer(NPO) _register_trainer(DPO) _register_trainer(SimNPO) +_register_trainer(RMU) diff --git a/src/trainer/unlearn/grad_diff.py b/src/trainer/unlearn/grad_diff.py index e11c7a7..bfecc19 100644 --- a/src/trainer/unlearn/grad_diff.py +++ b/src/trainer/unlearn/grad_diff.py @@ -14,7 +14,7 @@ def __init__(self, gamma=1.0, alpha=1.0, retain_loss_type="NLL", *args, **kwargs self.ref_model = self._prepare_ref_model(self.model) def _prepare_ref_model(self, model): - ref_model = copy.deepcopy(model).to("cuda") + ref_model = copy.deepcopy(model).to(self.accelerator.device) ref_model.eval() if self.is_deepspeed_enabled: ref_model = self._prepare_deepspeed(ref_model) diff --git a/src/trainer/unlearn/rmu.py b/src/trainer/unlearn/rmu.py new file mode 100644 index 0000000..391bd6a --- /dev/null +++ b/src/trainer/unlearn/rmu.py @@ -0,0 +1,142 @@ +"""Borrowed implementation from https://github.com/centerforaisafety/wmdp/blob/main/rmu/unlearn.py""" + +import re +import torch +import deepspeed +from trainer.unlearn.grad_diff import GradDiff + + +class RMU(GradDiff): + def __init__(self, + module_regex="model\.layers\.7", + trainable_params_regex=["model\.layers\.(5|6|7)\.mlp\.down_proj\.weight"], + steering_coeff=20, + *args, **kwargs): + """ + RMU Trainer that fine-tunes only specific layers and parameters using regex-based filtering. + + Args: + module_path (str): Regex pattern to match module names. + trainable_param_paths (list of str): List of regex patterns for trainable parameters. + """ + super().__init__(*args, **kwargs) + + # Create reference model if not already set + if self.ref_model is None: + self.ref_model = self._prepare_ref_model(self.model) + + # Unfreeze only the selected parameters + self.trainable_params_regex = trainable_params_regex # Regex for selecting params + + # Get actual module references + self.module_regex = module_regex # Regex for selecting modules + self.model_module = self._get_matching_module(self.model, self.module_regex) + self.ref_module = self._get_matching_module(self.ref_model, self.module_regex) + self.steering_coeff = steering_coeff + self.control_vec = None + + + def create_optimizer(self): + self._freeze_all_params(self.model, False) + # This makes the optimizer to select only trainable params + self._set_trainable_params(self.model, self.trainable_params_regex, True) + super().create_optimizer() + self._freeze_all_params(self.model, True) + + + def _get_matching_module(self, model, module_regex): + """Returns a single module matching the given regex from a DeepSpeed/DDP-wrapped model.""" + # Handle DeepSpeed and DDP-wrapped models by accessing the underlying module + if isinstance(model, deepspeed.DeepSpeedEngine): + model = model.module # Extract the actual PyTorch model inside + + matched_modules = {name: module for name, module in model.named_modules() if re.fullmatch(module_regex, name)} + + if len(matched_modules) > 1: + raise ValueError(f"More than one module matched with {module_regex}: {list(matched_modules.keys())}") + elif not matched_modules: + raise ValueError(f"No module matched with {module_regex}") + + return next(iter(matched_modules.values())) # Return the single matched module + + def _freeze_all_params(self, model, requires_grad=True): + """Freeze all parameters in the model initially.""" + for param in model.parameters(): + param.requires_grad = requires_grad + + def _set_trainable_params(self, model, trainable_params_regex, requires_grad=True): + """Unfreeze specific parameters that match the regex patterns.""" + for name, param in model.named_parameters(): + if any(re.fullmatch(pattern, name) for pattern in trainable_params_regex): + param.requires_grad = requires_grad + # print(f"{name}:requires_grad\t{requires_grad}") + + def forward_with_cache(self, model, inputs, module, no_grad=True): + """Performs a forward pass while caching the output of a specified module.""" + cache = [] + def hook(module, input, output): + if isinstance(output, tuple): + cache.append(output[0]) + else: + cache.append(output) + return None + + hook_handle = module.register_forward_hook(hook) + with torch.set_grad_enabled(not(no_grad)): + outputs = model(**inputs) + hook_handle.remove() + return cache[0], outputs + + def get_control_vector(self, dim): + if self.control_vec is None: + random_vector = torch.rand(1,1, dim) + self.control_vec = random_vector / torch.norm(random_vector) * self.steering_coeff + return self.control_vec + + + def compute_activation_loss(self, activation1, activation2, mask): + squared_diff = torch.nn.functional.mse_loss(activation1, activation2, reduction="none") # Shape (b, s, d) + expanded_mask = mask.unsqueeze(-1).expand_as(squared_diff) # Shape: [b, s, d] + squared_diff_sum = (squared_diff * expanded_mask).mean(dim=2).sum(dim=(1)) # Shape: [b, 1] + num_tokens = mask.sum(dim=-1, keepdim=True) # Sum over seq_len, Shape: [b, 1] + return (squared_diff_sum / num_tokens).mean() + + def compute_retain_loss(self, model, retain_inputs): + retain_loss = 0.0 + + if self.retain_loss_type == "EMBED_DIFF": + model_retain_activations, _ = self.forward_with_cache(model, retain_inputs, module=self.model_module, no_grad=False) + ref_retain_activations, _ = self.forward_with_cache(self.ref_model, retain_inputs, module=self.ref_module, no_grad=True) + mask = (retain_inputs['labels'] != -100) # Shape: [b, s] + retain_loss = self.compute_activation_loss(model_retain_activations, ref_retain_activations.to(model_retain_activations.device), mask) + else: + retain_loss = super().compute_retain_loss(model, retain_inputs) + return retain_loss + + def compute_loss(self, model, inputs, return_outputs=False): + forget_inputs = inputs["forget"] + forget_inputs = { + "input_ids": forget_inputs["input_ids"], + "attention_mask": forget_inputs["attention_mask"], + "labels": forget_inputs["labels"], + } + + model_forget_activations, forget_outputs = self.forward_with_cache(model, forget_inputs, self.model_module, no_grad=False) + # If multiple datasets or concepts need unlearning, pass the control vector during processing; otherwise, default to a random vector during training. + control_vec = forget_inputs.get("control_vec", self.get_control_vector(model_forget_activations.shape[-1])) + control_vec = control_vec.to(dtype=model_forget_activations.dtype, device=model_forget_activations.device) + control_vec = control_vec.expand_as(model_forget_activations) + mask = (forget_inputs['labels'] != -100) # Shape: [b, s] + forget_loss = self.compute_activation_loss(model_forget_activations, control_vec, mask) + + retain_inputs = inputs["retain"] + retain_inputs = { + "input_ids": retain_inputs["input_ids"], + "attention_mask": retain_inputs["attention_mask"], + "labels": retain_inputs["labels"], + } + retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) + + loss = self.gamma * forget_loss + self.alpha * retain_loss + + return (loss, forget_outputs) if return_outputs else loss From dccb831590114c7714f540f19c4d1b9c60b26fea Mon Sep 17 00:00:00 2001 From: Vineeth <48151992+Dornavineeth@users.noreply.github.com> Date: Thu, 27 Mar 2025 16:12:33 -0400 Subject: [PATCH 08/10] Add structure to contributions, setup leaderboard, update documentation (#8) * docs: updates, small corrections, re-formats * modified ruff commands * modified ruff commands * CI/CD minor updates * added contributing + leaderboard * fix minor spelling misatkes * docs: bunch of minor updates * docs fixes --------- Co-authored-by: molereddy --- .github/ISSUE_TEMPLATE/feature-request.yaml | 9 + .github/PULL_REQUEST_TEMPLATE.md | 5 +- .github/workflows/tests.yml | 7 +- .pre-commit-config.yaml | 6 +- Makefile | 6 +- README.md | 50 +++-- community/benchmarks/template/README.md | 51 +++++ community/benchmarks/template/run.sh | 18 ++ community/leaderboard.md | 168 ++++++++++++++++ community/methods/template/README.md | 39 ++++ community/methods/template/run.sh | 13 ++ docs/components.md | 4 +- docs/contributing.md | 207 ++++++++++++++++++++ docs/experiments.md | 4 +- docs/{results.md => repro.md} | 2 + setup.py | 9 +- setup_data.py | 2 +- src/trainer/unlearn/rmu.py | 105 ++++++---- 18 files changed, 620 insertions(+), 85 deletions(-) create mode 100644 community/benchmarks/template/README.md create mode 100644 community/benchmarks/template/run.sh create mode 100644 community/leaderboard.md create mode 100644 community/methods/template/README.md create mode 100644 community/methods/template/run.sh create mode 100644 docs/contributing.md rename docs/{results.md => repro.md} (98%) diff --git a/.github/ISSUE_TEMPLATE/feature-request.yaml b/.github/ISSUE_TEMPLATE/feature-request.yaml index 22ec671..e43e913 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yaml +++ b/.github/ISSUE_TEMPLATE/feature-request.yaml @@ -32,3 +32,12 @@ body: label: Motivation description: | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. + + - type: textarea + id: implementation + validations: + required: false + attributes: + label: Implementation + description: | + Please describe your proposed solution in detail. Outline the implementation approach, including any key technical considerations. If there are challenges or blockers preventing implementation, specify them along with potential workarounds or dependencies. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index fc69076..fbc9a07 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -6,6 +6,5 @@ Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). -- [ ] Have you gone through the documentation of adding new [components](../docs/components.md)? -- [ ] Did you make sure to update the documentation with your changes? Here are the pointers to documentation - [documentation guidelines](../README.md#-further-documentation). \ No newline at end of file +- [ ] Have you gone through the contributions [guide](../docs/contributing.md)? +- [ ] Are your changes documented? Read documentation guidelines [here](../README.md#-further-documentation). \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4277042..f1b0afa 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -46,16 +46,11 @@ jobs: run: | python -m pip install --upgrade pip pip install ruff==0.6.6 - # python -m pip install git+https://github.com/huggingface/transformers.git - # python -m pip install ".[torch,dev]" - name: Check Quality run: make quality - # - name: Check Style - # run: make style - # - name: Test with pytest # run: | - # cd LLaMA-Factory + # cd # make test diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 97a7341..97b7cdf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,6 @@ repos: rev: v0.6.9 hooks: - id: ruff - args: [--fix, scripts, src] - - id: ruff-format - args: [scripts, src] \ No newline at end of file + args: [check, --fix, scripts, src, setup.py, setup_data.py] + - id: ruff + args: [format, scripts, src, setup.py setup_data.py] \ No newline at end of file diff --git a/Makefile b/Makefile index 179c2d5..11bdbb1 100644 --- a/Makefile +++ b/Makefile @@ -3,10 +3,12 @@ check_dirs := scripts src #setup.py quality: - ruff check $(check_dirs) + ruff check $(check_dirs) setup.py setup_data.py + ruff format --check $(check_dirs) setup.py setup_data.py style: - ruff --format $(check_dirs) + ruff check $(check_dirs) setup.py setup_data.py --fix + ruff format $(check_dirs) setup.py setup_data.py test: CUDA_VISIBLE_DEVICES= pytest tests/ diff --git a/README.md b/README.md index 403754b..986b874 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,17 @@ ## πŸ“– Overview -We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 7+ LLMs. Each of these can be easily extended to incorporate more variants. +We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 6+ LLM architectures. Each of these can be easily extended to incorporate more variants. We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field. -> ⚠️ **Notice (Updated: February 27, 2025)** -> This repository replaces the original TOFU codebase, which can be found at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu) and isn't maintained anymore. +### πŸ“’ Updates + +#### [Mar 27, 2025] +- **Easier contributions, leaderboard and reproducibility**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details. + +#### [Feb 27, 2025] +⚠️ **Repository Update**: This repo replaces the original TOFU codebase at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu), which is no longer maintained. ## πŸ—ƒοΈ Available Components @@ -38,22 +43,21 @@ We provide several variants for each of the components in the unlearning pipelin | **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | | **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, MIA Attacks, TruthRatio, Model Utility | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) | -| **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2, ICLM; Additional: Phi-3.5, Phi-1.5, Gemma | +| **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma | --- ## πŸ“Œ Table of Contents - πŸ“– [Overview](#-overview) +- πŸ“’ [Updates](#-updates) - πŸ—ƒοΈ [Available Components](#%EF%B8%8F-available-components) - ⚑ [Quickstart](#-quickstart) - - πŸ› οΈ [Environment Setup](#-environment-setup) - - πŸ’Ύ [Data Setup](#-data-setup) - πŸ”„ [Updated TOFU benchmark](#-updated-tofu-benchmark) - πŸ§ͺ [Running Experiments](#-running-experiments) - πŸš€ [Perform Unlearning](#-perform-unlearning) - πŸ“Š [Perform an Evaluation](#-perform-an-evaluation) - πŸ“œ [Running Baseline Experiments](#-running-baseline-experiments) -- βž• [How to Add New Components](#-how-to-add-new-components) +- βž• [How to Contribute](#-how-to-contribute) - πŸ“š [Further Documentation](#-further-documentation) - πŸ”— [Support & Contributors](#-support--contributors) - πŸ“ [Citing this work](#-citing-this-work) @@ -64,20 +68,16 @@ We provide several variants for each of the components in the unlearning pipelin ## ⚑ Quickstart -### πŸ› οΈ Environment Setup - ```bash +# environment setup conda create -n unlearning python=3.11 conda activate unlearning pip install . pip install --no-build-isolation flash-attn==2.6.3 -``` - -### πŸ’Ύ Data Setup -Download the log files containing metric results from the models used in the supported benchmarks (including the retain model logs used to compare the unlearned models against). -```bash -python setup_data.py # populates saves/eval with evaluation results of the uploaded models +# data setup +python setup_data.py # saves/eval now contains evaluation results of the uploaded models +# Downloads log files with metric eval results (incl retain model logs) from the models used in the supported benchmarks. ``` --- @@ -103,7 +103,7 @@ python src/train.py --config-name=unlearn.yaml experiment=unlearn/tofu/default \ forget_split=forget10 retain_split=retain90 trainer=GradAscent task_name=SAMPLE_UNLEARN ``` -- `experiment`- Path to the Hydra config file [`configs/experiment/unlearn/muse/default.yaml`](configs/experiment/unlearn/tofu/default.yaml) with default experimental settings for TOFU unlearning, e.g. train dataset, eval benchmark details, model paths etc.. +- `experiment`- Path to the Hydra config file [`configs/experiment/unlearn/tofu/default.yaml`](configs/experiment/unlearn/tofu/default.yaml) with default experimental settings for TOFU unlearning, e.g. train dataset, eval benchmark details, model paths etc.. - `forget_split/retain_split`- Sets the forget and retain dataset splits. - `trainer`- Load [`configs/trainer/GradAscent.yaml`](configs/trainer/GradAscent.yaml) and override the unlearning method with the handler (see config) implemented in [`src/trainer/unlearn/grad_ascent.py`](src/trainer/unlearn/grad_ascent.py). @@ -126,24 +126,21 @@ For more details about creating and running evaluations, refer [`docs/evaluation ### πŸ“œ Running Baseline Experiments -The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. The expected results for these are in [`docs/results.md`](docs/results.md). +The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. The expected results for these are in [`docs/repro.md`](docs/repro.md). ```bash bash scripts/tofu_unlearn.sh bash scripts/muse_unlearn.sh ``` ---- +The above scripts are not tuned and uses default hyper parameter settings. We encourage you to tune your methods and add your final results in [`community/leaderboard.md`](community/leaderboard.md). -## βž• How to Add New Components +--- -Adding a new component (trainer, evaluation metric, benchmark, model, or dataset) requires defining a new class, registering it, and creating a configuration file. Learn more about adding new components in [`docs/components.md`](docs/components.md). +## βž• How to Contribute -Please feel free to raise a pull request for any new features after setting up the environment in development mode. +If you are interested in contributing to our work, please have a look at [`contributing.md`](docs/contributing.md) guide. -```bash -pip install .[dev] -``` ## πŸ“š Further Documentation @@ -151,11 +148,12 @@ For more in-depth information on specific aspects of the framework, refer to the | **Documentation** | **Contains** | |------------------------------------------------|--------------------------------------------------------------------------------------------------------------------| -| [`docs/components.md`](docs/components.md) | Instructions on how to add new components such as trainers, benchmarks, metrics, models, datasets, etc. | +| [`docs/contributing.md`](docs/contributing.md) | Instructions on how to add new methods, benchmarks, components such as trainers, benchmarks, metrics, models, datasets, etc. | | [`docs/evaluation.md`](docs/evaluation.md) | Detailed instructions on creating and running evaluation metrics and benchmarks. | | [`docs/experiments.md`](docs/experiments.md) | Guide on running experiments in various configurations and settings, including distributed training, fine-tuning, and overriding arguments. | | [`docs/hydra.md`](docs/hydra.md) | Explanation of the Hydra features used in configuration management for experiments. | -| [`docs/results.md`](docs/results.md) | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks. | +| [`community/leaderboard.md`](community/leaderboard.md) | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks. | +| [`docs/repro.md`](docs/repro.md) (deprecated) | Results are provided solely for reproducibility purposes, without any parameter tuning. | --- ## πŸ”— Support & Contributors diff --git a/community/benchmarks/template/README.md b/community/benchmarks/template/README.md new file mode 100644 index 0000000..855952f --- /dev/null +++ b/community/benchmarks/template/README.md @@ -0,0 +1,51 @@ +# TITLE + +- Paper title, authors, links. + +Provide a concise summary of your benchmark details and its contributions. Please avoid using images to keep the repository size manageable. + +# Datasets + +Use a clear and consistent naming convention for dataset splits. + +- [ ] Provide a link to find/download the datasets (preferably HuggingFace). + +# Models + + +- [ ] Upload any unlearning target or reference retain models for unlearning preferably on HuggingFace and provide the path. +- [ ] Model creation details and how they fit in benchmark. + +# Baselines & Results + +Discuss the baselines used and their results. + + +## Setup +Please include the experimental setup for the baselines + +- [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc. +- [ ] **Computational Setup:** Mention the type and number of GPUs used. +- [ ] **DeepSpeed Configuration:** If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.) +- [ ] **Other Details:** Any additional setup details crucial for reproducing your method. + +To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented. + + +# Citation + + +If you use this work, please cite: + +```bibtex + + + +@misc{openunlearning2025, + title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks}, + author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush}, + year={2025}, + howpublished={\url{https://github.com/locuslab/open-unlearning}}, + note={Accessed: February 27, 2025} +} +``` \ No newline at end of file diff --git a/community/benchmarks/template/run.sh b/community/benchmarks/template/run.sh new file mode 100644 index 0000000..a5335b2 --- /dev/null +++ b/community/benchmarks/template/run.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +######################################################################################################################## +########################################### RETAIN Finetuned ####$###################################################### +######################################################################################################################## + + + +######################################################################################################################### +############################################ FULL Finetuned models ###################################################### +######################################################################################################################### + + + + +######################################################################################################################### +############################################ Baseline methods ####$###################################################### +######################################################################################################################### diff --git a/community/leaderboard.md b/community/leaderboard.md new file mode 100644 index 0000000..8803932 --- /dev/null +++ b/community/leaderboard.md @@ -0,0 +1,168 @@ +
+ +# Leaderboard + +
+ +We encourage the community to develop new methods, optimize them for specific benchmarks, and compare results with existing approaches. + +To implement a new method, refer to our [contributing guide](../docs/contributing.md). + +> **Note:** The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date. + + +### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methodforget01forget05forget10
forget_qualitymodel_utilityforget_qualitymodel_utilityforget_qualitymodel_utility
Finetuned0.010.602.96e-130.68.08e-220.6
Retain1.00.601.00.61.00.59
+
+ + + +### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methodforget01forget05forget10
forget_qualitymodel_utilityforget_qualitymodel_utilityforget_qualitymodel_utility
Finetuned0.010.602.96e-130.68.08e-220.6
Retain1.00.601.00.61.00.59
+
+ + +### MUSE unlearning on the benchmark's target models + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MethodNewsBooks
forget_knowmem_ROUGEforget_verbmem_ROUGEprivleakretain_knowmem_ROUGEforget_knowmem_ROUGEforget_verbmem_ROUGEprivleakretain_knowmem_ROUGE
Finetuned0.640.58-99.810.550.471.0-57.260.69
Retain0.330.2100.560.30.1400.69
+
diff --git a/community/methods/template/README.md b/community/methods/template/README.md new file mode 100644 index 0000000..7facb01 --- /dev/null +++ b/community/methods/template/README.md @@ -0,0 +1,39 @@ +# TITLE + +- Paper title, authors, links. + + +Provide a concise summary of your method details and its contributions. Please avoid using images to keep the repository size manageable. + +# Setup + +Please include the experimental setup such as + +- [ ] **Hyperparameters & Search Space:** Specify key hyperparameters, their search ranges, number of trials etc. +- [ ] **Computational Setup:** Mention the type and number of GPUs used. +- [ ] **DeepSpeed Configuration:** If any modifications were made to the default DeepSpeed config, specify them here. (You may include the config as a code block.) +- [ ] **Other Details:** Any additional setup details crucial for reproducing your method. + +# Results + +To replicate your results, provide a `run.sh` script that contains all necessary commands to reproduce the final results. Ensure the script is well-documented. + +It would be appreciated if you can upload the final unlearned model(s) along with their `evals` folders to HuggingFace and provide the link(s) here. As the evaluations are updated, this would help us re-evaluate your model(s). + +# Citation + + +If you use this work, please cite: + +```bibtex + + + +@misc{openunlearning2025, + title={OpenUnlearning: A Unified Framework for LLM Unlearning Benchmarks}, + author={Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Kolter, J Zico and Maini, Pratyush}, + year={2025}, + howpublished={\url{https://github.com/locuslab/open-unlearning}}, + note={Accessed: February 27, 2025} +} +``` \ No newline at end of file diff --git a/community/methods/template/run.sh b/community/methods/template/run.sh new file mode 100644 index 0000000..8c165e5 --- /dev/null +++ b/community/methods/template/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +######################################################################################################################## +########################################### Hyper parameter tuning ##################################################### +######################################################################################################################## + +# Optional + +######################################################################################################################## +########################################### Final best parameters ##################################################### +######################################################################################################################## + +# Required to replicate your results \ No newline at end of file diff --git a/docs/components.md b/docs/components.md index e75f8df..016932c 100644 --- a/docs/components.md +++ b/docs/components.md @@ -19,7 +19,7 @@ This process involves three main steps: 6. [Collator](#collator) - Handles data collation logic 7. [Experiment](#experiment) - Combines components into a final experiment config -__Note:__ adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md). +> [!Note] adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md). --- @@ -147,7 +147,7 @@ To add a new model architecture: ### Implement and register a handler For all the models currently supported, HuggingFace's `AutoModelForCausalLM` and `AutoTokenizer` are used, and therefore the user doesn't need to create or register any handler. -__Note__: Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next. +> [!Note]: Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next. ### Add to configs Model configurations contain details required to load the model+tokenizer such as paths, chat templating arguments, LoRA parameters etc. in [`configs/models`](../configs/models/). diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..3e398f5 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,207 @@ +# Contributing + +Everyone is welcome to contribute, and every contribution is valued. Aside from coding components, answering questions, assisting others, and improving documentation are all appreciated. + +You can also help by spreading the word! If you find this project useful, please share it with others, cite it, link it on your repositories and posts, or simply ⭐️ the repo to show your support. + +> 🀝 This guide is heavily borrowed from awesome [transformers](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md) guide to contributing. + +## Ways to Contribute + +There are several ways you can contribute to OpenUnlearning: + +* Fix issues with the existing code. +* Submit issues related to bugs or desired new features. +* Support new components (models, datasets, collator etc). +* Implement new unlearning methods. +* Implement new evaluations. +* Contribute to the documentation. + +## Fixing Issues + +If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](#create-a-pull-request) and open a Pull Request! + +## Submitting a Bug-Related Issue or Feature Request + +Do your best to follow these guidelines when submitting a bug-related issue or a feature request. It will make it easier for us to come back to you quickly and with good feedback. + +### Did You Find a Bug? + +Before you report an issue, we would really appreciate it if you could **make sure the bug was not already reported** (use the search bar on GitHub under Issues). Please try to ensure that the bug is in OpenUnlearning itself, and not your code. + +Please include the following information in your issue so we can quickly resolve it: + +* A short, self-contained, code snippet that allows us to reproduce the bug. +* The **full** traceback if an exception is raised. +* The hardware used to run the experiment, including specifications such as the number and type of GPUs etc. +* The hydra config file corresponding to the experiment if needed (since these files ae long you may link them or use a markdown dropdown in your issue). +* Attach any other additional information, like screenshots, you think may help. + +### Do You Want a New Feature? + +If there is a new feature you'd like to see in OpenUnlearning, please open an issue and describe: + +1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it something you worked on and think it could benefit the community? + + Whatever it is, we'd love to hear about it! + +2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you. +3. Provide a *code snippet* that demonstrates the features usage. +4. If the feature is related to a paper, please include a link. + +## Do You Want to Support New Components? + +Adding a new component listed below requires defining a new class, registering it, and creating a configuration file. Learn more about adding new components in [`docs/components.md`](docs/components.md). + +1. [Trainer](components#trainer) - Algorithm used in LLM training or unlearning +2. [Dataset](components#dataset) - Dataset class for preprocessing raw data +3. [Evaluation Metric](components#evaluation-metric) - Metric class implementing model evaluation +4. [Benchmark](components#benchmark) - Suite combining multiple evaluation metrics +5. [Model](components#model) - LLM used in unlearning +6. [Collator](components#collator) - Handles data collation logic +7. [Experiment](components#experiment) - Combines components into a final experiment config + +> **IMPORTANT** πŸš€ +> **We especially encourage** contributions of methods and benchmarks that you've created, since you best understand them and know how to use them. We are ready to expedite their integration into OpenUnlearning. +> When facing difficulties implementing any component, please contact the maintainers to join our discord where we can go in detail with the implementations. + +## Contributing a New Unlearning Method + +### 1. Implement an Unlearning Trainer + +Your method might require a custom loss function, or other trainer related modifications which go here. +Refer to our [Trainer implementation guide](components.md#trainer) to ensure your method integrates well with our framework. + +### 2. Detail Commands to Be Run + +Some methods might involve multiple commands or steps while unlearning: ensure you write a clear `.sh` file that documents this. + +### 3. Run and Tune Your Method on Relevant Benchmarks + +- Once implemented, evaluate your method on applicable benchmarks using the best possible parameters. +- Create a folder [`community/methods/`](../community/methods) and include a README file in it, explaining the method details, hyper-parameters, strategy/logic for selecting the best model for unlearning etc. +- Include a bash script `run.sh` with the exact bash command needed to replicate your results. + +### 4. Update Leaderboard and Upload Model + +Don't forget to add your results to the [leaderboard](results.md) and upload your unlearned model to HuggingFace for broader accessibility and reproducibility. + +```bash +pip install huggingface_hub +huggingface-cli login + +huggingface-cli repo create {benchmark}-{model}-{datasplit}-{method} +cd + +git init +git remote add origin https://huggingface.co//{benchmark}-{model}-{datasplit}-{method} +git add . +git commit -m "Initial commit" +git push origin main +``` + +--- + +## Contributing to Unlearning Benchmark Evaluations + +Evaluating LLM unlearning is essential for assessing the effectiveness of different unlearning methods. While various benchmarks and metrics exist, identifying the most suitable ones for capturing the nuances of unlearning remains an open challenge. + +Your contributions toward defining or improving evaluation methods can significantly advance unlearning research. By proposing reliable benchmarks, you help ensure that unlearning methods are both effective and aligned with real-world requirements. + +- To add a new unlearning evaluation metric, refer to our [Metric Implementation Guide]((components.md#evaluation-metric).). +- To integrate new datasets and models, follow our [Components Guide](components.md). + +### Steps to add a new Unlearning Benchmark + +1. **Prepare Datasets & Models** – Create your dataset and train models to generate fine-tuned or retained models. +2. **Define a New Benchmark** (if needed) – Follow the [Benchmark Guide]((components.md#benchmark)) to implement a new evaluation benchmark. +3. **Run and Tune Baseline Methods** – Evaluate existing unlearning methods on your benchmark and optimize them. +4. **Document & Share Findings** – Provide detailed steps for reproduction in [`community/benchmarks/`](../community/benchmarks). + +--- + +## Do You Want to Add Documentation? + +We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution! + +--- + +## Create a Pull Request + +Before writing any code, we strongly advise you to search through the existing PRs or issues to make sure nobody is already working on the same thing. If you are unsure, it is always a good idea to open an issue to get some feedback. + +Follow the steps below to start contributing: + +1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code under your GitHub user account. + +2. Clone your fork to your local disk, and add the base repository as a remote: + + ```bash + git clone git@github.com:/open-unlearning.git + cd open-unlearning + git remote add upstream https://github.com/locuslab/open-unlearning.git + ``` + +3. You can work on the forked main branch or create a new branch to hold your development changes: + + ```bash + git checkout -b a-descriptive-name-for-my-changes + ``` + +4. Set up the environment in dev mode after following steps in [Quick Start](../README.md#-quickstart). This installs other packages such as `ruff`, `precommit` etc. + + ```bash + pip install .[dev] + ``` + +5. Develop the features in your fork/branch. + + As you work on your code, you should make sure the code is linted and formatted correctly. + + OpenUnlearning relies on `ruff` to lint & format its source code consistently. After you make changes, to check the quality of code, run + + ```bash + make quality + ``` + + If you prefer to apply the style corrections: + + ```bash + make style + ``` + + Once you're happy with your changes, add the changed files with `git add` and record your changes locally with `git commit`: + + ```bash + git add modified_file.py + git commit + ``` + + Please remember to write [good commit messages](https://chris.beams.io/posts/git-commit/) to clearly communicate the changes you made! + + To keep your copy of the code up to date with the original repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer: + + ```bash + git fetch upstream + git rebase upstream/main + ``` + + Push your changes to your branch: + + ```bash + git push -u origin a-descriptive-name-for-my-changes + ``` + + If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally. + +6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review. + +7. Please bear with us maintainers with the changes we require! We want to ensure we keep the repository clean and easily extensible. As you make your updates: you may want to work in your local branch and push the changes to your fork, since everyone can see the changes in the pull request. Changes pushed to the fork will automatically appear in the pull request. + +### Pull Request Checklist + +☐ The pull request title should summarize your contribution. +☐ If your pull request addresses an issue, please mention the issue number in the pull request description to make sure they are linked (and people viewing the issue know you are working on it). +☐ To indicate a work in progress please prefix the title with `[WIP]`. These are useful to avoid duplicated work, and to differentiate it from PRs ready to be merged. +☐ Make sure existing tests and checks, if any, pass. +☐ Make methods having informative docstrings. \ No newline at end of file diff --git a/docs/experiments.md b/docs/experiments.md index 8610961..b570d3e 100644 --- a/docs/experiments.md +++ b/docs/experiments.md @@ -59,7 +59,7 @@ paths.output_dir=saves/unlearn/NPO/evals ``` -**Note:** The unlearning experiments support evaluation during the unlearning training. But this is supported only on a single GPU, evaluation can be performed during unlearning itself. When multiple GPUs are used to train, checkpoints must be stored and evaluated after training. +> [!Note]: The unlearning experiments support evaluation during the unlearning finetuning. But this is supported only on a single GPU When multiple GPUs are used to train, checkpoints must be stored and evaluated after training. --- @@ -242,7 +242,7 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \ src/train.py --config-name=unlearn.yaml experiment=unlearn/muse/default.yaml task_name=DISTRIBUTED_TRAIN ``` -**Note:** Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference): +> [!Note]: Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference): ```bash CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/muse/default.yaml task_name=SAMPLE_EVAL diff --git a/docs/results.md b/docs/repro.md similarity index 98% rename from docs/results.md rename to docs/repro.md index 3af7cb6..ac64ac3 100644 --- a/docs/results.md +++ b/docs/repro.md @@ -4,6 +4,8 @@ +>​For results where methods have been tuned for optimal performance, please refer to the [`community/leaderboard`](../community/leaderboard.md). + The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. ```bash bash scripts/tofu_unlearn.sh diff --git a/setup.py b/setup.py index 79c6dbc..209335c 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,8 @@ setup( name="open-unlearning", version="0.1.0", - author="Vineeth Dorna, Anmol Reddy Mekala", - author_email="vineethdornal@gmail.com, m.anmolreddy@gmail.com", + author="Vineeth Dorna, Anmol Mekala", + author_email="vineethdorna@gmail.com, m.anmolreddy@gmail.com", description="A library for machine unlearning in LLMs.", long_description=open("README.md").read(), long_description_content_type="text/markdown", @@ -16,7 +16,10 @@ packages=find_packages(), install_requires=requirements, # Uses requirements.txt extras_require={ - "dev": ["pre-commit==4.0.1"], # Install using `pip install .[dev]` + "dev": [ + "pre-commit==4.0.1", + "ruff==0.6.9", + ], # Install using `pip install .[dev]` }, python_requires=">=3.11", ) diff --git a/setup_data.py b/setup_data.py index 358779c..760679b 100644 --- a/setup_data.py +++ b/setup_data.py @@ -14,4 +14,4 @@ allow_patterns="*.jsonl", repo_type="dataset", local_dir="data", -) \ No newline at end of file +) diff --git a/src/trainer/unlearn/rmu.py b/src/trainer/unlearn/rmu.py index 391bd6a..d990d3a 100644 --- a/src/trainer/unlearn/rmu.py +++ b/src/trainer/unlearn/rmu.py @@ -7,11 +7,14 @@ class RMU(GradDiff): - def __init__(self, - module_regex="model\.layers\.7", - trainable_params_regex=["model\.layers\.(5|6|7)\.mlp\.down_proj\.weight"], - steering_coeff=20, - *args, **kwargs): + def __init__( + self, + module_regex="model\.layers\.7", + trainable_params_regex=["model\.layers\.(5|6|7)\.mlp\.down_proj\.weight"], + steering_coeff=20, + *args, + **kwargs, + ): """ RMU Trainer that fine-tunes only specific layers and parameters using regex-based filtering. @@ -20,40 +23,46 @@ def __init__(self, trainable_param_paths (list of str): List of regex patterns for trainable parameters. """ super().__init__(*args, **kwargs) - + # Create reference model if not already set if self.ref_model is None: self.ref_model = self._prepare_ref_model(self.model) # Unfreeze only the selected parameters - self.trainable_params_regex = trainable_params_regex # Regex for selecting params - + self.trainable_params_regex = ( + trainable_params_regex # Regex for selecting params + ) + # Get actual module references self.module_regex = module_regex # Regex for selecting modules self.model_module = self._get_matching_module(self.model, self.module_regex) self.ref_module = self._get_matching_module(self.ref_model, self.module_regex) self.steering_coeff = steering_coeff self.control_vec = None - - + def create_optimizer(self): self._freeze_all_params(self.model, False) # This makes the optimizer to select only trainable params self._set_trainable_params(self.model, self.trainable_params_regex, True) super().create_optimizer() self._freeze_all_params(self.model, True) - - + def _get_matching_module(self, model, module_regex): """Returns a single module matching the given regex from a DeepSpeed/DDP-wrapped model.""" # Handle DeepSpeed and DDP-wrapped models by accessing the underlying module if isinstance(model, deepspeed.DeepSpeedEngine): model = model.module # Extract the actual PyTorch model inside - matched_modules = {name: module for name, module in model.named_modules() if re.fullmatch(module_regex, name)} + matched_modules = { + name: module + for name, module in model.named_modules() + if re.fullmatch(module_regex, name) + } if len(matched_modules) > 1: - raise ValueError(f"More than one module matched with {module_regex}: {list(matched_modules.keys())}") + raise ValueError( + f"More than one module matched with {module_regex}: {list(matched_modules.keys())}" + ) elif not matched_modules: raise ValueError(f"No module matched with {module_regex}") @@ -70,45 +79,59 @@ def _set_trainable_params(self, model, trainable_params_regex, requires_grad=Tru if any(re.fullmatch(pattern, name) for pattern in trainable_params_regex): param.requires_grad = requires_grad # print(f"{name}:requires_grad\t{requires_grad}") - + def forward_with_cache(self, model, inputs, module, no_grad=True): """Performs a forward pass while caching the output of a specified module.""" cache = [] + def hook(module, input, output): if isinstance(output, tuple): cache.append(output[0]) else: cache.append(output) - return None - + return None + hook_handle = module.register_forward_hook(hook) - with torch.set_grad_enabled(not(no_grad)): + with torch.set_grad_enabled(not (no_grad)): outputs = model(**inputs) hook_handle.remove() return cache[0], outputs - + def get_control_vector(self, dim): if self.control_vec is None: - random_vector = torch.rand(1,1, dim) - self.control_vec = random_vector / torch.norm(random_vector) * self.steering_coeff + random_vector = torch.rand(1, 1, dim) + self.control_vec = ( + random_vector / torch.norm(random_vector) * self.steering_coeff + ) return self.control_vec - def compute_activation_loss(self, activation1, activation2, mask): - squared_diff = torch.nn.functional.mse_loss(activation1, activation2, reduction="none") # Shape (b, s, d) + squared_diff = torch.nn.functional.mse_loss( + activation1, activation2, reduction="none" + ) # Shape (b, s, d) expanded_mask = mask.unsqueeze(-1).expand_as(squared_diff) # Shape: [b, s, d] - squared_diff_sum = (squared_diff * expanded_mask).mean(dim=2).sum(dim=(1)) # Shape: [b, 1] + squared_diff_sum = ( + (squared_diff * expanded_mask).mean(dim=2).sum(dim=(1)) + ) # Shape: [b, 1] num_tokens = mask.sum(dim=-1, keepdim=True) # Sum over seq_len, Shape: [b, 1] return (squared_diff_sum / num_tokens).mean() - + def compute_retain_loss(self, model, retain_inputs): retain_loss = 0.0 - + if self.retain_loss_type == "EMBED_DIFF": - model_retain_activations, _ = self.forward_with_cache(model, retain_inputs, module=self.model_module, no_grad=False) - ref_retain_activations, _ = self.forward_with_cache(self.ref_model, retain_inputs, module=self.ref_module, no_grad=True) - mask = (retain_inputs['labels'] != -100) # Shape: [b, s] - retain_loss = self.compute_activation_loss(model_retain_activations, ref_retain_activations.to(model_retain_activations.device), mask) + model_retain_activations, _ = self.forward_with_cache( + model, retain_inputs, module=self.model_module, no_grad=False + ) + ref_retain_activations, _ = self.forward_with_cache( + self.ref_model, retain_inputs, module=self.ref_module, no_grad=True + ) + mask = retain_inputs["labels"] != -100 # Shape: [b, s] + retain_loss = self.compute_activation_loss( + model_retain_activations, + ref_retain_activations.to(model_retain_activations.device), + mask, + ) else: retain_loss = super().compute_retain_loss(model, retain_inputs) return retain_loss @@ -121,14 +144,22 @@ def compute_loss(self, model, inputs, return_outputs=False): "labels": forget_inputs["labels"], } - model_forget_activations, forget_outputs = self.forward_with_cache(model, forget_inputs, self.model_module, no_grad=False) + model_forget_activations, forget_outputs = self.forward_with_cache( + model, forget_inputs, self.model_module, no_grad=False + ) # If multiple datasets or concepts need unlearning, pass the control vector during processing; otherwise, default to a random vector during training. - control_vec = forget_inputs.get("control_vec", self.get_control_vector(model_forget_activations.shape[-1])) - control_vec = control_vec.to(dtype=model_forget_activations.dtype, device=model_forget_activations.device) + control_vec = forget_inputs.get( + "control_vec", self.get_control_vector(model_forget_activations.shape[-1]) + ) + control_vec = control_vec.to( + dtype=model_forget_activations.dtype, device=model_forget_activations.device + ) control_vec = control_vec.expand_as(model_forget_activations) - mask = (forget_inputs['labels'] != -100) # Shape: [b, s] - forget_loss = self.compute_activation_loss(model_forget_activations, control_vec, mask) - + mask = forget_inputs["labels"] != -100 # Shape: [b, s] + forget_loss = self.compute_activation_loss( + model_forget_activations, control_vec, mask + ) + retain_inputs = inputs["retain"] retain_inputs = { "input_ids": retain_inputs["input_ids"], @@ -136,7 +167,7 @@ def compute_loss(self, model, inputs, return_outputs=False): "labels": retain_inputs["labels"], } retain_loss = self.compute_retain_loss(model=model, retain_inputs=retain_inputs) - + loss = self.gamma * forget_loss + self.alpha * retain_loss return (loss, forget_outputs) if return_outputs else loss From 9cbabb36ad2a223affca2d59605b359b4b632137 Mon Sep 17 00:00:00 2001 From: Anmol Mekala <49127549+molereddy@users.noreply.github.com> Date: Sun, 6 Apr 2025 22:20:01 -0400 Subject: [PATCH 09/10] Add MIMIR MIA attacks support, add MIA to TOFU (#7) * update docs * setup: add license * Update results docs * re-factor MIA probs extraction code * Ensure model and tokenizer are compatible wrt vocab size * Model-tokenizer vocab check: treat as warning * Document metric usage * Merge main properly * pre-commit enforcement to be set only in dev mode * revert vocab size warning * Revert "revert vocab size warning" This reverts commit eded904b272c98d11a46f782b9842ceb8a3fead2. * Revert "pre-commit enforcement to be set only in dev mode" This reverts commit 2bb5e192b33d1886c316a5e1dd715a43b4797f3b. * small fixes: keep pre-commit only in dev mode prereqs, remove vocab size check * privleak changes: change relative_auc handler to privleak. move min_k logic to metric file from utils * Simplified privleak version * Access key option for datasets * feat: MIA attack implementation from mimir, pipelined into open-unlearning format * Tokenwise, example wise log probs computation * configs: add MIA new version of privleak support * feat: privleak in TOFU * More hyperparam details: bsz * load and register mia metrics, cleanup old * MIA init bug fixes + convert numpy scalars to floats * fix: load ref model for ref mia * misc: comments and nits * fix: ref attack load to same device, log things * describe standard mia score signage * fix: handle division by 0 * standardize auc convention (fix muse's) make relative diff metric separate and clear * generalise the ks-test usage beyond forget quality * update docs: repo updates, clarify example commands, small fixes * fix: wrong command in readme * fit gradnorm to conventions * updated list of metrics * Add MIA attack configs * load holdout dataset from local path (must be updated) * docs: Update MIA count * remove hanging comments * fixes: docs, comments etc. addressing comments * Replace scrollable with dropdown * Ruff style changes * Clarify metrics kwargs, small docs updates * Delete todos.txt * Alert rendering * Mark alerts again * More alert rendering * more rendering changes * Add star history * Make heading smaller * Add holdout dataset to metrics, datasets and experiment configs * Add holdout evals to scripts * Fix cfg spell mistake (#9) * fix spelling of holdout * fix typo * Add links documentation * Update misc documentation * Small docs updates * Clean up links * Clarify new target models * Add ES, EM metrics + Minor fixes (#10) * Add seed in eval * Add ES, EM metrics * Add tqdm in MIA * Fix get_data * Ruff format fix (pre-commit and code) * Update docs * Update Readme * Evaluation and leaderboard modifications (#11) * Remove different splits in leaderboard * update default metrics * Fix: Only required metrics in summary * Minor fixes in docs and gitignore --------- Co-authored-by: Vineeth <48151992+Dornavineeth@users.noreply.github.com> Co-authored-by: Dornavineeth --- .pre-commit-config.yaml | 2 +- README.md | 45 +++++-- community/leaderboard.md | 45 ++----- configs/data/datasets/MUSE_MIA.yaml | 22 ++++ .../data/datasets/MUSE_forget_privleak.yaml | 10 -- .../data/datasets/MUSE_holdout_privleak.yaml | 10 -- .../data/datasets/MUSE_retain_privleak.yaml | 9 -- configs/data/datasets/TOFU_MIA.yaml | 22 ++++ configs/eval.yaml | 3 +- configs/eval/muse.yaml | 8 ++ ...g_logprob.yaml => exact_memorization.yaml} | 11 +- .../muse_metrics/extraction_strength.yaml | 12 ++ .../holdout_minKpc_neg_logprob.yaml | 13 --- configs/eval/muse_metrics/mia_gradnorm.yaml | 18 +++ configs/eval/muse_metrics/mia_loss.yaml | 17 +++ configs/eval/muse_metrics/mia_min_k.yaml | 18 +++ .../muse_metrics/mia_min_k_plus_plus.yaml | 18 +++ configs/eval/muse_metrics/mia_reference.yaml | 18 +++ configs/eval/muse_metrics/mia_zlib.yaml | 17 +++ configs/eval/muse_metrics/privleak.yaml | 15 +-- configs/eval/tofu.yaml | 11 +- .../eval/tofu_metrics/exact_memorization.yaml | 14 +++ .../tofu_metrics/extraction_strength.yaml | 14 +++ configs/eval/tofu_metrics/forget_quality.yaml | 2 +- configs/eval/tofu_metrics/mia_gradnorm.yaml | 18 +++ configs/eval/tofu_metrics/mia_loss.yaml | 16 +++ configs/eval/tofu_metrics/mia_min_k.yaml | 17 +++ .../tofu_metrics/mia_min_k_plus_plus.yaml | 17 +++ configs/eval/tofu_metrics/mia_reference.yaml | 17 +++ configs/eval/tofu_metrics/mia_zlib.yaml | 16 +++ configs/eval/tofu_metrics/privleak.yaml | 17 +++ configs/experiment/eval/tofu/default.yaml | 2 + configs/experiment/examples/muse_unlearn.yaml | 6 +- configs/experiment/finetune/tofu/default.yaml | 14 +++ configs/experiment/unlearn/tofu/default.yaml | 2 + docs/components.md | 6 +- docs/contributing.md | 8 +- docs/evaluation.md | 28 +++-- docs/experiments.md | 10 +- docs/links.md | 62 ++++++++++ docs/repro.md | 27 +++-- requirements.txt | 1 - scripts/tofu_finetune.sh | 20 ++-- scripts/tofu_unlearn.sh | 16 ++- setup.py | 1 + src/data/__init__.py | 5 +- src/eval.py | 2 + src/evals/base.py | 21 ++-- src/evals/metrics/__init__.py | 30 +++-- src/evals/metrics/base.py | 2 +- src/evals/metrics/memorization.py | 64 ++++++++++ src/evals/metrics/mia/__init__.py | 100 ++++++++++++++++ src/evals/metrics/mia/all_attacks.py | 63 ++++++++++ src/evals/metrics/mia/gradnorm.py | 36 ++++++ src/evals/metrics/mia/loss.py | 16 +++ src/evals/metrics/mia/min_k.py | 26 +++++ src/evals/metrics/mia/min_k_plus_plus.py | 39 +++++++ src/evals/metrics/mia/reference.py | 25 ++++ src/evals/metrics/mia/utils.py | 70 +++++++++++ src/evals/metrics/mia/zlib.py | 29 +++++ src/evals/metrics/privacy.py | 110 ++++++------------ src/evals/metrics/utils.py | 84 ++++++++++--- src/train.py | 2 + src/trainer/utils.py | 11 ++ 64 files changed, 1161 insertions(+), 269 deletions(-) create mode 100644 configs/data/datasets/MUSE_MIA.yaml delete mode 100644 configs/data/datasets/MUSE_forget_privleak.yaml delete mode 100644 configs/data/datasets/MUSE_holdout_privleak.yaml delete mode 100644 configs/data/datasets/MUSE_retain_privleak.yaml create mode 100644 configs/data/datasets/TOFU_MIA.yaml rename configs/eval/muse_metrics/{forget_minKpc_neg_logprob.yaml => exact_memorization.yaml} (50%) create mode 100644 configs/eval/muse_metrics/extraction_strength.yaml delete mode 100644 configs/eval/muse_metrics/holdout_minKpc_neg_logprob.yaml create mode 100644 configs/eval/muse_metrics/mia_gradnorm.yaml create mode 100644 configs/eval/muse_metrics/mia_loss.yaml create mode 100644 configs/eval/muse_metrics/mia_min_k.yaml create mode 100644 configs/eval/muse_metrics/mia_min_k_plus_plus.yaml create mode 100644 configs/eval/muse_metrics/mia_reference.yaml create mode 100644 configs/eval/muse_metrics/mia_zlib.yaml create mode 100644 configs/eval/tofu_metrics/exact_memorization.yaml create mode 100644 configs/eval/tofu_metrics/extraction_strength.yaml create mode 100644 configs/eval/tofu_metrics/mia_gradnorm.yaml create mode 100644 configs/eval/tofu_metrics/mia_loss.yaml create mode 100644 configs/eval/tofu_metrics/mia_min_k.yaml create mode 100644 configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml create mode 100644 configs/eval/tofu_metrics/mia_reference.yaml create mode 100644 configs/eval/tofu_metrics/mia_zlib.yaml create mode 100644 configs/eval/tofu_metrics/privleak.yaml create mode 100644 docs/links.md create mode 100644 src/evals/metrics/mia/__init__.py create mode 100644 src/evals/metrics/mia/all_attacks.py create mode 100644 src/evals/metrics/mia/gradnorm.py create mode 100644 src/evals/metrics/mia/loss.py create mode 100644 src/evals/metrics/mia/min_k.py create mode 100644 src/evals/metrics/mia/min_k_plus_plus.py create mode 100644 src/evals/metrics/mia/reference.py create mode 100644 src/evals/metrics/mia/utils.py create mode 100644 src/evals/metrics/mia/zlib.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 97b7cdf..408374b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,4 +6,4 @@ repos: - id: ruff args: [check, --fix, scripts, src, setup.py, setup_data.py] - id: ruff - args: [format, scripts, src, setup.py setup_data.py] \ No newline at end of file + args: [format, --check, scripts, src, setup.py setup_data.py] \ No newline at end of file diff --git a/README.md b/README.md index 986b874..77f2cd3 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,32 @@ We provide efficient and streamlined implementations of the TOFU, MUSE unlearnin We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field. +--- + ### πŸ“’ Updates -#### [Mar 27, 2025] +#### [Apr 6, 2025] +- **Metrics**: Added 6 Membership Inference Attacks (MIA)β€”LOSS, ZLib, Reference, GradNorm, MinK, and MinK++β€”along with ES and EM as additional evaluation metrics. +- **TOFU Benchmark**: Now includes a holdout set and supports MIA attack-based evaluation. You can now compute MUSE's privleak on TOFU. + +#### [Mar 27, 2025] - **Easier contributions, leaderboard and reproducibility**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details. + +
+Older Updates + +#### [Mar 9, 2025] +- **Unlearning Methods**: Added support for [RMU](https://arxiv.org/abs/2403.03218) (representation-engineering based unlearning). + #### [Feb 27, 2025] ⚠️ **Repository Update**: This repo replaces the original TOFU codebase at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu), which is no longer maintained. +
+ + +--- + ## πŸ—ƒοΈ Available Components We provide several variants for each of the components in the unlearning pipeline. @@ -41,7 +59,7 @@ We provide several variants for each of the components in the unlearning pipelin |------------------------|----------------------| | **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/) | | **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | -| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, MIA Attacks, TruthRatio, Model Utility | +| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, 6 MIA Attacks, TruthRatio, Model Utility | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) | | **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma | @@ -77,14 +95,15 @@ pip install --no-build-isolation flash-attn==2.6.3 # data setup python setup_data.py # saves/eval now contains evaluation results of the uploaded models -# Downloads log files with metric eval results (incl retain model logs) from the models used in the supported benchmarks. +# Downloads log files with metric eval results (incl retain model logs) from the models +# used in the supported benchmarks. ``` --- ### πŸ”„ Updated TOFU benchmark -We've updated Open-Unlearning's TOFU benchmark target models to use a wider variety of newer architectures with sizes varying from 1B to 8B. These include LLaMA 3.2 1B, LLaMA 3.2 3B, LLaMA 3.1 8B, and the original LLaMA-2 7B from [the old version of TOFU](github.com/locuslab/tofu). +We've updated Open-Unlearning's TOFU benchmark target models to use a wider variety of newer architectures with sizes varying from 1B to 8B. These include LLaMA 3.2 1B, LLaMA 3.2 3B, LLaMA 3.1 8B, and the original LLaMA-2 7B (re-created) target models from [the old version of TOFU](github.com/locuslab/tofu). For each architecture, we have finetuned with four different splits of the TOFU datasets: `full`, `retain90`, `retain95`, `retain99`, for a total of 16 finetuned models. The first serves as the target (base model for unlearning) and the rest are retain models used to measure performance against for each forget split. These models are on [HuggingFace](`https://huggingface.co/collections/open-unlearning/tofu-new-models-67bcf636334ea81727573a9f0`) and the paths to these models can be set in the experimental configs or in command-line overrides. @@ -112,15 +131,18 @@ python src/train.py --config-name=unlearn.yaml experiment=unlearn/tofu/default \ An example command for launching a TOFU evaluation process on `forget10` split: ```bash +model=Llama-3.2-1B-Instruct python src/eval.py --config-name=eval.yaml experiment=eval/tofu/default \ - model=Llama-3.2-1B-Instruct \ - model.model_args.pretrained_model_name_or_path=open-unlearning/tofu_Llama-3.2-1B-Instruct_full \ + model=${model} \ + model.model_args.pretrained_model_name_or_path=open-unlearning/tofu_${model}_full \ + retain_logs_path=saves/eval/tofu_${model}_retain90/TOFU_EVAL.json \ task_name=SAMPLE_EVAL ``` - `experiment`- Path to the evaluation configuration [`configs/experiment/eval/tofu/default.yaml`](configs/experiment/eval/tofu/default.yaml). - `model`- Sets up the model and tokenizer configs for the `Llama-3.2-1B-Instruct` model. - `model.model_args.pretrained_model_name_or_path`- Overrides the default experiment config to evaluate a model from a HuggingFace ID (can use a local model checkpoint path as well). +- `retain_logs_path`- Sets the path to the reference model eval logs that is needed to compute reference model based metrics like `forget_quality` in TOFU. For more details about creating and running evaluations, refer [`docs/evaluation.md`](docs/evaluation.md). @@ -153,7 +175,8 @@ For more in-depth information on specific aspects of the framework, refer to the | [`docs/experiments.md`](docs/experiments.md) | Guide on running experiments in various configurations and settings, including distributed training, fine-tuning, and overriding arguments. | | [`docs/hydra.md`](docs/hydra.md) | Explanation of the Hydra features used in configuration management for experiments. | | [`community/leaderboard.md`](community/leaderboard.md) | Reference results from various unlearning methods run using this framework on TOFU and MUSE benchmarks. | -| [`docs/repro.md`](docs/repro.md) (deprecated) | Results are provided solely for reproducibility purposes, without any parameter tuning. | +| [`docs/links.md`](docs/links.md) | List of all links to the research papers or other sources the implemented features are sourced from. | +| [`docs/repro.md`](docs/repro.md) | Results are provided solely for reproducibility purposes, without any parameter tuning. | --- ## πŸ”— Support & Contributors @@ -197,9 +220,15 @@ If you use OpenUnlearning in your research, please cite OpenUnlearning and the b ### 🀝 Acknowledgements - This repo is inspired from [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory). -- The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/jaechan-repo/muse_bench) benchmarks served as the foundation for our re-implementation. +- The [TOFU](https://github.com/locuslab/tofu) and [MUSE](https://github.com/swj0419/muse_bench) benchmarks served as the foundation for our re-implementation. --- ### πŸ“„ License This project is licensed under the MIT License. See the [`LICENSE`](LICENSE) file for details. + +--- + +### Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=locuslab/open-unlearning&type=Date)](https://www.star-history.com/#locuslab/open-unlearning&Date) diff --git a/community/leaderboard.md b/community/leaderboard.md index 8803932..56c007e 100644 --- a/community/leaderboard.md +++ b/community/leaderboard.md @@ -8,48 +8,35 @@ We encourage the community to develop new methods, optimize them for specific be To implement a new method, refer to our [contributing guide](../docs/contributing.md). -> **Note:** The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date. +> [!NOTE] +> The [results.md](../docs/results.md) file is maintained for reproducibility purposes. However, we encourage contributors to update the leaderboard table instead of the reproducibility table. We will continue refining and tuning baseline methods to keep the leaderboard up to date. -### TOFU unlearning on the `Llama-3.2-1B-Instruct` architecture +### TOFU unlearning on the `Llama-2-7b-hf-chat` architecture
- - - - - - - - - - - - + + - - - - - + @@ -70,37 +57,23 @@ To implement a new method, refer to our [contributing guide](../docs/contributin - - - - - - - - - - - + - - - - @@ -143,7 +116,7 @@ To implement a new method, refer to our [contributing guide](../docs/contributin - + @@ -152,7 +125,7 @@ To implement a new method, refer to our [contributing guide](../docs/contributin - + diff --git a/configs/data/datasets/MUSE_MIA.yaml b/configs/data/datasets/MUSE_MIA.yaml new file mode 100644 index 0000000..66e818c --- /dev/null +++ b/configs/data/datasets/MUSE_MIA.yaml @@ -0,0 +1,22 @@ +MUSE_MIA_holdout: + access_key: holdout + handler: CompletionDataset + args: + hf_args: + path: "muse-bench/MUSE-News" + name: "privleak" + split: "holdout" + prefix_key: "prompt" # doesn't exist in dataset + text_key: "text" + max_length: 2048 +MUSE_MIA_forget: + access_key: forget + handler: CompletionDataset + args: + hf_args: + path: "muse-bench/MUSE-News" + name: "privleak" + split: "forget" + prefix_key: "prompt" # doesn't exist in dataset + text_key: "text" + max_length: 2048 \ No newline at end of file diff --git a/configs/data/datasets/MUSE_forget_privleak.yaml b/configs/data/datasets/MUSE_forget_privleak.yaml deleted file mode 100644 index 4013eb0..0000000 --- a/configs/data/datasets/MUSE_forget_privleak.yaml +++ /dev/null @@ -1,10 +0,0 @@ -MUSE_forget_privleak: - handler: CompletionDataset - args: - hf_args: - path: "muse-bench/MUSE-News" - name: "privleak" - split: "forget" - prefix_key: "prompt" # doesn't exist in dataset - text_key: "text" - max_length: 2048 \ No newline at end of file diff --git a/configs/data/datasets/MUSE_holdout_privleak.yaml b/configs/data/datasets/MUSE_holdout_privleak.yaml deleted file mode 100644 index 4fcda6e..0000000 --- a/configs/data/datasets/MUSE_holdout_privleak.yaml +++ /dev/null @@ -1,10 +0,0 @@ -MUSE_holdout_privleak: - handler: CompletionDataset - args: - hf_args: - path: "muse-bench/MUSE-News" - name: "privleak" - split: "holdout" - prefix_key: "prompt" # doesn't exist in dataset - text_key: "text" - max_length: 2048 \ No newline at end of file diff --git a/configs/data/datasets/MUSE_retain_privleak.yaml b/configs/data/datasets/MUSE_retain_privleak.yaml deleted file mode 100644 index e52813c..0000000 --- a/configs/data/datasets/MUSE_retain_privleak.yaml +++ /dev/null @@ -1,9 +0,0 @@ -MUSE_retain_privleak: - handler: PretrainingDataset - args: - hf_args: - path: "muse-bench/MUSE-News" - name: "privleak" - split: "retain" - text_key: "text" - max_length: 2048 \ No newline at end of file diff --git a/configs/data/datasets/TOFU_MIA.yaml b/configs/data/datasets/TOFU_MIA.yaml new file mode 100644 index 0000000..20b6c97 --- /dev/null +++ b/configs/data/datasets/TOFU_MIA.yaml @@ -0,0 +1,22 @@ +TOFU_QA_forget: + access_key: forget + handler: QADataset + args: + hf_args: + name: "forget10" + split: "train" + path: "locuslab/TOFU" + question_key: "question" + answer_key: "answer" + max_length: 512 +TOFU_QA_holdout: + access_key: holdout + handler: QADataset + args: + hf_args: + name: "holdout10" + path: "locuslab/TOFU" + split: "train" + question_key: "question" + answer_key: "answer" + max_length: 512 \ No newline at end of file diff --git a/configs/eval.yaml b/configs/eval.yaml index 06e90a3..fef5ed4 100644 --- a/configs/eval.yaml +++ b/configs/eval.yaml @@ -13,4 +13,5 @@ model: device_map: cuda mode: eval -task_name: ??? \ No newline at end of file +task_name: ??? +seed: 0 \ No newline at end of file diff --git a/configs/eval/muse.yaml b/configs/eval/muse.yaml index 6350e42..ecdd98f 100644 --- a/configs/eval/muse.yaml +++ b/configs/eval/muse.yaml @@ -7,6 +7,14 @@ defaults: - retain_knowmem_ROUGE - forget_verbmem_ROUGE - privleak + - extraction_strength + # - exact_memorization + # - mia_min_k_plus_plus + # - mia_min_k + # - mia_loss + # - mia_reference + # - mia_zlib + # - mia_gradnorm handler: MUSEEvaluator output_dir: ${paths.output_dir} # set to default eval directory diff --git a/configs/eval/muse_metrics/forget_minKpc_neg_logprob.yaml b/configs/eval/muse_metrics/exact_memorization.yaml similarity index 50% rename from configs/eval/muse_metrics/forget_minKpc_neg_logprob.yaml rename to configs/eval/muse_metrics/exact_memorization.yaml index d9829df..68b940d 100644 --- a/configs/eval/muse_metrics/forget_minKpc_neg_logprob.yaml +++ b/configs/eval/muse_metrics/exact_memorization.yaml @@ -1,13 +1,12 @@ -# @package eval.muse.metrics.forget_minKpc_neg_logprob +# @package eval.muse.metrics.exact_memorization defaults: - - ../../data/datasets@datasets: MUSE_forget_privleak + - ../../data/datasets@datasets: MUSE_forget_verbmem - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex -handler: minKpc_negative_logprob -batch_size: 8 -percentile_K: 40 +handler: exact_memorization +batch_size: 8 datasets: - MUSE_forget_privleak: + MUSE_forget_verbmem: args: hf_args: path: muse-bench/MUSE-${eval.muse.data_split} \ No newline at end of file diff --git a/configs/eval/muse_metrics/extraction_strength.yaml b/configs/eval/muse_metrics/extraction_strength.yaml new file mode 100644 index 0000000..18d716a --- /dev/null +++ b/configs/eval/muse_metrics/extraction_strength.yaml @@ -0,0 +1,12 @@ +# @package eval.muse.metrics.extraction_strength +defaults: + - ../../data/datasets@datasets: MUSE_forget_verbmem + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex + +handler: extraction_strength +batch_size: 8 +datasets: + MUSE_forget_verbmem: + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} \ No newline at end of file diff --git a/configs/eval/muse_metrics/holdout_minKpc_neg_logprob.yaml b/configs/eval/muse_metrics/holdout_minKpc_neg_logprob.yaml deleted file mode 100644 index 4d3d1fa..0000000 --- a/configs/eval/muse_metrics/holdout_minKpc_neg_logprob.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# @package eval.muse.metrics.holdout_minKpc_neg_logprob -defaults: - - ../../data/datasets@datasets: MUSE_holdout_privleak - - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex -handler: minKpc_negative_logprob -batch_size: 8 -percentile_K: 40 - -datasets: - MUSE_holdout_privleak : - args: - hf_args: - path: muse-bench/MUSE-${eval.muse.data_split} \ No newline at end of file diff --git a/configs/eval/muse_metrics/mia_gradnorm.yaml b/configs/eval/muse_metrics/mia_gradnorm.yaml new file mode 100644 index 0000000..89cb8c5 --- /dev/null +++ b/configs/eval/muse_metrics/mia_gradnorm.yaml @@ -0,0 +1,18 @@ +# @package eval.muse.metrics.mia_gradnorm +defaults: + - ../../data/datasets@datasets: MUSE_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +datasets: + MUSE_MIA_holdout: + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + MUSE_MIA_forget: + access_key: forget + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + +handler: mia_gradnorm +batch_size: 1 +p: 2 \ No newline at end of file diff --git a/configs/eval/muse_metrics/mia_loss.yaml b/configs/eval/muse_metrics/mia_loss.yaml new file mode 100644 index 0000000..dfca113 --- /dev/null +++ b/configs/eval/muse_metrics/mia_loss.yaml @@ -0,0 +1,17 @@ +# @package eval.muse.metrics.mia_loss +defaults: + - ../../data/datasets@datasets: MUSE_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +datasets: + MUSE_MIA_holdout: + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + MUSE_MIA_forget: + access_key: forget + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + +batch_size: 8 +handler: mia_loss diff --git a/configs/eval/muse_metrics/mia_min_k.yaml b/configs/eval/muse_metrics/mia_min_k.yaml new file mode 100644 index 0000000..9b2b14c --- /dev/null +++ b/configs/eval/muse_metrics/mia_min_k.yaml @@ -0,0 +1,18 @@ +# @package eval.muse.metrics.mia_min_k +defaults: + - ../../data/datasets@datasets: MUSE_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +datasets: + MUSE_MIA_holdout: + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + MUSE_MIA_forget: + access_key: forget + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + +batch_size: 8 +handler: mia_min_k +k: 0.4 \ No newline at end of file diff --git a/configs/eval/muse_metrics/mia_min_k_plus_plus.yaml b/configs/eval/muse_metrics/mia_min_k_plus_plus.yaml new file mode 100644 index 0000000..e497c20 --- /dev/null +++ b/configs/eval/muse_metrics/mia_min_k_plus_plus.yaml @@ -0,0 +1,18 @@ +# @package eval.muse.metrics.mia_min_k_plus_plus +defaults: + - ../../data/datasets@datasets: MUSE_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +datasets: + MUSE_MIA_holdout: + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + MUSE_MIA_forget: + access_key: forget + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + +batch_size: 8 +handler: mia_min_k_plus_plus +k: 0.4 \ No newline at end of file diff --git a/configs/eval/muse_metrics/mia_reference.yaml b/configs/eval/muse_metrics/mia_reference.yaml new file mode 100644 index 0000000..ea5d38c --- /dev/null +++ b/configs/eval/muse_metrics/mia_reference.yaml @@ -0,0 +1,18 @@ +# @package eval.muse.metrics.mia_reference +defaults: + - ../../data/datasets@datasets: MUSE_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +datasets: + MUSE_MIA_holdout: + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + MUSE_MIA_forget: + access_key: forget + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + +batch_size: 8 +handler: mia_reference +reference_model_path: muse-bench/MUSE-${eval.muse.data_split}_retrain # modify appropriately diff --git a/configs/eval/muse_metrics/mia_zlib.yaml b/configs/eval/muse_metrics/mia_zlib.yaml new file mode 100644 index 0000000..d881332 --- /dev/null +++ b/configs/eval/muse_metrics/mia_zlib.yaml @@ -0,0 +1,17 @@ +# @package eval.muse.metrics.mia_zlib +defaults: + - ../../data/datasets@datasets: MUSE_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +datasets: + MUSE_MIA_holdout: + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + MUSE_MIA_forget: + access_key: forget + args: + hf_args: + path: muse-bench/MUSE-${eval.muse.data_split} + +batch_size: 8 +handler: mia_zlib \ No newline at end of file diff --git a/configs/eval/muse_metrics/privleak.yaml b/configs/eval/muse_metrics/privleak.yaml index d16aa75..048f946 100644 --- a/configs/eval/muse_metrics/privleak.yaml +++ b/configs/eval/muse_metrics/privleak.yaml @@ -1,22 +1,17 @@ # @package eval.muse.metrics.privleak defaults: - - .@pre_compute.forget_minKpc_neg_logprob: forget_minKpc_neg_logprob - - .@pre_compute.holdout_minKpc_neg_logprob: holdout_minKpc_neg_logprob + - .@pre_compute.mia_min_k: mia_min_k pre_compute: - forget_minKpc_neg_logprob: + mia_min_k: access_key: forget - holdout_minKpc_neg_logprob: - access_key: holdout reference_logs: retain_model_logs: path: ${eval.muse.retain_logs_path} include: - forget_minKpc_neg_logprob: + mia_min_k: access_key: retain - holdout_minKpc_neg_logprob: - access_key: holdout -handler: relative_auc -ref_value: 0.5 +handler: privleak +ref_value: 0.5 \ No newline at end of file diff --git a/configs/eval/tofu.yaml b/configs/eval/tofu.yaml index 61aaddf..bbfea26 100644 --- a/configs/eval/tofu.yaml +++ b/configs/eval/tofu.yaml @@ -4,11 +4,19 @@ defaults: # include all defined metrics files - tofu_metrics: # When you import a metric here, its configuration automatically populates the # metric key below, enabled by the @package directive at the top of each configuration file. - - forget_quality - forget_Q_A_Prob - forget_Q_A_ROUGE - model_utility # populated in the metrics key as metrics.model_utility + - privleak + - extraction_strength + # - exact_memorization + # - mia_min_k_plus_plus + # - mia_min_k + # - mia_loss + # - mia_zlib + # - mia_gradnorm + # - mia_reference # set reference model path appropriately handler: TOFUEvaluator output_dir: ${paths.output_dir} # set to default eval directory @@ -16,4 +24,5 @@ metrics: {} # lists a mapping from each evaluation metric to its config # populated through the first (@package) line in each metric config overwrite: false forget_split: forget10 +holdout_split: holdout10 retain_logs_path: null \ No newline at end of file diff --git a/configs/eval/tofu_metrics/exact_memorization.yaml b/configs/eval/tofu_metrics/exact_memorization.yaml new file mode 100644 index 0000000..c8ebb7a --- /dev/null +++ b/configs/eval/tofu_metrics/exact_memorization.yaml @@ -0,0 +1,14 @@ +# @package eval.tofu.metrics.exact_memorization +defaults: + - ../../data/datasets@datasets: TOFU_QA_forget + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex + # ^ get default dataset and generation config information + +handler: exact_memorization +batch_size: 32 + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/extraction_strength.yaml b/configs/eval/tofu_metrics/extraction_strength.yaml new file mode 100644 index 0000000..654da68 --- /dev/null +++ b/configs/eval/tofu_metrics/extraction_strength.yaml @@ -0,0 +1,14 @@ +# @package eval.tofu.metrics.extraction_strength +defaults: + - ../../data/datasets@datasets: TOFU_QA_forget + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex + # ^ get default dataset and generation config information + +handler: extraction_strength +batch_size: 32 + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/forget_quality.yaml b/configs/eval/tofu_metrics/forget_quality.yaml index 5119a5d..888e817 100644 --- a/configs/eval/tofu_metrics/forget_quality.yaml +++ b/configs/eval/tofu_metrics/forget_quality.yaml @@ -13,4 +13,4 @@ pre_compute: forget_truth_ratio: access_key: forget -handler: forget_quality \ No newline at end of file +handler: ks_test \ No newline at end of file diff --git a/configs/eval/tofu_metrics/mia_gradnorm.yaml b/configs/eval/tofu_metrics/mia_gradnorm.yaml new file mode 100644 index 0000000..1f2c3b2 --- /dev/null +++ b/configs/eval/tofu_metrics/mia_gradnorm.yaml @@ -0,0 +1,18 @@ +# @package eval.tofu.metrics.mia_gradnorm +defaults: + - ../../data/datasets@datasets: TOFU_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex + +handler: mia_gradnorm +batch_size: 1 +p: 2 + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} + TOFU_QA_holdout: + args: + hf_args: + name: ${eval.tofu.holdout_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/mia_loss.yaml b/configs/eval/tofu_metrics/mia_loss.yaml new file mode 100644 index 0000000..ee60537 --- /dev/null +++ b/configs/eval/tofu_metrics/mia_loss.yaml @@ -0,0 +1,16 @@ +# @package eval.tofu.metrics.mia_loss +defaults: + - ../../data/datasets@datasets: TOFU_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +batch_size: 32 +handler: mia_loss + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} + TOFU_QA_holdout: + args: + hf_args: + name: ${eval.tofu.holdout_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/mia_min_k.yaml b/configs/eval/tofu_metrics/mia_min_k.yaml new file mode 100644 index 0000000..fb87080 --- /dev/null +++ b/configs/eval/tofu_metrics/mia_min_k.yaml @@ -0,0 +1,17 @@ +# @package eval.tofu.metrics.mia_min_k +defaults: + - ../../data/datasets@datasets: TOFU_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +batch_size: 32 +handler: mia_min_k +k: 0.4 + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} + TOFU_QA_holdout: + args: + hf_args: + name: ${eval.tofu.holdout_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml b/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml new file mode 100644 index 0000000..c95be8f --- /dev/null +++ b/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml @@ -0,0 +1,17 @@ +# @package eval.tofu.metrics.mia_min_k_plus_plus +defaults: + - ../../data/datasets@datasets: TOFU_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +batch_size: 32 +k: 0.4 +handler: mia_min_k_plus_plus + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} + TOFU_QA_holdout: + args: + hf_args: + name: ${eval.tofu.holdout_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/mia_reference.yaml b/configs/eval/tofu_metrics/mia_reference.yaml new file mode 100644 index 0000000..b571d19 --- /dev/null +++ b/configs/eval/tofu_metrics/mia_reference.yaml @@ -0,0 +1,17 @@ +# @package eval.tofu.metrics.mia_reference +defaults: + - ../../data/datasets@datasets: TOFU_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +batch_size: 32 +handler: mia_reference +reference_model_path: ??? # modify appropriately for example open-unlearning/tofu_Llama-3.2-1B-Instruct_retain90 + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} + TOFU_QA_holdout: + args: + hf_args: + name: ${eval.tofu.holdout_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/mia_zlib.yaml b/configs/eval/tofu_metrics/mia_zlib.yaml new file mode 100644 index 0000000..68fbe2d --- /dev/null +++ b/configs/eval/tofu_metrics/mia_zlib.yaml @@ -0,0 +1,16 @@ +# @package eval.tofu.metrics.mia_zlib +defaults: + - ../../data/datasets@datasets: TOFU_MIA + - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex +batch_size: 32 +handler: mia_zlib + +datasets: + TOFU_QA_forget: + args: + hf_args: + name: ${eval.tofu.forget_split} + TOFU_QA_holdout: + args: + hf_args: + name: ${eval.tofu.holdout_split} \ No newline at end of file diff --git a/configs/eval/tofu_metrics/privleak.yaml b/configs/eval/tofu_metrics/privleak.yaml new file mode 100644 index 0000000..44a461e --- /dev/null +++ b/configs/eval/tofu_metrics/privleak.yaml @@ -0,0 +1,17 @@ +# @package eval.tofu.metrics.privleak +defaults: + - .@pre_compute.mia_min_k: mia_min_k + +pre_compute: + mia_min_k: + access_key: forget + +reference_logs: + retain_model_logs: + path: ${eval.tofu.retain_logs_path} + include: + mia_min_k: + access_key: retain + +handler: privleak +ref_value: 0.5 diff --git a/configs/experiment/eval/tofu/default.yaml b/configs/experiment/eval/tofu/default.yaml index 600f2bf..b2be020 100644 --- a/configs/experiment/eval/tofu/default.yaml +++ b/configs/experiment/eval/tofu/default.yaml @@ -5,6 +5,7 @@ defaults: - override /eval: tofu forget_split: forget10 +holdout_split: holdout10 retain_logs_path: null model: @@ -14,6 +15,7 @@ model: eval: tofu: forget_split: ${forget_split} + holdout_split: ${holdout_split} retain_logs_path: ${retain_logs_path} task_name: ??? \ No newline at end of file diff --git a/configs/experiment/examples/muse_unlearn.yaml b/configs/experiment/examples/muse_unlearn.yaml index 1535b92..07c6d12 100644 --- a/configs/experiment/examples/muse_unlearn.yaml +++ b/configs/experiment/examples/muse_unlearn.yaml @@ -140,7 +140,7 @@ eval: index: index handler: minKpc_negative_logprob batch_size: 8 - percentile_K: 40 + k: 0.4 access_key: forget holdout_minKpc_neg_logprob: datasets: @@ -161,7 +161,7 @@ eval: index: index handler: minKpc_negative_logprob batch_size: 8 - percentile_K: 40 + k: 0.4 access_key: holdout reference_logs: retain_model_logs: @@ -171,7 +171,7 @@ eval: access_key: retain holdout_minKpc_neg_logprob: access_key: holdout - handler: relative_auc + handler: privleak ref_value: 0.5 handler: MUSEEvaluator device: cuda diff --git a/configs/experiment/finetune/tofu/default.yaml b/configs/experiment/finetune/tofu/default.yaml index 072e86d..ce4d62b 100644 --- a/configs/experiment/finetune/tofu/default.yaml +++ b/configs/experiment/finetune/tofu/default.yaml @@ -4,6 +4,7 @@ defaults: - override /model: Llama-3.2-1B-Instruct - override /trainer: finetune - override /data/datasets@data.train: TOFU_QA_full + - override /eval: tofu mode: finetune trainer: @@ -13,4 +14,17 @@ trainer: warmup_epochs: 1.0 # custom parameter num_train_epochs: 5 + +forget_split: forget10 +holdout_split: holdout10 +retain_logs_path: null + +eval: + tofu: + forget_split: ${forget_split} + holdout_split: ${holdout_split} + retain_logs_path: ${retain_logs_path} + overwrite: true + + task_name: tofu_Llama-3.2-1B-Instruct_full \ No newline at end of file diff --git a/configs/experiment/unlearn/tofu/default.yaml b/configs/experiment/unlearn/tofu/default.yaml index f2e0ab1..3ea7b4f 100644 --- a/configs/experiment/unlearn/tofu/default.yaml +++ b/configs/experiment/unlearn/tofu/default.yaml @@ -14,11 +14,13 @@ model: forget_split: forget10 retain_split: retain90 +holdout_split: holdout10 retain_logs_path: null eval: tofu: forget_split: ${forget_split} + holdout_split: ${holdout_split} retain_logs_path: ${retain_logs_path} overwrite: true diff --git a/docs/components.md b/docs/components.md index 016932c..0c889ef 100644 --- a/docs/components.md +++ b/docs/components.md @@ -19,7 +19,8 @@ This process involves three main steps: 6. [Collator](#collator) - Handles data collation logic 7. [Experiment](#experiment) - Combines components into a final experiment config -> [!Note] adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md). +> [!NOTE] +> Adding each component requires Hydra config management features, which are documented in [`docs/hydra.md`](../docs/hydra.md). --- @@ -147,7 +148,8 @@ To add a new model architecture: ### Implement and register a handler For all the models currently supported, HuggingFace's `AutoModelForCausalLM` and `AutoTokenizer` are used, and therefore the user doesn't need to create or register any handler. -> [!Note]: Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next. +> [!NOTE] +Currently, we do not support loading models modified with LoRA and related variants. If you wish use such features, please create define and register model handlers for this logic in [`src/model`](../src/model) and provide the config info as discussed next. ### Add to configs Model configurations contain details required to load the model+tokenizer such as paths, chat templating arguments, LoRA parameters etc. in [`configs/models`](../configs/models/). diff --git a/docs/contributing.md b/docs/contributing.md index 3e398f5..583d3f2 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -17,6 +17,8 @@ There are several ways you can contribute to OpenUnlearning: * Implement new evaluations. * Contribute to the documentation. +Once your feature is added you may also link the relevant paper in [`docs/links.md`](../docs/links.md) + ## Fixing Issues If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](#create-a-pull-request) and open a Pull Request! @@ -61,7 +63,7 @@ Adding a new component listed below requires defining a new class, registering i 6. [Collator](components#collator) - Handles data collation logic 7. [Experiment](components#experiment) - Combines components into a final experiment config -> **IMPORTANT** πŸš€ +> [!IMPORTANT] > **We especially encourage** contributions of methods and benchmarks that you've created, since you best understand them and know how to use them. We are ready to expedite their integration into OpenUnlearning. > When facing difficulties implementing any component, please contact the maintainers to join our discord where we can go in detail with the implementations. @@ -84,7 +86,7 @@ Some methods might involve multiple commands or steps while unlearning: ensure y ### 4. Update Leaderboard and Upload Model -Don't forget to add your results to the [leaderboard](results.md) and upload your unlearned model to HuggingFace for broader accessibility and reproducibility. +Don't forget to add your results to the [leaderboard](results.md) and upload your unlearned model to HuggingFace for broader accessibility and reproducibility. Also, if applicable, add a link to your paper in [`docs/links.md`](../docs/links.md) ```bash pip install huggingface_hub @@ -116,7 +118,7 @@ Your contributions toward defining or improving evaluation methods can significa 1. **Prepare Datasets & Models** – Create your dataset and train models to generate fine-tuned or retained models. 2. **Define a New Benchmark** (if needed) – Follow the [Benchmark Guide]((components.md#benchmark)) to implement a new evaluation benchmark. 3. **Run and Tune Baseline Methods** – Evaluate existing unlearning methods on your benchmark and optimize them. -4. **Document & Share Findings** – Provide detailed steps for reproduction in [`community/benchmarks/`](../community/benchmarks). +4. **Document & Share Findings** – Provide detailed steps for reproduction in [`community/benchmarks/`](../community/benchmarks). Also, if applicable, add a link to your paper in [`docs/links.md`](../docs/links.md) --- diff --git a/docs/evaluation.md b/docs/evaluation.md index ec41a86..ecce4d2 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -46,12 +46,12 @@ Other metrics like TOFU's Forget Quality (which is a single score computed over ### Steps to create new metrics: #### 1. Implement a handler -Metric handlers are implemented in [`src/evals/metrics`](../src/evals/metrics/), where we define handlers for `probability`, `rouge`, `forget_quality` etc. +Metric handlers are implemented in [`src/evals/metrics`](../src/evals/metrics/), where we define handlers for `probability`, `rouge`, `privleak` etc. A metric handler is implemented as a function decorated with `@unlearning_metric`. This decorator wraps the function into an UnlearningMetric object. This provides functionality to automatically load and prepare datasets and collators for `probability` as specified in the eval config ([example](../configs/eval/tofu_metrics/forget_Q_A_Prob.yaml)), so they are readily available for use in the function. -Example: implementing the `rouge` and `forget_quality` handlers +Example: implementing the `rouge` and `privleak` handlers ```python # in src/evals/metrics/memorization.py @@ -72,16 +72,19 @@ def rouge(model, **kwargs): } # in src/evals/metrics/privacy.py -@unlearning_metric(name="forget_quality") -def forget_quality(model, **kwargs): - # the forget quality metric is aggregated from computed statistics of - # other metrics like truth ratio, which is provided through kwargs +@unlearning_metric(name="privleak") +def privleak(model, **kwargs): + # the privleak quality metric is found from computed statistics of + # other metrics like MIA attack scores, which is provided through kwargs ... - return {"agg_value": pvalue} + return {'agg_value': (score-ref)/(ref+1e-10)*100} ``` - `@unlearning_metric(name="rouge")` - Defines a `rouge` handler. +> [!NOTE] +`kwargs` contains many important attributes that are useful while computing metrics. It will contain all the metric-specific parameters defined in the metric's yaml file, and also contain the created objects corresponding to the other attributes mentioned in the metric config: such as the `"tokenizer"`, `"data"` (the preprocessed torch dataset), `"batch_size"`, `"collator"`, `"generation_args"`, `"pre_compute"` (prior metrics the current metric depends on), and `"reference_logs"` (evals from a reference model the current metric can use). + #### 2. Register the metric handler Register the handler to link the class to the configs via the class name in [`METRIC_REGISTRY`](../src/evals/metrics/__init__.py). @@ -98,8 +101,7 @@ Metric configurations are in [`configs/eval/tofu_metrics`](../configs/eval/tofu_ Example 1: Creating the config for MUSE's `forget_verbmem_ROUGE` ([`configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml`](../configs/eval/muse_metrics/forget_knowmem_ROUGE.yaml)). - + ```yaml # @package eval.muse.metrics.forget_verbmem_ROUGE @@ -128,13 +130,9 @@ collators: generation_args: max_new_tokens: 128 ``` - Example 2: Creating the config for TOFU's `forget_quality` ([`configs/eval/tofu_metrics/forget_quality.yaml`](../configs/eval/tofu_metrics/forget_quality.yaml)). - - ```yaml # @package eval.tofu.metrics.forget_quality defaults: @@ -155,9 +153,9 @@ pre_compute: forget_truth_ratio: access_key: forget -handler: forget_quality +handler: ks_test # the handler with logic that is registered in code ``` - + ### Designing metrics that depend on other metrics diff --git a/docs/experiments.md b/docs/experiments.md index b570d3e..d61009b 100644 --- a/docs/experiments.md +++ b/docs/experiments.md @@ -59,7 +59,8 @@ paths.output_dir=saves/unlearn/NPO/evals ``` -> [!Note]: The unlearning experiments support evaluation during the unlearning finetuning. But this is supported only on a single GPU When multiple GPUs are used to train, checkpoints must be stored and evaluated after training. +> [!NOTE] +The unlearning experiments support evaluation during the unlearning finetuning. But this is supported only on a single GPU When multiple GPUs are used to train, checkpoints must be stored and evaluated after training. --- @@ -70,7 +71,7 @@ To understand the structure of an evaluation config and the kind of available pa To understand the structure of an unlearning config and the kind of available parameters for overriding, refer to: [`configs/experiment/examples/muse_unlearn.yaml`](../configs/experiment/examples/muse_unlearn.yaml). The following tables list the most commonly used arguments while running experiments. - + ###

Model Settings

Methodforget01forget05 forget10
forget_quality model_utilityforget_qualitymodel_utilityforget_qualitymodel_utility
Finetuned0.010.602.96e-130.68.08e-220.64.35e-250.63
Retain 1.00.601.00.61.00.590.61
Methodforget01forget05 forget10
forget_quality model_utilityforget_qualitymodel_utilityforget_qualitymodel_utility
Finetuned0.010.602.96e-130.68.08e-221.66e-21 0.6
Retain 1.00.601.00.61.0 0.59
0.64 0.58 -99.810.550.56 0.47 1.0 -57.26
Retain 0.330.210.20 0 0.56 0.3
@@ -242,7 +243,8 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \ src/train.py --config-name=unlearn.yaml experiment=unlearn/muse/default.yaml task_name=DISTRIBUTED_TRAIN ``` -> [!Note]: Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference): +> [!CAUTION] +> Evaluation runs are designed to work only a single GPU (this includes running evaluation during training). To run an evaluation job, modify your command to make only one GPU visible (assuming one GPU is enough for inference), as shown below ```bash CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/muse/default.yaml task_name=SAMPLE_EVAL diff --git a/docs/links.md b/docs/links.md new file mode 100644 index 0000000..9a651a1 --- /dev/null +++ b/docs/links.md @@ -0,0 +1,62 @@ +# πŸ”— Links and References + +Links to research papers and resources corresponding to implemented features in this repository. Please feel free to fill in any missing references! + +--- + +## πŸ“Œ Table of Contents +- [Implemented Methods](#implemented-methods) +- [Benchmarks](#benchmarks) +- [Evaluation Metrics](#evaluation-metrics) +- [Useful Links](#useful-links) + - [Survey Papers](#survey-papers) + - [Other GitHub Repositories](#other-github-repositories) + +--- + +## πŸ“— Implemented Methods + +| Method | Resource | +|-----------------|----------| +| GradAscent, GradDiff | Naive baselines found in many papers including MUSE, TOFU etc. | +| NPO | Paper [πŸ“„](https://arxiv.org/abs/2404.05868), Code [πŸ™](https://github.com/licong-lin/negative-preference-optimization) | +| SimNPO | Paper [πŸ“„](https://arxiv.org/abs/2410.07163), Code [πŸ™](https://github.com/OPTML-Group/Unlearn-Simple) | +| IdkDPO | TOFU ([πŸ“„](https://arxiv.org/abs/2401.06121)) | +| RMU | WMDP paper ([πŸ™](https://github.com/centerforaisafety/wmdp/tree/main/rmu), [🌐](https://www.wmdp.ai/)), later used in G-effect ([πŸ™](https://github.com/tmlr-group/G-effect/blob/main/dataloader.py)) | + +--- + +## πŸ“˜ Benchmarks + +| Benchmark | Resource | +|-----------|----------| +| TOFU | Paper [πŸ“„](https://arxiv.org/abs/2401.06121) | +| MUSE | Paper [πŸ“„](https://arxiv.org/abs/2407.06460) | + +--- + +## πŸ“™ Evaluation Metrics + +| Metric | Resource | +|--------|----------| +| Verbatim Probability / ROUGE, simple QA-ROUGE | Naive metrics found in many papers including MUSE, TOFU etc. | +| Membership Inference Attacks (LOSS, ZLib, Reference, GradNorm, MinK, MinK++) | MIMIR ([πŸ™](https://github.com/iamgroot42/mimir)), MUSE ([πŸ“„](https://arxiv.org/abs/2407.06460)) | +| PrivLeak | MUSE ([πŸ“„](https://arxiv.org/abs/2407.06460)) | +| Forget Quality, Truth Ratio, Model Utility | TOFU ([πŸ“„](https://arxiv.org/abs/2401.06121)) | +| Extraction Strength (ES) | Carlini et al., 2021 ([πŸ“„](https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting)), used for unlearning in Wang et al., 2025 ([πŸ“„](https://openreview.net/pdf?id=wUtCieKuQU)) | +| Exact Memorization (EM) | Tirumala et al., 2022 ([πŸ“„](https://proceedings.neurips.cc/paper_files/paper/2022/hash/fa0509f4dab6807e2cb465715bf2d249-Abstract-Conference.html)), used for unlearning in Wang et al., 2025 ([πŸ“„](https://openreview.net/pdf?id=wUtCieKuQU)) | + +--- + +## 🌐 Useful Links + +### πŸ“š Surveys +- [Machine Unlearning in 2024](https://ai.stanford.edu/~kzliu/blog/unlearning) +- [Rethinking Machine Unlearning for Large Language Models](https://arxiv.org/abs/2402.08787) + +### πŸ™ Other GitHub Repositories +- [TOFU Benchmark (original)](https://github.com/locuslab/tofu) +- [MUSE Benchmark (original)](https://github.com/swj0419/muse_bench) +- [Awesome LLM Unlearning](https://github.com/chrisliu298/awesome-llm-unlearning) +- [Awesome Machine Unlearning](https://github.com/tamlhp/awesome-machine-unlearning) +- [Awesome GenAI Unlearning](https://github.com/franciscoliu/Awesome-GenAI-Unlearning) \ No newline at end of file diff --git a/docs/repro.md b/docs/repro.md index ac64ac3..9bd5103 100644 --- a/docs/repro.md +++ b/docs/repro.md @@ -4,7 +4,8 @@ ->​For results where methods have been tuned for optimal performance, please refer to the [`community/leaderboard`](../community/leaderboard.md). +> [!TIP] +> ​This page is for reproducibility. For results where methods have been tuned for optimal performance, please refer to the [`community/leaderboard`](../community/leaderboard.md). The scripts below execute standard baseline unlearning experiments on the TOFU and MUSE datasets, evaluated using their corresponding benchmarks. ```bash @@ -22,11 +23,13 @@ For all the experiments below, we used the following setup |-------------------------|------------| | **Hardware** | 2 Γ— L40s GPUs (48GB each) | | **Distributed Computing** | [DeepSpeed ZeRO Stage 3 (Accelerate)](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed) | -| **Hyperparameters** | Learning Rate (lr) = 1e-5
Ξ± = 1, Ξ³ = 1, Ξ² = 0.1 (where applicable)
Number of Epochs = 10
Optimizer: [paged_adamw_32bit](https://huggingface.co/docs/bitsandbytes/main/en/reference/optim/adamw#bitsandbytes.optim.PagedAdamW) | +| **Hyperparameters** | Learning Rate (lr) = 1e-5
Ξ± = 1, Ξ³ = 1, Ξ² = 0.1 (where applicable)
Batch size 32 effectively: 8 per device, 4 grad accum steps
Number of Epochs = 10
Optimizer: [paged_adamw_32bit](https://huggingface.co/docs/bitsandbytes/main/en/reference/optim/adamw#bitsandbytes.optim.PagedAdamW) | -__Note:__ -1. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO & RMU can be significantly improved with careful tuning. **Please use these numbers only for reproducibility purposes**. -2. NPO in MUSE: for NPO, the MUSE implementation is inconsistent with the [original paper](https://github.com/licong-lin/negative-preference-optimization) as discussed [here]( https://github.com/jaechan-repo/muse_bench/issues/2). This inconsistency is carried over into implementations like [SimNPO](https://github.com/OPTML-Group/Unlearn-Simple/issues/5). Here, we use the original NPO implementation with the same loss function expression across datasets. + +> [!NOTE] +> 1. The results in the next section display only some important subsets of metrics for each benchmark. For examples of more available evaluation metrics available: see `muse*/*_SUMMARY.json`, `tofu*/evals*/*_SUMMARY.json` files on the [HuggingFace space](https://huggingface.co/datasets/open-unlearning/eval). +> 2. Results may vary even with the same effective hyperparameters when trained with modifications to the distributed training setup, including when training on a single GPU. For example: methods such as SimNPO & RMU can be significantly improved with careful tuning. **Please use the below numbers only for reproducibility purposes**. +> 3. __NPO inconsistency__: for NPO, the MUSE implementation is inconsistent with the [original paper](https://github.com/licong-lin/negative-preference-optimization) as discussed [here](https://github.com/jaechan-repo/muse_bench/issues/2). This inconsistency is carried over into implementations like [SimNPO](https://github.com/OPTML-Group/Unlearn-Simple/issues/5). Here, we use the original NPO implementation with the same loss function expression across datasets. @@ -60,7 +63,7 @@ __Note:__ - + @@ -187,12 +190,12 @@ __Note:__ - + - + - + @@ -203,7 +206,7 @@ __Note:__ - + @@ -316,7 +319,7 @@ __Note:__ - + @@ -325,7 +328,7 @@ __Note:__ - + diff --git a/requirements.txt b/requirements.txt index 147f515..2f39c76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,6 @@ datasets==3.0.1 accelerate==0.34.2 bitsandbytes==0.44.1 rouge-score==0.1.2 -pre-commit==4.0.1 scipy==1.14.1 tensorboard==2.18.0 scikit-learn==1.5.2 diff --git a/scripts/tofu_finetune.sh b/scripts/tofu_finetune.sh index bb18fa4..dfb3a69 100644 --- a/scripts/tofu_finetune.sh +++ b/scripts/tofu_finetune.sh @@ -11,10 +11,10 @@ models=( ) per_device_train_batch_size=4 # Effective batch size 32 on two GPUs with gradent_accumulation_steps=8 -forget_retain_splits=( - "forget01 retain99" - "forget05 retain95" - "forget10 retain90" +splits=( + "forget01 holdout01 retain99" + "forget05 holdout05 retain95" + "forget10 holdout10 retain90" ) @@ -23,9 +23,10 @@ forget_retain_splits=( ########################################### RETAIN Finetuned TOFU ###################################################### ######################################################################################################################## -for split in "${forget_retain_splits[@]}"; do +for split in "${splits[@]}"; do forget_split=$(echo $split | cut -d' ' -f1) - retain_split=$(echo $split | cut -d' ' -f2) + holdout_split=$(echo $split | cut -d' ' -f2) + retain_split=$(echo $split | cut -d' ' -f3) for model in "${models[@]}"; do CUDA_VISIBLE_DEVICES=0,1 accelerate launch --config_file configs/accelerate/default_config.yaml --main_process_port $MASTER_PORT \ @@ -41,6 +42,7 @@ for split in "${forget_retain_splits[@]}"; do CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \ forget_split=${forget_split} \ + holdout_split=${holdout_split} \ task_name=tofu_${model}_${retain_split} \ model=${model} \ model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_${retain_split} @@ -65,12 +67,14 @@ for model in "${models[@]}"; do trainer.args.gradient_checkpointing=true # Evaluate the full models on each forget split - for split in "${forget_retain_splits[@]}"; do + for split in "${splits[@]}"; do forget_split=$(echo $split | cut -d' ' -f1) - retain_split=$(echo $split | cut -d' ' -f2) + holdout_split=$(echo $split | cut -d' ' -f2) + retain_split=$(echo $split | cut -d' ' -f3) CUDA_VISIBLE_DEVICES=0 python src/eval.py experiment=eval/tofu/default.yaml \ forget_split=${forget_split} \ + holdout_split=${holdout_split} \ task_name=tofu_${model}_full_${forget_split} \ model=${model} \ model.model_args.pretrained_model_name_or_path=saves/finetune/tofu_${model}_full \ diff --git a/scripts/tofu_unlearn.sh b/scripts/tofu_unlearn.sh index ae33189..87d9a7c 100644 --- a/scripts/tofu_unlearn.sh +++ b/scripts/tofu_unlearn.sh @@ -16,12 +16,13 @@ trainers_experiments=( "DPO unlearn/tofu/idk.yaml" "RMU unlearn/tofu/default.yaml" ) -forget_retain_splits=( - "forget01 retain99" - "forget05 retain95" - "forget10 retain90" +splits=( + "forget01 holdout01 retain99" + "forget05 holdout05 retain95" + "forget10 holdout10 retain90" ) + per_device_train_batch_size=4 # on two gpus would make effective batch size 32 gradient_accumulation_steps=4 @@ -31,9 +32,11 @@ gradient_accumulation_steps=4 ######################################################################################################################## -for split in "${forget_retain_splits[@]}"; do +for split in "${splits[@]}"; do forget_split=$(echo $split | cut -d' ' -f1) - retain_split=$(echo $split | cut -d' ' -f2) + holdout_split=$(echo $split | cut -d' ' -f2) + retain_split=$(echo $split | cut -d' ' -f3) + for model in "${models[@]}"; do for trainer_experiment in "${trainers_experiments[@]}"; do trainer=$(echo $trainer_experiment | cut -d' ' -f1) @@ -63,6 +66,7 @@ for split in "${forget_retain_splits[@]}"; do CUDA_VISIBLE_DEVICES=0 python src/eval.py \ experiment=eval/tofu/default.yaml \ forget_split=${forget_split} \ + holdout_split=${holdout_split} \ model=${model} \ task_name=${task_name} \ model.model_args.pretrained_model_name_or_path=saves/unlearn/${task_name} \ diff --git a/setup.py b/setup.py index 209335c..b02a348 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ long_description=open("README.md").read(), long_description_content_type="text/markdown", url="https://github.com/locuslab/open-unlearning", + license="MIT", packages=find_packages(), install_requires=requirements, # Uses requirements.txt extras_require={ diff --git a/src/data/__init__.py b/src/data/__init__.py index e838800..a67ce7d 100644 --- a/src/data/__init__.py +++ b/src/data/__init__.py @@ -40,9 +40,8 @@ def _load_single_dataset(dataset_name, dataset_cfg: DictConfig, **kwargs): def get_datasets(dataset_cfgs: Union[Dict, DictConfig], **kwargs): dataset = {} for dataset_name, dataset_cfg in dataset_cfgs.items(): - dataset[dataset_name] = _load_single_dataset( - dataset_name, dataset_cfg, **kwargs - ) + access_name = dataset_cfg.get("access_key", dataset_name) + dataset[access_name] = _load_single_dataset(dataset_name, dataset_cfg, **kwargs) if len(dataset) == 1: # return a single dataset return list(dataset.values())[0] diff --git a/src/eval.py b/src/eval.py index 82aae67..066c8da 100644 --- a/src/eval.py +++ b/src/eval.py @@ -1,6 +1,7 @@ import hydra from omegaconf import DictConfig +from trainer.utils import seed_everything from model import get_model from evals import get_evaluators @@ -11,6 +12,7 @@ def main(cfg: DictConfig): Args: cfg (DictConfig): Config to train """ + seed_everything(cfg.seed) model_cfg = cfg.model template_args = model_cfg.template_args assert model_cfg is not None, "Invalid model yaml passed in train config." diff --git a/src/evals/base.py b/src/evals/base.py index 992c249..3d794d5 100644 --- a/src/evals/base.py +++ b/src/evals/base.py @@ -32,8 +32,11 @@ def save_logs(self, logs, file): """Save the logs in a json file""" logs = dict(sorted(logs.items())) os.makedirs(os.path.dirname(file), exist_ok=True) - with open(file, "w") as f: - json.dump(logs, f, indent=4) + try: + with open(file, "w") as f: + json.dump(logs, f, indent=4) + except Exception as e: + raise RuntimeError(f"Failed to save {file}: {e}") def prepare_model(self, model): """Prepare model for evaluation""" @@ -49,6 +52,8 @@ def summarize(self, logs): """Summarize the metrics results""" metric_summary = {} for metric_name, metric_results in logs.items(): + if metric_name not in self.metrics: + continue agg_value = metric_results.get("agg_value", None) if agg_value is not None: metric_summary[metric_name] = agg_value @@ -77,6 +82,7 @@ def evaluate(self, model, output_dir=None, overwrite=None, **kwargs): logger.info( f"Result for metric {metric_name}:\t{logs[metric_name]['agg_value']}" ) + self.save_logs(self.summarize(logs), summary_file_path) continue _ = logs.pop(metric_name, None) # overwriting existing evals if present kwargs = { @@ -94,12 +100,7 @@ def evaluate(self, model, output_dir=None, overwrite=None, **kwargs): ) if "agg_value" in result: logger.info(f"Result for metric {metric_name}:\t{result['agg_value']}") - try: - self.save_logs(logs, logs_file_path) - except Exception as e: - raise RuntimeError(f"Failed to save logs: {e}") - try: - self.save_logs(self.summarize(logs), summary_file_path) - except Exception as e: - raise RuntimeError(f"Failed to save summary: {e}") + + self.save_logs(logs, logs_file_path) + self.save_logs(self.summarize(logs), summary_file_path) return logs diff --git a/src/evals/metrics/__init__.py b/src/evals/metrics/__init__.py index 7a4099e..9441c8d 100644 --- a/src/evals/metrics/__init__.py +++ b/src/evals/metrics/__init__.py @@ -7,11 +7,17 @@ rouge, truth_ratio, hm_aggregate, + extraction_strength, + exact_memorization, ) -from evals.metrics.privacy import ( - forget_quality, - minKpc_negative_logprob, - relative_auc, +from evals.metrics.privacy import ks_test, privleak, rel_diff +from evals.metrics.mia import ( + mia_loss, + mia_min_k, + mia_min_k_plus_plus, + mia_gradnorm, + mia_zlib, + mia_reference, ) METRICS_REGISTRY: Dict[str, UnlearningMetric] = {} @@ -47,7 +53,17 @@ def get_metrics(metric_cfgs: DictConfig, **kwargs): _register_metric(probability_w_options) _register_metric(rouge) _register_metric(truth_ratio) -_register_metric(forget_quality) +_register_metric(ks_test) _register_metric(hm_aggregate) -_register_metric(minKpc_negative_logprob) -_register_metric(relative_auc) +_register_metric(privleak) +_register_metric(rel_diff) +_register_metric(exact_memorization) +_register_metric(extraction_strength) + +# Register MIA metrics +_register_metric(mia_loss) +_register_metric(mia_min_k) +_register_metric(mia_min_k_plus_plus) +_register_metric(mia_gradnorm) +_register_metric(mia_zlib) +_register_metric(mia_reference) diff --git a/src/evals/metrics/base.py b/src/evals/metrics/base.py index 334db5c..105ff1c 100644 --- a/src/evals/metrics/base.py +++ b/src/evals/metrics/base.py @@ -120,7 +120,7 @@ def prepare_kwargs_evaluate_metric(self, model, metric_name, cache={}, **kwargs) reference_logs[reference_log_name][access_name] = _results if _results is None: logger.warning( - f"{key} not present in the {path}, setting it to None!" + f"{key} evals not present in the {path}, setting it to None, may result in error soon if code attempts to access." ) if reference_logs: kwargs.update({"reference_logs": reference_logs}) diff --git a/src/evals/metrics/memorization.py b/src/evals/metrics/memorization.py index 9587809..90cd3c2 100644 --- a/src/evals/metrics/memorization.py +++ b/src/evals/metrics/memorization.py @@ -1,4 +1,5 @@ import logging +import torch import numpy as np import scipy as sc from torch.utils.data import DataLoader @@ -9,6 +10,7 @@ evaluate_probability, eval_text_similarity, run_batchwise_evals, + tokenwise_vocab_logprobs, ) from evals.metrics.base import unlearning_metric @@ -136,3 +138,65 @@ def true_better(arr): def hm_aggregate(model, **kwargs): values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()] return {"agg_value": sc.stats.hmean(values)} + + +@unlearning_metric(name="exact_memorization") +def exact_memorization(model, **kwargs): + data = kwargs["data"] + collator = kwargs["collators"] + batch_size = kwargs["batch_size"] + dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator) + + def _exact_memorization(model, batch): + log_probs_batch, labels_batch = tokenwise_vocab_logprobs( + model, batch, grad=False, return_labels=True + ) + em_batch = [] + for log_probs, labels in zip(log_probs_batch, labels_batch): + assert len(log_probs) == len(labels) + preds = torch.argmax(log_probs, dim=-1) + em_score = (preds == labels).sum() / len(labels) + em_batch.append({"score": em_score.item()}) + return em_batch + + fun_args = {} + scores_by_index = run_batchwise_evals( + model, dataloader, _exact_memorization, fun_args, "Calculating EM" + ) + em_values = np.array([evals["score"] for evals in scores_by_index.values()]) + em_values = aggregate_to_1D(em_values) + return {"agg_value": np.mean(em_values), "value_by_index": scores_by_index} + + +@unlearning_metric(name="extraction_strength") +def extraction_strength(model, **kwargs): + data = kwargs["data"] + collator = kwargs["collators"] + batch_size = kwargs["batch_size"] + dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator) + + def _extraction_strength(model, batch): + log_probs_batch, labels_batch = tokenwise_vocab_logprobs( + model, batch, grad=False, return_labels=True + ) + es_batch = [] + for log_probs, labels in zip(log_probs_batch, labels_batch): + assert len(log_probs) == len(labels) + valid_len = len(labels) + preds = torch.argmax(log_probs, dim=-1) + for k in range(valid_len): + suff_preds = preds[k:] + suff_labels = labels[k:] + if torch.equal(suff_preds, suff_labels): + break + es_score = 1 - (k / valid_len) + es_batch.append({"score": es_score}) + return es_batch + + fun_args = {} + scores_by_index = run_batchwise_evals( + model, dataloader, _extraction_strength, fun_args, "Calculating ES" + ) + es_values = np.array([evals["score"] for evals in scores_by_index.values()]) + es_values = aggregate_to_1D(es_values) + return {"agg_value": np.mean(es_values), "value_by_index": scores_by_index} diff --git a/src/evals/metrics/mia/__init__.py b/src/evals/metrics/mia/__init__.py new file mode 100644 index 0000000..5ab869f --- /dev/null +++ b/src/evals/metrics/mia/__init__.py @@ -0,0 +1,100 @@ +""" +Attack implementations. +""" + +from transformers import AutoModelForCausalLM + +from evals.metrics.base import unlearning_metric +from evals.metrics.mia.loss import LOSSAttack +from evals.metrics.mia.min_k import MinKProbAttack +from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack +from evals.metrics.mia.gradnorm import GradNormAttack +from evals.metrics.mia.zlib import ZLIBAttack +from evals.metrics.mia.reference import ReferenceAttack + +from evals.metrics.mia.utils import mia_auc +import logging + +logger = logging.getLogger("metrics") + +## NOTE: all MIA attack statistics are signed as required in order to show the +# same trends as loss (higher the score on an example, less likely the membership) + + +@unlearning_metric(name="mia_loss") +def mia_loss(model, **kwargs): + return mia_auc( + LOSSAttack, + model, + data=kwargs["data"], + collator=kwargs["collators"], + batch_size=kwargs["batch_size"], + ) + + +@unlearning_metric(name="mia_min_k") +def mia_min_k(model, **kwargs): + return mia_auc( + MinKProbAttack, + model, + data=kwargs["data"], + collator=kwargs["collators"], + batch_size=kwargs["batch_size"], + k=kwargs["k"], + ) + + +@unlearning_metric(name="mia_min_k_plus_plus") +def mia_min_k_plus_plus(model, **kwargs): + return mia_auc( + MinKPlusPlusAttack, + model, + data=kwargs["data"], + collator=kwargs["collators"], + batch_size=kwargs["batch_size"], + k=kwargs["k"], + ) + + +@unlearning_metric(name="mia_gradnorm") +def mia_gradnorm(model, **kwargs): + return mia_auc( + GradNormAttack, + model, + data=kwargs["data"], + collator=kwargs["collators"], + batch_size=kwargs["batch_size"], + p=kwargs["p"], + ) + + +@unlearning_metric(name="mia_zlib") +def mia_zlib(model, **kwargs): + return mia_auc( + ZLIBAttack, + model, + data=kwargs["data"], + collator=kwargs["collators"], + batch_size=kwargs["batch_size"], + tokenizer=kwargs.get("tokenizer"), + ) + + +@unlearning_metric(name="mia_reference") +def mia_reference(model, **kwargs): + if "reference_model_path" not in kwargs: + raise ValueError("Reference model must be provided in kwargs") + logger.info(f"Loading reference model from {kwargs['reference_model_path']}") + reference_model = AutoModelForCausalLM.from_pretrained( + kwargs["reference_model_path"], + torch_dtype=model.dtype, + device_map={"": model.device}, + ) + return mia_auc( + ReferenceAttack, + model, + data=kwargs["data"], + collator=kwargs["collators"], + batch_size=kwargs["batch_size"], + reference_model=reference_model, + ) diff --git a/src/evals/metrics/mia/all_attacks.py b/src/evals/metrics/mia/all_attacks.py new file mode 100644 index 0000000..f2d074c --- /dev/null +++ b/src/evals/metrics/mia/all_attacks.py @@ -0,0 +1,63 @@ +""" +Enum class for attacks. Also contains the base attack class. +""" + +from enum import Enum +from torch.utils.data import DataLoader +import numpy as np +from tqdm import tqdm + + +# Attack definitions +class AllAttacks(str, Enum): + LOSS = "loss" + REFERENCE_BASED = "ref" + ZLIB = "zlib" + MIN_K = "min_k" + MIN_K_PLUS_PLUS = "min_k++" + GRADNORM = "gradnorm" + RECALL = "recall" + + +# Base attack class +class Attack: + def __init__(self, model, data, collator, batch_size, **kwargs): + """Initialize attack with model and create dataloader.""" + self.model = model + self.dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator) + self.setup(**kwargs) + + def setup(self, **kwargs): + """Setup attack-specific parameters.""" + pass + + def compute_batch_values(self, batch): + """Process a batch through model to get needed statistics.""" + raise NotImplementedError + + def compute_score(self, sample_stats): + """Compute MIA score for a single sample.""" + raise NotImplementedError + + def attack(self): + """Run full MIA attack.""" + all_scores = [] + all_indices = [] + + for batch in tqdm(self.dataloader, total=len(self.dataloader)): + indices = batch.pop("index").cpu().numpy().tolist() + batch_values = self.compute_batch_values(batch) + scores = [self.compute_score(values) for values in batch_values] + + all_scores.extend(scores) + all_indices.extend(indices) + + scores_by_index = { + str(idx): {"score": float(score)} + for idx, score in zip(all_indices, all_scores) + } + + return { + "agg_value": float(np.mean(all_scores)), + "value_by_index": scores_by_index, + } diff --git a/src/evals/metrics/mia/gradnorm.py b/src/evals/metrics/mia/gradnorm.py new file mode 100644 index 0000000..dcb529c --- /dev/null +++ b/src/evals/metrics/mia/gradnorm.py @@ -0,0 +1,36 @@ +""" +Gradient-norm attack. Proposed for MIA in multiple settings, and particularly +experimented for pre-training data and LLMs in https://arxiv.org/abs/2402.17012 +""" + +import torch +from evals.metrics.mia.all_attacks import Attack +from evals.metrics.utils import tokenwise_logprobs + + +# DO NOT use gradnorm in a way so that it runs when your accumulated gradients during training aren't used yet +# gradnorm zeros out the gradients of the model during its computation +class GradNormAttack(Attack): + def setup(self, p, **kwargs): + if p not in [1, 2, float("inf")]: + raise ValueError(f"Invalid p-norm value: {p}") + self.p = p + + def compute_batch_values(self, batch): + """Compute gradients of examples w.r.t model parameters. More grad norm => more loss.""" + batch_log_probs = tokenwise_logprobs(self.model, batch, grad=True) + batch_loss = [-torch.mean(lps) for lps in batch_log_probs] + batch_grad_norms = [] + for sample_loss in batch_loss: + sample_grad_norms = [] + self.model.zero_grad() + sample_loss.backward() + for param in self.model.parameters(): + if param.grad is not None: + sample_grad_norms.append(param.grad.detach().norm(p=self.p)) + batch_grad_norms.append(torch.stack(sample_grad_norms).mean()) + return batch_grad_norms + + def compute_score(self, sample_stats): + """Return negative gradient norm as the attack score.""" + return sample_stats.cpu().to(torch.float32).numpy() diff --git a/src/evals/metrics/mia/loss.py b/src/evals/metrics/mia/loss.py new file mode 100644 index 0000000..bcfd204 --- /dev/null +++ b/src/evals/metrics/mia/loss.py @@ -0,0 +1,16 @@ +""" +Straight-forward LOSS attack, as described in https://ieeexplore.ieee.org/abstract/document/8429311 +""" + +from evals.metrics.mia.all_attacks import Attack +from evals.metrics.utils import evaluate_probability + + +class LOSSAttack(Attack): + def compute_batch_values(self, batch): + """Compute probabilities and losses for the batch.""" + return evaluate_probability(self.model, batch) + + def compute_score(self, sample_stats): + """Return the average loss for the sample.""" + return sample_stats["avg_loss"] diff --git a/src/evals/metrics/mia/min_k.py b/src/evals/metrics/mia/min_k.py new file mode 100644 index 0000000..8b8d4ec --- /dev/null +++ b/src/evals/metrics/mia/min_k.py @@ -0,0 +1,26 @@ +""" +Min-k % Prob Attack: https://arxiv.org/pdf/2310.16789.pdf +""" + +import numpy as np +from evals.metrics.mia.all_attacks import Attack +from evals.metrics.utils import tokenwise_logprobs + + +class MinKProbAttack(Attack): + def setup(self, k=0.2, **kwargs): + self.k = k + + def compute_batch_values(self, batch): + """Get token-wise log probabilities for the batch.""" + return tokenwise_logprobs(self.model, batch, grad=False) + + def compute_score(self, sample_stats): + """Score single sample using min-k negative log probs scores attack.""" + lp = sample_stats.cpu().numpy() + if lp.size == 0: + return 0 + + num_k = max(1, int(len(lp) * self.k)) + sorted_vals = np.sort(lp) + return -np.mean(sorted_vals[:num_k]) diff --git a/src/evals/metrics/mia/min_k_plus_plus.py b/src/evals/metrics/mia/min_k_plus_plus.py new file mode 100644 index 0000000..cfc85de --- /dev/null +++ b/src/evals/metrics/mia/min_k_plus_plus.py @@ -0,0 +1,39 @@ +import torch as torch +import numpy as np +from evals.metrics.mia.min_k import MinKProbAttack +from evals.metrics.utils import tokenwise_vocab_logprobs, tokenwise_logprobs + + +class MinKPlusPlusAttack(MinKProbAttack): + def compute_batch_values(self, batch): + """Get both token-wise and vocab-wise log probabilities for the batch.""" + vocab_log_probs = tokenwise_vocab_logprobs(self.model, batch, grad=False) + token_log_probs = tokenwise_logprobs(self.model, batch, grad=False) + return [ + {"vocab_log_probs": vlp, "token_log_probs": tlp} + for vlp, tlp in zip(vocab_log_probs, token_log_probs) + ] + + def compute_score(self, sample_stats): + """Score using min-k negative log probs scores with vocab-wise normalization.""" + all_probs = sample_stats["vocab_log_probs"] + target_prob = sample_stats["token_log_probs"] + + if len(target_prob) == 0: + return 0 + + # Compute normalized scores using vocab distribution + mu = (torch.exp(all_probs) * all_probs).sum(-1) + sigma = (torch.exp(all_probs) * torch.square(all_probs)).sum(-1) - torch.square( + mu + ) + + # Handle numerical stability + sigma = torch.clamp(sigma, min=1e-6) + scores = (target_prob.cpu().numpy() - mu.cpu().numpy()) / torch.sqrt( + sigma + ).cpu().numpy() + + # Take bottom k% as the attack score + num_k = max(1, int(len(scores) * self.k)) + return -np.mean(sorted(scores)[:num_k]) diff --git a/src/evals/metrics/mia/reference.py b/src/evals/metrics/mia/reference.py new file mode 100644 index 0000000..3faeb6d --- /dev/null +++ b/src/evals/metrics/mia/reference.py @@ -0,0 +1,25 @@ +""" +Reference-based attacks. +""" + +from evals.metrics.mia.all_attacks import Attack +from evals.metrics.utils import evaluate_probability + + +class ReferenceAttack(Attack): + def setup(self, reference_model, **kwargs): + """Setup reference model.""" + self.reference_model = reference_model + + def compute_batch_values(self, batch): + """Compute loss scores for both target and reference models.""" + ref_results = evaluate_probability(self.reference_model, batch) + target_results = evaluate_probability(self.model, batch) + return [ + {"target_loss": t["avg_loss"], "ref_loss": r["avg_loss"]} + for t, r in zip(target_results, ref_results) + ] + + def compute_score(self, sample_stats): + """Score using difference between target and reference model losses.""" + return sample_stats["target_loss"] - sample_stats["ref_loss"] diff --git a/src/evals/metrics/mia/utils.py b/src/evals/metrics/mia/utils.py new file mode 100644 index 0000000..15eb19b --- /dev/null +++ b/src/evals/metrics/mia/utils.py @@ -0,0 +1,70 @@ +from evals.metrics.mia.all_attacks import AllAttacks +from evals.metrics.mia.loss import LOSSAttack +from evals.metrics.mia.reference import ReferenceAttack +from evals.metrics.mia.zlib import ZLIBAttack +from evals.metrics.mia.min_k import MinKProbAttack +from evals.metrics.mia.min_k_plus_plus import MinKPlusPlusAttack +from evals.metrics.mia.gradnorm import GradNormAttack + +from sklearn.metrics import roc_auc_score + + +import numpy as np + + +def get_attacker(attack: str): + mapping = { + AllAttacks.LOSS: LOSSAttack, + AllAttacks.REFERENCE_BASED: ReferenceAttack, + AllAttacks.ZLIB: ZLIBAttack, + AllAttacks.MIN_K: MinKProbAttack, + AllAttacks.MIN_K_PLUS_PLUS: MinKPlusPlusAttack, + AllAttacks.GRADNORM: GradNormAttack, + } + attack_cls = mapping.get(attack, None) + if attack_cls is None: + raise ValueError(f"Attack {attack} not found") + return attack_cls + + +def mia_auc(attack_cls, model, data, collator, batch_size, **kwargs): + """ + Compute the MIA AUC and accuracy. + + Parameters: + - attack_cls: the attack class to use. + - model: the target model. + - data: a dict with keys "forget" and "holdout". + - collator: data collator. + - batch_size: batch size. + - kwargs: additional optional parameters (e.g. k, p, tokenizer, reference_model). + + Returns a dict containing the attack outputs, including "acc" and "auc". + + Note on convention: auc is 1 when the forget data is much more likely than the holdout data + """ + # Build attack arguments from common parameters and any extras. + attack_args = { + "model": model, + "collator": collator, + "batch_size": batch_size, + } + attack_args.update(kwargs) + + output = { + "forget": attack_cls(data=data["forget"], **attack_args).attack(), + "holdout": attack_cls(data=data["holdout"], **attack_args).attack(), + } + forget_scores = [ + elem["score"] for elem in output["forget"]["value_by_index"].values() + ] + holdout_scores = [ + elem["score"] for elem in output["holdout"]["value_by_index"].values() + ] + scores = np.array(forget_scores + holdout_scores) + labels = np.array( + [0] * len(forget_scores) + [1] * len(holdout_scores) + ) # see note above + auc_value = roc_auc_score(labels, scores) + output["auc"], output["agg_value"] = auc_value, auc_value + return output diff --git a/src/evals/metrics/mia/zlib.py b/src/evals/metrics/mia/zlib.py new file mode 100644 index 0000000..5a8f7ba --- /dev/null +++ b/src/evals/metrics/mia/zlib.py @@ -0,0 +1,29 @@ +""" +zlib-normalization Attack: https://www.usenix.org/system/files/sec21-carlini-extracting.pdf +""" + +import zlib + +from evals.metrics.mia.all_attacks import Attack +from evals.metrics.utils import ( + evaluate_probability, + extract_target_texts_from_processed_data, +) + + +class ZLIBAttack(Attack): + def setup(self, tokenizer=None, **kwargs): + """Setup tokenizer.""" + self.tokenizer = tokenizer or self.model.tokenizer + + def compute_batch_values(self, batch): + """Get loss and text for batch.""" + eval_results = evaluate_probability(self.model, batch) + texts = extract_target_texts_from_processed_data(self.tokenizer, batch) + return [{"loss": r["avg_loss"], "text": t} for r, t in zip(eval_results, texts)] + + def compute_score(self, sample_stats): + """Score using loss normalized by compressed text length.""" + text = sample_stats["text"] + zlib_entropy = len(zlib.compress(text.encode("utf-8"))) + return sample_stats["loss"] / zlib_entropy diff --git a/src/evals/metrics/privacy.py b/src/evals/metrics/privacy.py index fcaab25..1d9bdfb 100644 --- a/src/evals/metrics/privacy.py +++ b/src/evals/metrics/privacy.py @@ -1,14 +1,12 @@ import numpy as np from scipy.stats import ks_2samp -from torch.utils.data import DataLoader -from sklearn.metrics import auc as get_auc, roc_curve as get_roc_curve - from evals.metrics.base import unlearning_metric, logger -from evals.metrics.utils import run_batchwise_evals, eval_minKpc_neg_logprob -@unlearning_metric(name="forget_quality") -def forget_quality(model, **kwargs): +@unlearning_metric(name="ks_test") +def ks_test(model, **kwargs): + """Compare two forget and retain model distributions with a 2-sample KS-test and report the p-value. + Used in the TOFU benchmark as forget_quality when computed over the truth_ratio statistic.""" forget_tr_stats = np.array( [ evals["score"] @@ -17,12 +15,11 @@ def forget_quality(model, **kwargs): ) reference_logs = kwargs.get("reference_logs", None) if reference_logs: + reference_logs = reference_logs["retain_model_logs"] retain_tr_stats = np.array( [ evals["score"] - for evals in kwargs["reference_logs"]["retain_model_logs"]["retain"][ - "value_by_index" - ].values() + for evals in reference_logs["retain"]["value_by_index"].values() ] ) fq = ks_2samp(forget_tr_stats, retain_tr_stats) @@ -35,72 +32,35 @@ def forget_quality(model, **kwargs): return {"agg_value": pvalue} -@unlearning_metric(name="minKpc_negative_logprob") -def minKpc_negative_logprob(model, **kwargs): - """Compute the min-k percentile average of token-wise model probabilities by data points""" - data = kwargs["data"] - collator = kwargs["collators"] - batch_size = kwargs["batch_size"] - - dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collator) - - fun_args = {"percentile": kwargs["percentile_K"]} - return { - "value_by_index": run_batchwise_evals( - model, - dataloader, - eval_minKpc_neg_logprob, - fun_args, - "Calculating avg token-wise lowest K% percentile logprobs across batches", +@unlearning_metric(name="privleak") +def privleak(model, **kwargs): + """Compare two forget and retain model scores using a relative comparison of a single statistic. + To be used for MIA AUC scores in ensuring consistency and reproducibility of the MUSE benchmark. + This function is similar to the rel_diff function below, but due to the MUSE benchmark reporting AUC + scores as (1-x) when the more conventional way is x, we do adjustments here to our MIA AUC scores. + calculations in the reverse way,""" + score = kwargs["pre_compute"]["forget"]["agg_value"] + try: + ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"] + except Exception as _: + logger.warning( + f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}" ) - } - - -@unlearning_metric(name="relative_auc") -def relative_auc(model, **kwargs): - """Compute the auc score of an MIA attack wrt model scores on a victim and holdout set""" - - def sweep(ppl, y): - fpr, tpr, _ = get_roc_curve(y, -ppl) - acc = np.max(1 - (fpr + (1 - tpr)) / 2) - return fpr, tpr, get_auc(fpr, tpr), acc - - forget_scores = kwargs["pre_compute"]["forget"]["value_by_index"].values() - forget_scores = [elem["score"] for elem in forget_scores] - forget_holdout_scores = kwargs["pre_compute"]["holdout"]["value_by_index"].values() - forget_holdout_scores = [elem["score"] for elem in forget_holdout_scores] - scores = np.array(forget_scores + forget_holdout_scores) - # in MUSE the scores are -mean(min k% log-probs) for some reason so flip the 1 and 0 - labels = np.array([0] * len(forget_scores) + [1] * len(forget_holdout_scores)) + ref = kwargs["ref_value"] + score = 1 - score + ref = 1 - ref + return {"agg_value": (score - ref) / (ref + 1e-10) * 100} - _, _, auc_score, acc = sweep(scores, labels) - output = { - "acc": acc, - "auc": auc_score, - } - retain_auc_score = kwargs["ref_value"] - - reference_logs = kwargs.get("reference_logs", None) - if reference_logs: - retain_scores = reference_logs["retain_model_logs"]["retain"][ - "value_by_index" - ].values() - retain_scores = [elem["score"] for elem in retain_scores] - retain_holdout_scores = reference_logs["retain_model_logs"]["holdout"][ - "value_by_index" - ].values() - retain_holdout_scores = [elem["score"] for elem in retain_holdout_scores] - scores = np.array(retain_scores + retain_holdout_scores) - labels = np.array([0] * len(retain_scores) + [1] * len(retain_holdout_scores)) - _, _, retain_auc_score, retain_acc = sweep(scores, labels) - output.update({"retain_acc": retain_acc, "retain_auc_score": retain_auc_score}) - - output.update( - { - "agg_value": (auc_score - retain_auc_score) - / (retain_auc_score) - * 100 # privleak score in muse - } - ) - return output +@unlearning_metric(name="rel_diff") +def rel_diff(model, **kwargs): + """Compare two forget and retain model scores using a relative comparison of a single statistic.""" + score = kwargs["pre_compute"]["forget"]["agg_value"] + try: + ref = kwargs["reference_logs"]["retain_model_logs"]["retain"]["agg_value"] + except Exception as _: + logger.warning( + f"retain_model_logs evals not provided for privleak, using default retain auc of {kwargs['ref_value']}" + ) + ref = kwargs["ref_value"] + return {"agg_value": (score - ref) / (ref + 1e-10) * 100} diff --git a/src/evals/metrics/utils.py b/src/evals/metrics/utils.py index 2fdbe20..92c51bf 100644 --- a/src/evals/metrics/utils.py +++ b/src/evals/metrics/utils.py @@ -103,25 +103,35 @@ def evaluate_probability(model, batch): ] -def eval_minKpc_neg_logprob(model, batch, percentile): - """Compute minK% attack score for each sample in a batch.""" +def tokenwise_logprobs(model, batch, grad=False, return_labels=False): + """ + Compute token-wise next token prediction logprobs for all labeled tokens for each sample in a batch. + `grad` decides whether gradients are turned on + Returns + log_probs_batch (List[Tensor]): Tensors of size seq_len where seq_len is length of labeled tokens + labels_batch (List[Tensor]): List of tensors of length N. Returned only if return_labels is True + """ batch = {k: v.to(model.device) for k, v in batch.items()} - with torch.no_grad(): + + model.train(mode=grad) + with torch.set_grad_enabled(grad): output = model(**batch) + logits = output.logits bsz, seq_len, V = logits.shape log_probs = torch.nn.functional.log_softmax(logits, dim=-1)[:, :-1, :] # ^ we don't predict next token for last token, bsz x seq_len-1 x V next_tokens = batch["input_ids"][:, 1:].unsqueeze(-1) # bsz x seq_len-1 x 1 target_log_probs = torch.gather(log_probs, dim=2, index=next_tokens).squeeze(-1) - mink_means = [] + log_probs_batch = [] + labels_batch = [] for i in range(bsz): labels = batch["labels"][i][:-1] # only focus on tokens which have loss on them (i.e. used in labels) actual_indices = (labels != IGNORE_INDEX).nonzero(as_tuple=True)[0] num_actual_tokens = actual_indices.numel() if num_actual_tokens == 0: - mink_means.append(0) + log_probs_batch.append(torch.tensor([0.0], device=labels.device)) continue start_idx, end_idx = actual_indices[0].item(), actual_indices[-1].item() if start_idx == 0: @@ -129,14 +139,52 @@ def eval_minKpc_neg_logprob(model, batch, percentile): "Index 0 in a datapoint's input_ids must not have loss (unignored labels) on it", UserWarning, ) - actual_seq_log_probs = ( - target_log_probs[i, start_idx - 1 : end_idx].cpu().numpy() - ) - sorted_probs = np.sort(actual_seq_log_probs) - top_k = max(1, int(percentile / 100 * len(actual_seq_log_probs))) - mink_mean = -1 * np.mean(sorted_probs[:top_k]) - mink_means.append(mink_mean) - return [{"score": float(neglogprob)} for neglogprob in mink_means] + log_probs_batch.append(target_log_probs[i, start_idx - 1 : end_idx]) + labels_batch.append(labels[actual_indices]) + + return (log_probs_batch, labels_batch) if return_labels else log_probs_batch + + +def tokenwise_vocab_logprobs(model, batch, grad=False, return_labels=False): + """Get vocabulary-wise log probabilities for each token in the sequence. + + Returns: + log_probs_batch (List[Tensor]): List of tensors of shape (N, V) containing log probabilities + for each sequence, where N is the length of labeled tokens and V is vocab size. + labels_batch (List[Tensor]): List of tensors of length N. Returned only if return_labels is True + """ + batch = {k: v.to(model.device) for k, v in batch.items()} + model.train(mode=grad) + with torch.set_grad_enabled(grad): + output = model(**batch) + + logits = output.logits + bsz, seq_len, V = logits.shape + log_probs = torch.nn.functional.log_softmax(logits, dim=-1)[ + :, :-1, : + ] # Don't predict for last token + + # Process each sequence in batch separately + log_probs_batch = [] + labels_batch = [] + for i in range(bsz): + labels = batch["labels"][i][:-1] + # Only include positions that have labels + actual_indices = (labels != IGNORE_INDEX).nonzero(as_tuple=True)[0] + if len(actual_indices) == 0: + log_probs_batch.append(torch.zeros(1, V, device=labels.device)) + continue + start_idx, end_idx = actual_indices[0].item(), actual_indices[-1].item() + if start_idx == 0: + warnings.warn( + "Index 0 in a datapoint's input_ids must not have loss (unignored labels) on it", + UserWarning, + ) + # Return full distribution for each position: shape (N, V) + log_probs_batch.append(log_probs[i, start_idx - 1 : end_idx]) + labels_batch.append(labels[actual_indices]) + + return (log_probs_batch, labels_batch) if return_labels else log_probs_batch class MultiTokenEOSCriteria(StoppingCriteria): @@ -277,3 +325,13 @@ def eval_rouge_recall_batch(gen_outputs, ground_truths): ) ] return scores + + +def extract_target_texts_from_processed_data(tokenizer, batch): + """Extract and detokenize text from activated positions in the batch.""" + labels = batch["labels"] + labels = [elem[elem != -100] for elem in labels] + texts = [ + tokenizer.decode(elem.tolist(), skip_special_tokens=True) for elem in labels + ] + return texts diff --git a/src/train.py b/src/train.py index 989e9cf..a9048e3 100644 --- a/src/train.py +++ b/src/train.py @@ -4,6 +4,7 @@ from model import get_model from trainer import load_trainer from evals import get_evaluator +from trainer.utils import seed_everything @hydra.main(version_base=None, config_path="../configs", config_name="train.yaml") @@ -12,6 +13,7 @@ def main(cfg: DictConfig): Args: cfg (DictConfig): Config to train """ + seed_everything(cfg.trainer.args.seed) mode = cfg.get("mode", "train") model_cfg = cfg.model template_args = model_cfg.template_args diff --git a/src/trainer/utils.py b/src/trainer/utils.py index c5125b7..dfb6876 100644 --- a/src/trainer/utils.py +++ b/src/trainer/utils.py @@ -1,8 +1,19 @@ import torch +import random +import numpy as np from torch import nn import torch.nn.functional as F +def seed_everything(seed=42): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + def compute_kl_divergence(model, target_model, inputs): with torch.no_grad(): ref_outputs = target_model(**inputs) From 3e4200304e42227af636543ab34d8a4ad7525c5a Mon Sep 17 00:00:00 2001 From: Anmol Mekala <49127549+molereddy@users.noreply.github.com> Date: Sun, 6 Apr 2025 22:50:25 -0400 Subject: [PATCH 10/10] Update README.md (describe new updates) --- README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 77f2cd3..de808d9 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ ## πŸ“– Overview -We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 6+ evaluation metrics, and 6+ LLM architectures. Each of these can be easily extended to incorporate more variants. +We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 9+ evaluation metrics, and 6+ LLM architectures. Each of these can be easily extended to incorporate more variants. We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field. @@ -30,18 +30,19 @@ We invite the LLM unlearning community to collaborate by adding new benchmarks, ### πŸ“’ Updates #### [Apr 6, 2025] -- **Metrics**: Added 6 Membership Inference Attacks (MIA)β€”LOSS, ZLib, Reference, GradNorm, MinK, and MinK++β€”along with ES and EM as additional evaluation metrics. -- **TOFU Benchmark**: Now includes a holdout set and supports MIA attack-based evaluation. You can now compute MUSE's privleak on TOFU. - -#### [Mar 27, 2025] -- **Easier contributions, leaderboard and reproducibility**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details. +- **More Metrics!** Added 6 Membership Inference Attacks (MIA) (LOSS, ZLib, Reference, GradNorm, MinK, and MinK++), along with Extraction Strength (ES) and Exact Memorization (EM) as additional evaluation metrics. +- **More TOFU Evaluations!** Now includes a holdout set and supports MIA attack-based evaluation. You can now compute MUSE's privleak on TOFU. +- **More Documentation!** [`docs/links.md`](docs/links.md) contains resources for each of the implemented features and other useful LLM unlearning resources.
Older Updates +#### [Mar 27, 2025] +- **More Documentation: easy contributions and the leaderboard functionality**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details. + #### [Mar 9, 2025] -- **Unlearning Methods**: Added support for [RMU](https://arxiv.org/abs/2403.03218) (representation-engineering based unlearning). +- **More Methods!** Added support for [RMU](https://arxiv.org/abs/2403.03218) (representation-engineering based unlearning). #### [Feb 27, 2025] ⚠️ **Repository Update**: This repo replaces the original TOFU codebase at [`github.com/locuslab/tofu`](https://github.com/locuslab/tofu), which is no longer maintained. @@ -59,7 +60,7 @@ We provide several variants for each of the components in the unlearning pipelin |------------------------|----------------------| | **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/) | | **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | -| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, QA-ROUGE, 6 MIA Attacks, TruthRatio, Model Utility | +| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) | | **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma |
1.27e-03 0.63 0.531.33e-135.87e-14 0.63 0.51 4.35e-25
Finetuned 0.010.600.6 0.472.96e-131.33e-13 0.6 0.478.08e-221.66e-21 0.6 0.48
0.65 1.0 0.60.630.64 1.0 0.59 0.630.64 0.58 -99.810.550.56 0.47 1.0 -57.26
Retain 0.330.210.20 0 0.56 0.3