diff --git a/README.md b/README.md index ab03fac..48a7168 100644 --- a/README.md +++ b/README.md @@ -26,15 +26,20 @@ We invite the LLM unlearning community to collaborate by adding new benchmarks, ### 📢 Updates +#### [May 12, 2025] + +- **Another benchmark!** We now support running the [`WMDP`](https://wmdp.ai/) benchmark with its `Zephyr` task model. +- **More evaluations!** The [`lm-evaluation-harness`](https://github.com/EleutherAI/lm-evaluation-harness) toolkit has been integrated into OpenUnlearning, enabling WMDP evaluations and support for popular general LLM benchmarks, including MMLU, GSM8K, and others. + +
+Older Updates + #### [Apr 6, 2025] -🚨🚨 **IMPORTANT:** 🚨🚨 Be sure to run `python setup_data.py` immediately after merging the latest version. This is required to refresh the downloaded eval log files and ensure they're compatible with the latest evaluation metrics. - **More Metrics!** Added 6 Membership Inference Attacks (MIA) (LOSS, ZLib, Reference, GradNorm, MinK, and MinK++), along with Extraction Strength (ES) and Exact Memorization (EM) as additional evaluation metrics. - **More TOFU Evaluations!** Now includes a holdout set and supports MIA attack-based evaluation. You can now compute MUSE's privleak on TOFU. - **More Documentation!** [`docs/links.md`](docs/links.md) contains resources for each of the implemented features and other useful LLM unlearning resources. - -
-Older Updates +Be sure to run `python setup_data.py` immediately after merging the latest version. This is required to refresh the downloaded eval log files and ensure they're compatible with the latest evaluation metrics. #### [Mar 27, 2025] - **More Documentation: easy contributions and the leaderboard functionality**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details. @@ -56,11 +61,11 @@ We provide several variants for each of the components in the unlearning pipelin | **Component** | **Available Options** | |------------------------|----------------------| -| **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/) | +| **Benchmarks** | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/), [WMDP](https://www.wmdp.ai/) | | **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU | -| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks | +| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) | | **Datasets** | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) | -| **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma | +| **Model Families** | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma, Zephyr | --- @@ -89,13 +94,15 @@ We provide several variants for each of the components in the unlearning pipelin # Environment setup conda create -n unlearning python=3.11 conda activate unlearning -pip install . +pip install .[lm_eval] pip install --no-build-isolation flash-attn==2.6.3 # Data setup -python setup_data.py # saves/eval now contains evaluation results of the uploaded models -# Downloads log files with metric eval results (incl retain model logs) from the models -# used in the supported benchmarks. +python setup_data.py --eval # saves/eval now contains evaluation results of the uploaded models +# This downloads log files with evaluation results (including retain model logs) +# into `saves/eval`, used for evaluating unlearning across supported benchmarks. +# Additional datasets (e.g., WMDP) are supported — run below for options: +# python setup_data.py --help ``` --- @@ -202,14 +209,13 @@ If you use OpenUnlearning in your research, please cite OpenUnlearning and the b booktitle={First Conference on Language Modeling}, year={2024} } -@article{shi2024muse, - title={MUSE: Machine Unlearning Six-Way Evaluation for Language Models}, +@inproceedings{ + shi2025muse, + title={{MUSE}: Machine Unlearning Six-Way Evaluation for Language Models}, author={Weijia Shi and Jaechan Lee and Yangsibo Huang and Sadhika Malladi and Jieyu Zhao and Ari Holtzman and Daogao Liu and Luke Zettlemoyer and Noah A. Smith and Chiyuan Zhang}, - year={2024}, - eprint={2407.06460}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2407.06460}, + booktitle={The Thirteenth International Conference on Learning Representations}, + year={2025}, + url={https://openreview.net/forum?id=TArmA033BU} } ```
diff --git a/configs/data/datasets/WMDP_forget.yaml b/configs/data/datasets/WMDP_forget.yaml new file mode 100644 index 0000000..4bd260f --- /dev/null +++ b/configs/data/datasets/WMDP_forget.yaml @@ -0,0 +1,9 @@ +WMDP_forget: + handler: PretrainingDataset + args: + hf_args: + path: "text" + data_files: "data/wmdp/wmdp-corpora/cyber-forget-corpus.jsonl" + split: "train" + text_key: "text" + max_length: 512 \ No newline at end of file diff --git a/configs/data/datasets/WMDP_retain.yaml b/configs/data/datasets/WMDP_retain.yaml new file mode 100644 index 0000000..53d87a7 --- /dev/null +++ b/configs/data/datasets/WMDP_retain.yaml @@ -0,0 +1,9 @@ +WMDP_retain: + handler: PretrainingDataset + args: + hf_args: + path: "text" + data_files: "data/wmdp/wmdp-corpora/cyber-retain-corpus.jsonl" + split: "train" + text_key: "text" + max_length: 512 \ No newline at end of file diff --git a/configs/eval/lm_eval.yaml b/configs/eval/lm_eval.yaml new file mode 100644 index 0000000..f67487d --- /dev/null +++ b/configs/eval/lm_eval.yaml @@ -0,0 +1,20 @@ +# @package eval.lm_eval +# NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/ + +handler: LMEvalEvaluator +output_dir: ${paths.output_dir} # set to default eval directory +overwrite: false + +# Define evaluation tasks here +tasks: + - mmlu + # - task: gsm8k + # dataset_path: gsm8k + # # define the entire task config. + # # ^ Example: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml + + +simple_evaluate_args: + batch_size: 16 + system_instruction: null + apply_chat_template: false \ No newline at end of file diff --git a/configs/eval/muse.yaml b/configs/eval/muse.yaml index ecdd98f..0b30777 100644 --- a/configs/eval/muse.yaml +++ b/configs/eval/muse.yaml @@ -15,6 +15,7 @@ defaults: # - mia_reference # - mia_zlib # - mia_gradnorm + # - forget_gibberish handler: MUSEEvaluator output_dir: ${paths.output_dir} # set to default eval directory diff --git a/configs/eval/muse_metrics/forget_gibberish.yaml b/configs/eval/muse_metrics/forget_gibberish.yaml new file mode 100644 index 0000000..a4163d0 --- /dev/null +++ b/configs/eval/muse_metrics/forget_gibberish.yaml @@ -0,0 +1,20 @@ +# @package eval.muse.metrics.forget_gibberish +defaults: + - .@pre_compute.forget_verbmem_ROUGE: forget_verbmem_ROUGE + +pre_compute: + forget_verbmem_ROUGE: + access_key: text + +handler: classifier_prob +batch_size: 32 +max_length: 512 +class_id: 0 +text_key: generation +device: cuda + +classifier_model_args: + pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" + +classifier_tokenization_args: + pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" diff --git a/configs/eval/tofu.yaml b/configs/eval/tofu.yaml index bbfea26..984f2c9 100644 --- a/configs/eval/tofu.yaml +++ b/configs/eval/tofu.yaml @@ -17,6 +17,7 @@ defaults: # include all defined metrics files # - mia_zlib # - mia_gradnorm # - mia_reference # set reference model path appropriately + # - forget_Q_A_gibberish handler: TOFUEvaluator output_dir: ${paths.output_dir} # set to default eval directory diff --git a/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml b/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml new file mode 100644 index 0000000..86c661f --- /dev/null +++ b/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml @@ -0,0 +1,20 @@ +# @package eval.tofu.metrics.forget_Q_A_gibberish +defaults: + - .@pre_compute.forget_Q_A_ROUGE: forget_Q_A_ROUGE + +pre_compute: + forget_Q_A_ROUGE: + access_key: text + +handler: classifier_prob +batch_size: 32 +max_length: 512 +class_id: 0 +text_key: generation +device: cuda + +classifier_model_args: + pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" + +classifier_tokenization_args: + pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457" diff --git a/configs/experiment/eval/wmdp/default.yaml b/configs/experiment/eval/wmdp/default.yaml new file mode 100644 index 0000000..983ad28 --- /dev/null +++ b/configs/experiment/eval/wmdp/default.yaml @@ -0,0 +1,15 @@ +# @package _global_ + +defaults: + - override /model: zephyr-7b-beta + - override /eval: lm_eval + +data_split: cyber + +eval: + lm_eval: + tasks: + - wmdp_${data_split} + - mmlu + +task_name: ??? \ No newline at end of file diff --git a/configs/experiment/unlearn/wmdp/default.yaml b/configs/experiment/unlearn/wmdp/default.yaml new file mode 100644 index 0000000..8b126ac --- /dev/null +++ b/configs/experiment/unlearn/wmdp/default.yaml @@ -0,0 +1,58 @@ +# @package _global_ + +defaults: + - override /model: zephyr-7b-beta + - override /trainer: RMU + - override /data: unlearn + - override /data/datasets@data.forget: WMDP_forget + - override /data/datasets@data.retain: WMDP_retain + - override /eval: lm_eval + +data_split: cyber + +data: + anchor: forget + forget: + WMDP_forget: + args: + hf_args: + data_files: data/wmdp/wmdp-corpora/${data_split}-forget-corpus.jsonl + retain: + WMDP_retain: + args: + hf_args: + data_files: data/wmdp/wmdp-corpora/${data_split}-retain-corpus.jsonl + +eval: + lm_eval: + tasks: + - wmdp_${data_split} + - mmlu + + +collator: + DataCollatorForSupervisedDataset: + args: + padding_side: left # Usually left but for mistral and zephyr its right (https://github.com/hongshi97/CAD/issues/2) + +trainer: + args: + per_device_train_batch_size: 1 + gradient_accumulation_steps: 16 + learning_rate: 5e-5 + eval_strategy: steps + eval_steps: 0.5 + max_steps: 80 + lr_scheduler_type: constant + + method_args: + # The params here are more dependent on model and dataset. Tune them carefully to work + gamma: 1.0 + steering_coeff: 2 + retain_loss_type: EMBED_DIFF + alpha: 1 + module_regex: model\.layers\.7 + trainable_params_regex: + - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26) + +task_name: ??? \ No newline at end of file diff --git a/configs/model/zephyr-7b-beta.yaml b/configs/model/zephyr-7b-beta.yaml new file mode 100644 index 0000000..24f6cef --- /dev/null +++ b/configs/model/zephyr-7b-beta.yaml @@ -0,0 +1,15 @@ +model_args: + pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta" + attn_implementation: 'flash_attention_2' + torch_dtype: bfloat16 +tokenizer_args: + pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta" +template_args: + apply_chat_template: True + system_prompt: You are a helpful assistant. + system_prompt_with_special_tokens: "<|system|>\nYou are a helpful assistant.\n" + user_start_tag: "<|user|>\n" + user_end_tag: "" + asst_start_tag: "<|assistant|>\n" + asst_end_tag: "" + date_string: 10 Apr 2025 \ No newline at end of file diff --git a/docs/evaluation.md b/docs/evaluation.md index 4e61130..d460c6e 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -240,3 +240,33 @@ metrics: {} # lists a mapping from each evaluation metric listed above to its co output_dir: ${paths.output_dir} # set to default eval directory forget_split: forget10 ``` + +## lm-evaluation-harness + +To evaluate model capabilities after unlearning, we support running [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) using our custom evaluator: [LMEvalEvaluator](../src/evals/lm_eval.py). +All evaluation tasks should be defined under the `tasks` in [lm_eval.yaml](../configs/eval/lm_eval.yaml) + +```yaml +# @package eval.lm_eval +# NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/ + +handler: LMEvalEvaluator +output_dir: ${paths.output_dir} # set to default eval directory +overwrite: false + +# Define evaluation tasks here +tasks: + - mmlu + - wmdp_cyber + - task: gsm8k + dataset_path: gsm8k + # define the entire task config. + # ^ Example: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml + + + +simple_evaluate_args: + batch_size: 16 + system_instruction: null + apply_chat_template: false +``` diff --git a/docs/links.md b/docs/links.md index 9a651a1..c41f325 100644 --- a/docs/links.md +++ b/docs/links.md @@ -5,12 +5,14 @@ Links to research papers and resources corresponding to implemented features in --- ## 📌 Table of Contents -- [Implemented Methods](#implemented-methods) -- [Benchmarks](#benchmarks) -- [Evaluation Metrics](#evaluation-metrics) -- [Useful Links](#useful-links) - - [Survey Papers](#survey-papers) - - [Other GitHub Repositories](#other-github-repositories) +- [🔗 Links and References](#-links-and-references) + - [📌 Table of Contents](#-table-of-contents) + - [📗 Implemented Methods](#-implemented-methods) + - [📘 Benchmarks](#-benchmarks) + - [📙 Evaluation Metrics](#-evaluation-metrics) + - [🌐 Useful Links](#-useful-links) + - [📚 Surveys](#-surveys) + - [🐙 Other GitHub Repositories](#-other-github-repositories) --- @@ -32,6 +34,7 @@ Links to research papers and resources corresponding to implemented features in |-----------|----------| | TOFU | Paper [📄](https://arxiv.org/abs/2401.06121) | | MUSE | Paper [📄](https://arxiv.org/abs/2407.06460) | +| WMDP | Paper [📄](https://arxiv.org/abs/2403.03218) | --- @@ -45,6 +48,7 @@ Links to research papers and resources corresponding to implemented features in | Forget Quality, Truth Ratio, Model Utility | TOFU ([📄](https://arxiv.org/abs/2401.06121)) | | Extraction Strength (ES) | Carlini et al., 2021 ([📄](https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) | | Exact Memorization (EM) | Tirumala et al., 2022 ([📄](https://proceedings.neurips.cc/paper_files/paper/2022/hash/fa0509f4dab6807e2cb465715bf2d249-Abstract-Conference.html)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) | +| lm-evaluation-harness | [💻](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) | --- diff --git a/setup.py b/setup.py index b02a348..6a5c99c 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,9 @@ packages=find_packages(), install_requires=requirements, # Uses requirements.txt extras_require={ + "lm-eval": [ + "lm-eval==0.4.8", + ], # Install using `pip install .[lm-eval]` "dev": [ "pre-commit==4.0.1", "ruff==0.6.9", diff --git a/setup_data.py b/setup_data.py index 760679b..a137394 100644 --- a/setup_data.py +++ b/setup_data.py @@ -1,17 +1,64 @@ +import argparse +import os +import subprocess from huggingface_hub import snapshot_download -# Setup retain model metrics -snapshot_download( - repo_id="open-unlearning/eval", - allow_patterns="*.json", - repo_type="dataset", - local_dir="saves/eval", -) - -# Setup data -snapshot_download( - repo_id="open-unlearning/idk", - allow_patterns="*.jsonl", - repo_type="dataset", - local_dir="data", -) + +def download_eval_data(): + snapshot_download( + repo_id="open-unlearning/eval", + allow_patterns="*.json", + repo_type="dataset", + local_dir="saves/eval", + ) + + +def download_idk_data(): + snapshot_download( + repo_id="open-unlearning/idk", + allow_patterns="*.jsonl", + repo_type="dataset", + local_dir="data", + ) + + +def download_wmdp(): + url = "https://cais-wmdp.s3.us-west-1.amazonaws.com/wmdp-corpora.zip" + dest_dir = "data/wmdp" + zip_path = os.path.join(dest_dir, "wmdp-corpora.zip") + + os.makedirs(dest_dir, exist_ok=True) + subprocess.run(["wget", url, "-O", zip_path], check=True) + subprocess.run(["unzip", "-P", "wmdpcorpora", zip_path, "-d", dest_dir], check=True) + + +def main(): + parser = argparse.ArgumentParser(description="Download and setup evaluation data.") + parser.add_argument( + "--eval_logs", + action="store_true", + help="Downloads TOFU, MUSE - retain and finetuned models eval logs and saves them in saves/eval", + ) + parser.add_argument( + "--idk", + action="store_true", + help="Download idk dataset from HF hub and stores it data/idk.jsonl", + ) + parser.add_argument( + "--wmdp", + action="store_true", + help="Download and unzip WMDP dataset into data/wmdp", + ) + + args = parser.parse_args() + + if args.download_eval: + download_eval_data() + if args.download_idk: + download_idk_data() + if args.download_wmdp: + download_wmdp() + + +if __name__ == "__main__": + main() diff --git a/src/evals/__init__.py b/src/evals/__init__.py index f5b323c..5ab4f60 100644 --- a/src/evals/__init__.py +++ b/src/evals/__init__.py @@ -2,6 +2,7 @@ from omegaconf import DictConfig from evals.tofu import TOFUEvaluator from evals.muse import MUSEEvaluator +from evals.lm_eval import LMEvalEvaluator EVALUATOR_REGISTRY: Dict[str, Any] = {} @@ -31,3 +32,4 @@ def get_evaluators(eval_cfgs: DictConfig, **kwargs): # Register Your benchmark evaluators _register_evaluator(TOFUEvaluator) _register_evaluator(MUSEEvaluator) +_register_evaluator(LMEvalEvaluator) diff --git a/src/evals/base.py b/src/evals/base.py index 3beb68a..8811467 100644 --- a/src/evals/base.py +++ b/src/evals/base.py @@ -109,4 +109,4 @@ def evaluate(self, model, output_dir=None, overwrite=None, **kwargs): self.save_logs(logs, logs_file_path) self.save_logs(self.summarize(logs), summary_file_path) - return logs + return self.summarize(logs) diff --git a/src/evals/lm_eval.py b/src/evals/lm_eval.py new file mode 100644 index 0000000..43080e4 --- /dev/null +++ b/src/evals/lm_eval.py @@ -0,0 +1,115 @@ +import logging +from omegaconf import OmegaConf + +from lm_eval.models.hf_vlms import HFLM +from lm_eval.tasks import TaskManager +from lm_eval import simple_evaluate + +from evals.base import Evaluator + + +logger = logging.getLogger("evaluator") + + +class LMEvalEvaluator(Evaluator): + def __init__(self, eval_cfg, **kwargs): + self.name = "LMEval" + self.eval_cfg = eval_cfg + self.tasks = OmegaConf.to_container( + self.eval_cfg.tasks, resolve=True, throw_on_missing=True + ) + self.task_manager = TaskManager() + self.simple_evaluate_args = dict(kwargs.get("simple_evaluate_args", {})) + + def prepare_model(self, model, **kwargs): + """Prepare model for evaluation""" + model.eval() + return HFLM(model) + + def summarize(self, eval_results: dict, task_name: str) -> dict: + """ + Summarize evaluation metrics from lm_eval.simple_evaluate. + - If task_name is a group, return only aggregated group-level metrics. + - If it's a single task, return per-task metrics from 'results'. + - Always exclude 'alias' entries and strip ',none' suffixes. + """ + summary = {} + + def clean_metric_key(prefix: str, metric_name: str) -> str | None: + if metric_name == "alias": + return None + base = metric_name.split(",", 1)[0].strip() + return f"{prefix}/{base}" + + # Check if task is a group (e.g., 'mmlu') + if task_name in self.task_manager.all_groups: + group_metrics = eval_results.get("groups", {}).get(task_name, {}) + for metric_name, value in group_metrics.items(): + key = clean_metric_key(task_name, metric_name) + if key is None: + continue + try: + summary[key] = float(value) + except (TypeError, ValueError): + summary[key] = value + else: + task_metrics = eval_results.get("results", {}).get(task_name, {}) + for metric_name, value in task_metrics.items(): + key = clean_metric_key(task_name, metric_name) + if key is None: + continue + try: + summary[key] = float(value) + except (TypeError, ValueError): + summary[key] = value + + return summary + + def get_task_name(self, task): + if isinstance(task, str): + return task + elif isinstance(task, dict): + if "task" in task: + return task.get("task") + raise ValueError(f"Invalid task format: {task}") + + def evaluate(self, model, output_dir=None, overwrite=None, **kwargs): + # set flag to overwrite metrics + overwrite = self.eval_cfg.overwrite if overwrite is None else overwrite + + # Prepare model for evaluation + kwargs = {"tokenizer": kwargs.get("tokenizer", None)} + model = self.prepare_model(model, **kwargs) + + # Set output_dir and file to store results + output_dir = output_dir if output_dir else self.eval_cfg.output_dir + logs_file_path = self.get_logs_file_path(output_dir) + summary_file_path = self.get_logs_file_path(output_dir, suffix="SUMMARY") + + # Load existing results from file if any. + logs = self.load_logs_from_file(logs_file_path) if not overwrite else {} + summary = self.load_logs_from_file(summary_file_path) if not overwrite else {} + + logger.info(f"***** Running {self.name} evaluation suite *****") + logger.info(f"Fine-grained evaluations will be saved to: {logs_file_path}") + logger.info( + f"Aggregated evaluations will be summarised in: {summary_file_path}" + ) + + for task in self.tasks: + task_name = self.get_task_name(task) + if not overwrite and task_name in logs and logs[task_name]: + logger.info(f"Skipping {task_name}, already evaluated.") + continue + _ = logs.pop(task_name, None) # overwriting existing evals if present + results = simple_evaluate( + model=model, + tasks=[task], + task_manager=self.task_manager, + **self.simple_evaluate_args, + ) + logs.update({task_name: results["samples"]}) + summary.update(self.summarize(results, task_name)) + self.save_logs(logs, logs_file_path) + self.save_logs(summary, summary_file_path) + return summary diff --git a/src/evals/metrics/__init__.py b/src/evals/metrics/__init__.py index 9441c8d..5afb042 100644 --- a/src/evals/metrics/__init__.py +++ b/src/evals/metrics/__init__.py @@ -6,7 +6,6 @@ probability_w_options, rouge, truth_ratio, - hm_aggregate, extraction_strength, exact_memorization, ) @@ -19,6 +18,10 @@ mia_zlib, mia_reference, ) +from evals.metrics.utility import ( + hm_aggregate, + classifier_prob, +) METRICS_REGISTRY: Dict[str, UnlearningMetric] = {} @@ -67,3 +70,6 @@ def get_metrics(metric_cfgs: DictConfig, **kwargs): _register_metric(mia_gradnorm) _register_metric(mia_zlib) _register_metric(mia_reference) + +# Register Utility metrics +_register_metric(classifier_prob) diff --git a/src/evals/metrics/memorization.py b/src/evals/metrics/memorization.py index 646eab7..d033c1e 100644 --- a/src/evals/metrics/memorization.py +++ b/src/evals/metrics/memorization.py @@ -1,10 +1,8 @@ import logging import torch import numpy as np -import scipy as sc from torch.utils.data import DataLoader - from evals.metrics.utils import ( aggregate_to_1D, evaluate_probability, @@ -164,12 +162,6 @@ def true_better(arr): return {"agg_value": forget_tr_avg, "value_by_index": value_by_index} -@unlearning_metric(name="hm_aggregate") -def hm_aggregate(model, **kwargs): - values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()] - return {"agg_value": sc.stats.hmean(values)} - - @unlearning_metric(name="exact_memorization") def exact_memorization(model, **kwargs): data = kwargs["data"] diff --git a/src/evals/metrics/utility.py b/src/evals/metrics/utility.py new file mode 100644 index 0000000..7f47e9c --- /dev/null +++ b/src/evals/metrics/utility.py @@ -0,0 +1,76 @@ +import torch +import numpy as np +import scipy as sc +from tqdm import tqdm +import torch.nn.functional as F +from torch.utils.data import DataLoader +from transformers import AutoTokenizer, AutoModelForSequenceClassification + +from evals.metrics.utils import aggregate_to_1D +from evals.metrics.base import unlearning_metric + + +@unlearning_metric(name="hm_aggregate") +def hm_aggregate(model, **kwargs): + values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()] + return {"agg_value": sc.stats.hmean(values)} + + +@unlearning_metric(name="classifier_prob") +def classifier_prob(model, **kwargs): + batch_size = kwargs.get("batch_size", 32) + max_length = kwargs.get("max_length", 512) + class_id = kwargs.get("class_id", 0) + text_key = kwargs.get("text_key", "generation") + classifier_model_args = kwargs["classifier_model_args"] + classifier_tokenization_args = kwargs["classifier_tokenization_args"] + device = kwargs.get("device", "cuda") + + tokenizer = AutoTokenizer.from_pretrained(**classifier_tokenization_args) + classifier = AutoModelForSequenceClassification.from_pretrained( + **classifier_model_args + ).to(device) + + data = kwargs["pre_compute"]["text"]["value_by_index"] + data_list = [ + {"text": entry[text_key], "index": int(key)} for key, entry in data.items() + ] + + # Create DataLoader + dataloader = DataLoader(data_list, batch_size=batch_size, shuffle=False) + + scores_by_index = {} + for batch in tqdm(dataloader): + batch_texts = batch["text"] + batch_indices = batch["index"].tolist() + + # Tokenize the batch of texts + inputs = tokenizer( + batch_texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_length, + return_attention_mask=True, + ) + inputs = {k: v.to(device) for k, v in inputs.items()} + + # Run the classifier + with torch.no_grad(): + outputs = classifier(**inputs) + # Convert logits to probabilities + scores = F.softmax(outputs.logits, dim=-1)[:, class_id].cpu().numpy().tolist() + + # Map predictions to labels + for idx, prob, text in zip(batch_indices, scores, batch_texts): + # Add the prediction to the original data + scores_by_index[idx] = {"score": prob, text_key: text} + class_scores = np.array( + [ + evals["score"] + for evals in scores_by_index.values() + if evals["score"] is not None + ] + ) + class_scores = aggregate_to_1D(class_scores) + return {"agg_value": np.mean(class_scores), "value_by_index": scores_by_index} diff --git a/src/train.py b/src/train.py index a9048e3..a2f81c8 100644 --- a/src/train.py +++ b/src/train.py @@ -3,7 +3,7 @@ from data import get_data, get_collators from model import get_model from trainer import load_trainer -from evals import get_evaluator +from evals import get_evaluators from trainer.utils import seed_everything @@ -34,17 +34,12 @@ def main(cfg: DictConfig): trainer_cfg = cfg.trainer assert trainer_cfg is not None, ValueError("Please set trainer") - # Get Evaluator - evaluator = None + # Get Evaluators + evaluators = None eval_cfgs = cfg.get("eval", None) if eval_cfgs: - assert len(eval_cfgs) <= 1, ValueError( - "Only one evaluation supported while training" - ) - eval_name, eval_cfg = next(iter(eval_cfgs.items())) - evaluator = get_evaluator( - eval_name, - eval_cfg, + evaluators = get_evaluators( + eval_cfgs=eval_cfgs, template_args=template_args, model=model, tokenizer=tokenizer, @@ -57,7 +52,7 @@ def main(cfg: DictConfig): eval_dataset=data.get("eval", None), tokenizer=tokenizer, data_collator=collator, - evaluator=evaluator, + evaluators=evaluators, template_args=template_args, ) diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py index 66774f2..27f44ee 100644 --- a/src/trainer/__init__.py +++ b/src/trainer/__init__.py @@ -46,7 +46,7 @@ def load_trainer( eval_dataset=None, tokenizer=None, data_collator=None, - evaluator=None, + evaluators=None, template_args=None, ): trainer_args = trainer_cfg.args @@ -67,7 +67,7 @@ def load_trainer( tokenizer=tokenizer, data_collator=data_collator, args=trainer_args, - evaluator=evaluator, + evaluators=evaluators, template_args=template_args, **method_args, ) diff --git a/src/trainer/base.py b/src/trainer/base.py index c9cfdce..05f36a2 100644 --- a/src/trainer/base.py +++ b/src/trainer/base.py @@ -13,8 +13,8 @@ class FinetuneTrainer(Trainer): - def __init__(self, evaluator=None, template_args=None, *args, **kwargs): - self.evaluator = evaluator + def __init__(self, evaluators=None, template_args=None, *args, **kwargs): + self.evaluators = evaluators self.template_args = template_args super().__init__(*args, **kwargs) @@ -26,7 +26,7 @@ def evaluate( trial: Dict[str, Any] = None, ) -> Dict[str, float]: # Run a custom evaluator and save results - if self.evaluator: + if self.evaluators: if self.accelerator.is_local_main_process: eval_metrics = {} if self.accelerator.num_processes == 1: @@ -36,14 +36,15 @@ def evaluate( ) output_dir = os.path.join(run_dir, checkpoint_folder, "evals") os.makedirs(output_dir, exist_ok=True) - eval_args = { - "output_dir": output_dir, - "template_args": self.template_args, - "model": self.model, - "tokenizer": self.tokenizer, - } - eval_metrics = self.evaluator.evaluate(**eval_args) - eval_metrics = self.evaluator.summarize(eval_metrics) + eval_metrics = {} + for _, evaluator in self.evaluators.items(): + eval_args = { + "output_dir": output_dir, + "template_args": self.template_args, + "model": self.model, + "tokenizer": self.tokenizer, + } + eval_metrics.update(evaluator.evaluate(**eval_args)) self.log(eval_metrics) else: logger.warning(