locuslab · Dornavineeth · May 22, 2025 · Apr 26, 2025 · Apr 27, 2025 · Apr 29, 2025
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@
 
 ## 📖 Overview
 
-We provide efficient and streamlined implementations of the TOFU, MUSE unlearning benchmarks while supporting 6 unlearning methods, 3+ datasets, 9+ evaluation metrics, and 6+ LLM architectures. Each of these can be easily extended to incorporate more variants.
+We provide efficient and streamlined implementations of the TOFU, MUSE and WMDP unlearning benchmarks while supporting 6 unlearning methods, 5+ datasets, 10+ evaluation metrics, and 7+ LLM architectures. Each of these can be easily extended to incorporate more variants.
 
 We invite the LLM unlearning community to collaborate by adding new benchmarks, unlearning methods, datasets and evaluation metrics here to expand OpenUnlearning's features, gain feedback from wider usage and drive progress in the field.
 
@@ -64,7 +64,7 @@ We provide several variants for each of the components in the unlearning pipelin
 | **Benchmarks**        | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/), [WMDP](https://www.wmdp.ai/) |
 | **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU |
 | **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) |
-| **Datasets**          | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) |
+| **Datasets**          | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits), WMDP-Bio, WMDP-Cyber |
 | **Model Families**    | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma, Zephyr |
 
 ---
@@ -209,13 +209,14 @@ If you use OpenUnlearning in your research, please cite OpenUnlearning and the b
   booktitle={First Conference on Language Modeling},
   year={2024}
 }
-@inproceedings{
-  shi2025muse,
-  title={{MUSE}: Machine Unlearning Six-Way Evaluation for Language Models},
+@article{shi2024muse,
+  title={MUSE: Machine Unlearning Six-Way Evaluation for Language Models},
   author={Weijia Shi and Jaechan Lee and Yangsibo Huang and Sadhika Malladi and Jieyu Zhao and Ari Holtzman and Daogao Liu and Luke Zettlemoyer and Noah A. Smith and Chiyuan Zhang},
-  booktitle={The Thirteenth International Conference on Learning Representations},
-  year={2025},
-  url={https://openreview.net/forum?id=TArmA033BU}
+  year={2024},
+  eprint={2407.06460},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/2407.06460},
 }
 ```
 </details>

diff --git a/configs/eval/tofu.yaml b/configs/eval/tofu.yaml
@@ -26,4 +26,6 @@ metrics: {} # lists a mapping from each evaluation metric to its config
 overwrite: false
 forget_split: forget10
 holdout_split: holdout10
-retain_logs_path: null
+retain_logs_path: null
+question_key: "question" # Specifies which key to use during forget and retain evaluations (e.g., "question" or "paraphrased_question")
+batch_size: 32
diff --git a/configs/eval/tofu_metrics/exact_memorization.yaml b/configs/eval/tofu_metrics/exact_memorization.yaml
@@ -5,10 +5,11 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: exact_memorization
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/extraction_strength.yaml b/configs/eval/tofu_metrics/extraction_strength.yaml
@@ -5,10 +5,11 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: extraction_strength
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/forget_Q_A_PARA_Prob.yaml b/configs/eval/tofu_metrics/forget_Q_A_PARA_Prob.yaml
@@ -6,10 +6,11 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets:
   TOFU_QA_forget_para:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}_perturbed
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/forget_Q_A_PARA_ROUGE.yaml b/configs/eval/tofu_metrics/forget_Q_A_PARA_ROUGE.yaml
@@ -8,13 +8,14 @@ defaults:
 
 handler: rouge
 rouge_type: rougeL_recall
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets: # override as needed
   TOFU_QA_forget_para:
     args:
       hf_args:
         name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
       predict_with_generate: True
 collators:
   DataCollatorForSupervisedDataset: 

diff --git a/configs/eval/tofu_metrics/forget_Q_A_PERT_Prob.yaml b/configs/eval/tofu_metrics/forget_Q_A_PERT_Prob.yaml
@@ -5,10 +5,11 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets:
   TOFU_QA_forget_pert:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}_perturbed
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/forget_Q_A_PERT_ROUGE.yaml b/configs/eval/tofu_metrics/forget_Q_A_PERT_ROUGE.yaml
@@ -7,13 +7,14 @@ defaults:
 
 handler: rouge
 rouge_type: rougeL_recall
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets: # override as needed
   TOFU_QA_forget_pert:
     args:
       hf_args:
         name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
       predict_with_generate: True
 collators:
   DataCollatorForSupervisedDataset: 

diff --git a/configs/eval/tofu_metrics/forget_Q_A_Prob.yaml b/configs/eval/tofu_metrics/forget_Q_A_Prob.yaml
@@ -5,10 +5,11 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/forget_Q_A_ROUGE.yaml b/configs/eval/tofu_metrics/forget_Q_A_ROUGE.yaml
@@ -8,13 +8,14 @@ defaults:
 
 handler: rouge
 rouge_type: rougeL_recall
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 
 datasets: # override as needed
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
       predict_with_generate: True
 collators:
   DataCollatorForSupervisedDataset: 

diff --git a/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml b/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml
@@ -8,7 +8,7 @@ pre_compute:
 
 handler: classifier_prob
 batch_size: 32
-max_length: 512
+max_length: 32
 class_id: 0
 text_key: generation
 device: cuda

diff --git a/configs/eval/tofu_metrics/mia_gradnorm.yaml b/configs/eval/tofu_metrics/mia_gradnorm.yaml
@@ -11,7 +11,8 @@ datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
   TOFU_QA_holdout:
     args:
       hf_args:

diff --git a/configs/eval/tofu_metrics/mia_loss.yaml b/configs/eval/tofu_metrics/mia_loss.yaml
@@ -2,14 +2,15 @@
 defaults:
   - ../../data/datasets@datasets: TOFU_MIA
   - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 handler: mia_loss
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
   TOFU_QA_holdout:
     args:
       hf_args:

diff --git a/configs/eval/tofu_metrics/mia_min_k.yaml b/configs/eval/tofu_metrics/mia_min_k.yaml
@@ -2,15 +2,16 @@
 defaults:
   - ../../data/datasets@datasets: TOFU_MIA
   - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 handler: mia_min_k
 k: 0.4
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
   TOFU_QA_holdout:
     args:
       hf_args:

diff --git a/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml b/configs/eval/tofu_metrics/mia_min_k_plus_plus.yaml
@@ -2,15 +2,16 @@
 defaults:
   - ../../data/datasets@datasets: TOFU_MIA
   - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 k: 0.4
 handler: mia_min_k_plus_plus
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
   TOFU_QA_holdout:
     args:
       hf_args:

diff --git a/configs/eval/tofu_metrics/mia_reference.yaml b/configs/eval/tofu_metrics/mia_reference.yaml
@@ -2,15 +2,16 @@
 defaults:
   - ../../data/datasets@datasets: TOFU_MIA
   - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 handler: mia_reference
 reference_model_path: ??? # modify appropriately for example open-unlearning/tofu_Llama-3.2-1B-Instruct_retain90
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
   TOFU_QA_holdout:
     args:
       hf_args:

diff --git a/configs/eval/tofu_metrics/mia_zlib.yaml b/configs/eval/tofu_metrics/mia_zlib.yaml
@@ -2,14 +2,15 @@
 defaults:
   - ../../data/datasets@datasets: TOFU_MIA
   - ../../collator@collators: DataCollatorForSupervisedDatasetwithIndex
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 handler: mia_zlib
 
 datasets:
   TOFU_QA_forget:
     args:
       hf_args:
-        name: ${eval.tofu.forget_split}
+        name: ${eval.tofu.forget_split}_perturbed
+      question_key: ${eval.tofu.question_key}
   TOFU_QA_holdout:
     args:
       hf_args:

diff --git a/configs/eval/tofu_metrics/ra_Q_A_PERT_Prob.yaml b/configs/eval/tofu_metrics/ra_Q_A_PERT_Prob.yaml
@@ -5,4 +5,4 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
diff --git a/configs/eval/tofu_metrics/ra_Q_A_Prob.yaml b/configs/eval/tofu_metrics/ra_Q_A_Prob.yaml
@@ -5,4 +5,4 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
diff --git a/configs/eval/tofu_metrics/ra_Q_A_ROUGE.yaml b/configs/eval/tofu_metrics/ra_Q_A_ROUGE.yaml
@@ -8,7 +8,7 @@ defaults:
 
 handler: rouge
 rouge_type: rougeL_recall
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 datasets: # override as needed
   TOFU_QA_ra:
     args:

diff --git a/configs/eval/tofu_metrics/retain_Q_A_PARA_Prob.yaml b/configs/eval/tofu_metrics/retain_Q_A_PARA_Prob.yaml
@@ -5,4 +5,9 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
+
+datasets:
+  TOFU_QA_retain_para:
+    args:
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/retain_Q_A_PERT_Prob.yaml b/configs/eval/tofu_metrics/retain_Q_A_PERT_Prob.yaml
@@ -5,4 +5,9 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
+
+datasets:
+  TOFU_QA_retain_pert:
+    args:
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/retain_Q_A_Prob.yaml b/configs/eval/tofu_metrics/retain_Q_A_Prob.yaml
@@ -5,4 +5,9 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
+
+datasets:
+  TOFU_QA_retain_eval:
+    args:
+      question_key: ${eval.tofu.question_key}
diff --git a/configs/eval/tofu_metrics/retain_Q_A_ROUGE.yaml b/configs/eval/tofu_metrics/retain_Q_A_ROUGE.yaml
@@ -8,10 +8,11 @@ defaults:
 
 handler: rouge
 rouge_type: rougeL_recall
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 datasets: # override as needed
   TOFU_QA_retain_eval:
     args:
+      question_key: ${eval.tofu.question_key}
       predict_with_generate: True
 collators:
   DataCollatorForSupervisedDataset: 

diff --git a/configs/eval/tofu_metrics/wf_Q_A_PERT_Prob.yaml b/configs/eval/tofu_metrics/wf_Q_A_PERT_Prob.yaml
@@ -5,4 +5,4 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
diff --git a/configs/eval/tofu_metrics/wf_Q_A_Prob.yaml b/configs/eval/tofu_metrics/wf_Q_A_Prob.yaml
@@ -5,4 +5,4 @@ defaults:
   # ^ get default dataset and generation config information
 
 handler: probability
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
diff --git a/configs/eval/tofu_metrics/wf_Q_A_ROUGE.yaml b/configs/eval/tofu_metrics/wf_Q_A_ROUGE.yaml
@@ -8,7 +8,7 @@ defaults:
 
 handler: rouge
 rouge_type: rougeL_recall
-batch_size: 32
+batch_size: ${eval.tofu.batch_size}
 datasets: # override as needed
   TOFU_QA_wf:
     args: