diff --git a/README.md b/README.md
index ab03fac..48a7168 100644
--- a/README.md
+++ b/README.md
@@ -26,15 +26,20 @@ We invite the LLM unlearning community to collaborate by adding new benchmarks,
 
 ### 📢 Updates
 
+#### [May 12, 2025]
+
+- **Another benchmark!** We now support running the [`WMDP`](https://wmdp.ai/) benchmark with its `Zephyr` task model.
+- **More evaluations!**  The [`lm-evaluation-harness`](https://github.com/EleutherAI/lm-evaluation-harness) toolkit has been integrated into OpenUnlearning, enabling WMDP evaluations and support for popular general LLM benchmarks, including MMLU, GSM8K, and others.
+
+<details>
+<summary><b>Older Updates</b></summary>
+
 #### [Apr 6, 2025]
-🚨🚨 **IMPORTANT:** 🚨🚨 Be sure to run `python setup_data.py` immediately after merging the latest version. This is required to refresh the downloaded eval log files and ensure they're compatible with the latest evaluation metrics.
 - **More Metrics!** Added 6 Membership Inference Attacks (MIA) (LOSS, ZLib, Reference, GradNorm, MinK, and MinK++), along with Extraction Strength (ES) and  Exact Memorization (EM) as additional evaluation metrics.
 - **More TOFU Evaluations!** Now includes a holdout set and supports MIA attack-based evaluation. You can now compute MUSE's privleak on TOFU.
 - **More Documentation!** [`docs/links.md`](docs/links.md) contains resources for each of the implemented features and other useful LLM unlearning resources.
 
-
-<details>
-<summary><b>Older Updates</b></summary>
+Be sure to run `python setup_data.py` immediately after merging the latest version. This is required to refresh the downloaded eval log files and ensure they're compatible with the latest evaluation metrics.
 
 #### [Mar 27, 2025]
 - **More Documentation: easy contributions and the leaderboard functionality**: We've updated the documentation to make contributing new unlearning methods and benchmarks much easier. Users can document additions better and also update a leaderboard with their results. See [this section](#-how-to-contribute) for details.
@@ -56,11 +61,11 @@ We provide several variants for each of the components in the unlearning pipelin
 
 | **Component**          | **Available Options** |
 |------------------------|----------------------|
-| **Benchmarks**        | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/) |
+| **Benchmarks**        | [TOFU](https://arxiv.org/abs/2401.06121), [MUSE](https://muse-bench.github.io/), [WMDP](https://www.wmdp.ai/) |
 | **Unlearning Methods** | GradAscent, GradDiff, NPO, SimNPO, DPO, RMU |
-| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks |
+| **Evaluation Metrics** | Verbatim Probability, Verbatim ROUGE, Knowledge QA-ROUGE, Model Utility, Forget Quality, TruthRatio, Extraction Strength, Exact Memorization, 6 MIA attacks, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) |
 | **Datasets**          | MUSE-News (BBC), MUSE-Books (Harry Potter), TOFU (different splits) |
-| **Model Families**    | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma |
+| **Model Families**    | TOFU: LLaMA-3.2, LLaMA-3.1, LLaMA-2; MUSE: LLaMA-2; Additional: Phi-3.5, Phi-1.5, Gemma, Zephyr |
 
 ---
 
@@ -89,13 +94,15 @@ We provide several variants for each of the components in the unlearning pipelin
 # Environment setup
 conda create -n unlearning python=3.11
 conda activate unlearning
-pip install .
+pip install .[lm_eval]
 pip install --no-build-isolation flash-attn==2.6.3
 
 # Data setup
-python setup_data.py  # saves/eval now contains evaluation results of the uploaded models
-# Downloads log files with metric eval results (incl retain model logs) from the models 
-# used in the supported benchmarks.
+python setup_data.py --eval # saves/eval now contains evaluation results of the uploaded models
+# This downloads log files with evaluation results (including retain model logs)
+# into `saves/eval`, used for evaluating unlearning across supported benchmarks.
+# Additional datasets (e.g., WMDP) are supported — run below for options:
+# python setup_data.py --help
 ```
 
 ---
@@ -202,14 +209,13 @@ If you use OpenUnlearning in your research, please cite OpenUnlearning and the b
   booktitle={First Conference on Language Modeling},
   year={2024}
 }
-@article{shi2024muse,
-  title={MUSE: Machine Unlearning Six-Way Evaluation for Language Models},
+@inproceedings{
+  shi2025muse,
+  title={{MUSE}: Machine Unlearning Six-Way Evaluation for Language Models},
   author={Weijia Shi and Jaechan Lee and Yangsibo Huang and Sadhika Malladi and Jieyu Zhao and Ari Holtzman and Daogao Liu and Luke Zettlemoyer and Noah A. Smith and Chiyuan Zhang},
-  year={2024},
-  eprint={2407.06460},
-  archivePrefix={arXiv},
-  primaryClass={cs.CL},
-  url={https://arxiv.org/abs/2407.06460},
+  booktitle={The Thirteenth International Conference on Learning Representations},
+  year={2025},
+  url={https://openreview.net/forum?id=TArmA033BU}
 }
 ```
 </details>
diff --git a/configs/data/datasets/WMDP_forget.yaml b/configs/data/datasets/WMDP_forget.yaml
new file mode 100644
index 0000000..4bd260f
--- /dev/null
+++ b/configs/data/datasets/WMDP_forget.yaml
@@ -0,0 +1,9 @@
+WMDP_forget:
+  handler: PretrainingDataset
+  args:
+    hf_args:
+      path: "text"
+      data_files: "data/wmdp/wmdp-corpora/cyber-forget-corpus.jsonl"
+      split: "train"
+    text_key: "text"
+    max_length: 512
\ No newline at end of file
diff --git a/configs/data/datasets/WMDP_retain.yaml b/configs/data/datasets/WMDP_retain.yaml
new file mode 100644
index 0000000..53d87a7
--- /dev/null
+++ b/configs/data/datasets/WMDP_retain.yaml
@@ -0,0 +1,9 @@
+WMDP_retain:
+  handler: PretrainingDataset
+  args:
+    hf_args:
+      path: "text"
+      data_files: "data/wmdp/wmdp-corpora/cyber-retain-corpus.jsonl"
+      split: "train"
+    text_key: "text"
+    max_length: 512
\ No newline at end of file
diff --git a/configs/eval/lm_eval.yaml b/configs/eval/lm_eval.yaml
new file mode 100644
index 0000000..f67487d
--- /dev/null
+++ b/configs/eval/lm_eval.yaml
@@ -0,0 +1,20 @@
+# @package eval.lm_eval
+# NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/
+
+handler: LMEvalEvaluator
+output_dir: ${paths.output_dir} # set to default eval directory
+overwrite: false
+
+# Define evaluation tasks here
+tasks:
+  - mmlu
+  # - task: gsm8k
+  #   dataset_path: gsm8k
+  #   # define the entire task config. 
+  #   # ^ Example: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
+
+
+simple_evaluate_args:
+  batch_size: 16
+  system_instruction: null
+  apply_chat_template: false
\ No newline at end of file
diff --git a/configs/eval/muse.yaml b/configs/eval/muse.yaml
index ecdd98f..0b30777 100644
--- a/configs/eval/muse.yaml
+++ b/configs/eval/muse.yaml
@@ -15,6 +15,7 @@ defaults:
     # - mia_reference
     # - mia_zlib
     # - mia_gradnorm
+    # - forget_gibberish
 
 handler: MUSEEvaluator
 output_dir: ${paths.output_dir} # set to default eval directory
diff --git a/configs/eval/muse_metrics/forget_gibberish.yaml b/configs/eval/muse_metrics/forget_gibberish.yaml
new file mode 100644
index 0000000..a4163d0
--- /dev/null
+++ b/configs/eval/muse_metrics/forget_gibberish.yaml
@@ -0,0 +1,20 @@
+# @package eval.muse.metrics.forget_gibberish
+defaults:
+  - .@pre_compute.forget_verbmem_ROUGE: forget_verbmem_ROUGE
+
+pre_compute:
+  forget_verbmem_ROUGE:
+    access_key: text
+
+handler: classifier_prob
+batch_size: 32
+max_length: 512
+class_id: 0
+text_key: generation
+device: cuda
+
+classifier_model_args:
+  pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
+
+classifier_tokenization_args:
+  pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
diff --git a/configs/eval/tofu.yaml b/configs/eval/tofu.yaml
index bbfea26..984f2c9 100644
--- a/configs/eval/tofu.yaml
+++ b/configs/eval/tofu.yaml
@@ -17,6 +17,7 @@ defaults: # include all defined metrics files
     # - mia_zlib
     # - mia_gradnorm
     # - mia_reference # set reference model path appropriately
+    # - forget_Q_A_gibberish
 
 handler: TOFUEvaluator
 output_dir: ${paths.output_dir} # set to default eval directory
diff --git a/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml b/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml
new file mode 100644
index 0000000..86c661f
--- /dev/null
+++ b/configs/eval/tofu_metrics/forget_Q_A_gibberish.yaml
@@ -0,0 +1,20 @@
+# @package eval.tofu.metrics.forget_Q_A_gibberish
+defaults:
+  - .@pre_compute.forget_Q_A_ROUGE: forget_Q_A_ROUGE
+
+pre_compute:
+  forget_Q_A_ROUGE:
+    access_key: text
+
+handler: classifier_prob
+batch_size: 32
+max_length: 512
+class_id: 0
+text_key: generation
+device: cuda
+
+classifier_model_args:
+  pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
+
+classifier_tokenization_args:
+  pretrained_model_name_or_path: "madhurjindal/autonlp-Gibberish-Detector-492513457"
diff --git a/configs/experiment/eval/wmdp/default.yaml b/configs/experiment/eval/wmdp/default.yaml
new file mode 100644
index 0000000..983ad28
--- /dev/null
+++ b/configs/experiment/eval/wmdp/default.yaml
@@ -0,0 +1,15 @@
+# @package _global_
+
+defaults:
+  - override /model: zephyr-7b-beta
+  - override /eval: lm_eval
+
+data_split: cyber
+
+eval:
+  lm_eval:
+    tasks:
+      - wmdp_${data_split}
+      - mmlu
+
+task_name: ???
\ No newline at end of file
diff --git a/configs/experiment/unlearn/wmdp/default.yaml b/configs/experiment/unlearn/wmdp/default.yaml
new file mode 100644
index 0000000..8b126ac
--- /dev/null
+++ b/configs/experiment/unlearn/wmdp/default.yaml
@@ -0,0 +1,58 @@
+# @package _global_
+
+defaults:
+  - override /model: zephyr-7b-beta
+  - override /trainer: RMU
+  - override /data: unlearn
+  - override /data/datasets@data.forget: WMDP_forget
+  - override /data/datasets@data.retain: WMDP_retain
+  - override /eval: lm_eval
+
+data_split: cyber
+
+data:
+  anchor: forget
+  forget:
+    WMDP_forget: 
+      args:
+        hf_args:
+          data_files: data/wmdp/wmdp-corpora/${data_split}-forget-corpus.jsonl
+  retain:
+    WMDP_retain:
+      args:
+        hf_args:
+          data_files: data/wmdp/wmdp-corpora/${data_split}-retain-corpus.jsonl
+
+eval:
+  lm_eval:
+    tasks:
+      - wmdp_${data_split}
+      - mmlu
+
+
+collator:
+  DataCollatorForSupervisedDataset:
+    args:
+      padding_side: left # Usually left but for mistral and zephyr its right (https://github.com/hongshi97/CAD/issues/2)
+
+trainer:
+  args:
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 16
+    learning_rate: 5e-5
+    eval_strategy: steps
+    eval_steps: 0.5
+    max_steps: 80
+    lr_scheduler_type: constant
+
+  method_args:
+    # The params here are more dependent on model and dataset. Tune them carefully to work
+    gamma: 1.0
+    steering_coeff: 2
+    retain_loss_type: EMBED_DIFF
+    alpha: 1 
+    module_regex: model\.layers\.7
+    trainable_params_regex: 
+      - model\.layers\.(5|6|7)\.mlp\.down_proj\.weight # If you want to update only these weights (as done in https://github.com/centerforaisafety/wmdp/blob/bc5e1ba0367ea826caeeeaa50656336a1e87acfb/rmu/unlearn.py#L26)
+
+task_name: ???
\ No newline at end of file
diff --git a/configs/model/zephyr-7b-beta.yaml b/configs/model/zephyr-7b-beta.yaml
new file mode 100644
index 0000000..24f6cef
--- /dev/null
+++ b/configs/model/zephyr-7b-beta.yaml
@@ -0,0 +1,15 @@
+model_args:
+  pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta"
+  attn_implementation: 'flash_attention_2'
+  torch_dtype: bfloat16
+tokenizer_args:
+  pretrained_model_name_or_path: "HuggingFaceH4/zephyr-7b-beta"
+template_args:
+  apply_chat_template: True
+  system_prompt: You are a helpful assistant.
+  system_prompt_with_special_tokens: "<|system|>\nYou are a helpful assistant.</s>\n"
+  user_start_tag: "<|user|>\n"
+  user_end_tag: "</s>"
+  asst_start_tag: "<|assistant|>\n"
+  asst_end_tag: "</s>"
+  date_string: 10 Apr 2025
\ No newline at end of file
diff --git a/docs/evaluation.md b/docs/evaluation.md
index 4e61130..d460c6e 100644
--- a/docs/evaluation.md
+++ b/docs/evaluation.md
@@ -240,3 +240,33 @@ metrics: {} # lists a mapping from each evaluation metric listed above to its co
 output_dir: ${paths.output_dir} # set to default eval directory
 forget_split: forget10
 ```
+
+## lm-evaluation-harness
+
+To evaluate model capabilities after unlearning, we support running [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) using our custom evaluator: [LMEvalEvaluator](../src/evals/lm_eval.py).
+All evaluation tasks should be defined under the  `tasks` in [lm_eval.yaml](../configs/eval/lm_eval.yaml)
+
+```yaml
+# @package eval.lm_eval
+# NOTE: the above line is not a comment, but sets the package for config. See https://hydra.cc/docs/upgrades/0.11_to_1.0/adding_a_package_directive/
+
+handler: LMEvalEvaluator
+output_dir: ${paths.output_dir} # set to default eval directory
+overwrite: false
+
+# Define evaluation tasks here
+tasks:
+  - mmlu
+  - wmdp_cyber
+  - task: gsm8k
+    dataset_path: gsm8k
+    # define the entire task config. 
+    # ^ Example: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
+    
+
+
+simple_evaluate_args:
+  batch_size: 16
+  system_instruction: null
+  apply_chat_template: false
+```
diff --git a/docs/links.md b/docs/links.md
index 9a651a1..c41f325 100644
--- a/docs/links.md
+++ b/docs/links.md
@@ -5,12 +5,14 @@ Links to research papers and resources corresponding to implemented features in
 ---
 
 ## 📌 Table of Contents
-- [Implemented Methods](#implemented-methods)
-- [Benchmarks](#benchmarks)
-- [Evaluation Metrics](#evaluation-metrics)
-- [Useful Links](#useful-links)
-  - [Survey Papers](#survey-papers)
-  - [Other GitHub Repositories](#other-github-repositories)
+- [🔗 Links and References](#-links-and-references)
+  - [📌 Table of Contents](#-table-of-contents)
+  - [📗 Implemented Methods](#-implemented-methods)
+  - [📘 Benchmarks](#-benchmarks)
+  - [📙 Evaluation Metrics](#-evaluation-metrics)
+  - [🌐 Useful Links](#-useful-links)
+    - [📚 Surveys](#-surveys)
+    - [🐙 Other GitHub Repositories](#-other-github-repositories)
 
 ---
 
@@ -32,6 +34,7 @@ Links to research papers and resources corresponding to implemented features in
 |-----------|----------|
 | TOFU      | Paper [📄](https://arxiv.org/abs/2401.06121) |
 | MUSE      | Paper [📄](https://arxiv.org/abs/2407.06460) |
+| WMDP      | Paper [📄](https://arxiv.org/abs/2403.03218) |
 
 ---
 
@@ -45,6 +48,7 @@ Links to research papers and resources corresponding to implemented features in
 | Forget Quality, Truth Ratio, Model Utility | TOFU ([📄](https://arxiv.org/abs/2401.06121)) |
 | Extraction Strength (ES) |  Carlini et al., 2021 ([📄](https://www.usenix.org/conference/usenixsecurity21/presentation/carlini-extracting)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) |
 | Exact Memorization (EM) |  Tirumala et al., 2022 ([📄](https://proceedings.neurips.cc/paper_files/paper/2022/hash/fa0509f4dab6807e2cb465715bf2d249-Abstract-Conference.html)), used for unlearning in Wang et al., 2025 ([📄](https://openreview.net/pdf?id=wUtCieKuQU)) |
+| lm-evaluation-harness |  [💻](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) |
 
 ---
 
diff --git a/setup.py b/setup.py
index b02a348..6a5c99c 100644
--- a/setup.py
+++ b/setup.py
@@ -17,6 +17,9 @@
     packages=find_packages(),
     install_requires=requirements,  # Uses requirements.txt
     extras_require={
+        "lm-eval": [
+            "lm-eval==0.4.8",
+        ],  # Install using `pip install .[lm-eval]`
         "dev": [
             "pre-commit==4.0.1",
             "ruff==0.6.9",
diff --git a/setup_data.py b/setup_data.py
index 760679b..a137394 100644
--- a/setup_data.py
+++ b/setup_data.py
@@ -1,17 +1,64 @@
+import argparse
+import os
+import subprocess
 from huggingface_hub import snapshot_download
 
-# Setup retain model metrics
-snapshot_download(
-    repo_id="open-unlearning/eval",
-    allow_patterns="*.json",
-    repo_type="dataset",
-    local_dir="saves/eval",
-)
-
-# Setup data
-snapshot_download(
-    repo_id="open-unlearning/idk",
-    allow_patterns="*.jsonl",
-    repo_type="dataset",
-    local_dir="data",
-)
+
+def download_eval_data():
+    snapshot_download(
+        repo_id="open-unlearning/eval",
+        allow_patterns="*.json",
+        repo_type="dataset",
+        local_dir="saves/eval",
+    )
+
+
+def download_idk_data():
+    snapshot_download(
+        repo_id="open-unlearning/idk",
+        allow_patterns="*.jsonl",
+        repo_type="dataset",
+        local_dir="data",
+    )
+
+
+def download_wmdp():
+    url = "https://cais-wmdp.s3.us-west-1.amazonaws.com/wmdp-corpora.zip"
+    dest_dir = "data/wmdp"
+    zip_path = os.path.join(dest_dir, "wmdp-corpora.zip")
+
+    os.makedirs(dest_dir, exist_ok=True)
+    subprocess.run(["wget", url, "-O", zip_path], check=True)
+    subprocess.run(["unzip", "-P", "wmdpcorpora", zip_path, "-d", dest_dir], check=True)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download and setup evaluation data.")
+    parser.add_argument(
+        "--eval_logs",
+        action="store_true",
+        help="Downloads TOFU, MUSE  - retain and finetuned models eval logs and saves them in saves/eval",
+    )
+    parser.add_argument(
+        "--idk",
+        action="store_true",
+        help="Download idk dataset from HF hub and stores it data/idk.jsonl",
+    )
+    parser.add_argument(
+        "--wmdp",
+        action="store_true",
+        help="Download and unzip WMDP dataset into data/wmdp",
+    )
+
+    args = parser.parse_args()
+
+    if args.download_eval:
+        download_eval_data()
+    if args.download_idk:
+        download_idk_data()
+    if args.download_wmdp:
+        download_wmdp()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/evals/__init__.py b/src/evals/__init__.py
index f5b323c..5ab4f60 100644
--- a/src/evals/__init__.py
+++ b/src/evals/__init__.py
@@ -2,6 +2,7 @@
 from omegaconf import DictConfig
 from evals.tofu import TOFUEvaluator
 from evals.muse import MUSEEvaluator
+from evals.lm_eval import LMEvalEvaluator
 
 EVALUATOR_REGISTRY: Dict[str, Any] = {}
 
@@ -31,3 +32,4 @@ def get_evaluators(eval_cfgs: DictConfig, **kwargs):
 # Register Your benchmark evaluators
 _register_evaluator(TOFUEvaluator)
 _register_evaluator(MUSEEvaluator)
+_register_evaluator(LMEvalEvaluator)
diff --git a/src/evals/base.py b/src/evals/base.py
index 3beb68a..8811467 100644
--- a/src/evals/base.py
+++ b/src/evals/base.py
@@ -109,4 +109,4 @@ def evaluate(self, model, output_dir=None, overwrite=None, **kwargs):
             self.save_logs(logs, logs_file_path)
             self.save_logs(self.summarize(logs), summary_file_path)
 
-        return logs
+        return self.summarize(logs)
diff --git a/src/evals/lm_eval.py b/src/evals/lm_eval.py
new file mode 100644
index 0000000..43080e4
--- /dev/null
+++ b/src/evals/lm_eval.py
@@ -0,0 +1,115 @@
+import logging
+from omegaconf import OmegaConf
+
+from lm_eval.models.hf_vlms import HFLM
+from lm_eval.tasks import TaskManager
+from lm_eval import simple_evaluate
+
+from evals.base import Evaluator
+
+
+logger = logging.getLogger("evaluator")
+
+
+class LMEvalEvaluator(Evaluator):
+    def __init__(self, eval_cfg, **kwargs):
+        self.name = "LMEval"
+        self.eval_cfg = eval_cfg
+        self.tasks = OmegaConf.to_container(
+            self.eval_cfg.tasks, resolve=True, throw_on_missing=True
+        )
+        self.task_manager = TaskManager()
+        self.simple_evaluate_args = dict(kwargs.get("simple_evaluate_args", {}))
+
+    def prepare_model(self, model, **kwargs):
+        """Prepare model for evaluation"""
+        model.eval()
+        return HFLM(model)
+
+    def summarize(self, eval_results: dict, task_name: str) -> dict:
+        """
+        Summarize evaluation metrics from lm_eval.simple_evaluate.
+        - If task_name is a group, return only aggregated group-level metrics.
+        - If it's a single task, return per-task metrics from 'results'.
+        - Always exclude 'alias' entries and strip ',none' suffixes.
+        """
+        summary = {}
+
+        def clean_metric_key(prefix: str, metric_name: str) -> str | None:
+            if metric_name == "alias":
+                return None
+            base = metric_name.split(",", 1)[0].strip()
+            return f"{prefix}/{base}"
+
+        # Check if task is a group (e.g., 'mmlu')
+        if task_name in self.task_manager.all_groups:
+            group_metrics = eval_results.get("groups", {}).get(task_name, {})
+            for metric_name, value in group_metrics.items():
+                key = clean_metric_key(task_name, metric_name)
+                if key is None:
+                    continue
+                try:
+                    summary[key] = float(value)
+                except (TypeError, ValueError):
+                    summary[key] = value
+        else:
+            task_metrics = eval_results.get("results", {}).get(task_name, {})
+            for metric_name, value in task_metrics.items():
+                key = clean_metric_key(task_name, metric_name)
+                if key is None:
+                    continue
+                try:
+                    summary[key] = float(value)
+                except (TypeError, ValueError):
+                    summary[key] = value
+
+        return summary
+
+    def get_task_name(self, task):
+        if isinstance(task, str):
+            return task
+        elif isinstance(task, dict):
+            if "task" in task:
+                return task.get("task")
+        raise ValueError(f"Invalid task format: {task}")
+
+    def evaluate(self, model, output_dir=None, overwrite=None, **kwargs):
+        # set flag to overwrite metrics
+        overwrite = self.eval_cfg.overwrite if overwrite is None else overwrite
+
+        # Prepare model for evaluation
+        kwargs = {"tokenizer": kwargs.get("tokenizer", None)}
+        model = self.prepare_model(model, **kwargs)
+
+        # Set output_dir and file to store results
+        output_dir = output_dir if output_dir else self.eval_cfg.output_dir
+        logs_file_path = self.get_logs_file_path(output_dir)
+        summary_file_path = self.get_logs_file_path(output_dir, suffix="SUMMARY")
+
+        # Load existing results from file if any.
+        logs = self.load_logs_from_file(logs_file_path) if not overwrite else {}
+        summary = self.load_logs_from_file(summary_file_path) if not overwrite else {}
+
+        logger.info(f"***** Running {self.name} evaluation suite *****")
+        logger.info(f"Fine-grained evaluations will be saved to: {logs_file_path}")
+        logger.info(
+            f"Aggregated evaluations will be summarised in: {summary_file_path}"
+        )
+
+        for task in self.tasks:
+            task_name = self.get_task_name(task)
+            if not overwrite and task_name in logs and logs[task_name]:
+                logger.info(f"Skipping {task_name}, already evaluated.")
+                continue
+            _ = logs.pop(task_name, None)  # overwriting existing evals if present
+            results = simple_evaluate(
+                model=model,
+                tasks=[task],
+                task_manager=self.task_manager,
+                **self.simple_evaluate_args,
+            )
+            logs.update({task_name: results["samples"]})
+            summary.update(self.summarize(results, task_name))
+            self.save_logs(logs, logs_file_path)
+            self.save_logs(summary, summary_file_path)
+        return summary
diff --git a/src/evals/metrics/__init__.py b/src/evals/metrics/__init__.py
index 9441c8d..5afb042 100644
--- a/src/evals/metrics/__init__.py
+++ b/src/evals/metrics/__init__.py
@@ -6,7 +6,6 @@
     probability_w_options,
     rouge,
     truth_ratio,
-    hm_aggregate,
     extraction_strength,
     exact_memorization,
 )
@@ -19,6 +18,10 @@
     mia_zlib,
     mia_reference,
 )
+from evals.metrics.utility import (
+    hm_aggregate,
+    classifier_prob,
+)
 
 METRICS_REGISTRY: Dict[str, UnlearningMetric] = {}
 
@@ -67,3 +70,6 @@ def get_metrics(metric_cfgs: DictConfig, **kwargs):
 _register_metric(mia_gradnorm)
 _register_metric(mia_zlib)
 _register_metric(mia_reference)
+
+# Register Utility metrics
+_register_metric(classifier_prob)
diff --git a/src/evals/metrics/memorization.py b/src/evals/metrics/memorization.py
index 646eab7..d033c1e 100644
--- a/src/evals/metrics/memorization.py
+++ b/src/evals/metrics/memorization.py
@@ -1,10 +1,8 @@
 import logging
 import torch
 import numpy as np
-import scipy as sc
 from torch.utils.data import DataLoader
 
-
 from evals.metrics.utils import (
     aggregate_to_1D,
     evaluate_probability,
@@ -164,12 +162,6 @@ def true_better(arr):
     return {"agg_value": forget_tr_avg, "value_by_index": value_by_index}
 
 
-@unlearning_metric(name="hm_aggregate")
-def hm_aggregate(model, **kwargs):
-    values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()]
-    return {"agg_value": sc.stats.hmean(values)}
-
-
 @unlearning_metric(name="exact_memorization")
 def exact_memorization(model, **kwargs):
     data = kwargs["data"]
diff --git a/src/evals/metrics/utility.py b/src/evals/metrics/utility.py
new file mode 100644
index 0000000..7f47e9c
--- /dev/null
+++ b/src/evals/metrics/utility.py
@@ -0,0 +1,76 @@
+import torch
+import numpy as np
+import scipy as sc
+from tqdm import tqdm
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+from evals.metrics.utils import aggregate_to_1D
+from evals.metrics.base import unlearning_metric
+
+
+@unlearning_metric(name="hm_aggregate")
+def hm_aggregate(model, **kwargs):
+    values = [result["agg_value"] for _, result in kwargs["pre_compute"].items()]
+    return {"agg_value": sc.stats.hmean(values)}
+
+
+@unlearning_metric(name="classifier_prob")
+def classifier_prob(model, **kwargs):
+    batch_size = kwargs.get("batch_size", 32)
+    max_length = kwargs.get("max_length", 512)
+    class_id = kwargs.get("class_id", 0)
+    text_key = kwargs.get("text_key", "generation")
+    classifier_model_args = kwargs["classifier_model_args"]
+    classifier_tokenization_args = kwargs["classifier_tokenization_args"]
+    device = kwargs.get("device", "cuda")
+
+    tokenizer = AutoTokenizer.from_pretrained(**classifier_tokenization_args)
+    classifier = AutoModelForSequenceClassification.from_pretrained(
+        **classifier_model_args
+    ).to(device)
+
+    data = kwargs["pre_compute"]["text"]["value_by_index"]
+    data_list = [
+        {"text": entry[text_key], "index": int(key)} for key, entry in data.items()
+    ]
+
+    # Create DataLoader
+    dataloader = DataLoader(data_list, batch_size=batch_size, shuffle=False)
+
+    scores_by_index = {}
+    for batch in tqdm(dataloader):
+        batch_texts = batch["text"]
+        batch_indices = batch["index"].tolist()
+
+        # Tokenize the batch of texts
+        inputs = tokenizer(
+            batch_texts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+            return_attention_mask=True,
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+
+        # Run the classifier
+        with torch.no_grad():
+            outputs = classifier(**inputs)
+        # Convert logits to probabilities
+        scores = F.softmax(outputs.logits, dim=-1)[:, class_id].cpu().numpy().tolist()
+
+        # Map predictions to labels
+        for idx, prob, text in zip(batch_indices, scores, batch_texts):
+            # Add the prediction to the original data
+            scores_by_index[idx] = {"score": prob, text_key: text}
+    class_scores = np.array(
+        [
+            evals["score"]
+            for evals in scores_by_index.values()
+            if evals["score"] is not None
+        ]
+    )
+    class_scores = aggregate_to_1D(class_scores)
+    return {"agg_value": np.mean(class_scores), "value_by_index": scores_by_index}
diff --git a/src/train.py b/src/train.py
index a9048e3..a2f81c8 100644
--- a/src/train.py
+++ b/src/train.py
@@ -3,7 +3,7 @@
 from data import get_data, get_collators
 from model import get_model
 from trainer import load_trainer
-from evals import get_evaluator
+from evals import get_evaluators
 from trainer.utils import seed_everything
 
 
@@ -34,17 +34,12 @@ def main(cfg: DictConfig):
     trainer_cfg = cfg.trainer
     assert trainer_cfg is not None, ValueError("Please set trainer")
 
-    # Get Evaluator
-    evaluator = None
+    # Get Evaluators
+    evaluators = None
     eval_cfgs = cfg.get("eval", None)
     if eval_cfgs:
-        assert len(eval_cfgs) <= 1, ValueError(
-            "Only one evaluation supported while training"
-        )
-        eval_name, eval_cfg = next(iter(eval_cfgs.items()))
-        evaluator = get_evaluator(
-            eval_name,
-            eval_cfg,
+        evaluators = get_evaluators(
+            eval_cfgs=eval_cfgs,
             template_args=template_args,
             model=model,
             tokenizer=tokenizer,
@@ -57,7 +52,7 @@ def main(cfg: DictConfig):
         eval_dataset=data.get("eval", None),
         tokenizer=tokenizer,
         data_collator=collator,
-        evaluator=evaluator,
+        evaluators=evaluators,
         template_args=template_args,
     )
 
diff --git a/src/trainer/__init__.py b/src/trainer/__init__.py
index 66774f2..27f44ee 100644
--- a/src/trainer/__init__.py
+++ b/src/trainer/__init__.py
@@ -46,7 +46,7 @@ def load_trainer(
     eval_dataset=None,
     tokenizer=None,
     data_collator=None,
-    evaluator=None,
+    evaluators=None,
     template_args=None,
 ):
     trainer_args = trainer_cfg.args
@@ -67,7 +67,7 @@ def load_trainer(
         tokenizer=tokenizer,
         data_collator=data_collator,
         args=trainer_args,
-        evaluator=evaluator,
+        evaluators=evaluators,
         template_args=template_args,
         **method_args,
     )
diff --git a/src/trainer/base.py b/src/trainer/base.py
index c9cfdce..05f36a2 100644
--- a/src/trainer/base.py
+++ b/src/trainer/base.py
@@ -13,8 +13,8 @@
 
 
 class FinetuneTrainer(Trainer):
-    def __init__(self, evaluator=None, template_args=None, *args, **kwargs):
-        self.evaluator = evaluator
+    def __init__(self, evaluators=None, template_args=None, *args, **kwargs):
+        self.evaluators = evaluators
         self.template_args = template_args
         super().__init__(*args, **kwargs)
 
@@ -26,7 +26,7 @@ def evaluate(
         trial: Dict[str, Any] = None,
     ) -> Dict[str, float]:
         # Run a custom evaluator and save results
-        if self.evaluator:
+        if self.evaluators:
             if self.accelerator.is_local_main_process:
                 eval_metrics = {}
                 if self.accelerator.num_processes == 1:
@@ -36,14 +36,15 @@ def evaluate(
                     )
                     output_dir = os.path.join(run_dir, checkpoint_folder, "evals")
                     os.makedirs(output_dir, exist_ok=True)
-                    eval_args = {
-                        "output_dir": output_dir,
-                        "template_args": self.template_args,
-                        "model": self.model,
-                        "tokenizer": self.tokenizer,
-                    }
-                    eval_metrics = self.evaluator.evaluate(**eval_args)
-                    eval_metrics = self.evaluator.summarize(eval_metrics)
+                    eval_metrics = {}
+                    for _, evaluator in self.evaluators.items():
+                        eval_args = {
+                            "output_dir": output_dir,
+                            "template_args": self.template_args,
+                            "model": self.model,
+                            "tokenizer": self.tokenizer,
+                        }
+                        eval_metrics.update(evaluator.evaluate(**eval_args))
                     self.log(eval_metrics)
                 else:
                     logger.warning(