NVIDIA · SnowMasaya · May 19, 2025 · May 19, 2025 · May 19, 2025
diff --git a/docs/source/wandb_manager.rst b/docs/source/wandb_manager.rst
@@ -0,0 +1,131 @@
+WandB Manager
+============
+
+Overview
+--------
+The WandBManager module provides integration with Weights & Biases (W&B) for experiment tracking, logging, and visualization.
+This module enables efficient recording and analysis of experimental results and metrics within the garak framework.
+
+Key Features
+-----------
+- Experiment initialization and configuration
+- Logging of attempt results
+- Recording of evaluation metrics
+- Generation of radar charts
+- Efficient batch processing of logs
+
+Configuration
+------------
+The WandBManager can be configured using YAML configuration files. Here are some example configurations:
+
+Basic Configuration
+^^^^^^^^^^^^^^^^^
+Basic configuration for W&B integration::
+
+    system:
+      wandb_enabled: true
+      wandb_project: "my-garak-project"
+
+Classes
+-------
+
+WandBManager
+^^^^^^^^^^^
+
+.. py:class:: WandBManager
+
+   A class that manages integration with W&B
+
+   .. py:method:: init_wandb(generator_name: str, probe_names: List[str], detector_names: List[str]) -> None
+
+      Initializes a W&B experiment.
+
+      :param generator_name: Name of the generator model
+      :param probe_names: List of probe names
+      :param detector_names: List of detector names
+      :return: None
+
+   .. py:method:: log_attempt(attempt: Attempt) -> None
+
+      Logs an attempt result.
+
+      :param attempt: The attempt object to log
+      :return: None
+
+   .. py:method:: flush_attempts() -> None
+
+      Sends pending attempt results to W&B in batch.
+
+      :return: None
+
+   .. py:method:: log_evaluation(evaluator: str, probe: str, detector: str, passed: int, total: int) -> None
+
+      Logs evaluation results.
+
+      :param evaluator: Name of the evaluator
+      :param probe: Name of the probe
+      :param detector: Name of the detector
+      :param passed: Number of passed attempts
+      :param total: Total number of attempts
+      :return: None
+
+   .. py:method:: flush_evaluations() -> None
+
+      Sends pending evaluation results to W&B in batch.
+
+      :return: None
+
+   .. py:method:: finish_wandb() -> None
+
+      Ends the W&B session.
+
+      :return: None
+
+
+Implementation Details
+--------------------
+
+The WandBManager class implements several key features:
+
+1. **Batch Processing**
+   - Attempts and evaluations are stored in memory before being sent to W&B
+   - Reduces API calls and improves performance
+   - Controlled through flush methods
+
+2. **Data Organization**
+   - Structures data in a format suitable for W&B visualization
+   - Creates tables for attempt results
+   - Generates summary metrics for evaluations
+
+3. **Visualization Support**
+   - Supports radar chart generation for evaluation metrics
+   - Enables custom visualization through W&B's interface
+
+Notes
+-----
+- W&B configuration (project name, API key) must be set up properly before use
+- `log_attempt` and `log_evaluation` methods store data in memory
+- Actual transmission to W&B occurs during `flush_attempts` and `flush_evaluations`
+- Regular flushing is recommended to manage memory usage
+- The manager handles connection errors and retries automatically
+
+Dependencies
+-----------
+- wandb
+- numpy
+- pandas
+
+See Also
+--------
+- `Weights & Biases Documentation <https://docs.wandb.ai/>`_
+- :doc:`../attempt`
+- :doc:`../evaluator`
+
+Contributing
+-----------
+When contributing to this module, please ensure:
+
+1. All new features are properly documented
+2. Tests are added for new functionality
+3. Existing tests pass
+4. Code follows the project's style guidelines 
diff --git a/garak/cli.py b/garak/cli.py
@@ -411,6 +411,19 @@ def worker_count_validation(workers):
 
     import garak.evaluators
 
+    # Initialize W&B if enabled
+    wandb_enabled = False
+    if hasattr(_config.system, "wandb_enabled"):
+        if _config.system.wandb_enabled:
+            from garak.resources.wandb_manager import wandb_manager
+
+            wandb_manager.init_wandb(
+                _config.plugins.model_name,
+                [p.strip() for p in _config.plugins.probe_spec.split(",")],
+                [d.strip() for d in _config.plugins.detector_spec.split(",")],
+            )
+            wandb_enabled = True
+
     try:
         has_config_file_or_json = False
         # do a special thing for CLI probe options, generator options
@@ -636,5 +649,9 @@ def worker_count_validation(workers):
     except (ValueError, GarakException) as e:
         logging.exception(e)
         print(e)
+    finally:
+        # Close W&B at the end
+        if wandb_enabled:
+            wandb_manager.finish_wandb()
 
     _config.set_http_lib_agents(prior_user_agents)
diff --git a/garak/evaluators/base.py b/garak/evaluators/base.py
@@ -14,6 +14,7 @@
 import garak.attempt
 import garak.analyze.calibration
 import garak.resources.theme
+from garak.resources.wandb_manager import wandb_manager
 
 
 class Evaluator:
@@ -32,6 +33,11 @@ def __init__(self):
         self.probename = ""
         if _config.system.show_z:
             self.calibration = garak.analyze.calibration.Calibration()
+        if hasattr(_config.system, "wandb_enabled"):
+            if _config.system.wandb_enabled:
+                self.wandb_enabled = True
+        else:
+            self.wandb_enabled = False
 
     def test(self, test_value: float) -> bool:
         """Function for converting the detector result to a boolean, True=pass
@@ -115,13 +121,32 @@ def evaluate(self, attempts: Iterable[garak.attempt.Attempt]) -> None:
                             )
                             + "\n"  # generator,probe,prompt,trigger,result,detector,score,run id,attemptid,
                         )
+                if self.wandb_enabled:
+                    wandb_manager.log_attempt(attempt)
 
             if _config.system.narrow_output:
                 print_func = self.print_results_narrow
             else:
                 print_func = self.print_results_wide
             print_func(detector, all_passes, all_outputs)
 
+            try:
+                if self.wandb_enabled:
+                    wandb_manager.flush_attempts()
+                    generator_name = (
+                        f"{_config.plugins.model_type} {_config.plugins.model_name}"
+                    )
+                    wandb_manager.generator_name = generator_name
+                    wandb_manager.log_evaluation(
+                        self.__class__.__name__,
+                        self.probename,
+                        detector,
+                        sum(all_passes),
+                        len(all_passes),
+                    )
+            except Exception as e:
+                logging.debug(f"W&B evaluation logging failed: {e}")
+
             _config.transient.reportfile.write(
                 json.dumps(
                     {