vllm-project · Harshith-umesh · Jul 21, 2025 · Jul 22, 2025
diff --git a/docs/preprocess.md b/docs/preprocess.md
@@ -0,0 +1,134 @@
+# Preprocess Commands
+
+GuideLLM provides preprocessing capabilities to transform and prepare data for benchmarking workflows. The preprocess module includes tools for creating datasets from existing benchmark results, enabling "apples-to-apples" comparisons and reusable benchmark datasets.
+
+## Overview
+
+The `guidellm preprocess` command provides utilities to:
+
+- **Extract datasets from benchmark results**: Convert completed benchmark reports into reusable datasets with known prompt and output token counts for consistent comparisons
+
+
+## Commands
+
+### `dataset-from-file`
+
+Extracts prompts and their corresponding output token counts from saved benchmark report files to create datasets for future benchmarking runs.
+
+#### Purpose
+
+When you run a benchmark with GuideLLM, you get detailed results about how a model performed with specific prompts. The `dataset-from-file` command allows you to extract those successful prompt-response pairs and convert them into a standardized dataset format. This enables:
+
+1. **Consistent Comparisons**: Use the exact same prompts across different models or configurations
+2. **Known Expectations**: Each prompt comes with its expected output token count
+3. **Reproducible Benchmarks**: Eliminate variability from different prompts when comparing models
+
+#### Syntax
+
+```bash
+guidellm preprocess dataset-from-file [OPTIONS] BENCHMARK_FILE
+```
+
+#### Arguments
+
+- `BENCHMARK_FILE`: Path to the saved benchmark report file (JSON format)
+
+#### Options
+
+- `-o, --output-path PATH`: Output dataset file path (default: `dataset_from_benchmark.json`)
+- `--show-stats`: Show dataset statistics after creation
+- `--disable-console-outputs`: Disable console output for silent operation
+- `--help`: Show help message and exit
+
+#### Example Usage
+
+##### Basic Usage
+
+```bash
+# Convert a benchmark report to a dataset
+guidellm preprocess dataset-from-file benchmark-results.json
+
+# Specify custom output path
+guidellm preprocess dataset-from-file benchmark-results.json -o my_dataset.json
+
+# Show statistics about the created dataset
+guidellm preprocess dataset-from-file benchmark-results.json --show-stats
+```
+
+#### Input File Requirements
+
+The input benchmark file must be a valid GuideLLM benchmark report containing:
+
+- **Valid JSON format**: The file must be properly formatted
+- **Benchmark report structure**: Must contain the expected benchmark report schema
+- **Successful requests**: Must contain at least one successful request to extract data from
+
+##### Supported Input Formats
+
+```json
+{
+  "benchmarks": [
+    {
+      "requests": {
+        "successful": [
+          {
+            "prompt": "What is the capital of France?",
+            "output_tokens": 5,
+            "... other request fields ..."
+          }
+        ],
+        "errored": [],
+        "incomplete": []
+      }
+    }
+  ]
+}
+```
+
+#### Output Format
+
+The generated dataset follows this structure:
+
+```json
+{
+  "version": "1.0",
+  "description": "Dataset created from benchmark results for apples-to-apples comparisons",
+  "data": [
+    {
+      "prompt": "What is the capital of France?",
+      "output_tokens_count": 5,
+      "prompt_tokens_count": 12
+    },
+    {
+      "prompt": "Explain quantum computing in simple terms.",
+      "output_tokens_count": 45,
+      "prompt_tokens_count": 8
+    }
+  ]
+}
+```
+
+
+Each data item contains:
+- `prompt`: The original prompt text
+- `output_tokens_count`: The number of tokens in the model's response
+- `prompt_tokens_count`: The number of tokens in the original prompt
+
+#### Statistics Output
+
+When using `--show-stats`, you'll see detailed information about the created dataset:
+
+```
+Dataset Statistics:
+==================
+Total items: 95
+Prompt length statistics:
+  Min: 8 characters
+  Max: 245 characters  
+  Mean: 87.3 characters
+Output tokens statistics:
+  Min: 1 tokens
+  Max: 512 tokens
+  Mean: 124.8 tokens
+```
+
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
@@ -15,6 +15,7 @@
 from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios
 from guidellm.config import print_config
 from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
+from guidellm.preprocess.dataset_from_file import create_dataset_from_file, DatasetCreationError
 from guidellm.scheduler import StrategyType
 from guidellm.utils import DefaultGroupHandler
 from guidellm.utils import cli as cli_tools
@@ -491,6 +492,12 @@ def dataset(
     hub_dataset_id,
     random_seed,
 ):
+    """
+    Convert a dataset to have specific prompt and output token counts.
+
+    This creates a filtered and processed dataset where prompts and outputs
+    match specified token counts, useful for consistent benchmarking.
+    """
     process_dataset(
         data=data,
         output_path=output_path,
@@ -508,5 +515,56 @@ def dataset(
     )
 
 
+@preprocess.command("dataset-from-file", help="Create a dataset from a saved benchmark report file.")
+@click.argument(
+    "benchmark_file",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
+)
+@click.option(
+    "-o",
+    "--output-path",
+    type=click.Path(file_okay=True, dir_okay=False, path_type=Path),
+    default=Path("dataset_from_benchmark.json"),
+    help="Output dataset file path.",
+)
+@click.option(
+    "--show-stats",
+    is_flag=True,
+    help="Show dataset statistics after creation.",
+)
+@click.option(
+    "--disable-console-outputs",
+    is_flag=True,
+    help="Set this flag to disable console output.",
+)
+def dataset_from_file(
+    benchmark_file,
+    output_path,
+    show_stats,
+    disable_console_outputs,
+):
+    """
+    Create a dataset from a saved benchmark report file.
+
+    This extracts prompts and their corresponding output token counts from
+    benchmark results to create an 'apples-to-apples' comparison dataset.
+
+    BENCHMARK_FILE: Path to the benchmark results JSON file.
+    """
+    try:
+        create_dataset_from_file(
+            benchmark_file=benchmark_file,
+            output_path=Path(output_path),
+            show_stats=show_stats,
+            enable_console=not disable_console_outputs,
+        )
+    except DatasetCreationError as e:
+        # To print clean error message without a traceback
+        if not disable_console_outputs:
+            click.echo(f"Error: {e}", err=True)
+        ctx = click.get_current_context()
+        ctx.exit(1)
+
+
 if __name__ == "__main__":
     cli()
diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py
@@ -1,3 +1,4 @@
 from .dataset import ShortPromptStrategy, process_dataset
+from .dataset_from_file import create_dataset_from_file, DatasetCreationError
 
-__all__ = ["ShortPromptStrategy", "process_dataset"]
+__all__ = ["ShortPromptStrategy", "process_dataset", "create_dataset_from_file", "DatasetCreationError"]