vllm-project
diff --git a/‎.pre-commit-config.yaml
Lines changed: 2 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 21 additions & 9 deletions b/‎pyproject.toml
Lines changed: 21 additions & 9 deletions
diff --git a/‎src/guidellm/__init__.py
Lines changed: 4 additions & 1 deletion b/‎src/guidellm/__init__.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/guidellm/backend/base.py
Lines changed: 111 additions & 58 deletions b/‎src/guidellm/backend/base.py
Lines changed: 111 additions & 58 deletions
@@ -18,6 +18,7 @@ repos:
         # main dependencies
         click,
         datasets,
+        ftfy,
         loguru,
         numpy,
         openai,
@@ -30,6 +31,7 @@ repos:
         # dev dependencies
         pytest,
         pydantic_settings,
+        requests-mock,
 
         # types
         types-click,
 
@@ -27,6 +27,7 @@ urls = { homepage = "https://github.com/neuralmagic/guidellm" }
 dependencies = [
     "click",
     "datasets",
+    "ftfy>=6.0.0",
     "loguru",
     "numpy",
     "openai",
@@ -41,13 +42,16 @@ dependencies = [
 dev = [
     # general and configurations
     "pre-commit~=3.5.0",
+    "scipy~=1.10",
     "sphinx~=7.1.2",
     "tox~=4.16.0",
 
     # testing
     "pytest~=8.2.2",
+    "pytest-asyncio~=0.23.8",
     "pytest-cov~=5.0.0",
     "pytest-mock~=3.14.0",
+    "pytest-rerunfailures~=14.0",
     "requests-mock~=1.12.1",
 
     # code quality
@@ -83,7 +87,7 @@ profile = "black"
 files = ["src/guidellm", "tests"]
 python_version = '3.8'
 warn_redundant_casts = true
-warn_unused_ignores = true
+warn_unused_ignores = false
 show_error_codes = true
 namespace_packages = true
 exclude = ["venv", ".tox"]
@@ -92,22 +96,27 @@ exclude = ["venv", ".tox"]
 # Check: https://mypy.readthedocs.io/en/latest/config_file.html#import-discovery
 follow_imports = 'silent'
 
-[[tool.mypy.overrides]]
-module = ["transformers.*", "datasets.*"]
-ignore_missing_imports=true
-
 
 [tool.ruff]
 line-length = 88
+indent-width = 4
 exclude = ["build", "dist", "env", ".venv"]
-lint.ignore = [
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+
+[tool.ruff.lint]
+ignore = [
     "PLR0913",
     "TCH001",
     "COM812",
     "ISC001",
     "TCH002",
+    "PLW1514", # allow Path.open without encoding
+
 ]
-lint.select = [
+select = [
     # Rules reference: https://docs.astral.sh/ruff/rules/
 
     # Code Style / Formatting
@@ -127,11 +136,11 @@ lint.select = [
     "Q", # flake8-quotes: enforces consistent use of single or double quotes
     "TCH", # flake8-type-checking: enforces type checking practices and standards
     "TID", # flake8-tidy-imports: enforces tidy and well-organized imports
+    "RUF022", # flake8-ruff: enforce sorting of __all__ in modules
 
     # Code Structure / Complexity
     "C4", # flake8-comprehensions: improves readability and performance of list, set, and dict comprehensions
     "C90", # mccabe: checks for overly complex code using cyclomatic complexity
-    "FBT", # flake8-boolean-trap: prevents the use of boolean traps in function arguments and calls
     "ISC", # flake8-implicit-str-concat: prevents implicit string concatenation
     "PIE", # flake8-pie: identifies and corrects common code inefficiencies and mistakes
     "R", # Refactor: suggests improvements to code structure and readability
@@ -164,7 +173,6 @@ lint.select = [
 "tests/**/*.py" = [
     "S101", # asserts allowed in tests
     "ARG", # Unused function args allowed in tests
-    "FBT", # Booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
     "PLR2004", # Magic value used in comparison
     "TCH002", # No import only type checking in tests
     "SLF001", # enable private member access in tests
@@ -173,8 +181,12 @@ lint.select = [
     "PT011", # allow generic exceptions in tests
     "N806", # allow uppercase variable names in tests
     "PGH003", # allow general ignores in tests
+    "S106", # allow hardcoded passwords in tests
 ]
 
+[tool.ruff.lint.isort]
+known-first-party = ["guidellm", "tests"]
+
 
 [tool.pytest.ini_options]
 addopts = '-s -vvv --cache-clear'
 
@@ -3,4 +3,7 @@
 evaluating and benchmarking large language models (LLMs).
 """
 
-from .logger import configure_logger, logger  # noqa: F401
+from .config import settings
+from .logger import configure_logger, logger
+
+__all__ = ["configure_logger", "logger", "settings"]
@@ -1,35 +1,39 @@
 import functools
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from enum import Enum
-from typing import Dict, Iterator, List, Optional, Type
+from typing import AsyncGenerator, Dict, List, Literal, Optional, Type
 
 from loguru import logger
+from pydantic import BaseModel
 
 from guidellm.core import TextGenerationRequest, TextGenerationResult
 
 __all__ = ["Backend", "BackendEngine", "GenerativeResponse"]
 
 
-class BackendEngine(str, Enum):
-    """
-    Determines the Engine of the LLM Backend.
-    All the implemented backends in the project have the engine.
-
-    NOTE: the `TEST` engine has to be used only for testing purposes.
-    """
+BackendEngine = Literal["test", "openai_server"]
 
-    TEST = "test"
-    OPENAI_SERVER = "openai_server"
 
-
-@dataclass
-class GenerativeResponse:
+class GenerativeResponse(BaseModel):
     """
-    A dataclass to represent a response from a generative AI backend.
+    A model representing a response from a generative AI backend.
+
+    :param type_: The type of response, either 'token_iter' for intermediate
+        token output or 'final' for the final result.
+    :type type_: Literal["token_iter", "final"]
+    :param add_token: The token to add to the output
+        (only applicable if type_ is 'token_iter').
+    :type add_token: Optional[str]
+    :param prompt: The original prompt sent to the backend.
+    :type prompt: Optional[str]
+    :param output: The final generated output (only applicable if type_ is 'final').
+    :type output: Optional[str]
+    :param prompt_token_count: The number of tokens in the prompt.
+    :type prompt_token_count: Optional[int]
+    :param output_token_count: The number of tokens in the output.
+    :type output_token_count: Optional[int]
     """
 
-    type_: str  # One of 'token_iter', 'final'
+    type_: Literal["token_iter", "final"]
     add_token: Optional[str] = None
     prompt: Optional[str] = None
     output: Optional[str] = None
@@ -39,7 +43,14 @@ class GenerativeResponse:
 
 class Backend(ABC):
     """
-    An abstract base class with template methods for generative AI backends.
+    Abstract base class for generative AI backends.
+
+    This class provides a common interface for creating and interacting with different
+    generative AI backends. Subclasses should implement the abstract methods to
+    define specific backend behavior.
+
+    :cvar _registry: A dictionary that maps BackendEngine types to backend classes.
+    :type _registry: Dict[BackendEngine, Type[Backend]]
     """
 
     _registry: Dict[BackendEngine, "Type[Backend]"] = {}
@@ -50,33 +61,38 @@ def register(cls, backend_type: BackendEngine):
         A decorator to register a backend class in the backend registry.
 
         :param backend_type: The type of backend to register.
-        :type backend_type: BackendType
+        :type backend_type: BackendEngine
+        :return: The decorated backend class.
+        :rtype: Type[Backend]
         """
 
         def inner_wrapper(wrapped_class: Type["Backend"]):
             cls._registry[backend_type] = wrapped_class
+            logger.info("Registered backend type: {}", backend_type)
             return wrapped_class
 
         return inner_wrapper
 
     @classmethod
     def create(cls, backend_type: BackendEngine, **kwargs) -> "Backend":
         """
-        Factory method to create a backend based on the backend type.
+        Factory method to create a backend instance based on the backend type.
 
         :param backend_type: The type of backend to create.
-        :type backend_type: BackendType
+        :type backend_type: BackendEngine
         :param kwargs: Additional arguments for backend initialization.
         :type kwargs: dict
         :return: An instance of a subclass of Backend.
         :rtype: Backend
+        :raises ValueError: If the backend type is not registered.
         """
 
-        logger.info(f"Creating backend of type {backend_type}")
+        logger.info("Creating backend of type {}", backend_type)
 
         if backend_type not in cls._registry:
-            logger.error(f"Unsupported backend type: {backend_type}")
-            raise ValueError(f"Unsupported backend type: {backend_type}")
+            err = ValueError(f"Unsupported backend type: {backend_type}")
+            logger.error("{}", err)
+            raise err
 
         return Backend._registry[backend_type](**kwargs)
 
@@ -87,82 +103,119 @@ def default_model(self) -> str:
 
         :return: The default model.
         :rtype: str
+        :raises ValueError: If no models are available.
         """
         return _cachable_default_model(self)
 
-    def submit(self, request: TextGenerationRequest) -> TextGenerationResult:
+    async def submit(self, request: TextGenerationRequest) -> TextGenerationResult:
         """
-        Submit a result request and populate the BenchmarkResult.
+        Submit a text generation request and return the result.
 
-        :param request: The result request to submit.
+        This method handles the request submission to the backend and processes
+        the response in a streaming fashion if applicable.
+
+        :param request: The request object containing the prompt
+            and other configurations.
         :type request: TextGenerationRequest
-        :return: The populated result result.
+        :return: The result of the text generation request.
         :rtype: TextGenerationResult
+        :raises ValueError: If no response is received from the backend.
         """
 
-        logger.info(f"Submitting request with prompt: {request.prompt}")
+        logger.debug("Submitting request with prompt: {}", request.prompt)
 
-        result = TextGenerationResult(
-            request=TextGenerationRequest(prompt=request.prompt),
-        )
+        result = TextGenerationResult(request=request)
         result.start(request.prompt)
+        received_final = False
 
-        for response in self.make_request(request):  # GenerativeResponse
-            if response.type_ == "token_iter" and response.add_token:
-                result.output_token(response.add_token)
+        async for response in self.make_request(request):
+            logger.debug("Received response: {}", response)
+            if response.type_ == "token_iter":
+                result.output_token(response.add_token if response.add_token else "")
             elif response.type_ == "final":
+                if received_final:
+                    err = ValueError(
+                        "Received multiple final responses from the backend."
+                    )
+                    logger.error(err)
+                    raise err
+
                 result.end(
+                    output=response.output,
                     prompt_token_count=response.prompt_token_count,
                     output_token_count=response.output_token_count,
                 )
+                received_final = True
+            else:
+                err = ValueError(
+                    f"Invalid response received from the backend of type: "
+                    f"{response.type_} for {response}"
+                )
+                logger.error(err)
+                raise err
 
-        logger.info(f"Request completed with output: {result.output}")
+        if not received_final:
+            err = ValueError("No final response received from the backend.")
+            logger.error(err)
+            raise err
+
+        logger.info("Request completed with output: {}", result.output)
 
         return result
 
     @abstractmethod
-    def make_request(
+    async def make_request(
         self,
         request: TextGenerationRequest,
-    ) -> Iterator[GenerativeResponse]:
+    ) -> AsyncGenerator[GenerativeResponse, None]:
         """
         Abstract method to make a request to the backend.
 
-        :param request: The result request to submit.
+        Subclasses must implement this method to define how requests are handled
+        by the backend.
+
+        :param request: The request object containing the prompt and
+            other configurations.
         :type request: TextGenerationRequest
-        :return: An iterator over the generative responses.
-        :rtype: Iterator[GenerativeResponse]
+        :yield: A generator yielding responses from the backend.
+        :rtype: AsyncGenerator[GenerativeResponse, None]
         """
-        raise NotImplementedError
+        yield None  # type: ignore  # noqa: PGH003
 
     @abstractmethod
     def available_models(self) -> List[str]:
         """
         Abstract method to get the available models for the backend.
 
+        Subclasses must implement this method to provide the list of models
+        supported by the backend.
+
         :return: A list of available models.
         :rtype: List[str]
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def model_tokenizer(self, model: str) -> Optional[str]:
-        """
-        Abstract method to get the tokenizer for a model.
-
-        :param model: The model to get the tokenizer for.
-        :type model: str
-        :return: The tokenizer for the model, or None if it cannot be created.
-        :rtype: Optional[str]
+        :raises NotImplementedError: If the method is not implemented by a subclass.
         """
         raise NotImplementedError
 
 
 @functools.lru_cache(maxsize=1)
 def _cachable_default_model(backend: Backend) -> str:
-    if models := backend.available_models():
-        logger.debug(f"Default model: {models[0]}")
+    """
+    Get the default model for a backend using LRU caching.
+
+    This function caches the default model to optimize repeated lookups.
+
+    :param backend: The backend instance for which to get the default model.
+    :type backend: Backend
+    :return: The default model.
+    :rtype: str
+    :raises ValueError: If no models are available.
+    """
+    logger.debug("Getting default model for backend: {}", backend)
+    models = backend.available_models()
+    if models:
+        logger.debug("Default model: {}", models[0])
         return models[0]
 
-    logger.error("No models available.")
-    raise ValueError("No models available.")
+    err = ValueError("No models available.")
+    logger.error(err)
+    raise err