BerriAI · ishaan-jaff · May 1, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/docs/my-website/docs/providers/llamafile.md b/docs/my-website/docs/providers/llamafile.md
@@ -0,0 +1,158 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Llamafile
+
+LiteLLM supports all models on Llamafile.
+
+| Property                  | Details                                                                                                                              |
+|---------------------------|--------------------------------------------------------------------------------------------------------------------------------------|
+| Description               | llamafile lets you distribute and run LLMs with a single file. [Docs](https://github.com/Mozilla-Ocho/llamafile/blob/main/README.md) |
+| Provider Route on LiteLLM | `llamafile/` (for OpenAI compatible server)                                                                                          |
+| Provider Doc              | [llamafile ↗](https://github.com/Mozilla-Ocho/llamafile/blob/main/llama.cpp/server/README.md#api-endpoints)                          |
+| Supported Endpoints       | `/chat/completions`, `/embeddings`, `/completions`                                                                                   |
+
+
+# Quick Start
+
+## Usage - litellm.completion (calling OpenAI compatible endpoint)
+llamafile Provides an OpenAI compatible endpoint for chat completions - here's how to call it with LiteLLM
+
+To use litellm to call llamafile add the following to your completion call
+
+* `model="llamafile/<your-llamafile-model-name>"` 
+* `api_base = "your-hosted-llamafile"`
+
+```python
+import litellm 
+
+response = litellm.completion(
+            model="llamafile/mistralai/mistral-7b-instruct-v0.2", # pass the llamafile model name for completeness
+            messages=messages,
+            api_base="http://localhost:8080/v1",
+            temperature=0.2,
+            max_tokens=80)
+
+print(response)
+```
+
+
+## Usage -  LiteLLM Proxy Server (calling OpenAI compatible endpoint)
+
+Here's how to call an OpenAI-Compatible Endpoint with the LiteLLM Proxy Server
+
+1. Modify the config.yaml 
+
+  ```yaml
+  model_list:
+    - model_name: my-model
+      litellm_params:
+        model: llamafile/mistralai/mistral-7b-instruct-v0.2 # add llamafile/ prefix to route as OpenAI provider
+        api_base: http://localhost:8080/v1 # add api base for OpenAI compatible provider
+  ```
+
+1. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml
+  ```
+
+1. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+  ```python
+  import openai
+  client = openai.OpenAI(
+      api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys
+      base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+  )
+
+  response = client.chat.completions.create(
+      model="my-model",
+      messages = [
+          {
+              "role": "user",
+              "content": "what llm are you"
+          }
+      ],
+  )
+
+  print(response)
+  ```
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "my-model",
+      "messages": [
+          {
+          "role": "user",
+          "content": "what llm are you"
+          }
+      ],
+  }'
+  ```
+  </TabItem>
+
+  </Tabs>
+
+
+## Embeddings
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import embedding   
+import os
+
+os.environ["LLAMAFILE_API_BASE"] = "http://localhost:8080/v1"
+
+
+embedding = embedding(model="llamafile/sentence-transformers/all-MiniLM-L6-v2", input=["Hello world"])
+
+print(embedding)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: my-model
+      litellm_params:
+        model: llamafile/sentence-transformers/all-MiniLM-L6-v2 # add llamafile/ prefix to route as OpenAI provider
+        api_base: http://localhost:8080/v1 # add api base for OpenAI compatible provider
+```
+
+1. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+1. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/embeddings' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-d '{"input": ["hello world"], "model": "my-model"}'
+```
+
+[See OpenAI SDK/Langchain/etc. examples](../proxy/user_keys.md#embeddings)
+
+</TabItem>
+</Tabs>
diff --git a/docs/my-website/docs/providers/openai_compatible.md b/docs/my-website/docs/providers/openai_compatible.md
@@ -3,13 +3,26 @@ import TabItem from '@theme/TabItem';
 
 # OpenAI-Compatible Endpoints
 
+:::info
+
+Selecting `openai` as the provider routes your request to an OpenAI-compatible endpoint using the upstream  
+[official OpenAI Python API library](https://github.com/openai/openai-python/blob/main/README.md).
+
+This library **requires** an API key for all requests, either through the `api_key` parameter 
+or the `OPENAI_API_KEY` environment variable.
+
+If you don’t want to provide a fake API key in each request, consider using a provider that directly matches your 
+OpenAI-compatible endpoint, such as [`hosted_vllm`](/docs/providers/vllm) or [`llamafile`](/docs/providers/llamafile).
+
+:::
+
 To call models hosted behind an openai proxy, make 2 changes:
 
 1. For `/chat/completions`: Put `openai/` in front of your model name, so litellm knows you're trying to call an openai `/chat/completions` endpoint. 
 
-2. For `/completions`: Put `text-completion-openai/` in front of your model name, so litellm knows you're trying to call an openai `/completions` endpoint. [NOT REQUIRED for `openai/` endpoints called via `/v1/completions` route].
+1. For `/completions`: Put `text-completion-openai/` in front of your model name, so litellm knows you're trying to call an openai `/completions` endpoint. [NOT REQUIRED for `openai/` endpoints called via `/v1/completions` route].
 
-2. **Do NOT** add anything additional to the base url e.g. `/v1/embedding`. LiteLLM uses the openai-client to make these calls, and that automatically adds the relevant endpoints. 
+1. **Do NOT** add anything additional to the base url e.g. `/v1/embedding`. LiteLLM uses the openai-client to make these calls, and that automatically adds the relevant endpoints. 
 
 
 ## Usage - completion

diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
@@ -236,6 +236,7 @@ const sidebars = {
         "providers/fireworks_ai",
         "providers/clarifai",
         "providers/vllm",
+        "providers/llamafile",
         "providers/infinity",
         "providers/xinference",
         "providers/cloudflare_workers",

diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -72,7 +72,6 @@
 from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
 import httpx
 import dotenv
-from enum import Enum
 
 litellm_mode = os.getenv("LITELLM_MODE", "DEV")  # "PRODUCTION", "DEV"
 if litellm_mode == "DEV":
@@ -1013,6 +1012,7 @@ def add_known_models():
 from .llms.azure.chat.gpt_transformation import AzureOpenAIConfig
 from .llms.azure.completion.transformation import AzureOpenAITextConfig
 from .llms.hosted_vllm.chat.transformation import HostedVLLMChatConfig
+from .llms.llamafile.chat.transformation import LlamafileChatConfig
 from .llms.litellm_proxy.chat.transformation import LiteLLMProxyChatConfig
 from .llms.vllm.completion.transformation import VLLMConfig
 from .llms.deepseek.chat.transformation import DeepSeekChatConfig

diff --git a/litellm/constants.py b/litellm/constants.py
@@ -155,6 +155,7 @@
     "custom",
     "litellm_proxy",
     "hosted_vllm",
+    "llamafile",
     "lm_studio",
     "galadriel",
 ]
@@ -244,6 +245,7 @@
     "github",
     "litellm_proxy",
     "hosted_vllm",
+    "llamafile",
     "lm_studio",
     "galadriel",
 ]
@@ -252,6 +254,7 @@
         "together_ai",
         "fireworks_ai",
         "hosted_vllm",
+        "llamafile",
     ]
 )
 _openai_like_providers: List = [

diff --git a/litellm/litellm_core_utils/get_llm_provider_logic.py b/litellm/litellm_core_utils/get_llm_provider_logic.py
@@ -101,7 +101,6 @@ def get_llm_provider(  # noqa: PLR0915
 
     Return model, custom_llm_provider, dynamic_api_key, api_base
     """
-
     try:
         ## IF LITELLM PARAMS GIVEN ##
         if litellm_params is not None:
@@ -477,6 +476,12 @@ def _get_openai_compatible_provider_info(  # noqa: PLR0915
         ) = litellm.HostedVLLMChatConfig()._get_openai_compatible_provider_info(
             api_base, api_key
         )
+    elif custom_llm_provider == "llamafile":
+        # llamafile is OpenAI compatible.
+        (api_base, dynamic_api_key) = litellm.LlamafileChatConfig()._get_openai_compatible_provider_info(
+                api_base,
+                api_key
+            )
     elif custom_llm_provider == "lm_studio":
         # lm_studio is openai compatible, we just need to set this to custom_openai
         (

diff --git a/litellm/llms/llamafile/chat/transformation.py b/litellm/llms/llamafile/chat/transformation.py
@@ -0,0 +1,46 @@
+from typing import Optional, Tuple
+
+from litellm.secret_managers.main import get_secret_str
+
+from ...openai.chat.gpt_transformation import OpenAIGPTConfig
+
+
+class LlamafileChatConfig(OpenAIGPTConfig):
+    """LlamafileChatConfig is used to provide configuration for the LlamaFile's chat API."""
+
+    @staticmethod
+    def _resolve_api_key(api_key: Optional[str] = None) -> str:
+        """Attempt to ensure that the API key is set, preferring the user-provided key
+        over the secret manager key (``LLAMAFILE_API_KEY``).
+
+        If both are None, a fake API key is returned.
+        """
+        return api_key or get_secret_str("LLAMAFILE_API_KEY") or "fake-api-key"  # llamafile does not require an API key
+
+    @staticmethod
+    def _resolve_api_base(api_base: Optional[str] = None) -> Optional[str]:
+        """Attempt to ensure that the API base is set, preferring the user-provided key
+        over the secret manager key (``LLAMAFILE_API_BASE``).
+
+        If both are None, a default Llamafile server URL is returned.
+        See: https://github.com/Mozilla-Ocho/llamafile/blob/bd1bbe9aabb1ee12dbdcafa8936db443c571eb9d/README.md#L61
+        """
+        return api_base or get_secret_str("LLAMAFILE_API_BASE") or "http://127.0.0.1:8080/v1" # type: ignore
+
+
+    def _get_openai_compatible_provider_info(
+        self,
+        api_base: Optional[str],
+        api_key: Optional[str]
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Attempts to ensure that the API base and key are set, preferring user-provided values,
+        before falling back to secret manager values (``LLAMAFILE_API_BASE`` and ``LLAMAFILE_API_KEY``
+        respectively).
+
+        If an API key cannot be resolved via either method, a fake key is returned. Llamafile
+        does not require an API key, but the underlying OpenAI library may expect one anyway.
+        """
+        api_base = LlamafileChatConfig._resolve_api_base(api_base)
+        dynamic_api_key = LlamafileChatConfig._resolve_api_key(api_key)
+
+        return api_base, dynamic_api_key
diff --git a/litellm/main.py b/litellm/main.py
@@ -3581,6 +3581,7 @@ def embedding(  # noqa: PLR0915
             custom_llm_provider == "openai_like"
             or custom_llm_provider == "jina_ai"
             or custom_llm_provider == "hosted_vllm"
+            or custom_llm_provider == "llamafile"
             or custom_llm_provider == "lm_studio"
         ):
             api_base = (

diff --git a/litellm/types/utils.py b/litellm/types/utils.py
@@ -2093,6 +2093,7 @@ class LlmProviders(str, Enum):
     CUSTOM = "custom"
     LITELLM_PROXY = "litellm_proxy"
     HOSTED_VLLM = "hosted_vllm"
+    LLAMAFILE = "llamafile"
     LM_STUDIO = "lm_studio"
     GALADRIEL = "galadriel"
     INFINITY = "infinity"

diff --git a/litellm/utils.py b/litellm/utils.py
@@ -6463,6 +6463,8 @@ def get_provider_chat_config(  # noqa: PLR0915
             return litellm.AiohttpOpenAIChatConfig()
         elif litellm.LlmProviders.HOSTED_VLLM == provider:
             return litellm.HostedVLLMChatConfig()
+        elif litellm.LlmProviders.LLAMAFILE == provider:
+            return litellm.LlamafileChatConfig()
         elif litellm.LlmProviders.LM_STUDIO == provider:
             return litellm.LMStudioChatConfig()
         elif litellm.LlmProviders.GALADRIEL == provider: