Fixed Server Issues and Improved The CLI and logging system

UtkarshTheDev · UtkarshTheDev · commit fbc71101066d · 2025-05-04T18:03:17.000+05:30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,23 @@
 
 All notable changes to LocalLab will be documented in this file.
 
+## [0.6.0] - 2024-05-02
+
+### Added
+
+- Added `do_sample` parameter to all generation endpoints in the API
+- Updated API documentation to include the `do_sample` parameter with description and examples
+- Added clear messages before and after model downloads for better user experience
+
+### Fixed
+
+- Fixed model downloading logs to display properly without interleaving
+- Implemented a custom progress bar system for Hugging Face downloads
+- Suppressed regular logs during model downloads to avoid interference with progress bars
+- Enhanced progress bar display with better formatting and descriptions
+- Fixed client error with `do_sample` parameter by adding it to all client methods
+- Updated client package version to 1.0.9 to reflect these fixes
+
 ## [0.5.9] - 2024-05-01
 
 ### Fixed
diff --git a/client/python_client/locallab_client/client.py b/client/python_client/locallab_client/client.py
@@ -235,7 +235,8 @@ async def generate(
         top_p: float = 0.9,
         timeout: float = 180.0,  # Increased timeout for more complete responses (3 minutes)
         repetition_penalty: float = 1.15,  # Added repetition penalty for better quality
-        top_k: int = 80  # Added top_k parameter for better quality
+        top_k: int = 80,  # Added top_k parameter for better quality
+        do_sample: bool = True  # Added do_sample parameter
     ) -> str:
         """Generate text using the model with improved error handling"""
         # Update activity timestamp
@@ -249,7 +250,8 @@ async def generate(
             "temperature": temperature,
             "top_p": top_p,
             "repetition_penalty": repetition_penalty,
-            "top_k": top_k
+            "top_k": top_k,
+            "do_sample": do_sample
         }
 
         if stream:
@@ -261,7 +263,8 @@ async def generate(
                 top_p=top_p,
                 timeout=timeout,
                 repetition_penalty=repetition_penalty,
-                top_k=top_k
+                top_k=top_k,
+                do_sample=do_sample
             )
 
         # Create a timeout for this specific request
@@ -307,7 +310,8 @@ async def stream_generate(
         timeout: float = 300.0,  # Increased timeout for more complete responses (5 minutes)
         retry_count: int = 3,    # Increased retry count for better reliability
         repetition_penalty: float = 1.15,  # Increased repetition penalty for better quality
-        top_k: int = 80  # Added top_k parameter for better quality
+        top_k: int = 80,  # Added top_k parameter for better quality
+        do_sample: bool = True  # Added do_sample parameter
     ) -> AsyncGenerator[str, None]:
         """
         Stream text generation with token-level streaming and robust error handling.
@@ -341,7 +345,8 @@ async def stream_generate(
             "temperature": temperature,
             "top_p": top_p,
             "repetition_penalty": repetition_penalty,
-            "top_k": top_k
+            "top_k": top_k,
+            "do_sample": do_sample
         }
 
         # Create a timeout for this specific request
diff --git a/client/python_client/locallab_client/sync_client.py b/client/python_client/locallab_client/sync_client.py
@@ -141,7 +141,8 @@ def generate(
         temperature: float = 0.7,
         top_p: float = 0.9,
         repetition_penalty: float = 1.15,  # Increased repetition penalty for better quality
-        top_k: int = 80  # Added top_k parameter for better quality
+        top_k: int = 80,  # Added top_k parameter for better quality
+        do_sample: bool = True  # Added do_sample parameter
     ) -> Union[str, Generator[str, None, None]]:
         """
         Generate text using the model with improved quality settings.
@@ -171,7 +172,8 @@ def generate(
                 temperature=temperature,
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
-                top_k=top_k
+                top_k=top_k,
+                do_sample=do_sample
             )
 
         return self._run_coroutine(
@@ -184,6 +186,7 @@ def generate(
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 top_k=top_k,
+                do_sample=do_sample,
                 timeout=180.0  # Increased timeout for more complete responses (3 minutes)
             )
         )
@@ -197,7 +200,8 @@ def stream_generate(
         top_p: float = 0.9,
         timeout: float = 300.0,  # Increased timeout for more complete responses (5 minutes)
         repetition_penalty: float = 1.15,  # Increased repetition penalty for better quality
-        top_k: int = 80  # Added top_k parameter for better quality
+        top_k: int = 80,  # Added top_k parameter for better quality
+        do_sample: bool = True  # Added do_sample parameter
     ) -> Generator[str, None, None]:
         """
         Stream text generation with improved quality and reliability.
@@ -234,7 +238,8 @@ async def producer():
                     timeout=timeout,
                     retry_count=3,  # Increased retry count for better reliability
                     repetition_penalty=repetition_penalty,  # Pass the repetition penalty parameter
-                    top_k=top_k  # Pass the top_k parameter
+                    top_k=top_k,  # Pass the top_k parameter
+                    do_sample=do_sample  # Pass the do_sample parameter
                 ):
                     await queue.put(chunk)
 
diff --git a/client/python_client/pyproject.toml b/client/python_client/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "locallab-client"
-version = "1.0.8"
+version = "1.0.9"
 description = "Python client for LocalLab - A local LLM server"
 readme = "README.md"
 authors = [
diff --git a/client/python_client/setup.py b/client/python_client/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="locallab-client",
-    version="1.0.8",
+    version="1.0.9",
     author="Utkarsh",
     author_email="utkarshweb2023@gmail.com",
     description="Python client for LocalLab - A local LLM server",
diff --git a/docs/colab/README.md b/docs/colab/README.md
@@ -35,40 +35,37 @@ graph TD
 
 ### Part 1: Setting Up the Server
 
-1. **Get Required Tokens**
+1.  **Get Required Tokens**
 
-   ```python
-   # Get these ready:
-   NGROK_TOKEN = "..."  # from ngrok.com (for remote access)
-   HF_TOKEN = "..."     # from huggingface.co (optional)
-   ```
+    ```python
+    # Get these ready:
+    NGROK_TOKEN = "..."  # from ngrok.com (for remote access)
+    HF_TOKEN = "..."     # from huggingface.co (optional)
+    ```
 
-2. **Install LocalLab Server Package**
+2.  **Install LocalLab Server Package**
 
-   ```python
-   !pip install locallab
-   ```
+    ```python
+    !pip install locallab
+    ```
 
-3. **Configure Environment**
+3.  **Configure Environment**
 
-   ```python
+    #### Method 1: Using CLI (recommended)
 
-   ```
+    ```python
+    !locallab config  # Enable Ngrok and Hugging face and provide tokens.
+    ```
 
-# Method 1: Using CLI (recommended)
-
-!locallab config
-// Enable Ngrok and Hugging face and provide tokens.
-
-# OR Method 2: Using Environment Variables
+    #### OR Method 2: Using Environment Variables
 
-import os
-os.environ["NGROK_AUTH_TOKEN"] = NGROK_TOKEN
-os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN # Optional
+    ```python
+    import os
+    os.environ["NGROK_AUTH_TOKEN"] = NGROK_TOKEN
+    os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN # Optional
+    ```
 
-````
-
-4. **Start Server with Ngrok for Remote Access**
+4.  **Start Server with Ngrok for Remote Access**
 
 ```python
 # Method 1: Using CLI (recommended)
@@ -81,7 +78,7 @@ start_server(use_ngrok=True)
 # The server will display a public URL like:
 # 🚀 Ngrok Public URL: https://abc123.ngrok.app
 # COPY THIS URL - you'll need it to connect!
-````
+```
 
 ### Part 2: Connecting with the Client
 
diff --git a/docs/guides/API.md b/docs/guides/API.md
@@ -36,7 +36,8 @@ Generate text using the loaded model.
   "temperature": "float",
   "top_p": "float",
   "top_k": "integer",
-  "repetition_penalty": "float"
+  "repetition_penalty": "float",
+  "do_sample": "boolean"
 }
 ```
 
@@ -49,6 +50,7 @@ Generate text using the loaded model.
 | `top_p`              | 0.9     | Nucleus sampling parameter (higher = more diverse responses)         |
 | `top_k`              | 80      | Limits vocabulary to top K tokens (higher = more diverse vocabulary) |
 | `repetition_penalty` | 1.15    | Penalizes repetition (higher = less repetition)                      |
+| `do_sample`          | true    | Whether to use sampling; if false, uses greedy decoding              |
 
 > **Note**: All parameters are optional. If not provided, the server will use the default values shown above.
 
@@ -86,7 +88,8 @@ curl -X POST "${BASE_URL}/generate" \
     "temperature": 0.7,
     "top_p": 0.9,
     "top_k": 80,
-    "repetition_penalty": 1.15
+    "repetition_penalty": 1.15,
+    "do_sample": true
   }'
 
 # Streaming generation
@@ -127,7 +130,8 @@ Chat completion endpoint similar to OpenAI's API.
   "temperature": "float",
   "top_p": "float",
   "top_k": "integer",
-  "repetition_penalty": "float"
+  "repetition_penalty": "float",
+  "do_sample": "boolean"
 }
 ```
 
@@ -181,7 +185,8 @@ curl -X POST "${BASE_URL}/chat" \
     "temperature": 0.7,
     "top_p": 0.9,
     "top_k": 80,
-    "repetition_penalty": 1.15
+    "repetition_penalty": 1.15,
+    "do_sample": true
   }'
 
 # Streaming chat
@@ -212,7 +217,8 @@ Generate text for multiple prompts in parallel.
   "temperature": "float",
   "top_p": "float",
   "top_k": "integer",
-  "repetition_penalty": "float"
+  "repetition_penalty": "float",
+  "do_sample": "boolean"
 }
 ```
 
@@ -259,7 +265,8 @@ curl -X POST "${BASE_URL}/generate/batch" \
     "temperature": 0.7,
     "top_p": 0.9,
     "top_k": 80,
-    "repetition_penalty": 1.15
+    "repetition_penalty": 1.15,
+    "do_sample": true
   }'
 ```
 
@@ -378,6 +385,7 @@ All generation endpoints have sensible defaults for the response quality paramet
 - `top_p`: 0.9
 - `top_k`: 80
 - `repetition_penalty`: 1.15
+- `do_sample`: true
 
 You can omit any or all of these parameters in your requests, and the server will use these defaults.
 
diff --git a/locallab/__init__.py b/locallab/__init__.py
@@ -2,7 +2,7 @@
 LocalLab - A lightweight AI inference server for running LLMs locally
 """
 
-__version__ = "0.5.9"  # Updated to match setup.py
+__version__ = "0.6.1"  # Updated to fix CLI config environment variable issue
 
 # Only import what's necessary initially, lazy-load the rest
 from .logger import get_logger
diff --git a/locallab/cli/interactive.py b/locallab/cli/interactive.py
@@ -151,11 +151,23 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
             default=config.get("enable_flash_attention", ENABLE_FLASH_ATTENTION)
         )
 
-        config["enable_better_transformer"] = click.confirm(
+        config["enable_bettertransformer"] = click.confirm(
             "Enable better transformer?",
             default=config.get("enable_bettertransformer", ENABLE_BETTERTRANSFORMER)
         )
 
+        # Set environment variables for optimization settings
+        os.environ["LOCALLAB_ENABLE_QUANTIZATION"] = str(config["enable_quantization"]).lower()
+        os.environ["LOCALLAB_QUANTIZATION_TYPE"] = str(config["quantization_type"]) if config["enable_quantization"] else ""
+        os.environ["LOCALLAB_ENABLE_CPU_OFFLOADING"] = str(config["enable_cpu_offloading"]).lower()
+        os.environ["LOCALLAB_ENABLE_ATTENTION_SLICING"] = str(config["enable_attention_slicing"]).lower()
+        os.environ["LOCALLAB_ENABLE_FLASH_ATTENTION"] = str(config["enable_flash_attention"]).lower()
+        os.environ["LOCALLAB_ENABLE_BETTERTRANSFORMER"] = str(config["enable_bettertransformer"]).lower()
+
+        # Save the optimization settings to config file
+        from .config import save_config
+        save_config(config)
+
         click.echo("\n✅ Optimization settings updated!")
     else:
         # If user doesn't want to configure, use the current values or defaults
@@ -172,6 +184,18 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
         if 'enable_bettertransformer' not in config:
             config["enable_bettertransformer"] = ENABLE_BETTERTRANSFORMER
 
+        # Set environment variables for optimization settings
+        os.environ["LOCALLAB_ENABLE_QUANTIZATION"] = str(config["enable_quantization"]).lower()
+        os.environ["LOCALLAB_QUANTIZATION_TYPE"] = str(config["quantization_type"]) if config["enable_quantization"] else ""
+        os.environ["LOCALLAB_ENABLE_CPU_OFFLOADING"] = str(config["enable_cpu_offloading"]).lower()
+        os.environ["LOCALLAB_ENABLE_ATTENTION_SLICING"] = str(config["enable_attention_slicing"]).lower()
+        os.environ["LOCALLAB_ENABLE_FLASH_ATTENTION"] = str(config["enable_flash_attention"]).lower()
+        os.environ["LOCALLAB_ENABLE_BETTERTRANSFORMER"] = str(config["enable_bettertransformer"]).lower()
+
+        # Save the optimization settings to config file
+        from .config import save_config
+        save_config(config)
+
         click.echo("\nUsing current optimization settings.")
 
     # Advanced Settings
diff --git a/locallab/logger/__init__.py b/locallab/logger/__init__.py
@@ -109,6 +109,18 @@ class SubduedColoredFormatter(logging.Formatter):
     """Formatter that adds subdued colors to regular logs and bright colors to important logs"""
 
     def format(self, record):
+        # Check if we're currently downloading a model
+        try:
+            from ..utils.progress import is_model_downloading
+            if is_model_downloading():
+                # During model download, only show critical logs
+                if record.levelno < logging.ERROR:
+                    # Skip non-critical logs during model download
+                    return ""
+        except (ImportError, AttributeError):
+            # If we can't import the function, continue as normal
+            pass
+
         # Check if this is an important message that should stand out
         is_important = False
 
diff --git a/locallab/model_manager.py b/locallab/model_manager.py
diff --git a/locallab/routes/generate.py b/locallab/routes/generate.py
diff --git a/locallab/utils/progress.py b/locallab/utils/progress.py
diff --git a/setup.py b/setup.py