Skip to content

Commit fbc7110

Browse files
committed
Fixed Server Issues and Improved The CLI and logging system
1 parent 19ba977 commit fbc7110

File tree

14 files changed

+221
-78
lines changed

14 files changed

+221
-78
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,23 @@
22

33
All notable changes to LocalLab will be documented in this file.
44

5+
## [0.6.0] - 2024-05-02
6+
7+
### Added
8+
9+
- Added `do_sample` parameter to all generation endpoints in the API
10+
- Updated API documentation to include the `do_sample` parameter with description and examples
11+
- Added clear messages before and after model downloads for better user experience
12+
13+
### Fixed
14+
15+
- Fixed model downloading logs to display properly without interleaving
16+
- Implemented a custom progress bar system for Hugging Face downloads
17+
- Suppressed regular logs during model downloads to avoid interference with progress bars
18+
- Enhanced progress bar display with better formatting and descriptions
19+
- Fixed client error with `do_sample` parameter by adding it to all client methods
20+
- Updated client package version to 1.0.9 to reflect these fixes
21+
522
## [0.5.9] - 2024-05-01
623

724
### Fixed

client/python_client/locallab_client/client.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,8 @@ async def generate(
235235
top_p: float = 0.9,
236236
timeout: float = 180.0, # Increased timeout for more complete responses (3 minutes)
237237
repetition_penalty: float = 1.15, # Added repetition penalty for better quality
238-
top_k: int = 80 # Added top_k parameter for better quality
238+
top_k: int = 80, # Added top_k parameter for better quality
239+
do_sample: bool = True # Added do_sample parameter
239240
) -> str:
240241
"""Generate text using the model with improved error handling"""
241242
# Update activity timestamp
@@ -249,7 +250,8 @@ async def generate(
249250
"temperature": temperature,
250251
"top_p": top_p,
251252
"repetition_penalty": repetition_penalty,
252-
"top_k": top_k
253+
"top_k": top_k,
254+
"do_sample": do_sample
253255
}
254256

255257
if stream:
@@ -261,7 +263,8 @@ async def generate(
261263
top_p=top_p,
262264
timeout=timeout,
263265
repetition_penalty=repetition_penalty,
264-
top_k=top_k
266+
top_k=top_k,
267+
do_sample=do_sample
265268
)
266269

267270
# Create a timeout for this specific request
@@ -307,7 +310,8 @@ async def stream_generate(
307310
timeout: float = 300.0, # Increased timeout for more complete responses (5 minutes)
308311
retry_count: int = 3, # Increased retry count for better reliability
309312
repetition_penalty: float = 1.15, # Increased repetition penalty for better quality
310-
top_k: int = 80 # Added top_k parameter for better quality
313+
top_k: int = 80, # Added top_k parameter for better quality
314+
do_sample: bool = True # Added do_sample parameter
311315
) -> AsyncGenerator[str, None]:
312316
"""
313317
Stream text generation with token-level streaming and robust error handling.
@@ -341,7 +345,8 @@ async def stream_generate(
341345
"temperature": temperature,
342346
"top_p": top_p,
343347
"repetition_penalty": repetition_penalty,
344-
"top_k": top_k
348+
"top_k": top_k,
349+
"do_sample": do_sample
345350
}
346351

347352
# Create a timeout for this specific request

client/python_client/locallab_client/sync_client.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ def generate(
141141
temperature: float = 0.7,
142142
top_p: float = 0.9,
143143
repetition_penalty: float = 1.15, # Increased repetition penalty for better quality
144-
top_k: int = 80 # Added top_k parameter for better quality
144+
top_k: int = 80, # Added top_k parameter for better quality
145+
do_sample: bool = True # Added do_sample parameter
145146
) -> Union[str, Generator[str, None, None]]:
146147
"""
147148
Generate text using the model with improved quality settings.
@@ -171,7 +172,8 @@ def generate(
171172
temperature=temperature,
172173
top_p=top_p,
173174
repetition_penalty=repetition_penalty,
174-
top_k=top_k
175+
top_k=top_k,
176+
do_sample=do_sample
175177
)
176178

177179
return self._run_coroutine(
@@ -184,6 +186,7 @@ def generate(
184186
top_p=top_p,
185187
repetition_penalty=repetition_penalty,
186188
top_k=top_k,
189+
do_sample=do_sample,
187190
timeout=180.0 # Increased timeout for more complete responses (3 minutes)
188191
)
189192
)
@@ -197,7 +200,8 @@ def stream_generate(
197200
top_p: float = 0.9,
198201
timeout: float = 300.0, # Increased timeout for more complete responses (5 minutes)
199202
repetition_penalty: float = 1.15, # Increased repetition penalty for better quality
200-
top_k: int = 80 # Added top_k parameter for better quality
203+
top_k: int = 80, # Added top_k parameter for better quality
204+
do_sample: bool = True # Added do_sample parameter
201205
) -> Generator[str, None, None]:
202206
"""
203207
Stream text generation with improved quality and reliability.
@@ -234,7 +238,8 @@ async def producer():
234238
timeout=timeout,
235239
retry_count=3, # Increased retry count for better reliability
236240
repetition_penalty=repetition_penalty, # Pass the repetition penalty parameter
237-
top_k=top_k # Pass the top_k parameter
241+
top_k=top_k, # Pass the top_k parameter
242+
do_sample=do_sample # Pass the do_sample parameter
238243
):
239244
await queue.put(chunk)
240245

client/python_client/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "locallab-client"
7-
version = "1.0.8"
7+
version = "1.0.9"
88
description = "Python client for LocalLab - A local LLM server"
99
readme = "README.md"
1010
authors = [

client/python_client/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="locallab-client",
5-
version="1.0.8",
5+
version="1.0.9",
66
author="Utkarsh",
77
author_email="utkarshweb2023@gmail.com",
88
description="Python client for LocalLab - A local LLM server",

docs/colab/README.md

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -35,40 +35,37 @@ graph TD
3535

3636
### Part 1: Setting Up the Server
3737

38-
1. **Get Required Tokens**
38+
1. **Get Required Tokens**
3939

40-
```python
41-
# Get these ready:
42-
NGROK_TOKEN = "..." # from ngrok.com (for remote access)
43-
HF_TOKEN = "..." # from huggingface.co (optional)
44-
```
40+
```python
41+
# Get these ready:
42+
NGROK_TOKEN = "..." # from ngrok.com (for remote access)
43+
HF_TOKEN = "..." # from huggingface.co (optional)
44+
```
4545

46-
2. **Install LocalLab Server Package**
46+
2. **Install LocalLab Server Package**
4747

48-
```python
49-
!pip install locallab
50-
```
48+
```python
49+
!pip install locallab
50+
```
5151

52-
3. **Configure Environment**
52+
3. **Configure Environment**
5353

54-
```python
54+
#### Method 1: Using CLI (recommended)
5555

56-
```
56+
```python
57+
!locallab config # Enable Ngrok and Hugging face and provide tokens.
58+
```
5759

58-
# Method 1: Using CLI (recommended)
59-
60-
!locallab config
61-
// Enable Ngrok and Hugging face and provide tokens.
62-
63-
# OR Method 2: Using Environment Variables
60+
#### OR Method 2: Using Environment Variables
6461

65-
import os
66-
os.environ["NGROK_AUTH_TOKEN"] = NGROK_TOKEN
67-
os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN # Optional
62+
```python
63+
import os
64+
os.environ["NGROK_AUTH_TOKEN"] = NGROK_TOKEN
65+
os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN # Optional
66+
```
6867

69-
````
70-
71-
4. **Start Server with Ngrok for Remote Access**
68+
4. **Start Server with Ngrok for Remote Access**
7269

7370
```python
7471
# Method 1: Using CLI (recommended)
@@ -81,7 +78,7 @@ start_server(use_ngrok=True)
8178
# The server will display a public URL like:
8279
# 🚀 Ngrok Public URL: https://abc123.ngrok.app
8380
# COPY THIS URL - you'll need it to connect!
84-
````
81+
```
8582

8683
### Part 2: Connecting with the Client
8784

docs/guides/API.md

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ Generate text using the loaded model.
3636
"temperature": "float",
3737
"top_p": "float",
3838
"top_k": "integer",
39-
"repetition_penalty": "float"
39+
"repetition_penalty": "float",
40+
"do_sample": "boolean"
4041
}
4142
```
4243

@@ -49,6 +50,7 @@ Generate text using the loaded model.
4950
| `top_p` | 0.9 | Nucleus sampling parameter (higher = more diverse responses) |
5051
| `top_k` | 80 | Limits vocabulary to top K tokens (higher = more diverse vocabulary) |
5152
| `repetition_penalty` | 1.15 | Penalizes repetition (higher = less repetition) |
53+
| `do_sample` | true | Whether to use sampling; if false, uses greedy decoding |
5254

5355
> **Note**: All parameters are optional. If not provided, the server will use the default values shown above.
5456
@@ -86,7 +88,8 @@ curl -X POST "${BASE_URL}/generate" \
8688
"temperature": 0.7,
8789
"top_p": 0.9,
8890
"top_k": 80,
89-
"repetition_penalty": 1.15
91+
"repetition_penalty": 1.15,
92+
"do_sample": true
9093
}'
9194

9295
# Streaming generation
@@ -127,7 +130,8 @@ Chat completion endpoint similar to OpenAI's API.
127130
"temperature": "float",
128131
"top_p": "float",
129132
"top_k": "integer",
130-
"repetition_penalty": "float"
133+
"repetition_penalty": "float",
134+
"do_sample": "boolean"
131135
}
132136
```
133137

@@ -181,7 +185,8 @@ curl -X POST "${BASE_URL}/chat" \
181185
"temperature": 0.7,
182186
"top_p": 0.9,
183187
"top_k": 80,
184-
"repetition_penalty": 1.15
188+
"repetition_penalty": 1.15,
189+
"do_sample": true
185190
}'
186191

187192
# Streaming chat
@@ -212,7 +217,8 @@ Generate text for multiple prompts in parallel.
212217
"temperature": "float",
213218
"top_p": "float",
214219
"top_k": "integer",
215-
"repetition_penalty": "float"
220+
"repetition_penalty": "float",
221+
"do_sample": "boolean"
216222
}
217223
```
218224

@@ -259,7 +265,8 @@ curl -X POST "${BASE_URL}/generate/batch" \
259265
"temperature": 0.7,
260266
"top_p": 0.9,
261267
"top_k": 80,
262-
"repetition_penalty": 1.15
268+
"repetition_penalty": 1.15,
269+
"do_sample": true
263270
}'
264271
```
265272

@@ -378,6 +385,7 @@ All generation endpoints have sensible defaults for the response quality paramet
378385
- `top_p`: 0.9
379386
- `top_k`: 80
380387
- `repetition_penalty`: 1.15
388+
- `do_sample`: true
381389

382390
You can omit any or all of these parameters in your requests, and the server will use these defaults.
383391

locallab/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
LocalLab - A lightweight AI inference server for running LLMs locally
33
"""
44

5-
__version__ = "0.5.9" # Updated to match setup.py
5+
__version__ = "0.6.1" # Updated to fix CLI config environment variable issue
66

77
# Only import what's necessary initially, lazy-load the rest
88
from .logger import get_logger

locallab/cli/interactive.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,11 +151,23 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
151151
default=config.get("enable_flash_attention", ENABLE_FLASH_ATTENTION)
152152
)
153153

154-
config["enable_better_transformer"] = click.confirm(
154+
config["enable_bettertransformer"] = click.confirm(
155155
"Enable better transformer?",
156156
default=config.get("enable_bettertransformer", ENABLE_BETTERTRANSFORMER)
157157
)
158158

159+
# Set environment variables for optimization settings
160+
os.environ["LOCALLAB_ENABLE_QUANTIZATION"] = str(config["enable_quantization"]).lower()
161+
os.environ["LOCALLAB_QUANTIZATION_TYPE"] = str(config["quantization_type"]) if config["enable_quantization"] else ""
162+
os.environ["LOCALLAB_ENABLE_CPU_OFFLOADING"] = str(config["enable_cpu_offloading"]).lower()
163+
os.environ["LOCALLAB_ENABLE_ATTENTION_SLICING"] = str(config["enable_attention_slicing"]).lower()
164+
os.environ["LOCALLAB_ENABLE_FLASH_ATTENTION"] = str(config["enable_flash_attention"]).lower()
165+
os.environ["LOCALLAB_ENABLE_BETTERTRANSFORMER"] = str(config["enable_bettertransformer"]).lower()
166+
167+
# Save the optimization settings to config file
168+
from .config import save_config
169+
save_config(config)
170+
159171
click.echo("\n✅ Optimization settings updated!")
160172
else:
161173
# If user doesn't want to configure, use the current values or defaults
@@ -172,6 +184,18 @@ def prompt_for_config(use_ngrok: bool = None, port: int = None, ngrok_auth_token
172184
if 'enable_bettertransformer' not in config:
173185
config["enable_bettertransformer"] = ENABLE_BETTERTRANSFORMER
174186

187+
# Set environment variables for optimization settings
188+
os.environ["LOCALLAB_ENABLE_QUANTIZATION"] = str(config["enable_quantization"]).lower()
189+
os.environ["LOCALLAB_QUANTIZATION_TYPE"] = str(config["quantization_type"]) if config["enable_quantization"] else ""
190+
os.environ["LOCALLAB_ENABLE_CPU_OFFLOADING"] = str(config["enable_cpu_offloading"]).lower()
191+
os.environ["LOCALLAB_ENABLE_ATTENTION_SLICING"] = str(config["enable_attention_slicing"]).lower()
192+
os.environ["LOCALLAB_ENABLE_FLASH_ATTENTION"] = str(config["enable_flash_attention"]).lower()
193+
os.environ["LOCALLAB_ENABLE_BETTERTRANSFORMER"] = str(config["enable_bettertransformer"]).lower()
194+
195+
# Save the optimization settings to config file
196+
from .config import save_config
197+
save_config(config)
198+
175199
click.echo("\nUsing current optimization settings.")
176200

177201
# Advanced Settings

locallab/logger/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,18 @@ class SubduedColoredFormatter(logging.Formatter):
109109
"""Formatter that adds subdued colors to regular logs and bright colors to important logs"""
110110

111111
def format(self, record):
112+
# Check if we're currently downloading a model
113+
try:
114+
from ..utils.progress import is_model_downloading
115+
if is_model_downloading():
116+
# During model download, only show critical logs
117+
if record.levelno < logging.ERROR:
118+
# Skip non-critical logs during model download
119+
return ""
120+
except (ImportError, AttributeError):
121+
# If we can't import the function, continue as normal
122+
pass
123+
112124
# Check if this is an important message that should stand out
113125
is_important = False
114126

0 commit comments

Comments
 (0)