Added context awareness in streaming and updated DOCS

UtkarshTheDev · UtkarshTheDev · commit 67ad047b3787 · 2025-04-09T22:00:56.000+05:30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,19 @@
 
 All notable changes to LocalLab will be documented in this file.
 
+## [0.4.46] - 2024-03-14
+
+### Added
+- Added context awareness to streaming generation
+- Enhanced streaming response quality with context tracking
+- Improved streaming response coherence by maintaining conversation history
+- Updated documentation with streaming context examples
+
+### Fixed
+- Fixed streaming response formatting issues
+- Improved error handling in streaming generation
+- Enhanced token cleanup for better readability
+
 ## [0.4.45] - 2024-03-14
 
 ### Fixed
diff --git a/client/python_client/locallab/client.py b/client/python_client/locallab/client.py
@@ -103,6 +103,7 @@ def __init__(self, config: Union[str, LocalLabConfig, Dict[str, Any]]):
         self.config = config
         self.session: Optional[aiohttp.ClientSession] = None
         self.ws: Optional[websockets.WebSocketClientProtocol] = None
+        self._stream_context = []  # Add context tracking
 
     async def __aenter__(self):
         await self.connect()
@@ -166,20 +167,21 @@ async def _request(self, method: str, path: str, **kwargs) -> Any:
                 await asyncio.sleep(2 ** attempt)
 
     async def stream_generate(self, prompt: str, options: Optional[Union[GenerateOptions, Dict]] = None) -> AsyncGenerator[str, None]:
-        """Stream generated text"""
+        """Stream generated text with context awareness"""
         if isinstance(options, dict):
             options = GenerateOptions(**options)
         if options is None:
             options = GenerateOptions()
         
-        # Format data consistently
+        # Format data with context
         data = {
             "prompt": prompt,
             "stream": True,
             "max_tokens": options.max_length,
             "temperature": options.temperature,
             "top_p": options.top_p,
-            "model": options.model_id
+            "model": options.model_id,
+            "context": self._stream_context[-5:] if self._stream_context else []  # Send last 5 exchanges
         }
         # Remove None values
         data = {k: v for k, v in data.items() if v is not None}
@@ -195,7 +197,7 @@ async def stream_generate(self, prompt: str, options: Optional[Union[GenerateOpt
                     yield f"Error: {error_msg}"
                     return
                 
-                buffer = ""
+                current_response = ""
                 async for line in response.content:
                     if line:
                         try:
@@ -217,7 +219,6 @@ async def stream_generate(self, prompt: str, options: Optional[Union[GenerateOpt
                                 # Handle different response formats
                                 text = data.get("text", data.get("response", data.get("content", "")))
                             except json.JSONDecodeError:
-                                # If not JSON, use raw line
                                 text = line
                             
                             if text:
@@ -235,20 +236,24 @@ async def stream_generate(self, prompt: str, options: Optional[Union[GenerateOpt
                                           .replace("user:", ""))
                                 
                                 # Add space between words if needed
-                                if (buffer and 
+                                if (current_response and 
                                     not text.startswith(" ") and 
                                     not text.startswith("\n") and 
-                                    not buffer.endswith(" ") and 
-                                    not buffer.endswith("\n")):
+                                    not current_response.endswith(" ") and 
+                                    not current_response.endswith("\n")):
                                     text = " " + text
                                 
-                                buffer += text
+                                current_response += text
                                 yield text
                                 
                         except Exception as e:
                             logger.error(f"Error processing stream chunk: {str(e)}")
                             yield f"\nError: Failed to process response - {str(e)}"
                             return
+                
+                # Update context with the full exchange
+                self._stream_context.append({"role": "user", "content": prompt})
+                self._stream_context.append({"role": "assistant", "content": current_response})
                             
         except Exception as e:
             logger.error(f"Stream connection error: {str(e)}")
diff --git a/client/python_client/setup.py b/client/python_client/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="locallab-client",
-    version="1.0.2",
+    version="1.0.3",
     author="Utkarsh Tiwari",
     author_email="utkarshweb2023@gmail.com",
     description="Official Python client for LocalLab - A local LLM server",
diff --git a/docs/guides/examples.md b/docs/guides/examples.md
@@ -133,6 +133,31 @@ async def stream_chat():
         print(token, end="", flush=True)
 ```
 
+### Stream Text Generation with Context
+
+The streaming generation now maintains context of the conversation for more coherent responses:
+
+```python
+async def stream_with_context():
+    client = LocalLabClient("http://localhost:8000")
+    try:
+        # First response
+        print("Q: Tell me a story about a robot")
+        async for token in client.stream_generate("Tell me a story about a robot"):
+            print(token, end="", flush=True)
+        print("\n")
+        
+        # Follow-up question (will have context from previous response)
+        print("Q: What happens next in the story?")
+        async for token in client.stream_generate("What happens next in the story?"):
+            print(token, end="", flush=True)
+        print("\n")
+    finally:
+        await client.close()
+```
+
+The client maintains a context of recent exchanges, allowing for more coherent follow-up responses. The context is automatically managed and includes up to 5 previous exchanges.
+
 ## Batch Processing
 
 ### Process Multiple Prompts
diff --git a/locallab/__init__.py b/locallab/__init__.py
@@ -2,7 +2,7 @@
 LocalLab - A lightweight AI inference server for running LLMs locally
 """
 
-__version__ = "0.4.45"
+__version__ = "0.4.46"
 
 # Only import what's necessary initially, lazy-load the rest
 from .logger import get_logger
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="locallab",
-    version="0.4.45",
+    version="0.4.46",
     packages=find_packages(include=["locallab", "locallab.*"]),
     install_requires=[
         "fastapi>=0.95.0,<1.0.0",