From 2d5dfc6aad6419bd2481cb192706b15274f51caa Mon Sep 17 00:00:00 2001 From: James Barney Date: Mon, 2 Jun 2025 13:53:23 -0400 Subject: [PATCH 1/7] adding azdo --- api/api.py | 366 ++++++++++++++++- api/azure_openai_client.py | 543 ++++++++++++++++++++++++++ api/azuredevops_client.py | 279 +++++++++++++ api/config.py | 22 +- api/config/embedder.json | 8 + api/config/generator.json | 28 +- api/data_pipeline.py | 263 ++++++++++--- api/rag.py | 43 +- api/requirements.txt | 3 +- api/websocket_wiki.py | 500 ++++++++++++++++++------ next.config.ts | 5 + src/app/[owner]/[repo]/page.tsx | 98 ++++- src/app/page.tsx | 100 ++++- src/components/ConfigurationModal.tsx | 14 +- src/messages/en.json | 2 +- src/messages/es.json | 4 +- src/messages/ja.json | 4 +- src/messages/kr.json | 4 +- src/messages/vi.json | 4 +- src/messages/zh.json | 4 +- 20 files changed, 2079 insertions(+), 215 deletions(-) create mode 100644 api/azure_openai_client.py create mode 100644 api/azuredevops_client.py diff --git a/api/api.py b/api/api.py index 1550f06f..1cba6a89 100644 --- a/api/api.py +++ b/api/api.py @@ -1,8 +1,9 @@ import os import logging +from urllib.parse import quote from fastapi import FastAPI, HTTPException, Query, Request, WebSocket from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, Response +from fastapi.responses import JSONResponse, Response, RedirectResponse from typing import List, Optional, Dict, Any, Literal import json from datetime import datetime @@ -28,14 +29,29 @@ description="API for streaming chat completions" ) -# Configure CORS -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # Allows all origins - allow_credentials=True, - allow_methods=["*"], # Allows all methods - allow_headers=["*"], # Allows all headers -) +# Create a separate router for API endpoints to ensure they take precedence +api_router = FastAPI() + +# Configure CORS for both app and api_router +for app_instance in [app, api_router]: + app_instance.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Allows all origins + allow_credentials=True, + allow_methods=["*"], # Allows all methods + allow_headers=["*"], # Allows all headers + ) + +# Mount the API router at /api prefix +app.mount("/api", api_router) + +# Import and include debug routes +try: + from api.debug_azure import debug_router + app.include_router(debug_router) + logger.info("Debug routes registered successfully") +except Exception as e: + logger.error(f"Failed to register debug routes: {str(e)}") # Helper function to get adalflow root path def get_adalflow_default_root_path(): @@ -414,7 +430,7 @@ async def save_wiki_cache(data: WikiCacheRequest) -> bool: # --- Wiki Cache API Endpoints --- -@app.get("/api/wiki_cache", response_model=Optional[WikiCacheData]) +@api_router.get("/wiki_cache", response_model=Optional[WikiCacheData]) async def get_cached_wiki( owner: str = Query(..., description="Repository owner"), repo: str = Query(..., description="Repository name"), @@ -434,7 +450,7 @@ async def get_cached_wiki( logger.info(f"Wiki cache not found for {owner}/{repo} ({repo_type}), lang: {language}") return None -@app.post("/api/wiki_cache") +@api_router.post("/wiki_cache") async def store_wiki_cache(request_data: WikiCacheRequest): """ Stores generated wiki data (structure and pages) to the server-side cache. @@ -446,7 +462,7 @@ async def store_wiki_cache(request_data: WikiCacheRequest): else: raise HTTPException(status_code=500, detail="Failed to save wiki cache") -@app.delete("/api/wiki_cache") +@api_router.delete("/wiki_cache") async def delete_wiki_cache( owner: str = Query(..., description="Repository owner"), repo: str = Query(..., description="Repository name"), @@ -493,8 +509,332 @@ async def root(): } } +# --- Simplified Route for Azure DevOps Repositories --- +@app.get("/{project}/{repository}") +async def simplified_repo_route( + project: str, + repository: str, + request: Request, + file_tree: str = Query(None), + readme: str = Query(None), + type: str = Query(None), + repo_url: str = Query(None) +): + """ + Simplified route for Azure DevOps repositories without the organization part. + This route receives the file tree and README content as query parameters. + """ + # Skip API routes - they should be handled by their specific endpoints + if project == "api": + raise HTTPException(status_code=404, detail="Not found - Use specific API endpoints") + + logger.info(f"Simplified route triggered for: {project}/{repository}") + logger.info(f"Query parameters: {request.query_params}") + + # If we have the file tree and README in the query parameters, return them directly + if file_tree and readme and type == 'azure': + logger.info("Using file tree and README from query parameters") + return { + "file_tree": file_tree, + "readme": readme + } + + # Otherwise, return a 404 + raise HTTPException(status_code=404, detail="Resource not found") + +# --- Catch-all Route for Azure DevOps Repositories --- +@app.get("/{organization}/{project}/{repository}", include_in_schema=False) +async def catch_all_repo_route( + organization: str, + project: str, + repository: str, + request: Request +): + # Skip well-known paths and other special paths + if organization.startswith('.well-known') or organization.startswith('_next') or organization.startswith('api'): + raise HTTPException(status_code=404, detail="Resource not found") + """ + Catch-all route for repository URLs with the format /{organization}/{project}/{repository}. + This is primarily used for Azure DevOps repositories but could be used for other repository types as well. + """ + logger.info(f"Catch-all route triggered for: {organization}/{project}/{repository}") + + # Get all query parameters + query_params = dict(request.query_params) + logger.info(f"Query parameters: {query_params}") + + # Check if repo_url is provided + if 'repo_url' in query_params: + try: + # Get the repo URL from query parameters + repo_url = query_params.get('repo_url') + repo_type = query_params.get('type', 'github') + + # Handle double-encoded URLs + from urllib.parse import unquote + if '%25' in repo_url: # Double-encoded + repo_url = unquote(unquote(repo_url)) + else: + repo_url = unquote(repo_url) + + logger.info(f"Decoded repo_url: {repo_url}") + + # Get access token based on repository type + access_token = None + if repo_type == 'github': + access_token = os.environ.get("GITHUB_TOKEN") + elif repo_type == 'gitlab': + access_token = os.environ.get("GITLAB_TOKEN") + elif repo_type == 'bitbucket': + access_token = os.environ.get("BITBUCKET_TOKEN") + elif repo_type == 'azure': + access_token = os.environ.get("AZURE_DEVOPS_TOKEN") + + if not access_token and repo_type != 'github': # GitHub can work without a token for public repos + logger.warning(f"{repo_type.upper()}_TOKEN not found in environment variables") + + # Import the necessary modules for repository processing + from api.data_pipeline import download_repo, DatabaseManager + + # Process the repository + try: + # Create a database manager instance + db_manager = DatabaseManager() + + # Prepare the database for the repository + # This will download the repository, process the files, and create embeddings + logger.info(f"Preparing database for {repo_url}") + db_manager.prepare_database(repo_url, repo_type, access_token) + + # Get the repository structure to return to the frontend + # This is similar to what's done in the GitHub/GitLab endpoints + try: + # Get the repository path + repo_path = os.path.join(get_adalflow_default_root_path(), "repos", repository) + logger.info(f"Getting repository structure from: {repo_path}") + + # Use the existing get_local_repo_structure function + # We can call it directly since we're in the same file + result = await get_local_repo_structure(repo_path) + + # Check if the result is a JSONResponse (error) + if isinstance(result, JSONResponse): + logger.error(f"Error getting repository structure: {result.body}") + # Return basic repository information + return { + "repository": { + "name": repository, + "owner": organization, + "project": project, + "url": repo_url + }, + "type": repo_type, + "status": "processing" + } + + # For Azure DevOps repos, redirect to a simpler URL format without the organization + # This makes the frontend handling much more straightforward + from fastapi.responses import RedirectResponse + + # Get provider and model from query parameters if available + provider = request.query_params.get('provider', 'azure') + model = request.query_params.get('model', 'gpt-4o') + language = request.query_params.get('language', 'en') + comprehensive = request.query_params.get('comprehensive', 'true') + + # Construct the redirect URL with query parameters to pass the file tree and README + redirect_url = f"/{project}/{repository}?file_tree={quote(result['file_tree'])}&readme={quote(result['readme'])}&type=azure&repo_url={quote(repo_url)}&provider={provider}&model={model}&language={language}&comprehensive={comprehensive}" + + # Return a redirect response + return RedirectResponse(url=redirect_url, status_code=302) + except Exception as structure_error: + logger.error(f"Error getting repository structure: {str(structure_error)}") + # Return basic repository information even if structure retrieval fails + return { + "repository": { + "name": repository, + "owner": organization, + "project": project, + "url": repo_url + }, + "type": repo_type, + "status": "processing" + } + except Exception as process_error: + logger.error(f"Error processing repository: {str(process_error)}") + raise HTTPException(status_code=500, detail=f"Error processing repository: {str(process_error)}") + except Exception as e: + logger.error(f"Error in catch-all route: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error processing repository: {str(e)}") + + # If repo_url is not provided, return 404 + raise HTTPException(status_code=404, detail="Resource not found") + +# --- Azure DevOps Repository Structure Endpoint --- +@app.get("/api/azuredevops/structure") +async def get_azuredevops_structure( + repo_url: str = Query(..., description="URL of the Azure DevOps repository") +): + """ + Get repository structure (file tree and README) for an Azure DevOps repository. + This endpoint is specifically designed for Azure DevOps repositories. + + Args: + repo_url: URL of the Azure DevOps repository + + Returns: + Repository structure with file tree and README content + """ + try: + logger.info(f"Getting Azure DevOps repository structure for {repo_url}") + + # Parse the repository URL to extract organization, project, and repository name + # Format: https://dev.azure.com/{organization}/{project}/_git/{repository} + url_parts = repo_url.split('/') + if len(url_parts) < 6: + raise HTTPException(status_code=400, detail=f"Invalid Azure DevOps repository URL: {repo_url}") + + organization = url_parts[3] + project = url_parts[4] + repository = url_parts[-1] + + logger.info(f"Parsed Azure DevOps URL - Organization: {organization}, Project: {project}, Repository: {repository}") + + # Import the necessary modules for repository processing + from api.data_pipeline import download_repo, DatabaseManager + + # Create a database manager instance + db_manager = DatabaseManager() + + # Get access token if available + access_token = os.getenv('AZURE_DEVOPS_TOKEN') + + # Prepare the database for the repository + # This will download the repository, process the files, and create embeddings + logger.info(f"Preparing database for {repo_url}") + db_manager.prepare_database(repo_url, 'azure', access_token) + + # Get the repository structure + try: + # Get the repository path + repo_path = os.path.join(get_adalflow_default_root_path(), "repos", repository) + logger.info(f"Getting repository structure from: {repo_path}") + + # Use the existing get_local_repo_structure function + result = await get_local_repo_structure(repo_path) + + # Check if the result is a JSONResponse (error) + if isinstance(result, JSONResponse): + logger.error(f"Error getting repository structure: {result.body}") + # Return error response + raise HTTPException(status_code=500, detail="Error getting repository structure") + + # Return just the file tree and README directly + return { + "file_tree": result["file_tree"], + "readme": result["readme"] + } + except Exception as structure_error: + logger.error(f"Error getting repository structure: {str(structure_error)}") + raise HTTPException(status_code=500, detail=f"Error getting repository structure: {str(structure_error)}") + except Exception as e: + logger.error(f"Error in get_azuredevops_structure: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error getting Azure DevOps repository structure: {str(e)}") + +# --- Repository Structure Endpoint --- +@app.get("/api/repo/structure") +async def get_repo_structure( + type: str = Query(..., description="Repository type (e.g., github, gitlab, azure)"), + repo_url: str = Query(..., description="URL of the repository"), + owner: str = Query(..., description="Repository owner or organization"), + repo: str = Query(..., description="Repository name") +): + """ + Get repository structure (file tree and README) for a repository. + This is particularly useful for Azure DevOps repositories. + + Args: + type: Repository type (e.g., github, gitlab, azure) + repo_url: URL of the repository + owner: Repository owner or organization + repo: Repository name + + Returns: + Repository structure with file tree and README content + """ + try: + logger.info(f"Getting repository structure for {repo_url}") + + # Import the necessary modules for repository processing + from api.data_pipeline import download_repo, DatabaseManager + + # Create a database manager instance + db_manager = DatabaseManager() + + # Get access token if available + access_token = None + if type == 'azure': + access_token = os.getenv('AZURE_DEVOPS_TOKEN') + elif type == 'github': + access_token = os.getenv('GITHUB_TOKEN') + elif type == 'gitlab': + access_token = os.getenv('GITLAB_TOKEN') + elif type == 'bitbucket': + access_token = os.getenv('BITBUCKET_TOKEN') + + # Prepare the database for the repository + # This will download the repository, process the files, and create embeddings + logger.info(f"Preparing database for {repo_url}") + db_manager.prepare_database(repo_url, type, access_token) + + # Get the repository structure + try: + # Get the repository path + repo_path = os.path.join(get_adalflow_default_root_path(), "repos", repo) + logger.info(f"Getting repository structure from: {repo_path}") + + # Use the existing get_local_repo_structure function + result = await get_local_repo_structure(repo_path) + + # Check if the result is a JSONResponse (error) + if isinstance(result, JSONResponse): + logger.error(f"Error getting repository structure: {result.body}") + # Return basic repository information + return { + "repository": { + "name": repo, + "owner": owner, + "url": repo_url + }, + "type": type, + "status": "processing" + } + + # Return repository information for the frontend in the same format as the GitHub API response + # The frontend expects just file_tree and readme directly + return { + "file_tree": result["file_tree"], + "readme": result["readme"], + "status": "ready" + } + except Exception as structure_error: + logger.error(f"Error getting repository structure: {str(structure_error)}") + # Return basic repository information even if structure retrieval fails + return { + "repository": { + "name": repo, + "owner": owner, + "url": repo_url + }, + "type": type, + "status": "processing" + } + except Exception as e: + logger.error(f"Error in get_repo_structure: {str(e)}") + raise HTTPException(status_code=500, detail=f"Error getting repository structure: {str(e)}") + # --- Processed Projects Endpoint --- (New Endpoint) -@app.get("/api/processed_projects", response_model=List[ProcessedProjectEntry]) +@api_router.get("/processed_projects", response_model=List[ProcessedProjectEntry]) async def get_processed_projects(): """ Lists all processed projects found in the wiki cache directory. diff --git a/api/azure_openai_client.py b/api/azure_openai_client.py new file mode 100644 index 00000000..b1906a84 --- /dev/null +++ b/api/azure_openai_client.py @@ -0,0 +1,543 @@ +"""Azure OpenAI ModelClient integration.""" + +import os +import base64 +from typing import ( + Dict, + Sequence, + Optional, + List, + Any, + TypeVar, + Callable, + Generator, + Union, + Literal, +) +import re +import logging +import backoff +from azure.core.credentials import AzureKeyCredential + +# Import OpenAI modules directly +from openai import AzureOpenAI, AsyncAzureOpenAI, Stream +from openai import ( + APITimeoutError, + InternalServerError, + RateLimitError, + UnprocessableEntityError, + BadRequestError, +) +from openai.types import ( + Completion, + CreateEmbeddingResponse, + Image, +) +from openai.types.chat import ChatCompletionChunk, ChatCompletion, ChatCompletionMessage +from openai.types.chat.chat_completion import Choice + +from adalflow.core.model_client import ModelClient +from adalflow.core.types import ( + ModelType, + EmbedderOutput, + TokenLogProb, + CompletionUsage, + GeneratorOutput, +) +from adalflow.components.model_client.utils import parse_embedding_response + +# Import OpenAI client functions for reuse +from api.openai_client import ( + get_first_message_content, + estimate_token_count, + parse_stream_response, + handle_streaming_response, + get_all_messages_content, + get_probabilities, +) + +log = logging.getLogger(__name__) +T = TypeVar("T") + + +class AzureOpenAIClient(ModelClient): + """A component wrapper for the Azure OpenAI API client. + + Supports both embedding and chat completion APIs, including multimodal capabilities. + + Users can: + 1. Simplify use of ``Embedder`` and ``Generator`` components by passing `AzureOpenAIClient()` as the `model_client`. + 2. Use this as a reference to create their own API client or extend this class by copying and modifying the code. + + Note: + We recommend avoiding `response_format` to enforce output data type or `tools` and `tool_choice` in `model_kwargs` when calling the API. + OpenAI's internal formatting and added prompts are unknown. Instead: + - Use :ref:`OutputParser` for response parsing and formatting. + + For multimodal inputs, provide images in `model_kwargs["images"]` as a path, URL, or list of them. + The model must support vision capabilities (e.g., `gpt-4o`, `gpt-4-vision`). + + Args: + api_key (Optional[str], optional): Azure OpenAI API key. Defaults to `None`. + api_version (str, optional): Azure OpenAI API version. Defaults to `"2024-02-01"`. + chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion into a `str`. Defaults to `None`. + The default parser is `get_first_message_content`. + base_url (str): The API base URL to use when initializing the client. + env_api_key_name (str): The environment variable name for the API key. Defaults to `"AZURE_OPENAI_API_KEY"`. + env_base_url_name (str): The environment variable name for the base URL. Defaults to `"AZURE_OPENAI_API_BASE"`. + + References: + - Azure OpenAI API Overview: https://learn.microsoft.com/en-us/azure/ai-services/openai/ + - Embeddings Guide: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/understand-embeddings + - Chat Completion Models: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models + """ + + def __init__( + self, + api_key: Optional[str] = None, + api_version: str = "2024-12-01-preview", # Updated to latest API version + chat_completion_parser: Callable[[Completion], Any] = None, + input_type: Literal["text", "messages"] = "text", + base_url: Optional[str] = None, + env_base_url_name: str = "AZURE_OPENAI_ENDPOINT", + env_api_key_name: str = "AZURE_OPENAI_API_KEY", + model_api_versions: Optional[Dict[str, str]] = None, + ): + """It is recommended to set the AZURE_OPENAI_API_KEY environment variable instead of passing it as an argument. + + Args: + api_key (Optional[str], optional): Azure OpenAI API key. Defaults to None. + api_version (str, optional): Azure OpenAI API version. Defaults to "2024-02-01". + base_url (str): The API base URL to use when initializing the client. + env_api_key_name (str): The environment variable name for the API key. Defaults to `"AZURE_OPENAI_API_KEY"`. + env_base_url_name (str): The environment variable name for the base URL. Defaults to `"AZURE_OPENAI_API_BASE"`. + """ + super().__init__() + self._api_key = api_key + self._api_version = api_version + self._env_api_key_name = env_api_key_name + self._env_base_url_name = env_base_url_name + self.base_url = base_url or os.getenv(self._env_base_url_name) + + # Store model-specific API versions + self._model_api_versions = model_api_versions or {} + + self.sync_client = self.init_sync_client() + self.async_client = None # only initialize if the async call is called + self.chat_completion_parser = ( + chat_completion_parser or get_first_message_content + ) + self._input_type = input_type + self._api_kwargs = {} # add api kwargs when the Azure OpenAI Client is called + + def init_sync_client(self): + """Initialize the synchronous Azure OpenAI client.""" + api_key = self._api_key or os.getenv(self._env_api_key_name) + if not api_key: + raise ValueError( + f"API key must be provided either as an argument or as an environment variable {self._env_api_key_name}" + ) + if not self.base_url: + raise ValueError( + f"Base URL must be provided either as an argument or as an environment variable {self._env_base_url_name}" + ) + + # Use the Azure OpenAI client format compatible with the installed version + return AzureOpenAI( + api_key=api_key, + api_version=self._api_version, + azure_endpoint=self.base_url + ) + + def init_async_client(self): + """Initialize the asynchronous Azure OpenAI client.""" + api_key = self._api_key or os.getenv(self._env_api_key_name) + if not api_key: + raise ValueError( + f"API key must be provided either as an argument or as an environment variable {self._env_api_key_name}" + ) + if not self.base_url: + raise ValueError( + f"Base URL must be provided either as an argument or as an environment variable {self._env_base_url_name}" + ) + + # Use the Azure OpenAI client format compatible with the installed version + return AsyncAzureOpenAI( + api_key=api_key, + api_version=self._api_version, + azure_endpoint=self.base_url + ) + + def parse_chat_completion( + self, + completion: Union[ChatCompletion, Generator[ChatCompletionChunk, None, None]], + ): + """Parse the completion, and put it into the raw_response.""" + if isinstance(completion, Generator): + # Handle streaming response + text = "" + for chunk in completion: + content = parse_stream_response(chunk) + if content is not None: + text += content + return text + else: + # Handle non-streaming response + return self.chat_completion_parser(completion) + + def track_completion_usage( + self, + completion: Union[ChatCompletion, Generator[ChatCompletionChunk, None, None]], + ): + """Track the completion usage.""" + if isinstance(completion, Generator): + # For streaming responses, we can't get the usage directly + # We'll estimate it based on the response + text = "" + for chunk in completion: + content = parse_stream_response(chunk) + if content is not None: + text += content + return CompletionUsage( + prompt_tokens=0, # We don't know + completion_tokens=estimate_token_count(text), + total_tokens=0, # We don't know + ) + else: + # For non-streaming responses, we can get the usage directly + return CompletionUsage( + prompt_tokens=completion.usage.prompt_tokens, + completion_tokens=completion.usage.completion_tokens, + total_tokens=completion.usage.total_tokens, + ) + + def parse_embedding_response( + self, response: CreateEmbeddingResponse + ) -> EmbedderOutput: + """Parse the embedding response to a structure Adalflow components can understand. + + Should be called in ``Embedder``. + """ + return parse_embedding_response(response) + + def convert_inputs_to_api_kwargs( + self, + input: Optional[Any] = None, + model_kwargs: Dict = {}, + model_type: ModelType = ModelType.UNDEFINED, + ) -> Dict: + """ + Specify the API input type and output api_kwargs that will be used in _call and _acall methods. + Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format. + For multimodal inputs, images can be provided in model_kwargs["images"] as a string path, URL, or list of them. + The model specified in model_kwargs["model"] must support multimodal capabilities when using images. + + Args: + input: The input text or messages to process + model_kwargs: Additional parameters including: + - images: Optional image source(s) as path, URL, or list of them + - detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto' + - model: The model to use (must support multimodal inputs if images are provided) + model_type: The type of model (EMBEDDER or LLM) + + Returns: + Dict: API-specific kwargs for the model call + """ + api_kwargs = model_kwargs.copy() + + # Handle different model types + if model_type == ModelType.EMBEDDER: + if isinstance(input, list): + api_kwargs["input"] = input + else: + api_kwargs["input"] = [input] + + # Azure OpenAI requires a deployment_id instead of model + if "model" in api_kwargs: + api_kwargs["deployment_id"] = api_kwargs.pop("model") + + return api_kwargs + + elif model_type == ModelType.LLM: + # Azure OpenAI requires a deployment_id instead of model + if "model" in api_kwargs: + api_kwargs["deployment_id"] = api_kwargs.pop("model") + + # Handle multimodal inputs (images) + images = api_kwargs.pop("images", None) + detail = api_kwargs.pop("detail", "auto") + + if self._input_type == "text" and input is not None: + # Convert text input to messages format + if images: + # For multimodal, we need to format with content list + content = [{"type": "text", "text": input}] + + # Process images + if isinstance(images, str): + images = [images] + + for img in images: + img_content = self._prepare_image_content(img, detail) + content.append(img_content) + + api_kwargs["messages"] = [{"role": "user", "content": content}] + else: + # For text-only, we can use simple format + api_kwargs["messages"] = [{"role": "user", "content": input}] + + elif self._input_type == "messages" and input is not None: + # Input is already in messages format + api_kwargs["messages"] = input + + return api_kwargs + + # For other model types, just pass through the kwargs + return api_kwargs + + def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED): + """ + kwargs is the combined input and model_kwargs. Support streaming call. + """ + self._api_kwargs = api_kwargs.copy() + + # Check if we need to use a model-specific API version + deployment_id = api_kwargs.get("deployment_id") + if deployment_id and deployment_id in self._model_api_versions: + # Create a new client with the model-specific API version + api_key = self._api_key or os.getenv(self._env_api_key_name) + model_specific_client = AzureOpenAI( + api_key=api_key, + api_version=self._model_api_versions[deployment_id], + azure_endpoint=self.base_url, + ) + log.info(f"Using model-specific API version {self._model_api_versions[deployment_id]} for model {deployment_id}") + client = model_specific_client + else: + # Use the default client + client = self.sync_client + + # Handle different model types + if model_type == ModelType.EMBEDDER: + # Prepare the embeddings API call + embedding_api_kwargs = api_kwargs.copy() + + # Handle model parameter for Azure OpenAI + # The installed version doesn't support deployment_id, so we need to use model + deployment_id = embedding_api_kwargs.pop('deployment_id', None) + if deployment_id and not embedding_api_kwargs.get('model'): + # Use deployment_id as the model name + embedding_api_kwargs['model'] = deployment_id + log.info(f"Using deployment_id {deployment_id} as model for embeddings") + + # Ensure we have a model parameter + model = embedding_api_kwargs.get('model') + if not model: + model = "text-embedding-ada-002" # Default model + embedding_api_kwargs['model'] = model + log.info(f"Using default model {model} for embeddings") + + # Remove dimensions parameter if it exists - not supported by text-embedding-ada-002 + if 'dimensions' in embedding_api_kwargs: + dimensions = embedding_api_kwargs.pop('dimensions') + log.info(f"Removed dimensions parameter ({dimensions}) as it's not supported by {model}") + + # Ensure we're using the correct API version for text-embedding-3 models + if model and ('text-embedding-3' in model): + # Create a client with the correct API version for text-embedding-3 models + api_key = self._api_key or os.getenv(self._env_api_key_name) + embedding_client = AzureOpenAI( + api_key=api_key, + api_version="2024-02-01", # API version compatible with text-embedding-3 models + azure_endpoint=self.base_url + ) + log.info(f"Using API version 2024-02-01 for {model} embeddings") + # Call the embeddings API with the specialized client + response = embedding_client.embeddings.create(**embedding_api_kwargs) + else: + # Use the standard client for other embedding models + response = client.embeddings.create(**embedding_api_kwargs) + + return self.parse_embedding_response(response) + + elif model_type == ModelType.LLM: + # Handle streaming if requested + stream = api_kwargs.pop("stream", False) + + if stream: + # Handle streaming response + response = client.chat.completions.create( + **api_kwargs, stream=True + ) + return self.parse_chat_completion(response) + else: + # Handle non-streaming response + response = client.chat.completions.create(**api_kwargs) + return self.parse_chat_completion(response) + + # For other model types, raise an error + raise ValueError(f"Unsupported model type: {model_type}") + + async def acall( + self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED + ): + """ + kwargs is the combined input and model_kwargs + """ + if self.async_client is None: + self.async_client = self.init_async_client() + + self._api_kwargs = api_kwargs.copy() + + # Check if we need to use a model-specific API version + deployment_id = api_kwargs.get("deployment_id") + if deployment_id and deployment_id in self._model_api_versions: + # Create a new client with the model-specific API version + api_key = self._api_key or os.getenv(self._env_api_key_name) + model_specific_client = AsyncAzureOpenAI( + api_key=api_key, + api_version=self._model_api_versions[deployment_id], + azure_endpoint=self.base_url, + ) + log.info(f"Using model-specific API version {self._model_api_versions[deployment_id]} for model {deployment_id}") + client = model_specific_client + else: + # Use the default client + client = self.async_client + + # Handle different model types + if model_type == ModelType.EMBEDDER: + # Prepare the embeddings API call + embedding_api_kwargs = api_kwargs.copy() + + # Handle model parameter for Azure OpenAI + # The installed version doesn't support deployment_id, so we need to use model + deployment_id = embedding_api_kwargs.pop('deployment_id', None) + if deployment_id and not embedding_api_kwargs.get('model'): + # Use deployment_id as the model name + embedding_api_kwargs['model'] = deployment_id + log.info(f"Using deployment_id {deployment_id} as model for embeddings") + + # Ensure we have a model parameter + model = embedding_api_kwargs.get('model') + if not model: + model = "text-embedding-ada-002" # Default model + embedding_api_kwargs['model'] = model + log.info(f"Using default model {model} for embeddings") + + # Remove dimensions parameter if it exists - not supported by text-embedding-ada-002 + if 'dimensions' in embedding_api_kwargs: + dimensions = embedding_api_kwargs.pop('dimensions') + log.info(f"Removed dimensions parameter ({dimensions}) as it's not supported by {model}") + + # Ensure we're using the correct API version for text-embedding-3 models + if model and ('text-embedding-3' in model): + # Create a client with the correct API version for text-embedding-3 models + api_key = self._api_key or os.getenv(self._env_api_key_name) + embedding_client = AsyncAzureOpenAI( + api_key=api_key, + api_version="2024-02-01", # API version compatible with text-embedding-3 models + azure_endpoint=self.base_url + ) + log.info(f"Using API version 2024-02-01 for {model} embeddings") + # Call the embeddings API with the specialized client + response = await embedding_client.embeddings.create(**embedding_api_kwargs) + else: + # Use the standard client for other embedding models + response = await client.embeddings.create(**embedding_api_kwargs) + + return self.parse_embedding_response(response) + + elif model_type == ModelType.LLM: + # Handle streaming if requested + stream = api_kwargs.pop("stream", False) + + if stream: + # Handle streaming response + response = await client.chat.completions.create( + **api_kwargs, stream=True + ) + # For streaming, return the raw response so it can be iterated over + # This allows the caller to handle the streaming directly + log.info("Returning raw streaming response") + return response + else: + # Handle non-streaming response + response = await client.chat.completions.create(**api_kwargs) + return self.parse_chat_completion(response) + + # For other model types, raise an error + raise ValueError(f"Unsupported model type: {model_type}") + + @classmethod + def from_dict(cls: type[T], data: Dict[str, Any]) -> T: + """Create a client from a dictionary.""" + return cls(**data) + + def to_dict(self) -> Dict[str, Any]: + """Convert the component to a dictionary.""" + return { + "api_key": self._api_key, + "api_version": self._api_version, + "base_url": self.base_url, + "env_base_url_name": self._env_base_url_name, + "env_api_key_name": self._env_api_key_name, + "input_type": self._input_type, + "model_api_versions": self._model_api_versions, + } + + def _encode_image(self, image_path: str) -> str: + """Encode image to base64 string. + + Args: + image_path: Path to image file. + + Returns: + Base64 encoded image string. + + Raises: + ValueError: If the file cannot be read or doesn't exist. + """ + try: + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + except Exception as e: + raise ValueError(f"Error encoding image: {str(e)}") + + def _prepare_image_content( + self, image_source: Union[str, Dict[str, Any]], detail: str = "auto" + ) -> Dict[str, Any]: + """Prepare image content for API request. + + Args: + image_source: Either a path to local image or a URL. + detail: Image detail level ('auto', 'low', or 'high'). + + Returns: + Formatted image content for API request. + """ + # If image_source is already a formatted dictionary, return it + if isinstance(image_source, dict) and "type" in image_source: + return image_source + + # Check if the source is a URL or a local file path + is_url = image_source.startswith(("http://", "https://")) + + # Format the image content + if is_url: + return { + "type": "image_url", + "image_url": { + "url": image_source, + "detail": detail + } + } + else: + # Local file path, encode it to base64 + return { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{self._encode_image(image_source)}", + "detail": detail + } + } diff --git a/api/azuredevops_client.py b/api/azuredevops_client.py new file mode 100644 index 00000000..a2c4d67a --- /dev/null +++ b/api/azuredevops_client.py @@ -0,0 +1,279 @@ +import os +import logging +import json +import base64 +import subprocess +from urllib.parse import urlparse, urlunparse, quote + +# Configure logging +logger = logging.getLogger(__name__) + +def get_azuredevops_file_content(repo_url: str, file_path: str, access_token: str = None) -> str: + """ + Retrieves the content of a file from an Azure DevOps repository using the Azure DevOps REST API. + + Args: + repo_url (str): The URL of the Azure DevOps repository + (e.g., "https://dev.azure.com/organization/project/_git/repo") + file_path (str): The path to the file within the repository (e.g., "src/main.py") + access_token (str, optional): Personal access token for Azure DevOps + + Returns: + str: The content of the file as a string + + Raises: + ValueError: If the file cannot be fetched or if the URL is not a valid Azure DevOps URL + """ + logger.info(f"Fetching file content from Azure DevOps: {repo_url}, file: {file_path}") + + try: + # Extract organization, project, and repo name from Azure DevOps URL + if not (repo_url.startswith("https://dev.azure.com/") or repo_url.startswith("http://dev.azure.com/")): + logger.error(f"Invalid Azure DevOps URL format: {repo_url}") + raise ValueError(f"Not a valid Azure DevOps repository URL: {repo_url}") + + # Log the original URL for debugging + logger.info(f"Processing Azure DevOps URL: {repo_url}") + + # Parse the URL using urlparse to handle URL encoding properly + parsed_url = urlparse(repo_url) + logger.info(f"Parsed URL - scheme: {parsed_url.scheme}, netloc: {parsed_url.netloc}, path: {parsed_url.path}") + + path_parts = parsed_url.path.strip('/').split('/') + logger.info(f"Path parts: {path_parts}") + + # Find the organization (first part of the path) + if not path_parts or len(path_parts) < 1: + logger.error("Organization not found in URL path parts") + raise ValueError("Organization not found in URL") + organization = path_parts[0] + logger.info(f"Extracted organization: {organization}") + + # Find the _git part to locate the repository name + try: + git_index = path_parts.index('_git') + logger.info(f"Found _git at index {git_index}") + except ValueError: + logger.error("Could not find '_git' in the URL path") + raise ValueError("Could not find '_git' in the URL path") + + # The repository is the part after _git + if git_index + 1 >= len(path_parts): + logger.error("Repository name not found in URL (no part after _git)") + raise ValueError("Repository name not found in URL") + repository = path_parts[git_index + 1] + logger.info(f"Extracted repository: {repository}") + + # The project is everything between the organization and _git + # For projects with spaces, this will be properly encoded in the URL + if git_index <= 1: + logger.error("Project name not found in URL (git_index <= 1)") + raise ValueError("Project name not found in URL") + + # Use the project name as it appears in the URL (might contain URL encoding) + project = path_parts[1] + logger.info(f"Extracted project name: {project}") + + # For URLs with spaces in project names, we need to preserve the URL encoding + # Use the original parsed path to construct the API URL + project_path = parsed_url.path.split('/_git/')[0] + logger.info(f"Project path from URL: {project_path}") + + organization_path = f"/{organization}" + logger.info(f"Organization path: {organization_path}") + + project_relative_path = project_path[len(organization_path):].lstrip('/') + logger.info(f"Project relative path: {project_relative_path}") + + # Use Azure DevOps REST API to get file content + # The API endpoint for getting file content is: + # https://dev.azure.com/{organization}/{project}/_apis/git/repositories/{repository}/items?path={path}&api-version=7.1 + + # Encode the file path properly for the URL + encoded_file_path = quote(file_path) + logger.info(f"Encoded file path: {encoded_file_path}") + + # Construct the API URL with detailed logging - properly handle spaces in project path + # We need to re-encode the project_relative_path for the API URL while preserving the spaces + # This is tricky because we need to encode spaces as %20 but not re-encode already encoded characters + from urllib.parse import quote + + # First, ensure project_relative_path has spaces (not %20) + if '%20' in project_relative_path: + project_relative_path = project_relative_path.replace('%20', ' ') + logger.info(f"Normalized project path: {project_relative_path}") + + # Then encode it properly for the URL + encoded_project_path = quote(project_relative_path) + logger.info(f"Encoded project path: {encoded_project_path}") + + # Construct the final API URL + api_url = f"https://dev.azure.com/{organization}/{encoded_project_path}/_apis/git/repositories/{repository}/items?path={encoded_file_path}&api-version=7.1&includeContent=true" + logger.info(f"Constructed API URL: {api_url}") + + # Add verbose curl output for debugging + curl_cmd = ["curl", "-v", "-s"] + logger.info("Using verbose curl for detailed request/response information") + + # Prepare curl command with authentication if token is provided + if access_token: + # Azure DevOps uses Basic Auth with PAT as the password and empty username + auth_string = f":{access_token}" + encoded_auth = base64.b64encode(auth_string.encode()).decode() + curl_cmd.extend(["-H", f"Authorization: Basic {encoded_auth}"]) + logger.info("Added authentication header to request") + else: + logger.warning("No access token provided for Azure DevOps API request") + + curl_cmd.append(api_url) + + logger.info(f"Executing curl command to fetch file content from Azure DevOps API") + logger.info(f"Full API URL: {api_url}") + + # Execute the curl command with detailed output + result = subprocess.run( + curl_cmd, + check=False, # Don't raise exception on non-zero exit code, we'll handle errors manually + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + # Log the curl command exit code and stderr for debugging + logger.info(f"Curl command exit code: {result.returncode}") + if result.stderr: + logger.info(f"Curl stderr output: {result.stderr}") + + # For Azure DevOps, the API returns the raw file content directly (not base64 encoded) + content = result.stdout + + # Check if we got an error response (usually in JSON format) + if content.startswith('{'): + try: + error_data = json.loads(content) + logger.info(f"Received JSON response: {json.dumps(error_data, indent=2)}") + + if "message" in error_data: + error_message = error_data['message'] + logger.error(f"Azure DevOps API error message: {error_message}") + raise ValueError(f"Azure DevOps API error: {error_message}") + + if "value" in error_data and isinstance(error_data["value"], dict) and "content" in error_data["value"]: + # This is a successful response with content in the value field + logger.info("Successfully retrieved file content in JSON format") + return error_data["value"]["content"] + except json.JSONDecodeError as e: + # If it's not valid JSON but starts with '{', it might still be file content + logger.warning(f"Response starts with '{{' but is not valid JSON: {e}") + pass + + # Check for empty content + if not content.strip(): + logger.error("Received empty response from Azure DevOps API") + raise ValueError("Received empty response from Azure DevOps API") + + # If we get here, assume the content is the raw file content + logger.info(f"Successfully retrieved file content, size: {len(content)} bytes") + return content + + except subprocess.CalledProcessError as e: + error_msg = e.stderr + # Sanitize error message to remove any tokens + if access_token and access_token in error_msg: + error_msg = error_msg.replace(access_token, "[REDACTED]") + + logger.error(f"Subprocess error: {e.returncode}, Error message: {error_msg}") + raise ValueError(f"Error fetching file content from Azure DevOps: {error_msg}") + + except Exception as e: + logger.error(f"Unexpected error in get_azuredevops_file_content: {str(e)}") + raise ValueError(f"Unexpected error accessing Azure DevOps: {str(e)}") + +def clone_azuredevops_repo(repo_url: str, local_path: str, access_token: str = None) -> str: + """ + Clones an Azure DevOps repository to a local path. + Handles repositories with spaces in project names. + + Args: + repo_url (str): The URL of the Azure DevOps repository + local_path (str): The local directory where the repository will be cloned + access_token (str, optional): Personal access token for Azure DevOps + + Returns: + str: The output message from the git command + """ + try: + # Check if Git is installed + logger.info(f"Preparing to clone Azure DevOps repository to {local_path}") + subprocess.run( + ["git", "--version"], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # Check if repository already exists + if os.path.exists(local_path) and os.listdir(local_path): + # Directory exists and is not empty + logger.warning(f"Repository already exists at {local_path}. Using existing repository.") + return f"Using existing repository at {local_path}" + + # Ensure the local path exists + os.makedirs(local_path, exist_ok=True) + + # Prepare the clone URL with access token if provided + clone_url = repo_url + + # Handle spaces in project names for Azure DevOps URLs + if " " in repo_url or "%20" in repo_url: + logger.info("Azure DevOps URL contains spaces or encoded spaces, handling specially") + parsed = urlparse(repo_url) + + # Extract components + path = parsed.path + + # Handle spaces in path + if " " in path or "%20" in path: + # Normalize path to have spaces (not %20) + if "%20" in path: + path = path.replace("%20", " ") + + # Then encode it properly for git + from urllib.parse import quote + encoded_path = quote(path) + logger.info(f"Original path: {path}") + logger.info(f"Encoded path for git: {encoded_path}") + + # Reconstruct the URL + clone_url = f"{parsed.scheme}://{parsed.netloc}{encoded_path}" + logger.info(f"Reconstructed URL for git: {clone_url}") + + # Add authentication if token is provided + if access_token: + parsed = urlparse(clone_url) + # Format: https://{username}:{token}@dev.azure.com/... + # For Azure DevOps, we use an empty username with the PAT as the password + clone_url = urlunparse((parsed.scheme, f":{access_token}@{parsed.netloc}", parsed.path, '', '', '')) + logger.info("Using access token for authentication") + + # Clone the repository + logger.info(f"Cloning repository from {repo_url} to {local_path}") + # We use repo_url in the log to avoid exposing the token in logs + result = subprocess.run( + ["git", "clone", clone_url, local_path], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + logger.info("Repository cloned successfully") + return result.stdout.decode("utf-8") + + except subprocess.CalledProcessError as e: + error_msg = e.stderr.decode('utf-8') + # Sanitize error message to remove any tokens + if access_token and access_token in error_msg: + error_msg = error_msg.replace(access_token, "***TOKEN***") + raise ValueError(f"Error during cloning: {error_msg}") + except Exception as e: + raise ValueError(f"An unexpected error occurred: {str(e)}") diff --git a/api/config.py b/api/config.py index db3baba3..6a7309ec 100644 --- a/api/config.py +++ b/api/config.py @@ -8,12 +8,15 @@ from api.openai_client import OpenAIClient from api.openrouter_client import OpenRouterClient +from api.azure_openai_client import AzureOpenAIClient from adalflow import GoogleGenAIClient, OllamaClient # Get API keys from environment variables OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY') OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY') +AZURE_OPENAI_API_KEY = os.environ.get('AZURE_OPENAI_API_KEY') +AZURE_OPENAI_ENDPOINT = os.environ.get('AZURE_OPENAI_ENDPOINT') or os.environ.get('AZURE_OPENAI_API_BASE') # Set keys in environment (in case they're needed elsewhere in the code) if OPENAI_API_KEY: @@ -22,6 +25,14 @@ os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY if OPENROUTER_API_KEY: os.environ["OPENROUTER_API_KEY"] = OPENROUTER_API_KEY +if AZURE_OPENAI_API_KEY: + os.environ["AZURE_OPENAI_API_KEY"] = AZURE_OPENAI_API_KEY + +# Support both old and new Azure OpenAI endpoint environment variables +if AZURE_OPENAI_ENDPOINT: + os.environ["AZURE_OPENAI_ENDPOINT"] = AZURE_OPENAI_ENDPOINT + # Also set the old variable for backward compatibility + os.environ["AZURE_OPENAI_API_BASE"] = AZURE_OPENAI_ENDPOINT # Get configuration directory from environment variable, or use default if not set CONFIG_DIR = os.environ.get('DEEPWIKI_CONFIG_DIR', None) @@ -31,7 +42,8 @@ "GoogleGenAIClient": GoogleGenAIClient, "OpenAIClient": OpenAIClient, "OpenRouterClient": OpenRouterClient, - "OllamaClient": OllamaClient + "OllamaClient": OllamaClient, + "AzureOpenAIClient": AzureOpenAIClient } # Load JSON configuration file @@ -212,6 +224,14 @@ def get_model_config(provider="google", model=None): result["model_kwargs"] = {"model": model, **model_params["options"]} else: result["model_kwargs"] = {"model": model} + elif provider == "azure": + # Azure OpenAI uses deployment_id instead of model + # The model name is used as the deployment_id + result["model_kwargs"] = {"model": model, **model_params} + + # Pass model-specific API versions if available + if "model_api_versions" in provider_config: + result["model_client_kwargs"] = {"model_api_versions": provider_config["model_api_versions"]} else: # Standard structure for other providers result["model_kwargs"] = {"model": model, **model_params} diff --git a/api/config/embedder.json b/api/config/embedder.json index df8ade90..8fb8b7a5 100644 --- a/api/config/embedder.json +++ b/api/config/embedder.json @@ -1,5 +1,13 @@ { "embedder": { + "client_class": "AzureOpenAIClient", + "batch_size": 500, + "model_kwargs": { + "model": "text-embedding-ada-002", + "encoding_format": "float" + } + }, + "embedder_openai": { "client_class": "OpenAIClient", "batch_size": 500, "model_kwargs": { diff --git a/api/config/generator.json b/api/config/generator.json index b1e4ac7e..e5bdccc3 100644 --- a/api/config/generator.json +++ b/api/config/generator.json @@ -1,6 +1,32 @@ { - "default_provider": "google", + "default_provider": "azure", "providers": { + "azure": { + "default_model": "gpt-4o", + "client_class": "AzureOpenAIClient", + "supportsCustomModel": true, + "model_api_versions": { + "gpt-4.1": "2025-01-01-preview" + }, + "models": { + "gpt-4o": { + "temperature": 0.7, + "top_p": 0.8 + }, + "gpt-4": { + "temperature": 0.7, + "top_p": 0.8 + }, + "gpt-35-turbo": { + "temperature": 0.7, + "top_p": 0.8 + }, + "gpt-4.1": { + "temperature": 0.7, + "top_p": 0.8 + } + } + }, "google": { "default_model": "gemini-2.0-flash", "supportsCustomModel": true, diff --git a/api/data_pipeline.py b/api/data_pipeline.py index 853aa2bf..d8b4974e 100644 --- a/api/data_pipeline.py +++ b/api/data_pipeline.py @@ -9,15 +9,64 @@ import base64 import re import glob +import xattr from adalflow.utils import get_adalflow_default_root_path from adalflow.core.db import LocalDB from api.config import configs, DEFAULT_EXCLUDED_DIRS, DEFAULT_EXCLUDED_FILES from api.ollama_patch import OllamaDocumentProcessor from urllib.parse import urlparse, urlunparse, quote +import hashlib +from typing import Tuple, Dict, Any # Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) logger = logging.getLogger(__name__) +def get_file_attributes(file_path: str) -> Dict[str, Any]: + """ + Extracts extended file attributes (xattr) from a local file. + + Args: + file_path (str): Path to the file to extract attributes from + + Returns: + Dict[str, Any]: Dictionary of attribute names and their values + """ + attributes = {} + try: + # Get list of all attribute names for the file + attr_names = xattr.listxattr(file_path) + + # For each attribute, get its value and add to the dictionary + for attr_name in attr_names: + try: + # Get the attribute value + attr_value = xattr.getxattr(file_path, attr_name) + + # Try to decode as UTF-8 string if possible + try: + attr_value = attr_value.decode('utf-8') + except (UnicodeDecodeError, AttributeError): + # If not decodable as UTF-8, use as is + pass + + # Add to attributes dictionary + attributes[attr_name] = attr_value + except (OSError, IOError) as e: + logger.warning(f"Error reading attribute {attr_name} from {file_path}: {e}") + + logger.debug(f"Extracted {len(attributes)} attributes from {file_path}") + except (OSError, IOError) as e: + logger.warning(f"Error listing attributes for {file_path}: {e}") + + return attributes + +# No need for a separate get_repo_structure function +# We're using the existing get_local_repo_structure function in api.py + # Maximum token limit for OpenAI embedding models MAX_EMBEDDING_TOKENS = 8192 @@ -45,36 +94,35 @@ def count_tokens(text: str, local_ollama: bool = False) -> int: # Rough approximation: 4 characters per token return len(text) // 4 -def download_repo(repo_url: str, local_path: str, type: str = "github", access_token: str = None) -> str: +def download_repo(repo_url: str, type: str = "github", access_token: str = None) -> str: """ - Downloads a Git repository (GitHub, GitLab, or Bitbucket) to a specified local path. + Downloads a repository to a local directory. Args: - repo_url (str): The URL of the Git repository to clone. - local_path (str): The local directory where the repository will be cloned. - access_token (str, optional): Access token for private repositories. + repo_url (str): URL of the repository to download + type (str): Type of repository (github, gitlab, bitbucket, azure) + access_token (str, optional): Personal access token for private repositories Returns: - str: The output message from the `git` command. + str: Path to the local directory containing the repository """ try: - # Check if Git is installed - logger.info(f"Preparing to clone repository to {local_path}") - subprocess.run( - ["git", "--version"], - check=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - # Check if repository already exists - if os.path.exists(local_path) and os.listdir(local_path): - # Directory exists and is not empty - logger.warning(f"Repository already exists at {local_path}. Using existing repository.") - return f"Using existing repository at {local_path}" - - # Ensure the local path exists + # Log the repository URL and type for debugging + logger.info(f"Downloading repository from {repo_url} of type {type}") + + # Create a unique directory name based on the repo URL + repo_hash = hashlib.md5(repo_url.encode()).hexdigest() + local_path = os.path.join(get_adalflow_default_root_path(), "repos", repo_hash) + logger.info(f"Generated local path: {local_path} (hash: {repo_hash})") + + # Check if the repository already exists locally + if os.path.exists(local_path): + logger.info(f"Repository already exists at {local_path}") + return local_path + + # Create the directory if it doesn't exist os.makedirs(local_path, exist_ok=True) + logger.info(f"Created directory for repository at {local_path}") # Prepare the clone URL with access token if provided clone_url = repo_url @@ -90,6 +138,22 @@ def download_repo(repo_url: str, local_path: str, type: str = "github", access_t elif type == "bitbucket": # Format: https://{token}@bitbucket.org/owner/repo.git clone_url = urlunparse((parsed.scheme, f"{access_token}@{parsed.netloc}", parsed.path, '', '', '')) + elif type == "azure": + # Format for Azure DevOps: https://organization@dev.azure.com/organization/project/_git/repo.git + # For Azure DevOps, we need to ensure the URL is in the correct format for git clone + # Extract the organization name from the path + path_parts = parsed.path.strip('/').split('/') + organization = path_parts[0] if path_parts else "" + + # The path should end with .git + path = parsed.path + + # Use the organization name as the username with the PAT as password + clone_url = urlunparse((parsed.scheme, f"{organization}:{access_token}@{parsed.netloc}", path, '', '', '')) + + # Log the URL format (without exposing the token) + sanitized_url = urlunparse((parsed.scheme, f"{organization}:***TOKEN***@{parsed.netloc}", path, '', '', '')) + logger.info(f"Azure DevOps clone URL format (sanitized): {sanitized_url}") logger.info("Using access token for authentication") # Clone the repository @@ -103,7 +167,8 @@ def download_repo(repo_url: str, local_path: str, type: str = "github", access_t ) logger.info("Repository cloned successfully") - return result.stdout.decode("utf-8") + # Return the local path instead of the command output + return local_path except subprocess.CalledProcessError as e: error_msg = e.stderr.decode('utf-8') @@ -140,7 +205,7 @@ def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: Lis documents = [] # File extensions to look for, prioritizing code files code_extensions = [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs", - ".jsx", ".tsx", ".html", ".css", ".php", ".swift", ".cs"] + ".jsx", ".tsx", ".html", ".css", ".php", ".swift", ".cs", ".tf", ".tfvars"] doc_extensions = [".md", ".txt", ".rst", ".json", ".yaml", ".yml"] # Determine filtering mode: inclusion or exclusion @@ -286,17 +351,28 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List if token_count > MAX_EMBEDDING_TOKENS * 10: logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit") continue + + # Extract file attributes + file_attrs = get_file_attributes(file_path) + + # Prepare metadata + metadata = { + "file_path": relative_path, + "type": ext[1:], + "is_code": True, + "is_implementation": is_implementation, + "title": relative_path, + "token_count": token_count, + } + + # Add file attributes to metadata + if file_attrs: + metadata["file_attributes"] = file_attrs + logger.info(f"Added {len(file_attrs)} file attributes to {relative_path}") doc = Document( text=content, - meta_data={ - "file_path": relative_path, - "type": ext[1:], - "is_code": True, - "is_implementation": is_implementation, - "title": relative_path, - "token_count": token_count, - }, + meta_data=metadata, ) documents.append(doc) except Exception as e: @@ -320,17 +396,28 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List if token_count > MAX_EMBEDDING_TOKENS: logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit") continue + + # Extract file attributes + file_attrs = get_file_attributes(file_path) + + # Prepare metadata + metadata = { + "file_path": relative_path, + "type": ext[1:], + "is_code": False, + "is_implementation": False, + "title": relative_path, + "token_count": token_count, + } + + # Add file attributes to metadata + if file_attrs: + metadata["file_attributes"] = file_attrs + logger.info(f"Added {len(file_attrs)} file attributes to {relative_path}") doc = Document( text=content, - meta_data={ - "file_path": relative_path, - "type": ext[1:], - "is_code": False, - "is_implementation": False, - "title": relative_path, - "token_count": token_count, - }, + meta_data=metadata, ) documents.append(doc) except Exception as e: @@ -608,13 +695,14 @@ def get_bitbucket_file_content(repo_url: str, file_path: str, access_token: str raise ValueError(f"Failed to get file content: {str(e)}") -def get_file_content(repo_url: str, file_path: str, type: str = "github", access_token: str = None) -> str: +def get_file_content(repo_url: str, file_path: str, type: str = "github", access_token: str = None): """ - Retrieves the content of a file from a Git repository (GitHub or GitLab). + Retrieves the content of a file from a Git repository (GitHub, GitLab, Bitbucket, or Azure DevOps). Args: repo_url (str): The URL of the repository file_path (str): The path to the file within the repository + type (str): The type of repository (github, gitlab, bitbucket, azure) access_token (str, optional): Access token for private repositories Returns: @@ -623,14 +711,58 @@ def get_file_content(repo_url: str, file_path: str, type: str = "github", access Raises: ValueError: If the file cannot be fetched or if the URL is not valid """ - if type == "github": - return get_github_file_content(repo_url, file_path, access_token) - elif type == "gitlab": - return get_gitlab_file_content(repo_url, file_path, access_token) - elif type == "bitbucket": - return get_bitbucket_file_content(repo_url, file_path, access_token) - else: - raise ValueError("Unsupported repository URL. Only GitHub and GitLab are supported.") + try: + if type == "github": + return get_github_file_content(repo_url, file_path, access_token) + elif type == "gitlab": + return get_gitlab_file_content(repo_url, file_path, access_token) + elif type == "bitbucket": + return get_bitbucket_file_content(repo_url, file_path, access_token) + elif type == "azure": + from api.azuredevops_client import get_azuredevops_file_content + # For Azure DevOps, we need to decode the URL since it might contain encoded spaces + decoded_repo_url = repo_url + logger.info(f"Original Azure DevOps URL: {repo_url}") + + try: + # If the URL is already decoded, this won't change it + # If it's encoded (like with %20 for spaces), this will decode it + from urllib.parse import unquote + + # Check if the URL is double-encoded (contains %25 which is the encoded form of %) + if "%25" in repo_url: + # First decode to convert %25 to % + temp_url = unquote(repo_url) + logger.info(f"First decode step: {temp_url}") + + # Then decode again to convert % encoded characters + decoded_repo_url = unquote(temp_url) + logger.info(f"Double-decoded Azure DevOps URL: {decoded_repo_url}") + else: + decoded_repo_url = unquote(repo_url) + logger.info(f"Decoded Azure DevOps URL: {decoded_repo_url}") + + # Ensure the URL contains dev.azure.com + if "dev.azure.com" not in decoded_repo_url: + logger.error(f"URL does not appear to be an Azure DevOps URL: {decoded_repo_url}") + raise ValueError(f"Invalid Azure DevOps URL: {decoded_repo_url}") + + # Verify the URL format is correct for Azure DevOps + if "_git" not in decoded_repo_url: + logger.error(f"Azure DevOps URL missing '_git' segment: {decoded_repo_url}") + raise ValueError(f"Invalid Azure DevOps URL format, missing '_git' segment: {decoded_repo_url}") + except Exception as e: + logger.error(f"Failed to process Azure DevOps URL: {str(e)}") + raise ValueError(f"Failed to process Azure DevOps URL: {str(e)}") + + logger.info(f"Processing Azure DevOps repository with URL: {decoded_repo_url}") + logger.info(f"Fetching file: {file_path} from Azure DevOps repository") + return get_azuredevops_file_content(decoded_repo_url, file_path, access_token) + else: + raise ValueError(f"Unsupported repository type: {type}") + except Exception as e: + raise ValueError(f"Error getting file content: {str(e)}") + class DatabaseManager: """ @@ -703,6 +835,14 @@ def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token elif type == "bitbucket": # Bitbucket URL format: https://bitbucket.org/owner/repo repo_name = repo_url_or_path.split("/")[-1].replace(".git", "") + elif type == "azure": + # Azure DevOps URL format: https://dev.azure.com/organization/project/_git/repo + # Find the part after _git/ in the URL + if "_git/" in repo_url_or_path: + repo_name = repo_url_or_path.split("_git/")[-1].replace(".git", "") + else: + # Fallback to the last part of the URL + repo_name = repo_url_or_path.split("/")[-1].replace(".git", "") else: # Generic handling for other Git URLs repo_name = repo_url_or_path.split("/")[-1].replace(".git", "") @@ -712,7 +852,28 @@ def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token # Check if the repository directory already exists and is not empty if not (os.path.exists(save_repo_dir) and os.listdir(save_repo_dir)): # Only download if the repository doesn't exist or is empty - download_repo(repo_url_or_path, save_repo_dir, type, access_token) + try: + # Use the download_repo function to get a local path + repo_path = download_repo(repo_url_or_path, type, access_token) + + # If the repo_path is different from save_repo_dir, copy the contents + if repo_path != save_repo_dir and os.path.exists(repo_path): + import shutil + # Create save_repo_dir if it doesn't exist + os.makedirs(save_repo_dir, exist_ok=True) + + # Copy contents from repo_path to save_repo_dir + for item in os.listdir(repo_path): + src = os.path.join(repo_path, item) + dst = os.path.join(save_repo_dir, item) + if os.path.isdir(src): + shutil.copytree(src, dst, dirs_exist_ok=True) + else: + shutil.copy2(src, dst) + logger.info(f"Copied repository from {repo_path} to {save_repo_dir}") + except Exception as e: + logger.error(f"Error downloading repository: {str(e)}") + raise ValueError(f"Failed to download repository: {str(e)}") else: logger.info(f"Repository already exists at {save_repo_dir}. Using existing repository.") else: # local path diff --git a/api/rag.py b/api/rag.py index e0603a5b..58ce7ba0 100644 --- a/api/rag.py +++ b/api/rag.py @@ -420,13 +420,19 @@ def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_ try: retreive_embedder = self.query_embedder if self.local_ollama else self.embedder + + # Extract top_k from config to handle it separately + retriever_config = configs["retriever"].copy() + top_k = retriever_config.pop("top_k", 20) # Default to 20 if not specified + + # Initialize the retriever with the correct parameters self.retriever = FAISSRetriever( - **configs["retriever"], embedder=retreive_embedder, documents=self.transformed_docs, document_map_func=lambda doc: doc.vector, + top_k=top_k # Pass top_k as a direct parameter ) - logger.info("FAISS retriever created successfully") + logger.info(f"FAISS retriever created successfully with top_k={top_k}") except Exception as e: logger.error(f"Error creating FAISS retriever: {str(e)}") # Try to provide more specific error information @@ -462,13 +468,34 @@ def call(self, query: str, language: str = "en") -> Tuple[List]: Tuple of (RAGAnswer, retrieved_documents) """ try: + # Add debug logging + logger.info(f"Calling retriever with query: {query[:50]}...") + logger.info(f"Retriever type: {type(self.retriever).__name__}") + + # Call the retriever retrieved_documents = self.retriever(query) - - # Fill in the documents - retrieved_documents[0].documents = [ - self.transformed_docs[doc_index] - for doc_index in retrieved_documents[0].doc_indices - ] + logger.info(f"Retrieved documents type: {type(retrieved_documents)}") + + if isinstance(retrieved_documents, list) and len(retrieved_documents) > 0: + logger.info(f"First result type: {type(retrieved_documents[0]).__name__}") + logger.info(f"Doc indices available: {hasattr(retrieved_documents[0], 'doc_indices')}") + + # Fill in the documents + if hasattr(retrieved_documents[0], 'doc_indices'): + retrieved_documents[0].documents = [ + self.transformed_docs[doc_index] + for doc_index in retrieved_documents[0].doc_indices + ] + else: + logger.error("Retrieved documents don't have doc_indices attribute") + # Try to handle this case gracefully + if hasattr(retrieved_documents[0], 'documents') and not retrieved_documents[0].documents: + # If documents is empty, try to populate it with the top documents + top_k = getattr(self.retriever, 'top_k', 20) # Default to 20 if not specified + retrieved_documents[0].documents = self.transformed_docs[:top_k] + logger.info(f"Populated documents with top {top_k} documents as fallback") + else: + logger.error(f"Unexpected retriever result format: {retrieved_documents}") return retrieved_documents diff --git a/api/requirements.txt b/api/requirements.txt index 18f60c50..16466ed3 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -13,4 +13,5 @@ python-dotenv>=1.0.0 openai>=1.76.2 ollama>=0.4.8 aiohttp>=3.8.4 -websockets>=11.0.3 \ No newline at end of file +websockets>=11.0.3 +xattr>=1.1.4 \ No newline at end of file diff --git a/api/websocket_wiki.py b/api/websocket_wiki.py index 11278b5e..2ab59644 100644 --- a/api/websocket_wiki.py +++ b/api/websocket_wiki.py @@ -3,7 +3,7 @@ from typing import List, Optional, Dict, Any from urllib.parse import unquote -import google.generativeai as genai +# Import model clients from adalflow.components.model_client.ollama_client import OllamaClient from adalflow.core.types import ModelType from fastapi import WebSocket, WebSocketDisconnect, HTTPException @@ -13,8 +13,16 @@ from api.data_pipeline import count_tokens, get_file_content from api.openai_client import OpenAIClient from api.openrouter_client import OpenRouterClient +from api.azure_openai_client import AzureOpenAIClient from api.rag import RAG +# Optional import for Google Generative AI +try: + import google.generativeai as genai + GOOGLE_AI_AVAILABLE = True +except ImportError: + GOOGLE_AI_AVAILABLE = False + # Configure logging logging.basicConfig( level=logging.INFO, @@ -24,9 +32,14 @@ # Get API keys from environment variables google_api_key = os.environ.get('GOOGLE_API_KEY') +azure_openai_api_key = os.environ.get('AZURE_OPENAI_API_KEY') +azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT') or os.environ.get('AZURE_OPENAI_API_BASE') + +# Check if Azure OpenAI is configured +AZURE_OPENAI_AVAILABLE = bool(azure_openai_api_key and azure_openai_endpoint) -# Configure Google Generative AI -if google_api_key: +# Configure Google Generative AI if available +if GOOGLE_AI_AVAILABLE and google_api_key: genai.configure(api_key=google_api_key) else: logger.warning("GOOGLE_API_KEY not found in environment variables") @@ -47,7 +60,7 @@ class ChatCompletionRequest(BaseModel): type: Optional[str] = Field("github", description="Type of repository (e.g., 'github', 'gitlab', 'bitbucket')") # model parameters - provider: str = Field("google", description="Model provider (google, openai, openrouter, ollama)") + provider: str = Field("azure", description="Model provider (azure, openai, openrouter, ollama, google)") model: Optional[str] = Field(None, description="Model name for the specified provider") language: Optional[str] = Field("en", description="Language for content generation (e.g., 'en', 'ja', 'zh', 'es', 'kr', 'vi')") @@ -81,7 +94,13 @@ async def handle_websocket_chat(websocket: WebSocket): # Create a new RAG instance for this request try: - request_rag = RAG(provider=request.provider, model=request.model) + # Set a default provider if empty + provider = request.provider + if not provider or provider.strip() == "": + provider = "google" # Default to google if provider is empty + logger.info(f"Empty provider detected, defaulting to: {provider}") + + request_rag = RAG(provider=provider, model=request.model) # Extract custom file filter parameters if provided excluded_dirs = None @@ -201,9 +220,22 @@ async def handle_websocket_chat(websocket: WebSocket): # Try to perform RAG retrieval try: # This will use the actual RAG implementation + logger.info("About to call request_rag with query") retrieved_documents = request_rag(rag_query, language=request.language) - - if retrieved_documents and retrieved_documents[0].documents: + logger.info(f"RAG call successful, result type: {type(retrieved_documents)}") + + # Debug the retrieved documents structure + if isinstance(retrieved_documents, tuple): + logger.info(f"Retrieved documents is a tuple of length {len(retrieved_documents)}") + for i, item in enumerate(retrieved_documents): + logger.info(f"Item {i} type: {type(item).__name__}") + elif isinstance(retrieved_documents, list): + logger.info(f"Retrieved documents is a list of length {len(retrieved_documents)}") + for i, item in enumerate(retrieved_documents): + logger.info(f"Item {i} type: {type(item).__name__}") + + # Check if we have documents + if retrieved_documents and hasattr(retrieved_documents[0], 'documents'): # Format context for the prompt in a more structured way documents = retrieved_documents[0].documents logger.info(f"Retrieved {len(documents)} documents") @@ -266,25 +298,25 @@ async def handle_websocket_chat(websocket: WebSocket): if is_first_iteration: system_prompt = f""" -You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). -You are conducting a multi-turn Deep Research process to thoroughly investigate the specific topic in the user's query. -Your goal is to provide detailed, focused information EXCLUSIVELY about this topic. -IMPORTANT:You MUST respond in {language_name} language. +You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). +You are conducting a multi-turn Deep Research process to thoroughly investigate the specific RFP topic in the user's query. +Your goal is to provide detailed, focused information EXCLUSIVELY about this insurance RFP topic to help onboard new RFP experts and provide clear documentation for AI RFP answerers. +IMPORTANT: You MUST respond in {language_name} language. - This is the first iteration of a multi-turn research process focused EXCLUSIVELY on the user's query - Start your response with "## Research Plan" -- Outline your approach to investigating this specific topic -- If the topic is about a specific file or feature (like "Dockerfile"), focus ONLY on that file or feature -- Clearly state the specific topic you're researching to maintain focus throughout all iterations -- Identify the key aspects you'll need to research -- Provide initial findings based on the information available +- Outline your approach to investigating this specific insurance RFP topic +- If the topic is about a specific insurance product or line of business (like "Vision PPO"), focus ONLY on that product or line +- Clearly state the specific RFP topic you're researching to maintain focus throughout all iterations +- Identify the key aspects you'll need to research (e.g., coverage details, network information, pricing structures, etc.) +- Provide initial findings based on the information available in the RFP documentation - End with "## Next Steps" indicating what you'll investigate in the next iteration - Do NOT provide a final conclusion yet - this is just the beginning of the research -- Do NOT include general repository information unless directly relevant to the query -- Focus EXCLUSIVELY on the specific topic being researched - do not drift to related topics -- Your research MUST directly address the original question +- Do NOT include general repository information unless directly relevant to the RFP query +- Focus EXCLUSIVELY on the specific insurance RFP topic being researched - do not drift to related topics +- Your research MUST directly address the original question about insurance RFP documentation - NEVER respond with just "Continue the research" as an answer - always provide substantive research findings - Remember that this topic will be maintained across all research iterations @@ -292,14 +324,15 @@ async def handle_websocket_chat(websocket: WebSocket): """ elif is_final_iteration: system_prompt = f""" -You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). -You are in the final iteration of a Deep Research process focused EXCLUSIVELY on the latest user query. -Your goal is to synthesize all previous findings and provide a comprehensive conclusion that directly addresses this specific topic and ONLY this topic. -IMPORTANT:You MUST respond in {language_name} language. +You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). +You are in the final iteration of a Deep Research process focused EXCLUSIVELY on the latest user query about insurance RFP documentation. +Your goal is to synthesize all previous findings and provide a comprehensive conclusion that directly addresses this specific insurance RFP topic and ONLY this topic, helping onboard new RFP experts and providing clear documentation for AI RFP answerers. +IMPORTANT: You MUST respond in {language_name} language. @@ -307,67 +340,71 @@ async def handle_websocket_chat(websocket: WebSocket): - CAREFULLY review the entire conversation history to understand all previous findings - Synthesize ALL findings from previous iterations into a comprehensive conclusion - Start with "## Final Conclusion" -- Your conclusion MUST directly address the original question -- Stay STRICTLY focused on the specific topic - do not drift to related topics -- Include specific code references and implementation details related to the topic -- Highlight the most important discoveries and insights about this specific functionality +- Your conclusion MUST directly address the original question about insurance RFP documentation +- Stay STRICTLY focused on the specific insurance RFP topic - do not drift to related topics +- Include specific RFP document references and important details related to the topic +- Highlight the most important discoveries and insights about this specific insurance product or line of business - Provide a complete and definitive answer to the original question -- Do NOT include general repository information unless directly relevant to the query -- Focus exclusively on the specific topic being researched +- Do NOT include general repository information unless directly relevant to the RFP query +- Focus exclusively on the specific insurance RFP topic being researched - NEVER respond with "Continue the research" as an answer - always provide a complete conclusion -- If the topic is about a specific file or feature (like "Dockerfile"), focus ONLY on that file or feature +- If the topic is about a specific insurance product or line of business (like "Vision PPO"), focus ONLY on that product or line - Ensure your conclusion builds on and references key findings from previous iterations +- Include any relevant compliance or regulatory information if applicable """ else: system_prompt = f""" -You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). -You are currently in iteration {research_iteration} of a Deep Research process focused EXCLUSIVELY on the latest user query. -Your goal is to build upon previous research iterations and go deeper into this specific topic without deviating from it. -IMPORTANT:You MUST respond in {language_name} language. +You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). +You are currently in iteration {research_iteration} of a Deep Research process focused EXCLUSIVELY on the latest user query about insurance RFP documentation. +Your goal is to build upon previous research iterations and go deeper into this specific insurance RFP topic without deviating from it, helping onboard new RFP experts and providing clear documentation for AI RFP answerers. +IMPORTANT: You MUST respond in {language_name} language. - CAREFULLY review the conversation history to understand what has been researched so far - Your response MUST build on previous research iterations - do not repeat information already covered -- Identify gaps or areas that need further exploration related to this specific topic +- Identify gaps or areas that need further exploration related to this specific insurance RFP topic - Focus on one specific aspect that needs deeper investigation in this iteration - Start your response with "## Research Update {research_iteration}" - Clearly explain what you're investigating in this iteration - Provide new insights that weren't covered in previous iterations - If this is iteration 3, prepare for a final conclusion in the next iteration -- Do NOT include general repository information unless directly relevant to the query -- Focus EXCLUSIVELY on the specific topic being researched - do not drift to related topics -- If the topic is about a specific file or feature (like "Dockerfile"), focus ONLY on that file or feature +- Do NOT include general repository information unless directly relevant to the RFP query +- Focus EXCLUSIVELY on the specific insurance RFP topic being researched - do not drift to related topics +- If the topic is about a specific insurance product or line of business (like "Vision PPO"), focus ONLY on that product or line - NEVER respond with just "Continue the research" as an answer - always provide substantive research findings -- Your research MUST directly address the original question +- Your research MUST directly address the original question about insurance RFP documentation - Maintain continuity with previous research iterations - this is a continuous investigation +- Consider any relevant compliance, regulatory, or competitive information if applicable """ else: system_prompt = f""" -You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). -You provide direct, concise, and accurate information about code repositories. +You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). +You provide direct, concise, and accurate information about insurance RFP documentation to help onboard new RFP experts and provide clear documentation for AI RFP answerers. You NEVER start responses with markdown headers or code fences. -IMPORTANT:You MUST respond in {language_name} language. +IMPORTANT: You MUST respond in {language_name} language. - Answer the user's question directly without ANY preamble or filler phrases -- DO NOT include any rationale, explanation, or extra comments. +- DO NOT include any rationale, explanation, or extra comments - DO NOT start with preambles like "Okay, here's a breakdown" or "Here's an explanation" - DO NOT start with markdown headers like "## Analysis of..." or any file path references - DO NOT start with ```markdown code fences @@ -377,25 +414,27 @@ async def handle_websocket_chat(websocket: WebSocket): ```markdown -## Analysis of `adalflow/adalflow/datasets/gsm8k.py` +## Analysis of Vision PPO Documentation -This file contains... +This document contains... ``` -- Format your response with proper markdown including headings, lists, and code blocks WITHIN your answer -- For code analysis, organize your response with clear sections +- Format your response with proper markdown including headings, lists, and tables WITHIN your answer +- For RFP analysis, organize your response with clear sections by product, coverage, or question type - Think step by step and structure your answer logically - Start with the most relevant information that directly addresses the user's query -- Be precise and technical when discussing code +- Be precise and use appropriate insurance industry terminology +- Include relevant compliance or regulatory information when applicable - Your response language should be in the same language as the user's query """ # Fetch file content if provided @@ -479,6 +518,68 @@ async def handle_websocket_chat(websocket: WebSocket): model_kwargs=model_kwargs, model_type=ModelType.LLM ) + elif request.provider == "azure": + logger.info(f"Using Azure OpenAI protocol with model: {request.model}") + + # Check if Azure OpenAI credentials are set + if not AZURE_OPENAI_AVAILABLE: + logger.warning("Azure OpenAI credentials not found in environment variables, but continuing with request") + # We'll handle this below by falling back to other providers + + # Initialize Azure OpenAI client + model = AzureOpenAIClient() + + # Format the prompt as messages for Azure OpenAI + # First create the system message with context + system_content = system_prompt + + # Create the user message with the query + user_content = query + + # Format messages for Azure OpenAI + messages = [ + {"role": "system", "content": system_content}, + ] + + # Add conversation history if available + if conversation_history: + messages.append({"role": "user", "content": f"Previous conversation: {conversation_history}"}) + + # Add context if available + if context_text.strip(): + messages.append({"role": "user", "content": f"Context: {context_text}"}) + + # Add file content if available + if request.filePath and file_content: + messages.append({"role": "user", "content": f"File content ({request.filePath}): {file_content}"}) + + # Add the actual query + messages.append({"role": "user", "content": user_content}) + + logger.info(f"Formatted {len(messages)} messages for Azure OpenAI") + + # Set up model kwargs + model_kwargs = { + "model": request.model or "gpt-4", # Default to GPT-4 if not specified + "stream": True, + "temperature": model_config.get("temperature", 0.7), + "top_p": model_config.get("top_p", 0.8) + } + + # For Azure OpenAI, we need to ensure the api_kwargs include both 'messages' and 'model' + # The convert_inputs_to_api_kwargs method may not be handling this correctly + api_kwargs = { + "messages": messages, + "model": request.model or "gpt-4", # Ensure model is included + "stream": True, + "temperature": model_config.get("temperature", 0.7), + "top_p": model_config.get("top_p", 0.8) + } + + # Log the API kwargs for debugging + logger.info(f"Azure OpenAI API kwargs: {api_kwargs.keys()}") + + # No need to use convert_inputs_to_api_kwargs as we're manually constructing the kwargs elif request.provider == "openai": logger.info(f"Using Openai protocol with model: {request.model}") @@ -502,19 +603,107 @@ async def handle_websocket_chat(websocket: WebSocket): model_type=ModelType.LLM ) else: - # Initialize Google Generative AI model - model = genai.GenerativeModel( - model_name=model_config["model"], - generation_config={ - "temperature": model_config["temperature"], - "top_p": model_config["top_p"], - "top_k": model_config["top_k"] + # Fall back to Google Generative AI if available + if GOOGLE_AI_AVAILABLE and google_api_key: + # Initialize Google Generative AI model + logger.info("Using Google Generative AI for model generation") + # Create safe generation config with defaults + generation_config = { + "temperature": 0.7, + "top_p": 0.8, + "top_k": 40 } - ) + + # Update with available parameters from model_config + if "temperature" in model_config: + generation_config["temperature"] = model_config["temperature"] + if "top_p" in model_config: + generation_config["top_p"] = model_config["top_p"] + + # Initialize the model with the safe configuration + model = genai.GenerativeModel( + model_name=model_config["model"], + generation_config=generation_config + ) + else: + # Fall back to OpenAI if neither Azure nor Google is available + logger.info("Falling back to OpenAI for model generation") + model = OpenAIClient() + model_kwargs = { + "model": request.model or "gpt-3.5-turbo", + "stream": True, + "temperature": model_config.get("temperature", 0.7), + "top_p": model_config.get("top_p", 0.8) + } + + api_kwargs = model.convert_inputs_to_api_kwargs( + input=prompt, + model_kwargs=model_kwargs, + model_type=ModelType.LLM + ) # Process the response based on the provider try: - if request.provider == "ollama": + if request.provider == "azure": + # Get the response and handle it properly using the previously created api_kwargs + logger.info("Making Azure OpenAI API call") + response = await model.acall(api_kwargs=api_kwargs, model_type=ModelType.LLM) + + # The response is now the raw AsyncStream object from the OpenAI library + logger.info("Processing Azure OpenAI streaming response") + + try: + # Iterate over the stream chunks + async for chunk in response: + # Log the chunk type + logger.debug(f"Received chunk type: {type(chunk).__name__}") + + # Debug the chunk structure + chunk_dict = {attr: getattr(chunk, attr) for attr in dir(chunk) if not attr.startswith('_') and not callable(getattr(chunk, attr))} + logger.debug(f"Chunk attributes: {list(chunk_dict.keys())}") + + # Skip chunks with no delta content + if not hasattr(chunk, 'choices') or not chunk.choices: + logger.debug("Skipping chunk with no choices") + continue + + # Log choices structure + logger.debug(f"Choices length: {len(chunk.choices)}") + + # Process each choice in the chunk + for i, choice in enumerate(chunk.choices): + choice_dict = {attr: getattr(choice, attr) for attr in dir(choice) if not attr.startswith('_') and not callable(getattr(choice, attr))} + logger.debug(f"Choice {i} attributes: {list(choice_dict.keys())}") + + # Extract content from delta if available + if hasattr(choice, 'delta'): + delta_dict = {attr: getattr(choice.delta, attr) for attr in dir(choice.delta) if not attr.startswith('_') and not callable(getattr(choice.delta, attr))} + logger.debug(f"Delta attributes: {list(delta_dict.keys())}") + + # Get content if available + if hasattr(choice.delta, 'content') and choice.delta.content is not None: + content = choice.delta.content + logger.debug(f"Sending content: {content[:20]}..." if len(content) > 20 else f"Sending content: {content}") + await websocket.send_text(content) + + logger.info("Azure OpenAI streaming response completed successfully") + except Exception as e: + logger.error(f"Error processing Azure OpenAI streaming response: {str(e)}") + + # Try to get the response directly if streaming failed + try: + # If response is a completed response rather than a stream + if hasattr(response, 'choices') and len(response.choices) > 0: + if hasattr(response.choices[0], 'message') and hasattr(response.choices[0].message, 'content'): + content = response.choices[0].message.content + if content: + await websocket.send_text(content) + except Exception as recovery_error: + logger.error(f"Failed to recover response content: {str(recovery_error)}") + + # Explicitly close the WebSocket connection after the response is complete + await websocket.close() + elif request.provider == "ollama": # Get the response and handle it properly using the previously created api_kwargs response = await model.acall(api_kwargs=api_kwargs, model_type=ModelType.LLM) # Handle streaming response from Ollama @@ -603,75 +792,150 @@ async def handle_websocket_chat(websocket: WebSocket): model_kwargs=model_kwargs, model_type=ModelType.LLM ) - + # Get the response using the simplified prompt fallback_response = await model.acall(api_kwargs=fallback_api_kwargs, model_type=ModelType.LLM) - - # Handle streaming fallback_response from Ollama + + # Handle streaming fallback_response async for chunk in fallback_response: text = getattr(chunk, 'response', None) or getattr(chunk, 'text', None) or str(chunk) if text and not text.startswith('model=') and not text.startswith('created_at='): text = text.replace('', '').replace('', '') await websocket.send_text(text) - elif request.provider == "openrouter": + elif request.provider == "azure" and AZURE_OPENAI_AVAILABLE: + # Initialize Azure OpenAI client for fallback + logger.info("Making fallback Azure OpenAI API call") + fallback_model = AzureOpenAIClient() + + # Format the simplified prompt as messages for Azure OpenAI + fallback_messages = [ + {"role": "system", "content": system_prompt}, + ] + + # Add conversation history if available + if conversation_history: + fallback_messages.append({"role": "user", "content": f"Previous conversation: {conversation_history}"}) + + # Add file content if available + if request.filePath and file_content: + fallback_messages.append({"role": "user", "content": f"File content ({request.filePath}): {file_content}"}) + + # Add the note about answering without retrieval augmentation + fallback_messages.append({"role": "user", "content": "Answering without retrieval augmentation due to input size constraints."}) + + # Add the actual query + fallback_messages.append({"role": "user", "content": query}) + + logger.info(f"Formatted {len(fallback_messages)} fallback messages for Azure OpenAI") + + # For Azure OpenAI, we need to ensure the api_kwargs include both 'messages' and 'model' + # The convert_inputs_to_api_kwargs method may not be handling this correctly + fallback_api_kwargs = { + "messages": fallback_messages, + "model": request.model or "gpt-4", # Ensure model is included + "stream": True, + "temperature": 0.7, + "top_p": 0.8 + } + + # Log the API kwargs for debugging + logger.info(f"Azure OpenAI fallback API kwargs: {fallback_api_kwargs.keys()}") + + # Get the response using the simplified prompt + fallback_response = await fallback_model.acall(api_kwargs=fallback_api_kwargs, model_type=ModelType.LLM) + + # The response is now the raw AsyncStream object from the OpenAI library + logger.info("Processing Azure OpenAI fallback streaming response") + try: - # Create new api_kwargs with the simplified prompt - fallback_api_kwargs = model.convert_inputs_to_api_kwargs( - input=simplified_prompt, - model_kwargs=model_kwargs, - model_type=ModelType.LLM - ) - - # Get the response using the simplified prompt - logger.info("Making fallback OpenRouter API call") - fallback_response = await model.acall(api_kwargs=fallback_api_kwargs, model_type=ModelType.LLM) - - # Handle streaming fallback_response from OpenRouter + # Iterate over the stream chunks async for chunk in fallback_response: - await websocket.send_text(chunk) - except Exception as e_fallback: - logger.error(f"Error with OpenRouter API fallback: {str(e_fallback)}") - error_msg = f"\nError with OpenRouter API fallback: {str(e_fallback)}\n\nPlease check that you have set the OPENROUTER_API_KEY environment variable with a valid API key." - await websocket.send_text(error_msg) - elif request.provider == "openai": + # Log the chunk type + logger.info(f"Received fallback chunk type: {type(chunk).__name__}") + + # Debug the chunk structure + chunk_dict = {attr: getattr(chunk, attr) for attr in dir(chunk) if not attr.startswith('_') and not callable(getattr(chunk, attr))} + logger.info(f"Fallback chunk attributes: {list(chunk_dict.keys())}") + + # Skip chunks with no delta content + if not hasattr(chunk, 'choices') or not chunk.choices: + logger.info("Skipping fallback chunk with no choices") + continue + + # Log choices structure + logger.info(f"Fallback choices length: {len(chunk.choices)}") + + # Process each choice in the chunk + for i, choice in enumerate(chunk.choices): + choice_dict = {attr: getattr(choice, attr) for attr in dir(choice) if not attr.startswith('_') and not callable(getattr(choice, attr))} + logger.info(f"Fallback choice {i} attributes: {list(choice_dict.keys())}") + + # Extract content from delta if available + if hasattr(choice, 'delta'): + delta_dict = {attr: getattr(choice.delta, attr) for attr in dir(choice.delta) if not attr.startswith('_') and not callable(getattr(choice.delta, attr))} + logger.info(f"Fallback delta attributes: {list(delta_dict.keys())}") + + # Get content if available + if hasattr(choice.delta, 'content') and choice.delta.content is not None: + content = choice.delta.content + logger.info(f"Sending fallback content: {content[:20]}..." if len(content) > 20 else f"Sending fallback content: {content}") + await websocket.send_text(content) + + logger.info("Azure OpenAI fallback streaming response completed successfully") + except Exception as e: + logger.error(f"Error processing Azure OpenAI fallback streaming response: {str(e)}") + + # Try to get the response directly if streaming failed + try: + # If response is a completed response rather than a stream + if hasattr(fallback_response, 'choices') and len(fallback_response.choices) > 0: + if hasattr(fallback_response.choices[0], 'message') and hasattr(fallback_response.choices[0].message, 'content'): + content = fallback_response.choices[0].message.content + if content: + await websocket.send_text(content) + except Exception as recovery_error: + logger.error(f"Failed to recover fallback response content: {str(recovery_error)}") + elif GOOGLE_AI_AVAILABLE and google_api_key: + # Initialize Google Generative AI model as fallback + logger.info("Making fallback Google Generative AI call") try: - # Create new api_kwargs with the simplified prompt - fallback_api_kwargs = model.convert_inputs_to_api_kwargs( - input=simplified_prompt, - model_kwargs=model_kwargs, - model_type=ModelType.LLM + # Get model config + model_config = get_model_config(request.provider, request.model) + + # Create safe generation config with defaults + generation_config = { + "temperature": 0.7, + "top_p": 0.8, + "top_k": 40 + } + + # Update with available parameters if they exist + if isinstance(model_config, dict): + if "temperature" in model_config: + generation_config["temperature"] = model_config["temperature"] + if "top_p" in model_config: + generation_config["top_p"] = model_config["top_p"] + + # Initialize the model with the safe configuration + fallback_model = genai.GenerativeModel( + model_name=model_config.get("model", "gemini-pro"), + generation_config=generation_config ) - - # Get the response using the simplified prompt - logger.info("Making fallback Openai API call") - fallback_response = await model.acall(api_kwargs=fallback_api_kwargs, model_type=ModelType.LLM) - - # Handle streaming fallback_response from Openai - async for chunk in fallback_response: - text = chunk if isinstance(chunk, str) else getattr(chunk, 'text', str(chunk)) - await websocket.send_text(text) - except Exception as e_fallback: - logger.error(f"Error with Openai API fallback: {str(e_fallback)}") - error_msg = f"\nError with Openai API fallback: {str(e_fallback)}\n\nPlease check that you have set the OPENAI_API_KEY environment variable with a valid API key." + + # Get streaming response using simplified prompt + fallback_response = fallback_model.generate_content(simplified_prompt, stream=True) + # Stream the fallback response + for chunk in fallback_response: + if hasattr(chunk, 'text'): + await websocket.send_text(chunk.text) + except Exception as e_google: + logger.error(f"Error with Google Generative AI fallback: {str(e_google)}") + error_msg = f"\nAll fallback options failed. Please try again with a shorter query or check your API configurations.\nLast error: {str(e_google)}" await websocket.send_text(error_msg) else: - # Initialize Google Generative AI model - model_config = get_model_config(request.provider, request.model) - fallback_model = genai.GenerativeModel( - model_name=model_config["model"], - generation_config={ - "temperature": model_config["model_kwargs"].get("temperature", 0.7), - "top_p": model_config["model_kwargs"].get("top_p", 0.8), - "top_k": model_config["model_kwargs"].get("top_k", 40) - } - ) - - # Get streaming response using simplified prompt - fallback_response = fallback_model.generate_content(simplified_prompt, stream=True) - # Stream the fallback response - for chunk in fallback_response: - if hasattr(chunk, 'text'): - await websocket.send_text(chunk.text) + # No fallback options available + error_msg = "\nNo fallback options available. Please check your API configurations and try again with a shorter query." + await websocket.send_text(error_msg) except Exception as e2: logger.error(f"Error in fallback streaming response: {str(e2)}") await websocket.send_text(f"\nI apologize, but your request is too large for me to process. Please try a shorter query or break it into smaller parts.") diff --git a/next.config.ts b/next.config.ts index 77b5cc65..b0622123 100644 --- a/next.config.ts +++ b/next.config.ts @@ -23,6 +23,11 @@ const nextConfig: NextConfig = { source: '/local_repo/structure', destination: `${TARGET_SERVER_BASE_URL}/local_repo/structure`, }, + { + // Azure DevOps repository path pattern + source: '/:organization/:project/:repository', + destination: `${TARGET_SERVER_BASE_URL}/:organization/:project/:repository`, + }, ]; }, }; diff --git a/src/app/[owner]/[repo]/page.tsx b/src/app/[owner]/[repo]/page.tsx index 05dfd340..70e005fb 100644 --- a/src/app/[owner]/[repo]/page.tsx +++ b/src/app/[owner]/[repo]/page.tsx @@ -545,7 +545,23 @@ Remember: // Determine the wiki structure from repository data const determineWikiStructure = useCallback(async (fileTree: string, readme: string, owner: string, repo: string) => { + console.log('determineWikiStructure called with:', { + fileTreeLength: fileTree?.length || 0, + readmeLength: readme?.length || 0, + owner, + repo, + repoType: repoInfo?.type || 'unknown' + }); + + if (!fileTree) { + console.error('No file tree data provided'); + setError('No file tree data available. Please try again.'); + setIsLoading(false); + return; + } + if (!owner || !repo) { + console.error('Invalid repository information. Owner and repo name are required.'); setError('Invalid repository information. Owner and repo name are required.'); setIsLoading(false); return; @@ -556,22 +572,29 @@ Remember: console.log('Wiki structure determination already in progress, skipping duplicate call'); return; } + + // For Azure DevOps repositories, ensure we're using the correct format + let repoUrlForRequest = ''; + if (repoInfo.type === 'azure') { + repoUrlForRequest = repoInfo.repoUrl || `https://dev.azure.com/${owner}/_git/${repo}`; + console.log('Using Azure DevOps URL for request:', repoUrlForRequest); + } else { + repoUrlForRequest = getRepoUrl(repoInfo); + } try { setStructureRequestInProgress(true); setLoadingMessage(messages.loading?.determiningStructure || 'Determining wiki structure...'); - // Get repository URL - const repoUrl = getRepoUrl(repoInfo); - // Prepare request body // eslint-disable-next-line @typescript-eslint/no-explicit-any const requestBody: Record = { - repo_url: repoUrl, + repo_url: repoUrlForRequest, type: repoInfo.type, + provider: 'azure', // Default provider to ensure it's never empty messages: [{ role: 'user', -content: `Analyze this GitHub repository ${owner}/${repo} and create a wiki structure for it. +content: `Analyze this ${repoInfo.type === 'azure' ? 'Azure DevOps' : repoInfo.type === 'gitlab' ? 'GitLab' : repoInfo.type === 'bitbucket' ? 'Bitbucket' : 'GitHub'} repository ${owner}/${repo} and create a wiki structure for it. 1. The complete file tree of the project: @@ -697,10 +720,12 @@ IMPORTANT: let responseText = ''; try { + console.log('Starting WebSocket connection for wiki structure generation'); // Create WebSocket URL from the server base URL const serverBaseUrl = process.env.NEXT_PUBLIC_SERVER_BASE_URL || 'http://localhost:8001'; const wsBaseUrl = serverBaseUrl.replace(/^http/, 'ws'); const wsUrl = `${wsBaseUrl}/ws/chat`; + console.log('WebSocket URL:', wsUrl); // Create a new WebSocket connection const ws = new WebSocket(wsUrl); @@ -711,6 +736,7 @@ IMPORTANT: ws.onopen = () => { console.log('WebSocket connection established for wiki structure'); // Send the request as JSON + console.log('Sending request body via WebSocket:', JSON.stringify(requestBody).substring(0, 200) + '...'); ws.send(JSON.stringify(requestBody)); resolve(); }; @@ -722,6 +748,7 @@ IMPORTANT: // If the connection doesn't open within 5 seconds, fall back to HTTP const timeout = setTimeout(() => { + console.warn('WebSocket connection timeout after 5 seconds'); reject(new Error('WebSocket connection timeout')); }, 5000); @@ -730,6 +757,7 @@ IMPORTANT: clearTimeout(timeout); console.log('WebSocket connection established for wiki structure'); // Send the request as JSON + console.log('Sending request body via WebSocket:', JSON.stringify(requestBody).substring(0, 200) + '...'); ws.send(JSON.stringify(requestBody)); resolve(); }; @@ -1189,7 +1217,48 @@ IMPORTANT: throw err; } } - else if (repoInfo.type === 'bitbucket') { + else if (repoInfo.type === 'azure') { + // Azure DevOps repositories use a simplified approach + try { + // Check if we have the file tree and README in the URL query parameters + const fileTreeParam = searchParams.get('file_tree'); + const readmeParam = searchParams.get('readme'); + + if (fileTreeParam && readmeParam) { + // Use the file tree and README from the URL parameters + fileTreeData = decodeURIComponent(fileTreeParam); + readmeContent = decodeURIComponent(readmeParam); + console.log('Using file tree and README from URL parameters'); + } else { + // For Azure DevOps, we need to make a request to the catch-all route + // The backend will handle cloning the repository and redirecting to a simplified URL + console.log('Fetching Azure DevOps repository structure from backend'); + + // Construct the URL with the repository information + const azureRepoUrl = repoInfo.repoUrl || ''; + + // Extract organization, project, and repository from the URL + // Format: https://dev.azure.com/{organization}/{project}/_git/{repository} + const urlParts = azureRepoUrl.split('/'); + const organization = urlParts[3]; + const project = urlParts[4]; + const repository = urlParts[urlParts.length - 1]; + + // Create the request URL using the catch-all route format + const requestUrl = `/${organization}/${project}/${repository}?type=azure&repo_url=${encodeURIComponent(azureRepoUrl)}`; + + console.log('Making request to catch-all route:', requestUrl); + + // For Azure DevOps, we'll navigate to the catch-all route + // The backend will redirect to a simplified URL with the file tree and README as query parameters + window.location.href = requestUrl; + return; // Stop execution here as we're redirecting + } + } catch (err) { + console.error('Error fetching Azure DevOps repository structure:', err); + throw err; + } + } else if (repoInfo.type === 'bitbucket') { // Bitbucket API approach const repoPath = extractUrlPath(repoInfo.repoUrl ?? '') ?? `${owner}/${repo}`; const encodedRepoPath = encodeURIComponent(repoPath); @@ -1269,7 +1338,18 @@ IMPORTANT: } // Now determine the wiki structure - await determineWikiStructure(fileTreeData, readmeContent, owner, repo); + console.log('Determining wiki structure with data:', { fileTreeData: fileTreeData.substring(0, 100) + '...', readmeContent: readmeContent.substring(0, 100) + '...', owner, repo }); + + // For Azure DevOps repositories, we need to handle the owner differently + // The owner should be just the organization, not the full path + if (repoInfo.type === 'azure') { + // Extract just the organization name for Azure DevOps + const azureOwner = owner.split('/')[0]; + console.log('Using modified owner for Azure DevOps:', azureOwner); + await determineWikiStructure(fileTreeData, readmeContent, azureOwner, repo); + } else { + await determineWikiStructure(fileTreeData, readmeContent, owner, repo); + } } catch (error) { console.error('Error fetching repository structure:', error); @@ -1795,6 +1875,10 @@ IMPORTANT: ) : repoInfo.type === 'gitlab' ? ( + ) : repoInfo.type === 'azure' ? ( + + + ) : ( )} diff --git a/src/app/page.tsx b/src/app/page.tsx index 17e97132..b051b5fb 100644 --- a/src/app/page.tsx +++ b/src/app/page.tsx @@ -87,7 +87,7 @@ export default function Home() { const [excludedFiles, setExcludedFiles] = useState(''); const [includedDirs, setIncludedDirs] = useState(''); const [includedFiles, setIncludedFiles] = useState(''); - const [selectedPlatform, setSelectedPlatform] = useState<'github' | 'gitlab' | 'bitbucket'>('github'); + const [selectedPlatform, setSelectedPlatform] = useState<'github' | 'gitlab' | 'bitbucket' | 'azure'>('github'); const [accessToken, setAccessToken] = useState(''); const [error, setError] = useState(null); const [isSubmitting, setIsSubmitting] = useState(false); @@ -114,6 +114,7 @@ export default function Home() { // Handle Windows absolute paths (e.g., C:\path\to\folder) const windowsPathRegex = /^[a-zA-Z]:\\(?:[^\\/:*?"<>|\r\n]+\\)*[^\\/:*?"<>|\r\n]*$/; const customGitRegex = /^(?:https?:\/\/)?([^\/]+)\/(.+?)\/([^\/]+)(?:\.git)?\/?$/; + const azureDevOpsRegex = /^(?:https?:\/\/)?dev\.azure\.com\/([^\/]+)\/([^\/]+)\/_git\/([^\/]+)(?:\.git)?\/?$/; if (windowsPathRegex.test(input)) { type = 'local'; @@ -128,6 +129,70 @@ export default function Home() { repo = input.split('/').filter(Boolean).pop() || 'local-repo'; owner = 'local'; } + // Handle Azure DevOps URLs + else if (input.includes('dev.azure.com') || input.includes('azure.com')) { + type = 'azure'; + console.log('Detected Azure DevOps URL:', input); + + // First, ensure the URL is properly decoded (it might be double-encoded) + let decodedInput = input; + try { + // If the URL is already decoded, this won't change it + // If it's encoded (like with %20 for spaces), this will decode it + if (input.includes('%')) { + decodedInput = decodeURIComponent(input); + console.log('Decoded Azure DevOps URL:', decodedInput); + } + } catch (error) { + console.warn('Error decoding URL:', error); + } + + try { + // Parse the URL properly to handle spaces in project names + const url = new URL(decodedInput); + console.log('Parsed URL:', url.toString()); + console.log('URL pathname:', url.pathname); + + const pathParts = url.pathname.split('/'); + console.log('Path parts:', pathParts); + + // Find the organization (first part after domain) + const organization = pathParts[1]; // First part after the initial slash + console.log('Organization:', organization); + + // Check if _git is in the path + if (url.pathname.includes('_git')) { + // Find the repository (part after _git) + const gitIndex = pathParts.indexOf('_git'); + if (gitIndex !== -1 && gitIndex + 1 < pathParts.length) { + repo = pathParts[gitIndex + 1]; + console.log('Repository:', repo); + + // For the owner, we'll use the organization and the encoded project path + // This preserves spaces and special characters in project names + const projectPath = url.pathname.split('/_git/')[0].substring(organization.length + 2); + owner = `${organization}/${projectPath}`; + console.log('Owner:', owner); + } else { + console.error('Could not find repository name after _git'); + return null; + } + } else { + console.error('URL does not contain _git segment:', url.pathname); + // Try to extract organization and project from the URL anyway + if (pathParts.length >= 3) { + owner = `${pathParts[1]}/${pathParts[2]}`; + repo = pathParts[pathParts.length - 1]; + console.log('Fallback - Owner:', owner, 'Repo:', repo); + } else { + return null; + } + } + } catch (error) { + console.error('Could not parse Azure DevOps repository from URL', error); + return null; + } + } else if (customGitRegex.test(input)) { type = 'web'; fullPath = extractUrlPath(input)?.replace(/\.git$/, ''); @@ -204,7 +269,38 @@ export default function Home() { params.append('token', accessToken); } // Always include the type parameter - params.append('type', (type == 'local' ? type : selectedPlatform) || 'github'); + // For Azure DevOps URLs, ensure we use 'azure' as the type and handle URL encoding properly + if (type === 'azure') { + params.append('type', 'azure'); + + // For Azure DevOps URLs, we need to ensure the URL is properly encoded + // This is especially important for URLs with spaces in project names + if (repositoryInput.includes('dev.azure.com')) { + console.log('Processing Azure DevOps URL for API request:', repositoryInput); + + // Ensure the URL is properly encoded for the API + // First decode it to handle any double-encoding + let normalizedUrl = repositoryInput; + try { + if (normalizedUrl.includes('%')) { + normalizedUrl = decodeURIComponent(normalizedUrl); + console.log('Decoded Azure DevOps URL:', normalizedUrl); + } + } catch (e) { + console.warn('Error decoding URL:', e); + } + + // Then encode it properly + const encodedUrl = encodeURIComponent(normalizedUrl); + console.log('Encoded Azure DevOps URL for API:', encodedUrl); + params.append('repo_url', encodedUrl); + } else { + params.append('repo_url', encodeURIComponent(repositoryInput)); + } + } else { + params.append('type', (type == 'local' ? type : selectedPlatform) || 'github'); + params.append('repo_url', encodeURIComponent(repositoryInput)); + } // Add local path if it exists if (localPath) { params.append('local_path', encodeURIComponent(localPath)); diff --git a/src/components/ConfigurationModal.tsx b/src/components/ConfigurationModal.tsx index 105ca605..401e860f 100644 --- a/src/components/ConfigurationModal.tsx +++ b/src/components/ConfigurationModal.tsx @@ -29,8 +29,8 @@ interface ConfigurationModalProps { setCustomModel: (value: string) => void; // Platform selection - selectedPlatform: 'github' | 'gitlab' | 'bitbucket'; - setSelectedPlatform: (value: 'github' | 'gitlab' | 'bitbucket') => void; + selectedPlatform: 'github' | 'gitlab' | 'bitbucket' | 'azure'; + setSelectedPlatform: (value: 'github' | 'gitlab' | 'bitbucket' | 'azure') => void; // Access token accessToken: string; @@ -267,6 +267,16 @@ export default function ConfigurationModal({ > Bitbucket + diff --git a/src/messages/en.json b/src/messages/en.json index f0cfd009..fce53774 100644 --- a/src/messages/en.json +++ b/src/messages/en.json @@ -20,7 +20,7 @@ "home": { "welcome": "Welcome to DeepWiki-Open", "welcomeTagline": "AI-powered documentation for your code repositories", - "description": "Generate comprehensive documentation from GitHub, GitLab, or Bitbucket repositories with just a few clicks.", + "description": "Generate comprehensive documentation from GitHub, GitLab, Azure DevOps, or Bitbucket repositories with just a few clicks.", "quickStart": "Quick Start", "enterRepoUrl": "Enter a repository URL in one of these formats:", "advancedVisualization": "Advanced Visualization with Mermaid Diagrams", diff --git a/src/messages/es.json b/src/messages/es.json index 50c0d743..a0741100 100644 --- a/src/messages/es.json +++ b/src/messages/es.json @@ -20,7 +20,7 @@ "home": { "welcome": "Bienvenido a DeepWiki", "welcomeTagline": "Documentación impulsada por IA para repositorios de código", - "description": "Genera documentación completa de repositorios GitHub, GitLab o Bitbucket con solo unos clics.", + "description": "Genera documentación completa de repositorios GitHub, GitLab, Azure DevOps, o Bitbucket con solo unos clics.", "quickStart": "Inicio Rápido", "enterRepoUrl": "Ingresa una URL de repositorio en uno de estos formatos:", "advancedVisualization": "Visualización Avanzada con Diagramas Mermaid", @@ -31,7 +31,7 @@ "form": { "repository": "Repositorio", "configureWiki": "Configurar Wiki", - "repoPlaceholder": "propietario/repositorio o URL de GitHub/GitLab/Bitbucket", + "repoPlaceholder": "propietario/repositorio o URL de GitHub/GitLab/Azure DevOps/Bitbucket", "wikiLanguage": "Idioma del Wiki", "modelOptions": "Opciones de Modelo", "modelProvider": "Proveedor de Modelo", diff --git a/src/messages/ja.json b/src/messages/ja.json index 17c20403..fd320660 100644 --- a/src/messages/ja.json +++ b/src/messages/ja.json @@ -20,7 +20,7 @@ "home": { "welcome": "DeepWikiへようこそ", "welcomeTagline": "コードリポジトリのためのAI駆動ドキュメンテーション", - "description": "GitHub、GitLab、またはBitbucketリポジトリから包括的なドキュメントを数クリックで生成します。", + "description": "GitHub、GitLab、Azure DevOps、またはBitbucketリポジトリから包括的なドキュメントを数クリックで生成します。", "quickStart": "クイックスタート", "enterRepoUrl": "以下のいずれかの形式でリポジトリURLを入力してください:", "advancedVisualization": "Mermaidダイアグラムによる高度な可視化", @@ -31,7 +31,7 @@ "form": { "repository": "リポジトリ", "configureWiki": "Wiki設定", - "repoPlaceholder": "所有者/リポジトリまたはGitHub/GitLab/BitbucketのURL", + "repoPlaceholder": "所有者/リポジトリまたはGitHub/GitLab/Azure DevOps/BitbucketのURL", "wikiLanguage": "Wiki言語", "modelOptions": "モデルオプション", "modelProvider": "モデルプロバイダー", diff --git a/src/messages/kr.json b/src/messages/kr.json index cc210b63..22b87067 100644 --- a/src/messages/kr.json +++ b/src/messages/kr.json @@ -20,7 +20,7 @@ "home": { "welcome": "DeepWiki-Open에 오신 것을 환영합니다", "welcomeTagline": "코드 저장소를 위한 AI 기반 문서화", - "description": "GitHub, GitLab 또는 Bitbucket 저장소에서 클릭 한 번으로 종합 문서를 생성하세요.", + "description": "GitHub, GitLab, Azure Devops, 또는 Bitbucket 저장소에서 클릭 한 번으로 종합 문서를 생성하세요.", "quickStart": "빠른 시작", "enterRepoUrl": "다음 형식 중 하나로 저장소 URL을 입력하세요:", "advancedVisualization": "Mermaid 다이어그램을 활용한 고급 시각화", @@ -31,7 +31,7 @@ "form": { "repository": "저장소", "configureWiki": "위키 구성", - "repoPlaceholder": "owner/repo 또는 GitHub/GitLab/Bitbucket URL", + "repoPlaceholder": "owner/repo 또는 GitHub/GitLab/Azure Devops/Bitbucket URL", "wikiLanguage": "위키 언어", "modelOptions": "모델 옵션", "modelProvider": "모델 제공자", diff --git a/src/messages/vi.json b/src/messages/vi.json index d4820b31..0cdbfa20 100644 --- a/src/messages/vi.json +++ b/src/messages/vi.json @@ -20,7 +20,7 @@ "home": { "welcome": "Chào mừng đến với DeepWiki-Open", "welcomeTagline": "Tài liệu hỗ trợ bởi AI cho các repository của bạn", - "description": "Tạo tài liệu từ các repository GitHub, GitLab, hoặc Bitbucket chỉ với vài cú nhấp chuột.", + "description": "Tạo tài liệu từ các repository GitHub, GitLab, Azure Devops, hoặc Bitbucket chỉ với vài cú nhấp chuột.", "quickStart": "Bắt đầu nhanh", "enterRepoUrl": "Nhập URL repository", "advancedVisualization": "Tùy chỉnh sơ đồ trực quan với Mermaid", @@ -31,7 +31,7 @@ "form": { "repository": "Repository", "configureWiki": "Cấu hình Wiki", - "repoPlaceholder": "owner/repo hoặc URL GitHub/GitLab/Bitbucket", + "repoPlaceholder": "owner/repo hoặc URL GitHub/GitLab/Azure Devops/Bitbucket", "wikiLanguage": "Ngôn ngữ Wiki", "modelOptions": "Tùy chọn mô hình", "modelProvider": "Nhà cung cấp mô hình", diff --git a/src/messages/zh.json b/src/messages/zh.json index bb6b2946..f7ee87da 100644 --- a/src/messages/zh.json +++ b/src/messages/zh.json @@ -20,7 +20,7 @@ "home": { "welcome": "欢迎使用DeepWiki", "welcomeTagline": "为代码仓库提供AI驱动的文档", - "description": "只需一次点击,即可从GitHub、GitLab或Bitbucket仓库生成全面的文档。", + "description": "只需一次点击,即可从GitHub、GitLab、Azure DevOps或Bitbucket仓库生成全面的文档。", "quickStart": "快速开始", "enterRepoUrl": "请以下列格式之一输入仓库URL:", "advancedVisualization": "使用Mermaid图表进行高级可视化", @@ -31,7 +31,7 @@ "form": { "repository": "仓库", "configureWiki": "配置Wiki", - "repoPlaceholder": "所有者/仓库或GitHub/GitLab/Bitbucket URL", + "repoPlaceholder": "所有者/仓库或GitHub/GitLab/Azure DevOps/Bitbucket URL", "wikiLanguage": "Wiki语言", "modelOptions": "模型选项", "modelProvider": "模型提供商", From d203c9dca86c0be9480f57cc167a302bcd39ffce Mon Sep 17 00:00:00 2001 From: James Barney Date: Mon, 2 Jun 2025 14:07:27 -0400 Subject: [PATCH 2/7] Update generator.json --- api/config/generator.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/config/generator.json b/api/config/generator.json index 5eb8686e..2e8ae2cb 100644 --- a/api/config/generator.json +++ b/api/config/generator.json @@ -1,5 +1,5 @@ { - "default_provider": "azure", + "default_provider": "google", "providers": { "azure": { "default_model": "gpt-4o", @@ -171,4 +171,4 @@ } } } -} \ No newline at end of file +} From 0bf52505377ba24052ae3fb96ca859c8740a5422 Mon Sep 17 00:00:00 2001 From: James Barney Date: Tue, 3 Jun 2025 10:50:56 -0700 Subject: [PATCH 3/7] conflicts --- src/app/[owner]/[repo]/page.tsx | 17 ++--------------- src/components/TokenInput.tsx | 2 +- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/src/app/[owner]/[repo]/page.tsx b/src/app/[owner]/[repo]/page.tsx index 4e961b08..ab7b4bf2 100644 --- a/src/app/[owner]/[repo]/page.tsx +++ b/src/app/[owner]/[repo]/page.tsx @@ -588,14 +588,6 @@ Remember: setStructureRequestInProgress(true); setLoadingMessage(messages.loading?.determiningStructure || 'Determining wiki structure...'); -<<<<<<< HEAD - // Prepare request body - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const requestBody: Record = { - repo_url: repoUrlForRequest, - type: repoInfo.type, - provider: 'azure', // Default provider to ensure it's never empty -======= // Get repository URL const repoUrl = getRepoUrl(effectiveRepoInfo); @@ -604,7 +596,6 @@ Remember: const requestBody: Record = { repo_url: repoUrl, type: effectiveRepoInfo.type, ->>>>>>> ca14883d9efb9334ecf06018c88115f9e24bf125 messages: [{ role: 'user', content: `Analyze this ${repoInfo.type === 'azure' ? 'Azure DevOps' : repoInfo.type === 'gitlab' ? 'GitLab' : repoInfo.type === 'bitbucket' ? 'Bitbucket' : 'GitHub'} repository ${owner}/${repo} and create a wiki structure for it. @@ -1230,8 +1221,7 @@ IMPORTANT: throw err; } } -<<<<<<< HEAD - else if (repoInfo.type === 'azure') { + else if (effectiveRepoInfo.type === 'azure') { // Azure DevOps repositories use a simplified approach try { // Check if we have the file tree and README in the URL query parameters @@ -1272,10 +1262,7 @@ IMPORTANT: console.error('Error fetching Azure DevOps repository structure:', err); throw err; } - } else if (repoInfo.type === 'bitbucket') { -======= - else if (effectiveRepoInfo.type === 'bitbucket') { ->>>>>>> ca14883d9efb9334ecf06018c88115f9e24bf125 + } else if (effectiveRepoInfo.type === 'bitbucket') { // Bitbucket API approach const repoPath = extractUrlPath(effectiveRepoInfo.repoUrl ?? '') ?? `${owner}/${repo}`; const encodedRepoPath = encodeURIComponent(repoPath); diff --git a/src/components/TokenInput.tsx b/src/components/TokenInput.tsx index 14fadcd8..7c617261 100644 --- a/src/components/TokenInput.tsx +++ b/src/components/TokenInput.tsx @@ -4,7 +4,7 @@ import React from 'react'; import { useLanguage } from '@/contexts/LanguageContext'; interface TokenInputProps { - selectedPlatform: 'github' | 'gitlab' | 'bitbucket'; + selectedPlatform: 'github' | 'gitlab' | 'bitbucket' | 'azure'; setSelectedPlatform: (value: 'github' | 'gitlab' | 'bitbucket') => void; accessToken: string; setAccessToken: (value: string) => void; From 0db95c9424ef80649cbbfac42824b84dbba3fbeb Mon Sep 17 00:00:00 2001 From: James Barney Date: Tue, 3 Jun 2025 10:53:03 -0700 Subject: [PATCH 4/7] azure example --- api/config/azure_example.json | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 api/config/azure_example.json diff --git a/api/config/azure_example.json b/api/config/azure_example.json new file mode 100644 index 00000000..e126aa9e --- /dev/null +++ b/api/config/azure_example.json @@ -0,0 +1,13 @@ +{ + "embedder_azure": { + "client_class": "AzureOpenAIClient", + "batch_size": 500, + "model_kwargs": { + "model": "text-embedding-ada-002", + "encoding_format": "float", + "azure_endpoint": "https://your-azure-endpoint.openai.azure.com", + "api_key": "your-azure-api-key", + "api_version": "2023-05-15" + } + } +} From f3cbd0fc65a4673edd19638024365a12db6d3677 Mon Sep 17 00:00:00 2001 From: James Barney Date: Tue, 3 Jun 2025 10:54:03 -0700 Subject: [PATCH 5/7] embedder --- api/config/embedder.json | 8 -------- 1 file changed, 8 deletions(-) diff --git a/api/config/embedder.json b/api/config/embedder.json index 8fb8b7a5..df8ade90 100644 --- a/api/config/embedder.json +++ b/api/config/embedder.json @@ -1,13 +1,5 @@ { "embedder": { - "client_class": "AzureOpenAIClient", - "batch_size": 500, - "model_kwargs": { - "model": "text-embedding-ada-002", - "encoding_format": "float" - } - }, - "embedder_openai": { "client_class": "OpenAIClient", "batch_size": 500, "model_kwargs": { From 0e20f74b12710efa149876d0e888fadf9eaa8fb3 Mon Sep 17 00:00:00 2001 From: James Barney Date: Tue, 3 Jun 2025 11:28:04 -0700 Subject: [PATCH 6/7] reverting some prompts --- api/api.py | 5 ++- api/websocket_wiki.py | 98 ++++++++++++++++++++----------------------- 2 files changed, 49 insertions(+), 54 deletions(-) diff --git a/api/api.py b/api/api.py index f751eb22..da9e4dfb 100644 --- a/api/api.py +++ b/api/api.py @@ -10,6 +10,8 @@ from pydantic import BaseModel, Field import google.generativeai as genai import asyncio +from api.data_pipeline import download_repo, DatabaseManager +from urllib.parse import unquote # Get a logger for this module logger = logging.getLogger(__name__) @@ -586,7 +588,6 @@ async def catch_all_repo_route( repo_type = query_params.get('type', 'github') # Handle double-encoded URLs - from urllib.parse import unquote if '%25' in repo_url: # Double-encoded repo_url = unquote(unquote(repo_url)) else: @@ -609,7 +610,7 @@ async def catch_all_repo_route( logger.warning(f"{repo_type.upper()}_TOKEN not found in environment variables") # Import the necessary modules for repository processing - from api.data_pipeline import download_repo, DatabaseManager + # Process the repository try: diff --git a/api/websocket_wiki.py b/api/websocket_wiki.py index 2ab59644..34599cc0 100644 --- a/api/websocket_wiki.py +++ b/api/websocket_wiki.py @@ -298,25 +298,25 @@ async def handle_websocket_chat(websocket: WebSocket): if is_first_iteration: system_prompt = f""" -You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). -You are conducting a multi-turn Deep Research process to thoroughly investigate the specific RFP topic in the user's query. -Your goal is to provide detailed, focused information EXCLUSIVELY about this insurance RFP topic to help onboard new RFP experts and provide clear documentation for AI RFP answerers. -IMPORTANT: You MUST respond in {language_name} language. +You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). +You are conducting a multi-turn Deep Research process to thoroughly investigate the specific topic in the user's query. +Your goal is to provide detailed, focused information EXCLUSIVELY about this topic. +IMPORTANT:You MUST respond in {language_name} language. - This is the first iteration of a multi-turn research process focused EXCLUSIVELY on the user's query - Start your response with "## Research Plan" -- Outline your approach to investigating this specific insurance RFP topic -- If the topic is about a specific insurance product or line of business (like "Vision PPO"), focus ONLY on that product or line -- Clearly state the specific RFP topic you're researching to maintain focus throughout all iterations -- Identify the key aspects you'll need to research (e.g., coverage details, network information, pricing structures, etc.) -- Provide initial findings based on the information available in the RFP documentation +- Outline your approach to investigating this specific topic +- If the topic is about a specific file or feature (like "Dockerfile"), focus ONLY on that file or feature +- Clearly state the specific topic you're researching to maintain focus throughout all iterations +- Identify the key aspects you'll need to research +- Provide initial findings based on the information available - End with "## Next Steps" indicating what you'll investigate in the next iteration - Do NOT provide a final conclusion yet - this is just the beginning of the research -- Do NOT include general repository information unless directly relevant to the RFP query -- Focus EXCLUSIVELY on the specific insurance RFP topic being researched - do not drift to related topics -- Your research MUST directly address the original question about insurance RFP documentation +- Do NOT include general repository information unless directly relevant to the query +- Focus EXCLUSIVELY on the specific topic being researched - do not drift to related topics +- Your research MUST directly address the original question - NEVER respond with just "Continue the research" as an answer - always provide substantive research findings - Remember that this topic will be maintained across all research iterations @@ -324,15 +324,14 @@ async def handle_websocket_chat(websocket: WebSocket): """ elif is_final_iteration: system_prompt = f""" -You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). -You are in the final iteration of a Deep Research process focused EXCLUSIVELY on the latest user query about insurance RFP documentation. -Your goal is to synthesize all previous findings and provide a comprehensive conclusion that directly addresses this specific insurance RFP topic and ONLY this topic, helping onboard new RFP experts and providing clear documentation for AI RFP answerers. -IMPORTANT: You MUST respond in {language_name} language. +You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). +You are in the final iteration of a Deep Research process focused EXCLUSIVELY on the latest user query. +Your goal is to synthesize all previous findings and provide a comprehensive conclusion that directly addresses this specific topic and ONLY this topic. +IMPORTANT:You MUST respond in {language_name} language. @@ -340,71 +339,67 @@ async def handle_websocket_chat(websocket: WebSocket): - CAREFULLY review the entire conversation history to understand all previous findings - Synthesize ALL findings from previous iterations into a comprehensive conclusion - Start with "## Final Conclusion" -- Your conclusion MUST directly address the original question about insurance RFP documentation -- Stay STRICTLY focused on the specific insurance RFP topic - do not drift to related topics -- Include specific RFP document references and important details related to the topic -- Highlight the most important discoveries and insights about this specific insurance product or line of business +- Your conclusion MUST directly address the original question +- Stay STRICTLY focused on the specific topic - do not drift to related topics +- Include specific code references and implementation details related to the topic +- Highlight the most important discoveries and insights about this specific functionality - Provide a complete and definitive answer to the original question -- Do NOT include general repository information unless directly relevant to the RFP query -- Focus exclusively on the specific insurance RFP topic being researched +- Do NOT include general repository information unless directly relevant to the query +- Focus exclusively on the specific topic being researched - NEVER respond with "Continue the research" as an answer - always provide a complete conclusion -- If the topic is about a specific insurance product or line of business (like "Vision PPO"), focus ONLY on that product or line +- If the topic is about a specific file or feature (like "Dockerfile"), focus ONLY on that file or feature - Ensure your conclusion builds on and references key findings from previous iterations -- Include any relevant compliance or regulatory information if applicable """ else: system_prompt = f""" -You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). -You are currently in iteration {research_iteration} of a Deep Research process focused EXCLUSIVELY on the latest user query about insurance RFP documentation. -Your goal is to build upon previous research iterations and go deeper into this specific insurance RFP topic without deviating from it, helping onboard new RFP experts and providing clear documentation for AI RFP answerers. -IMPORTANT: You MUST respond in {language_name} language. +You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). +You are currently in iteration {research_iteration} of a Deep Research process focused EXCLUSIVELY on the latest user query. +Your goal is to build upon previous research iterations and go deeper into this specific topic without deviating from it. +IMPORTANT:You MUST respond in {language_name} language. - CAREFULLY review the conversation history to understand what has been researched so far - Your response MUST build on previous research iterations - do not repeat information already covered -- Identify gaps or areas that need further exploration related to this specific insurance RFP topic +- Identify gaps or areas that need further exploration related to this specific topic - Focus on one specific aspect that needs deeper investigation in this iteration - Start your response with "## Research Update {research_iteration}" - Clearly explain what you're investigating in this iteration - Provide new insights that weren't covered in previous iterations - If this is iteration 3, prepare for a final conclusion in the next iteration -- Do NOT include general repository information unless directly relevant to the RFP query -- Focus EXCLUSIVELY on the specific insurance RFP topic being researched - do not drift to related topics -- If the topic is about a specific insurance product or line of business (like "Vision PPO"), focus ONLY on that product or line +- Do NOT include general repository information unless directly relevant to the query +- Focus EXCLUSIVELY on the specific topic being researched - do not drift to related topics +- If the topic is about a specific file or feature (like "Dockerfile"), focus ONLY on that file or feature - NEVER respond with just "Continue the research" as an answer - always provide substantive research findings -- Your research MUST directly address the original question about insurance RFP documentation +- Your research MUST directly address the original question - Maintain continuity with previous research iterations - this is a continuous investigation -- Consider any relevant compliance, regulatory, or competitive information if applicable """ else: system_prompt = f""" -You are an expert RFP (Request for Proposal) analyst specializing in insurance company documentation. You're examining the repository: {repo_url} ({repo_name}). -You provide direct, concise, and accurate information about insurance RFP documentation to help onboard new RFP experts and provide clear documentation for AI RFP answerers. +You are an expert code analyst examining the {repo_type} repository: {repo_url} ({repo_name}). +You provide direct, concise, and accurate information about code repositories. You NEVER start responses with markdown headers or code fences. IMPORTANT: You MUST respond in {language_name} language. - Answer the user's question directly without ANY preamble or filler phrases -- DO NOT include any rationale, explanation, or extra comments +- DO NOT include any rationale, explanation, or extra comments. - DO NOT start with preambles like "Okay, here's a breakdown" or "Here's an explanation" - DO NOT start with markdown headers like "## Analysis of..." or any file path references - DO NOT start with ```markdown code fences @@ -414,27 +409,26 @@ async def handle_websocket_chat(websocket: WebSocket): ```markdown -## Analysis of Vision PPO Documentation +## Analysis of `adalflow/adalflow/datasets/gsm8k.py` -This document contains... +This file contains... ``` -- Format your response with proper markdown including headings, lists, and tables WITHIN your answer -- For RFP analysis, organize your response with clear sections by product, coverage, or question type + +- Be precise and technical when discussing code +- Format your response with proper markdown including headings, lists, and code blocks WITHIN your answer +- For code analysis, organize your response with clear sections - Think step by step and structure your answer logically - Start with the most relevant information that directly addresses the user's query -- Be precise and use appropriate insurance industry terminology -- Include relevant compliance or regulatory information when applicable - Your response language should be in the same language as the user's query """ # Fetch file content if provided From ee99dd714e42b0848dec1f440b78757d05a9f845 Mon Sep 17 00:00:00 2001 From: James Barney Date: Fri, 6 Jun 2025 15:35:07 -0400 Subject: [PATCH 7/7] top k --- api/config.py | 17 +++++++++++++++++ api/rag.py | 17 ++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/api/config.py b/api/config.py index c1caf723..0d1f278c 100644 --- a/api/config.py +++ b/api/config.py @@ -143,6 +143,17 @@ def load_generator_config(): # Load embedder configuration def load_embedder_config(): embedder_config = load_json_config("embedder.json") + + # Debug logging to see what's loaded + logger.info(f"Loaded embedder config: {embedder_config}") + if "retriever" in embedder_config: + logger.info(f"Retriever config found: {embedder_config['retriever']}") + if "top_k" in embedder_config["retriever"]: + logger.info(f"top_k value found: {embedder_config['retriever']['top_k']}") + else: + logger.warning("top_k not found in retriever config") + else: + logger.warning("retriever key not found in embedder config") # Process client classes for key in ["embedder", "embedder_ollama"]: @@ -246,6 +257,12 @@ def load_repo_config(): if key in embedder_config: configs[key] = embedder_config[key] +# Ensure retriever configuration has a top_k value +if "retriever" not in configs: + configs["retriever"] = {} +if "top_k" not in configs.get("retriever", {}): + configs["retriever"]["top_k"] = 20 # Default value + # Update repository configuration if repo_config: for key in ["file_filters", "repository"]: diff --git a/api/rag.py b/api/rag.py index a9ed3d71..decef4cc 100644 --- a/api/rag.py +++ b/api/rag.py @@ -420,8 +420,23 @@ def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_ try: # Use the appropriate embedder for retrieval retrieve_embedder = self.query_embedder if self.is_ollama_embedder else self.embedder + + # Debug logging for configs + logger.info(f"Available configs keys: {list(configs.keys())}") + if "retriever" in configs: + logger.info(f"Retriever config: {configs['retriever']}") + if "top_k" in configs["retriever"]: + logger.info(f"Found top_k in configs: {configs['retriever']['top_k']}") + else: + logger.warning("top_k not found in retriever config") + else: + logger.warning("retriever key not found in configs") + + # Get top_k from configs or use a default value + top_k = configs.get("retriever", {}).get("top_k", 20) + logger.info(f"Using top_k value: {top_k}") + self.retriever = FAISSRetriever( - **configs["retriever"], embedder=retrieve_embedder, documents=self.transformed_docs, document_map_func=lambda doc: doc.vector,