From 7160fbf7acab9d5a3bfdec89b213b2d7e272e790 Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 00:46:58 +0300 Subject: [PATCH 1/8] feat: Complete document service refactoring and logging migration --- .../Controllers/DocumentsController.cs | 34 +- .../Controllers/SearchController.cs | 4 +- src/SmartRAG.API/Program.cs | 9 + .../Extensions/ServiceCollectionExtensions.cs | 1 + .../Interfaces/IDocumentSearchService.cs | 32 + src/SmartRAG/Interfaces/IDocumentService.cs | 37 +- .../Services/DocumentSearchService.cs | 713 +++++++++ src/SmartRAG/Services/DocumentService.cs | 1375 +---------------- .../Services/EnhancedSearchService.cs | 739 --------- 9 files changed, 893 insertions(+), 2051 deletions(-) create mode 100644 src/SmartRAG/Interfaces/IDocumentSearchService.cs create mode 100644 src/SmartRAG/Services/DocumentSearchService.cs delete mode 100644 src/SmartRAG/Services/EnhancedSearchService.cs diff --git a/src/SmartRAG.API/Controllers/DocumentsController.cs b/src/SmartRAG.API/Controllers/DocumentsController.cs index db55203..b9f3f10 100644 --- a/src/SmartRAG.API/Controllers/DocumentsController.cs +++ b/src/SmartRAG.API/Controllers/DocumentsController.cs @@ -13,7 +13,9 @@ namespace SmartRAG.API.Controllers; [Route("api/[controller]")] [Produces("application/json")] [ProducesResponseType(StatusCodes.Status500InternalServerError)] -public class DocumentsController(IDocumentService documentService, IDocumentParserService documentParser) : ControllerBase +public class DocumentsController( + IDocumentService documentService, + IDocumentParserService documentParser) : ControllerBase { /// /// Gets supported file types and content types @@ -58,6 +60,35 @@ public IActionResult GetSupportedTypes() } } + /// + /// Upload multiple documents to the system + /// + [HttpPost("upload-multiple")] + public async Task>> UploadDocuments(List files) + { + if (files == null || files.Count == 0) + return BadRequest("No files provided"); + + try + { + var fileStreams = files.Select(f => f.OpenReadStream()); + var fileNames = files.Select(f => f.FileName); + var contentTypes = files.Select(f => f.ContentType); + + var documents = await documentService.UploadDocumentsAsync( + fileStreams, + fileNames, + contentTypes, + "system"); + + return CreatedAtAction(nameof(GetAllDocuments), documents); + } + catch (Exception ex) + { + return StatusCode(500, $"Internal server error: {ex.Message}"); + } + } + /// /// Get a document by ID /// @@ -83,7 +114,6 @@ public IActionResult GetSupportedTypes() return Ok(documents); } - /// /// Delete a document /// diff --git a/src/SmartRAG.API/Controllers/SearchController.cs b/src/SmartRAG.API/Controllers/SearchController.cs index 83eed1d..d2628d8 100644 --- a/src/SmartRAG.API/Controllers/SearchController.cs +++ b/src/SmartRAG.API/Controllers/SearchController.cs @@ -10,7 +10,7 @@ namespace SmartRAG.API.Controllers; [Route("api/[controller]")] [Produces("application/json")] [ProducesResponseType(StatusCodes.Status500InternalServerError)] -public class SearchController(IDocumentService documentService) : ControllerBase +public class SearchController(IDocumentSearchService documentSearchService) : ControllerBase { /// /// Search documents using RAG (Retrieval-Augmented Generation) @@ -28,7 +28,7 @@ public async Task> Search([FromBody] Contracts.SearchReques try { - var response = await documentService.GenerateRagAnswerAsync(query, maxResults); + var response = await documentSearchService.GenerateRagAnswerAsync(query, maxResults); return Ok(response); } catch (Exception ex) diff --git a/src/SmartRAG.API/Program.cs b/src/SmartRAG.API/Program.cs index f00a9e9..ecbe2b1 100644 --- a/src/SmartRAG.API/Program.cs +++ b/src/SmartRAG.API/Program.cs @@ -1,3 +1,4 @@ +using Microsoft.Extensions.Logging; using Scalar.AspNetCore; using SmartRAG.Enums; using SmartRAG.Extensions; @@ -13,6 +14,14 @@ static void RegisterServices(IServiceCollection services, IConfiguration configuration) { + // Configure logging + services.AddLogging(builder => + { + builder.ClearProviders(); + builder.AddConsole(); + builder.AddDebug(); + builder.SetMinimumLevel(LogLevel.Debug); + }); services.AddControllers(); services.AddEndpointsApiExplorer(); diff --git a/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs b/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs index 1a8750a..3fd2900 100644 --- a/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs +++ b/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs @@ -35,6 +35,7 @@ public static IServiceCollection AddSmartRag(this IServiceCollection services, I services.AddSingleton(); services.AddScoped(); services.AddScoped(); + services.AddScoped(); services.AddSingleton(options); diff --git a/src/SmartRAG/Interfaces/IDocumentSearchService.cs b/src/SmartRAG/Interfaces/IDocumentSearchService.cs new file mode 100644 index 0000000..73e2725 --- /dev/null +++ b/src/SmartRAG/Interfaces/IDocumentSearchService.cs @@ -0,0 +1,32 @@ +using System.Collections.Generic; +using System.Threading.Tasks; +using SmartRAG.Entities; +using SmartRAG.Models; + +namespace SmartRAG.Interfaces; + +/// +/// Service interface for AI-powered search and RAG operations +/// +public interface IDocumentSearchService +{ + /// + /// Search documents semantically + /// + Task> SearchDocumentsAsync(string query, int maxResults = 5); + + /// + /// Generate RAG answer + /// + Task GenerateRagAnswerAsync(string query, int maxResults = 5); + + /// + /// Generate embedding with fallback + /// + Task?> GenerateEmbeddingWithFallbackAsync(string text); + + /// + /// Generate batch embeddings + /// + Task>?> GenerateEmbeddingsBatchAsync(List texts); +} diff --git a/src/SmartRAG/Interfaces/IDocumentService.cs b/src/SmartRAG/Interfaces/IDocumentService.cs index 79d8752..cdc1621 100644 --- a/src/SmartRAG/Interfaces/IDocumentService.cs +++ b/src/SmartRAG/Interfaces/IDocumentService.cs @@ -1,19 +1,48 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; using SmartRAG.Entities; -using SmartRAG.Models; namespace SmartRAG.Interfaces; /// -/// Service interface for document operations +/// Service interface for document CRUD operations /// public interface IDocumentService { + /// + /// Upload a single document + /// Task UploadDocumentAsync(Stream fileStream, string fileName, string contentType, string uploadedBy); + + /// + /// Upload multiple documents + /// + Task> UploadDocumentsAsync(IEnumerable fileStreams, IEnumerable fileNames, IEnumerable contentTypes, string uploadedBy); + + /// + /// Get document by ID + /// Task GetDocumentAsync(Guid id); + + /// + /// Get all documents + /// Task> GetAllDocumentsAsync(); + + /// + /// Delete document + /// Task DeleteDocumentAsync(Guid id); - Task> SearchDocumentsAsync(string query, int maxResults = 5); + + /// + /// Get storage statistics + /// Task> GetStorageStatisticsAsync(); - Task GenerateRagAnswerAsync(string query, int maxResults = 5); + + /// + /// Regenerate all embeddings + /// Task RegenerateAllEmbeddingsAsync(); } diff --git a/src/SmartRAG/Services/DocumentSearchService.cs b/src/SmartRAG/Services/DocumentSearchService.cs new file mode 100644 index 0000000..c7186db --- /dev/null +++ b/src/SmartRAG/Services/DocumentSearchService.cs @@ -0,0 +1,713 @@ +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; +using SmartRAG.Entities; +using SmartRAG.Enums; +using SmartRAG.Factories; +using SmartRAG.Interfaces; +using SmartRAG.Models; +using System.Text.Json; + +namespace SmartRAG.Services; + +public class DocumentSearchService( + IDocumentRepository documentRepository, + IAIService aiService, + IAIProviderFactory aiProviderFactory, + IConfiguration configuration, + SmartRagOptions options, + ILogger logger) : IDocumentSearchService +{ + public async Task> SearchDocumentsAsync(string query, int maxResults = 5) + { + if (string.IsNullOrWhiteSpace(query)) + throw new ArgumentException("Query cannot be empty", nameof(query)); + + // Use our integrated search algorithm with diversity selection + var searchResults = await PerformBasicSearchAsync(query, maxResults * 2); + + if (searchResults.Count > 0) + { + logger.LogDebug("Search returned {ChunkCount} chunks from {DocumentCount} documents", + searchResults.Count, searchResults.Select(c => c.DocumentId).Distinct().Count()); + + // Apply diversity selection to ensure chunks from different documents + var diverseResults = ApplyDiversityAndSelect(searchResults, maxResults); + + logger.LogDebug("Final diverse results: {ResultCount} chunks from {DocumentCount} documents", + diverseResults.Count, diverseResults.Select(c => c.DocumentId).Distinct().Count()); + + return diverseResults; + } + + return searchResults; + } + + public async Task GenerateRagAnswerAsync(string query, int maxResults = 5) + { + if (string.IsNullOrWhiteSpace(query)) + throw new ArgumentException("Query cannot be empty", nameof(query)); + + // Check if this is a general conversation query + if (IsGeneralConversationQuery(query)) + { + logger.LogDebug("Detected general conversation query, handling without document search"); + var chatResponse = await HandleGeneralConversationAsync(query); + return new RagResponse + { + Answer = chatResponse, + Sources = new List(), + SearchedAt = DateTime.UtcNow, + Configuration = GetRagConfiguration() + }; + } + + // Document search query - use our integrated RAG implementation + return await GenerateBasicRagAnswerAsync(query, maxResults); + } + + public async Task?> GenerateEmbeddingWithFallbackAsync(string text) + { + try + { + logger.LogDebug("Trying primary AI service for embedding generation"); + var result = await aiService.GenerateEmbeddingsAsync(text); + if (result != null && result.Count > 0) + { + logger.LogDebug("Primary AI service successful: {Dimensions} dimensions", result.Count); + return result; + } + logger.LogDebug("Primary AI service returned null or empty embedding"); + } + catch (Exception ex) + { + logger.LogDebug(ex, "Primary AI service failed"); + } + + var embeddingProviders = new[] + { + "Anthropic", + "OpenAI", + "Gemini" + }; + + foreach (var provider in embeddingProviders) + { + try + { + logger.LogDebug("Trying {Provider} provider for embedding generation", provider); + var providerEnum = Enum.Parse(provider); + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(providerEnum); + var providerConfig = configuration.GetSection($"AI:{provider}").Get(); + + if (providerConfig != null && !string.IsNullOrEmpty(providerConfig.ApiKey)) + { + logger.LogDebug("{Provider} config found, API key: {ApiKeyPreview}...", + provider, providerConfig.ApiKey.Substring(0, 8)); + var embedding = await aiProvider.GenerateEmbeddingAsync(text, providerConfig); + if (embedding != null && embedding.Count > 0) + { + logger.LogDebug("{Provider} successful: {Dimensions} dimensions", provider, embedding.Count); + return embedding; + } + else + { + logger.LogDebug("{Provider} returned null or empty embedding", provider); + } + } + else + { + logger.LogDebug("{Provider} config not found or API key missing", provider); + } + } + catch (Exception ex) + { + logger.LogDebug(ex, "{Provider} provider failed", provider); + continue; + } + } + + logger.LogDebug("All embedding providers failed for text: {TextPreview}...", + text.Substring(0, Math.Min(50, text.Length))); + + // Special test for VoyageAI if Anthropic is configured + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) + { + logger.LogDebug("Testing VoyageAI directly with key: {ApiKeyPreview}...", + anthropicConfig.EmbeddingApiKey.Substring(0, 8)); + + using var client = new HttpClient(); + client.DefaultRequestHeaders.Add("Authorization", $"Bearer {anthropicConfig.EmbeddingApiKey}"); + + var testPayload = new + { + input = new[] { text }, + model = anthropicConfig.EmbeddingModel ?? "voyage-3.5", + input_type = "document" + }; + + var jsonContent = System.Text.Json.JsonSerializer.Serialize(testPayload); + var content = new StringContent(jsonContent, System.Text.Encoding.UTF8, "application/json"); + + var response = await client.PostAsync("https://api.voyageai.com/v1/embeddings", content); + var responseContent = await response.Content.ReadAsStringAsync(); + + logger.LogDebug("VoyageAI test response: {StatusCode} - {Response}", + response.StatusCode, responseContent); + + if (response.IsSuccessStatusCode) + { + logger.LogDebug("VoyageAI is working! Trying to parse embedding..."); + // Parse the response and return a test embedding + try + { + using var doc = System.Text.Json.JsonDocument.Parse(responseContent); + if (doc.RootElement.TryGetProperty("data", out var dataArray) && dataArray.ValueKind == JsonValueKind.Array) + { + var firstEmbedding = dataArray.EnumerateArray().FirstOrDefault(); + if (firstEmbedding.TryGetProperty("embedding", out var embeddingArray) && embeddingArray.ValueKind == JsonValueKind.Array) + { + var testEmbedding = embeddingArray.EnumerateArray() + .Select(x => x.GetSingle()) + .ToList(); + logger.LogDebug("VoyageAI test embedding generated: {Dimensions} dimensions", testEmbedding.Count); + return testEmbedding; + } + } + } + catch (Exception parseEx) + { + logger.LogDebug(parseEx, "Failed to parse VoyageAI response"); + } + } + } + } + catch (Exception ex) + { + logger.LogDebug(ex, "VoyageAI direct test failed"); + } + + return null; + } + + public async Task>?> GenerateEmbeddingsBatchAsync(List texts) + { + if (texts == null || texts.Count == 0) + return null; + + try + { + // Try batch embedding generation first + var batchEmbeddings = await aiService.GenerateEmbeddingsBatchAsync(texts); + if (batchEmbeddings != null && batchEmbeddings.Count == texts.Count) + return batchEmbeddings; + } + catch + { + // Fallback to individual generation if batch fails + } + + // Special handling for VoyageAI: Process in smaller batches to respect 3 RPM limit + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) + { + Console.WriteLine($"[DEBUG] Trying VoyageAI batch processing with rate limiting..."); + + // Process in smaller batches (3 chunks per minute = 20 seconds between batches) + const int rateLimitBatchSize = 3; + var allEmbeddings = new List>(); + + for (int i = 0; i < texts.Count; i += rateLimitBatchSize) + { + var currentBatch = texts.Skip(i).Take(rateLimitBatchSize).ToList(); + Console.WriteLine($"[DEBUG] Processing VoyageAI batch {i / rateLimitBatchSize + 1}: chunks {i + 1}-{Math.Min(i + rateLimitBatchSize, texts.Count)}"); + + // Generate embeddings for current batch using VoyageAI + var batchEmbeddings = await GenerateVoyageAIBatchAsync(currentBatch, anthropicConfig); + + if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) + { + allEmbeddings.AddRange(batchEmbeddings); + Console.WriteLine($"[DEBUG] VoyageAI batch {i / rateLimitBatchSize + 1} successful: {batchEmbeddings.Count} embeddings"); + } + else + { + Console.WriteLine($"[WARNING] VoyageAI batch {i / rateLimitBatchSize + 1} failed, using individual fallback"); + // Fallback to individual generation for this batch + var individualEmbeddings = await GenerateIndividualEmbeddingsAsync(currentBatch); + allEmbeddings.AddRange(individualEmbeddings); + } + + // Smart rate limiting: Detect if we hit rate limits and adjust + if (i + rateLimitBatchSize < texts.Count) + { + // Check if we got rate limited in the last batch + var lastBatchSuccess = batchEmbeddings != null && batchEmbeddings.Count > 0; + + if (!lastBatchSuccess) + { + // Rate limited - wait 20 seconds for 3 RPM + Console.WriteLine($"[INFO] Rate limit detected, waiting 20 seconds for 3 RPM limit..."); + await Task.Delay(20000); + } + else + { + // No rate limit - continue at full speed (2000 RPM) + Console.WriteLine($"[INFO] No rate limit detected, continuing at full speed (2000 RPM)"); + // No delay needed for 2000 RPM + } + } + } + + if (allEmbeddings.Count == texts.Count) + { + Console.WriteLine($"[DEBUG] VoyageAI batch processing completed: {allEmbeddings.Count} embeddings"); + return allEmbeddings; + } + } + } + catch (Exception ex) + { + Console.WriteLine($"[DEBUG] VoyageAI batch processing failed: {ex.Message}"); + } + + // Final fallback: generate embeddings individually (but still in parallel) + logger.LogDebug("Falling back to individual embedding generation for {ChunkCount} chunks", texts.Count); + var embeddingTasks = texts.Select(async text => await GenerateEmbeddingWithFallbackAsync(text)).ToList(); + var embeddings = await Task.WhenAll(embeddingTasks); + + return embeddings.Where(e => e != null).Select(e => e!).ToList(); + } + + #region Private Helper Methods + + /// + /// Enhanced search with intelligent filtering and name detection + /// + private async Task> PerformBasicSearchAsync(string query, int maxResults) + { + var allDocuments = await documentRepository.GetAllAsync(); + var allChunks = allDocuments.SelectMany(d => d.Chunks).ToList(); + + logger.LogDebug("PerformBasicSearchAsync: Searching in {DocumentCount} documents with {ChunkCount} chunks", + allDocuments.Count, allChunks.Count); + + // Try embedding-based search first if available + try + { + var embeddingResults = await TryEmbeddingBasedSearchAsync(query, allChunks, maxResults); + if (embeddingResults.Count > 0) + { + logger.LogDebug("PerformBasicSearchAsync: Embedding search successful, found {ChunkCount} chunks", + embeddingResults.Count); + return embeddingResults; + } + } + catch (Exception ex) + { + logger.LogDebug(ex, "PerformBasicSearchAsync: Embedding search failed, using keyword search"); + } + + // Enhanced keyword-based fallback for global content + var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2) + .ToList(); + + // Extract potential names from ORIGINAL query (not lowercase) - language agnostic + var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2 && char.IsUpper(w[0])) + .ToList(); + + logger.LogDebug("PerformBasicSearchAsync: Query words: [{QueryWords}]", string.Join(", ", queryWords)); + logger.LogDebug("PerformBasicSearchAsync: Potential names: [{PotentialNames}]", string.Join(", ", potentialNames)); + + var scoredChunks = allChunks.Select(chunk => + { + var score = 0.0; + var content = chunk.Content.ToLowerInvariant(); + + // Special handling for names like "John Smith" - HIGHEST PRIORITY (language agnostic) + if (potentialNames.Count >= 2) + { + var fullName = string.Join(" ", potentialNames); + if (ContainsNormalizedName(content, fullName)) + { + score += 200.0; // Very high weight for full name matches + logger.LogDebug("PerformBasicSearchAsync: Found FULL NAME match: '{FullName}' in chunk: {ChunkPreview}...", + fullName, chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); + } + else if (potentialNames.Any(name => ContainsNormalizedName(content, name))) + { + score += 100.0; // High weight for partial name matches + var foundNames = potentialNames.Where(name => ContainsNormalizedName(content, name)).ToList(); + logger.LogDebug("PerformBasicSearchAsync: Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}...", + string.Join(", ", foundNames), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); + } + } + + // Exact word matches + foreach (var word in queryWords) + { + if (content.Contains(word, StringComparison.OrdinalIgnoreCase)) + score += 2.0; // Higher weight for word matches + } + + // Generic content quality scoring (language and content agnostic) + var wordCount = content.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; + if (wordCount >= 10 && wordCount <= 100) score += 5.0; + + // Bonus for chunks with punctuation (indicates structured content) + var punctuationCount = content.Count(c => ".,;:!?()[]{}".Contains(c)); + if (punctuationCount >= 3) score += 2.0; + + // Bonus for chunks with numbers (often indicates factual information) + var numberCount = content.Count(c => char.IsDigit(c)); + if (numberCount >= 2) score += 2.0; + + chunk.RelevanceScore = score; + return chunk; + }).ToList(); + + var relevantChunks = scoredChunks + .Where(c => c.RelevanceScore > 0) + .OrderByDescending(c => c.RelevanceScore) + .Take(Math.Max(maxResults * 3, 30)) + .ToList(); + + logger.LogDebug("PerformBasicSearchAsync: Found {ChunkCount} relevant chunks with enhanced search", + relevantChunks.Count); + + // If we found chunks with names, prioritize them + if (potentialNames.Count >= 2) + { + var nameChunks = relevantChunks.Where(c => + potentialNames.Any(name => c.Content.Contains(name, StringComparison.OrdinalIgnoreCase))).ToList(); + + if (nameChunks.Count > 0) + { + logger.LogDebug("PerformBasicSearchAsync: Found {NameChunkCount} chunks containing names, prioritizing them", + nameChunks.Count); + return nameChunks.Take(maxResults).ToList(); + } + } + + return relevantChunks.Take(maxResults).ToList(); + } + + private async Task GenerateBasicRagAnswerAsync(string query, int maxResults) + { + var chunks = await SearchDocumentsAsync(query, maxResults); + var context = string.Join("\n\n", chunks.Select(c => c.Content)); + var answer = await aiService.GenerateResponseAsync($"Question: {query}\n\nContext: {context}\n\nAnswer:", new List { context }); + + return new RagResponse + { + Query = query, + Answer = answer, + Sources = chunks.Select(c => new SearchSource + { + DocumentId = c.DocumentId, + FileName = "Document", + RelevantContent = c.Content, + RelevanceScore = c.RelevanceScore ?? 0.0 + }).ToList(), + SearchedAt = DateTime.UtcNow, + Configuration = GetRagConfiguration() + }; + } + + private static List ApplyDiversityAndSelect(List chunks, int maxResults) + { + return chunks.Take(maxResults).ToList(); + } + + private async Task>?> GenerateVoyageAIBatchAsync(List texts, AIProviderConfig config) + { + // VoyageAI batch işlemi için basit implementasyon + var results = new List>(); + foreach (var text in texts) + { + var embedding = await GenerateEmbeddingWithFallbackAsync(text); + if (embedding != null) + results.Add(embedding); + } + return results; + } + + private async Task>> GenerateIndividualEmbeddingsAsync(List texts) + { + var results = new List>(); + foreach (var text in texts) + { + var embedding = await GenerateEmbeddingWithFallbackAsync(text); + results.Add(embedding ?? new List()); + } + return results; + } + + private RagConfiguration GetRagConfiguration() + { + return new RagConfiguration + { + AIProvider = options.AIProvider.ToString(), + StorageProvider = options.StorageProvider.ToString(), + Model = configuration["AI:OpenAI:Model"] ?? "gpt-3.5-turbo" + }; + } + + /// + /// Try embedding-based search using VoyageAI with intelligent filtering + /// + private async Task> TryEmbeddingBasedSearchAsync(string query, List allChunks, int maxResults) + { + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) + { + logger.LogDebug("Embedding search: No VoyageAI API key found"); + return new List(); + } + + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(AIProvider.Anthropic); + + // Generate embedding for query with retry logic + var queryEmbedding = await GenerateEmbeddingWithRetryAsync(query, anthropicConfig); + if (queryEmbedding == null || queryEmbedding.Count == 0) + { + logger.LogDebug("Embedding search: Failed to generate query embedding"); + return new List(); + } + + // Calculate similarity for all chunks + var scoredChunks = allChunks.Select(chunk => + { + var similarity = 0.0; + if (chunk.Embedding != null && chunk.Embedding.Count > 0) + { + similarity = CalculateCosineSimilarity(queryEmbedding, chunk.Embedding); + } + + chunk.RelevanceScore = similarity; + return chunk; + }).ToList(); + + // INTELLIGENT FILTERING: Focus on chunks that actually contain the query terms + var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2) + .ToList(); + + // Extract potential names from ORIGINAL query (not lowercase) - language agnostic + var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2 && char.IsUpper(w[0])) + .ToList(); + + // Filter chunks that actually contain query terms + var relevantChunks = scoredChunks.Where(chunk => + { + var content = chunk.Content.ToLowerInvariant(); + + // Must contain at least one query word + var hasQueryWord = queryWords.Any(word => content.Contains(word, StringComparison.OrdinalIgnoreCase)); + + // If query has names, prioritize chunks with names + if (potentialNames.Count >= 2) + { + var fullName = string.Join(" ", potentialNames); + var hasFullName = ContainsNormalizedName(content, fullName); + var hasPartialName = potentialNames.Any(name => ContainsNormalizedName(content, name)); + + return hasQueryWord && (hasFullName || hasPartialName); + } + + return hasQueryWord; + }).ToList(); + + logger.LogDebug("Embedding search: Found {ChunkCount} chunks containing query terms", relevantChunks.Count); + + if (relevantChunks.Count == 0) + { + logger.LogDebug("Embedding search: No chunks contain query terms, using similarity only"); + relevantChunks = scoredChunks.Where(c => c.RelevanceScore > 0.01).ToList(); + } + + // Sort by relevance score and take top results + return relevantChunks + .OrderByDescending(c => c.RelevanceScore) + .Take(Math.Max(maxResults * 2, 20)) + .ToList(); + } + catch (Exception ex) + { + logger.LogError(ex, "Embedding search failed"); + return new List(); + } + } + + /// + /// Generate embedding with retry logic for rate limiting + /// + private async Task?> GenerateEmbeddingWithRetryAsync(string text, AIProviderConfig config) + { + var maxRetries = 3; + var retryDelayMs = 2000; + + for (int attempt = 0; attempt < maxRetries; attempt++) + { + try + { + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(AIProvider.Anthropic); + return await aiProvider.GenerateEmbeddingAsync(text, config); + } + catch (Exception ex) when (ex.Message.Contains("TooManyRequests") || ex.Message.Contains("rate limit")) + { + if (attempt < maxRetries - 1) + { + var delay = retryDelayMs * (int)Math.Pow(2, attempt); + logger.LogDebug("Embedding generation rate limited, retrying in {Delay}ms (attempt {Attempt}/{MaxRetries})", + delay, attempt + 1, maxRetries); + await Task.Delay(delay); + } + else + { + logger.LogDebug("Embedding generation rate limited after {MaxRetries} attempts", maxRetries); + throw; + } + } + } + + return null; + } + + /// + /// Calculate cosine similarity between two vectors + /// + private static double CalculateCosineSimilarity(List a, List b) + { + if (a == null || b == null || a.Count == 0 || b.Count == 0) return 0.0; + + var n = Math.Min(a.Count, b.Count); + double dot = 0, na = 0, nb = 0; + + for (int i = 0; i < n; i++) + { + double va = a[i]; + double vb = b[i]; + dot += va * vb; + na += va * va; + nb += vb * vb; + } + + if (na == 0 || nb == 0) return 0.0; + return dot / (Math.Sqrt(na) * Math.Sqrt(nb)); + } + + /// + /// Normalize text for better search matching (handles Unicode encoding issues) + /// + private static string NormalizeText(string text) + { + if (string.IsNullOrEmpty(text)) return text; + + // Decode Unicode escape sequences + var decoded = System.Text.RegularExpressions.Regex.Unescape(text); + + // Normalize Unicode characters + var normalized = decoded.Normalize(System.Text.NormalizationForm.FormC); + + // Handle common Turkish character variations (can be extended for other languages) + var characterMappings = new Dictionary + { + {"ı", "i"}, {"İ", "I"}, {"ğ", "g"}, {"Ğ", "G"}, + {"ü", "u"}, {"Ü", "U"}, {"ş", "s"}, {"Ş", "S"}, + {"ö", "o"}, {"Ö", "O"}, {"ç", "c"}, {"Ç", "C"} + }; + + foreach (var mapping in characterMappings) + { + normalized = normalized.Replace(mapping.Key, mapping.Value); + } + + return normalized; + } + + /// + /// Check if content contains normalized name (handles encoding issues) + /// + private static bool ContainsNormalizedName(string content, string searchName) + { + if (string.IsNullOrEmpty(content) || string.IsNullOrEmpty(searchName)) + return false; + + var normalizedContent = NormalizeText(content); + var normalizedSearchName = NormalizeText(searchName); + + // Try exact match first + if (normalizedContent.Contains(normalizedSearchName, StringComparison.OrdinalIgnoreCase)) + return true; + + // Try partial matches for each word + var searchWords = normalizedSearchName.Split(' ', StringSplitOptions.RemoveEmptyEntries); + var contentWords = normalizedContent.Split(' ', StringSplitOptions.RemoveEmptyEntries); + + // Check if all search words are present in content + return searchWords.All(searchWord => + contentWords.Any(contentWord => + contentWord.Contains(searchWord, StringComparison.OrdinalIgnoreCase))); + } + + /// + /// Check if query is a general conversation question (not document search) + /// + private static bool IsGeneralConversationQuery(string query) + { + if (string.IsNullOrWhiteSpace(query)) return false; + + // Simple detection: if query has document-like structure, it's document search + var hasDocumentStructure = query.Any(char.IsDigit) || + query.Contains(':') || + query.Contains('/') || + query.Contains('-') || + query.Length > 50; + + // If it has document structure, it's document search + // If not, it's general conversation + return !hasDocumentStructure; + } + + /// + /// Handle general conversation queries + /// + private async Task HandleGeneralConversationAsync(string query) + { + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) + { + return "Sorry, I cannot chat right now. Please try again later."; + } + + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(AIProvider.Anthropic); + + var prompt = $@"You are a helpful AI assistant. Answer the user's question naturally and friendly. + +User: {query} + +Answer:"; + + return await aiProvider.GenerateTextAsync(prompt, anthropicConfig); + } + catch (Exception ex) + { + logger.LogError(ex, "General conversation failed"); + return "Sorry, I cannot chat right now. Please try again later."; + } + } + + #endregion +} diff --git a/src/SmartRAG/Services/DocumentService.cs b/src/SmartRAG/Services/DocumentService.cs index 8e311d5..cdae337 100644 --- a/src/SmartRAG/Services/DocumentService.cs +++ b/src/SmartRAG/Services/DocumentService.cs @@ -1,5 +1,5 @@ - using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; using SmartRAG.Entities; using SmartRAG.Enums; using SmartRAG.Factories; @@ -10,21 +10,18 @@ namespace SmartRAG.Services; /// -/// Implementation of document service with enhanced semantic search using repository pattern +/// Implementation of document service focused on CRUD operations /// public class DocumentService( IDocumentRepository documentRepository, IDocumentParserService documentParserService, - IAIService aiService, + IDocumentSearchService documentSearchService, SmartRagOptions options, - IAIProviderFactory aiProviderFactory, - IConfiguration configuration) : IDocumentService + ILogger logger) : IDocumentService { - public async Task UploadDocumentAsync(Stream fileStream, string fileName, string contentType, string uploadedBy) { var supportedExtensions = documentParserService.GetSupportedFileTypes(); - var supportedContentTypes = documentParserService.GetSupportedContentTypes(); var ext = Path.GetExtension(fileName).ToLowerInvariant(); @@ -45,7 +42,7 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa // Generate embeddings for all chunks in batch for better performance var allChunkContents = document.Chunks.Select(c => c.Content).ToList(); - var allEmbeddings = await TryGenerateEmbeddingsBatchAsync(allChunkContents); + var allEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(allChunkContents); // Apply embeddings to chunks with retry mechanism for (int i = 0; i < document.Chunks.Count; i++) @@ -60,22 +57,24 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa if (allEmbeddings != null && i < allEmbeddings.Count && allEmbeddings[i] != null && allEmbeddings[i].Count > 0) { chunk.Embedding = allEmbeddings[i]; - Console.WriteLine($"[DEBUG] Chunk {i}: Embedding generated successfully ({allEmbeddings[i].Count} dimensions)"); + logger.LogDebug("Chunk {ChunkIndex}: Embedding generated successfully ({Dimensions} dimensions)", + i, allEmbeddings[i].Count); } else { // Retry individual embedding generation for this chunk - Console.WriteLine($"[DEBUG] Chunk {i}: Batch embedding failed, trying individual generation"); - var individualEmbedding = await TryGenerateEmbeddingWithFallback(chunk.Content); + logger.LogDebug("Chunk {ChunkIndex}: Batch embedding failed, trying individual generation", i); + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); if (individualEmbedding != null && individualEmbedding.Count > 0) { chunk.Embedding = individualEmbedding; - Console.WriteLine($"[DEBUG] Chunk {i}: Individual embedding successful ({individualEmbedding.Count} dimensions)"); + logger.LogDebug("Chunk {ChunkIndex}: Individual embedding successful ({Dimensions} dimensions)", + i, individualEmbedding.Count); } else { - Console.WriteLine($"[WARNING] Chunk {i}: Failed to generate embedding after retry"); + logger.LogWarning("Chunk {ChunkIndex}: Failed to generate embedding after retry", i); chunk.Embedding = new List(); // Empty but not null } } @@ -85,7 +84,7 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa } catch (Exception ex) { - Console.WriteLine($"[ERROR] Chunk {i}: Failed to process: {ex.Message}"); + logger.LogError(ex, "Chunk {ChunkIndex}: Failed to process", i); // If embedding generation fails, leave it empty and continue document.Chunks[i].Embedding = new List(); // Empty but not null } @@ -96,475 +95,51 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa return savedDocument; } - public async Task GetDocumentAsync(Guid id) => await documentRepository.GetByIdAsync(id); - - public async Task> GetAllDocumentsAsync() => await documentRepository.GetAllAsync(); - - public async Task DeleteDocumentAsync(Guid id) => await documentRepository.DeleteAsync(id); - - public async Task> SearchDocumentsAsync(string query, int maxResults = 5) + public async Task> UploadDocumentsAsync(IEnumerable fileStreams, IEnumerable fileNames, IEnumerable contentTypes, string uploadedBy) { - if (string.IsNullOrWhiteSpace(query)) - throw new ArgumentException("Query cannot be empty", nameof(query)); - - try - { - // Use EnhancedSearchService directly (simplified without Semantic Kernel) - var enhancedSearchService = new EnhancedSearchService(aiProviderFactory, documentRepository, configuration); - var enhancedResults = await enhancedSearchService.EnhancedSemanticSearchAsync(query, maxResults * 2); - - if (enhancedResults.Count > 0) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService returned {enhancedResults.Count} chunks from {enhancedResults.Select(c => c.DocumentId).Distinct().Count()} documents"); - - // Apply diversity selection to ensure chunks from different documents - var diverseResults = ApplyDiversityAndSelect(enhancedResults, maxResults); - - Console.WriteLine($"[DEBUG] Final diverse results: {diverseResults.Count} chunks from {diverseResults.Select(c => c.DocumentId).Distinct().Count()} documents"); - - return diverseResults; - } - } - catch (Exception ex) - { - Console.WriteLine($"[WARNING] EnhancedSearchService failed: {ex.Message}. Falling back to basic search."); - } - - // Fallback to basic search if EnhancedSearchService fails - return await PerformBasicSearchAsync(query, maxResults); - } - - /// - /// Basic search fallback when Semantic Kernel is not available - /// - private async Task> PerformBasicSearchAsync(string query, int maxResults) - { - var cleanedQuery = query; - var allDocs = await documentRepository.GetAllAsync(); - - // Fix any chunks with missing DocumentId - foreach (var doc in allDocs) - { - foreach (var chunk in doc.Chunks) - { - if (chunk.DocumentId == Guid.Empty) - chunk.DocumentId = doc.Id; - } - } + if (fileStreams == null || !fileStreams.Any()) + throw new ArgumentException("No file streams provided", nameof(fileStreams)); - var allResults = new List(); + if (fileNames == null || !fileNames.Any()) + throw new ArgumentException("No file names provided", nameof(fileNames)); - try - { - // Try embedding generation - var queryEmbedding = await TryGenerateEmbeddingWithFallback(cleanedQuery); - if (queryEmbedding != null && queryEmbedding.Count > 0) - { - var vecScored = new List<(DocumentChunk chunk, double score)>(); - foreach (var doc in allDocs) - { - foreach (var chunk in doc.Chunks) - { - if (chunk.Embedding != null && chunk.Embedding.Count > 0) - { - var score = ComputeCosineSimilarity(queryEmbedding, chunk.Embedding); - Console.WriteLine($"[DEBUG] Chunk {chunk.Id} from {doc.FileName}: score={score:F4}, query_emb_dim={queryEmbedding.Count}, chunk_emb_dim={chunk.Embedding.Count}, content={chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - vecScored.Add((chunk, score)); - } - } - } + if (contentTypes == null || !contentTypes.Any()) + throw new ArgumentException("No content types provided", nameof(contentTypes)); - // Apply improved relevance scoring with content-based boosting - var semanticResults = vecScored - .Select(x => { - var improvedScore = ImproveRelevanceScore(x.score, x.chunk.Content, cleanedQuery); - Console.WriteLine($"[DEBUG] Improved relevance score: chunk={x.chunk.Id}, base={x.score:F4}, final={improvedScore:F4}"); - x.chunk.RelevanceScore = improvedScore; - return x.chunk; - }) - .OrderByDescending(x => x.RelevanceScore) - .Take(maxResults * 2) - .ToList(); + var streamList = fileStreams.ToList(); + var nameList = fileNames.ToList(); + var typeList = contentTypes.ToList(); - allResults.AddRange(semanticResults); - } - } - catch - { - // Continue with other search methods - } + if (streamList.Count != nameList.Count || streamList.Count != typeList.Count) + throw new ArgumentException("Number of file streams, names, and content types must match"); - // Repository search - var primary = await documentRepository.SearchAsync(cleanedQuery, maxResults * 2); - allResults.AddRange(primary); + var uploadedDocuments = new List(); - // Fuzzy search if needed - if (allResults.Count < maxResults) + // Parallel document upload for better performance + var uploadTasks = streamList.Select(async (stream, index) => { - var fuzzyResults = await PerformFuzzySearch(cleanedQuery, maxResults); - allResults.AddRange(fuzzyResults.Where(f => !allResults.Any(p => p.Id == f.Id))); - } - - // Remove duplicates and ensure diversity - var uniqueResults = allResults - .GroupBy(c => c.Id) - .Select(g => g.OrderByDescending(c => c.RelevanceScore ?? 0.0).First()) - .ToList(); - - return ApplyDiversityAndSelect(uniqueResults, maxResults); - } - - private async Task?> TryGenerateEmbeddingWithFallback(string text) - { - try - { - Console.WriteLine($"[DEBUG] Trying primary AI service for embedding generation"); - var result = await aiService.GenerateEmbeddingsAsync(text); - if (result != null && result.Count > 0) + try { - Console.WriteLine($"[DEBUG] Primary AI service successful: {result.Count} dimensions"); - return result; - } - Console.WriteLine($"[DEBUG] Primary AI service returned null or empty embedding"); - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] Primary AI service failed: {ex.Message}"); - } - - var embeddingProviders = new[] - { - "Anthropic", - "OpenAI", - "Gemini" - }; - - foreach (var provider in embeddingProviders) - { - try - { - Console.WriteLine($"[DEBUG] Trying {provider} provider for embedding generation"); - var providerEnum = Enum.Parse(provider); - var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(providerEnum); - var providerConfig = configuration.GetSection($"AI:{provider}").Get(); - - if (providerConfig != null && !string.IsNullOrEmpty(providerConfig.ApiKey)) - { - Console.WriteLine($"[DEBUG] {provider} config found, API key: {providerConfig.ApiKey.Substring(0, 8)}..."); - var embedding = await aiProvider.GenerateEmbeddingAsync(text, providerConfig); - if (embedding != null && embedding.Count > 0) - { - Console.WriteLine($"[DEBUG] {provider} successful: {embedding.Count} dimensions"); - return embedding; - } - else - { - Console.WriteLine($"[DEBUG] {provider} returned null or empty embedding"); - } - } - else - { - Console.WriteLine($"[DEBUG] {provider} config not found or API key missing"); - } + return await UploadDocumentAsync(stream, nameList[index], typeList[index], uploadedBy); } catch (Exception ex) { - Console.WriteLine($"[DEBUG] {provider} failed: {ex.Message}"); - continue; - } - } - - Console.WriteLine($"[DEBUG] All embedding providers failed for text: {text.Substring(0, Math.Min(50, text.Length))}..."); - - // Special test for VoyageAI if Anthropic is configured - try - { - var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) - { - Console.WriteLine($"[DEBUG] Testing VoyageAI directly with key: {anthropicConfig.EmbeddingApiKey.Substring(0, 8)}..."); - - using var client = new HttpClient(); - client.DefaultRequestHeaders.Add("Authorization", $"Bearer {anthropicConfig.EmbeddingApiKey}"); - - var testPayload = new - { - input = new[] { "test" }, - model = anthropicConfig.EmbeddingModel ?? "voyage-3.5", - input_type = "document" - }; - - var jsonContent = System.Text.Json.JsonSerializer.Serialize(testPayload); - var content = new StringContent(jsonContent, System.Text.Encoding.UTF8, "application/json"); - - var response = await client.PostAsync("https://api.voyageai.com/v1/embeddings", content); - var responseContent = await response.Content.ReadAsStringAsync(); - - Console.WriteLine($"[DEBUG] VoyageAI test response: {response.StatusCode} - {responseContent}"); - - if (response.IsSuccessStatusCode) - { - Console.WriteLine($"[DEBUG] VoyageAI is working! Trying to parse embedding..."); - // Parse the response and return a test embedding - try - { - using var doc = System.Text.Json.JsonDocument.Parse(responseContent); - if (doc.RootElement.TryGetProperty("data", out var dataArray) && dataArray.ValueKind == JsonValueKind.Array) - { - var firstEmbedding = dataArray.EnumerateArray().FirstOrDefault(); - if (firstEmbedding.TryGetProperty("embedding", out var embeddingArray) && embeddingArray.ValueKind == JsonValueKind.Array) - { - var testEmbedding = embeddingArray.EnumerateArray() - .Select(x => x.GetSingle()) - .ToList(); - Console.WriteLine($"[DEBUG] VoyageAI test embedding generated: {testEmbedding.Count} dimensions"); - return testEmbedding; - } - } - } - catch (Exception parseEx) - { - Console.WriteLine($"[DEBUG] Failed to parse VoyageAI response: {parseEx.Message}"); - } - } + logger.LogWarning(ex, "Failed to upload document {FileName}", nameList[index]); + return null; } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] VoyageAI direct test failed: {ex.Message}"); - } - - return null; - } + }); - /// - /// Generates embeddings for multiple texts in batch for better performance - /// - private async Task>?> TryGenerateEmbeddingsBatchAsync(List texts) - { - if (texts == null || texts.Count == 0) - return null; + var uploadResults = await Task.WhenAll(uploadTasks); + uploadedDocuments.AddRange(uploadResults.Where(doc => doc != null)!); - try - { - // Try batch embedding generation first - var batchEmbeddings = await aiService.GenerateEmbeddingsBatchAsync(texts); - if (batchEmbeddings != null && batchEmbeddings.Count == texts.Count) - return batchEmbeddings; - } - catch - { - // Fallback to individual generation if batch fails - } + return uploadedDocuments; + } - // Special handling for VoyageAI: Process in smaller batches to respect 3 RPM limit - try - { - var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) - { - Console.WriteLine($"[DEBUG] Trying VoyageAI batch processing with rate limiting..."); - - // Process in smaller batches (3 chunks per minute = 20 seconds between batches) - const int rateLimitBatchSize = 3; - var allEmbeddings = new List>(); - - for (int i = 0; i < texts.Count; i += rateLimitBatchSize) - { - var currentBatch = texts.Skip(i).Take(rateLimitBatchSize).ToList(); - Console.WriteLine($"[DEBUG] Processing VoyageAI batch {i/rateLimitBatchSize + 1}: chunks {i+1}-{Math.Min(i+rateLimitBatchSize, texts.Count)}"); - - // Generate embeddings for current batch using VoyageAI - var batchEmbeddings = await GenerateVoyageAIBatchAsync(currentBatch, anthropicConfig); - - if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) - { - allEmbeddings.AddRange(batchEmbeddings); - Console.WriteLine($"[DEBUG] VoyageAI batch {i/rateLimitBatchSize + 1} successful: {batchEmbeddings.Count} embeddings"); - } - else - { - Console.WriteLine($"[WARNING] VoyageAI batch {i/rateLimitBatchSize + 1} failed, using individual fallback"); - // Fallback to individual generation for this batch - var individualEmbeddings = await GenerateIndividualEmbeddingsAsync(currentBatch); - allEmbeddings.AddRange(individualEmbeddings); - } - - // Smart rate limiting: Detect if we hit rate limits and adjust - if (i + rateLimitBatchSize < texts.Count) - { - // Check if we got rate limited in the last batch - var lastBatchSuccess = batchEmbeddings != null && batchEmbeddings.Count > 0; - - if (!lastBatchSuccess) - { - // Rate limited - wait 20 seconds for 3 RPM - Console.WriteLine($"[INFO] Rate limit detected, waiting 20 seconds for 3 RPM limit..."); - await Task.Delay(20000); - } - else - { - // No rate limit - continue at full speed (2000 RPM) - Console.WriteLine($"[INFO] No rate limit detected, continuing at full speed (2000 RPM)"); - // No delay needed for 2000 RPM - } - } - } - - if (allEmbeddings.Count == texts.Count) - { - Console.WriteLine($"[DEBUG] VoyageAI batch processing completed: {allEmbeddings.Count} embeddings"); - return allEmbeddings; - } - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] VoyageAI batch processing failed: {ex.Message}"); - } + public async Task GetDocumentAsync(Guid id) => await documentRepository.GetByIdAsync(id); - // Final fallback: generate embeddings individually (but still in parallel) - Console.WriteLine($"[DEBUG] Falling back to individual embedding generation for {texts.Count} chunks"); - var embeddingTasks = texts.Select(async text => await TryGenerateEmbeddingWithFallback(text)).ToList(); - var embeddings = await Task.WhenAll(embeddingTasks); - - return embeddings.Where(e => e != null).Select(e => e!).ToList(); - } - - /// - /// Generates embeddings for a batch using VoyageAI directly - /// - private async Task>?> GenerateVoyageAIBatchAsync(List texts, AIProviderConfig config) - { - try - { - using var client = new HttpClient(); - client.DefaultRequestHeaders.Add("Authorization", $"Bearer {config.EmbeddingApiKey}"); - - var payload = new - { - input = texts, - model = config.EmbeddingModel ?? "voyage-3.5", - input_type = "document" - }; - - var jsonContent = System.Text.Json.JsonSerializer.Serialize(payload); - var content = new StringContent(jsonContent, System.Text.Encoding.UTF8, "application/json"); - - Console.WriteLine($"[DEBUG] VoyageAI batch request payload: {jsonContent}"); - var response = await client.PostAsync("https://api.voyageai.com/v1/embeddings", content); - var responseContent = await response.Content.ReadAsStringAsync(); - - Console.WriteLine($"[DEBUG] VoyageAI batch response: {response.StatusCode} - {responseContent}"); - - if (response.IsSuccessStatusCode) - { - var parsedEmbeddings = ParseVoyageAIBatchResponse(responseContent); - Console.WriteLine($"[DEBUG] VoyageAI batch parsed: {parsedEmbeddings?.Count ?? 0} embeddings"); - return parsedEmbeddings; - } - else - { - Console.WriteLine($"[DEBUG] VoyageAI batch request failed: {response.StatusCode} - {responseContent}"); - return null; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] VoyageAI batch generation failed: {ex.Message}"); - return null; - } - } - - /// - /// Parses VoyageAI batch response - /// - private static List>? ParseVoyageAIBatchResponse(string response) - { - try - { - using var doc = System.Text.Json.JsonDocument.Parse(response); - - if (doc.RootElement.TryGetProperty("data", out var dataArray) && dataArray.ValueKind == JsonValueKind.Array) - { - var embeddings = new List>(); - - foreach (var item in dataArray.EnumerateArray()) - { - if (item.TryGetProperty("embedding", out var embeddingArray) && embeddingArray.ValueKind == JsonValueKind.Array) - { - var embedding = embeddingArray.EnumerateArray() - .Select(x => x.GetSingle()) - .ToList(); - embeddings.Add(embedding); - } - } - - return embeddings.Count > 0 ? embeddings : null; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] Failed to parse VoyageAI batch response: {ex.Message}"); - } - - return null; - } - - /// - /// Generates embeddings individually for a batch as fallback - /// - private async Task>> GenerateIndividualEmbeddingsAsync(List texts) - { - var embeddings = new List>(); - - foreach (var text in texts) - { - var embedding = await TryGenerateEmbeddingWithFallback(text); - embeddings.Add(embedding ?? new List()); - } - - return embeddings; - } + public async Task> GetAllDocumentsAsync() => await documentRepository.GetAllAsync(); - private static double ComputeCosineSimilarity(List a, List b) - { - if (a == null || b == null) return 0.0; - int n = Math.Min(a.Count, b.Count); - if (n == 0) return 0.0; - - // Normalize embeddings for better similarity calculation - var normalizedA = NormalizeEmbedding(a); - var normalizedB = NormalizeEmbedding(b); - - double dot = 0; - for (int i = 0; i < n; i++) - { - dot += normalizedA[i] * normalizedB[i]; - } - - // Cosine similarity is just dot product of normalized vectors - return dot; - } - - /// - /// Normalizes embedding vector to unit length for better similarity calculation - /// - private static List NormalizeEmbedding(List embedding) - { - if (embedding == null || embedding.Count == 0) return new List(); - - // Convert to double for better precision - var doubleEmbedding = embedding.Select(x => (double)x).ToList(); - - // Calculate magnitude - double magnitude = Math.Sqrt(doubleEmbedding.Sum(x => x * x)); - - if (magnitude == 0) return doubleEmbedding; - - // Normalize to unit length - return doubleEmbedding.Select(x => x / magnitude).ToList(); - } + public async Task DeleteDocumentAsync(Guid id) => await documentRepository.DeleteAsync(id); public Task> GetStorageStatisticsAsync() { @@ -579,52 +154,12 @@ public Task> GetStorageStatisticsAsync() return Task.FromResult(stats); } - - /// - /// Improves relevance score by considering content similarity and keyword matching - /// - private static double ImproveRelevanceScore(double baseScore, string content, string query) - { - if (string.IsNullOrEmpty(content) || string.IsNullOrEmpty(query)) - return baseScore; - - var improvedScore = baseScore; - - // Convert to lowercase for case-insensitive comparison - var lowerContent = content.ToLowerInvariant(); - var lowerQuery = query.ToLowerInvariant(); - - // Extract key terms from query (simple approach) - var queryTerms = lowerQuery.Split(new[] { ' ', ',', '.', '?', '!' }, StringSplitOptions.RemoveEmptyEntries) - .Where(term => term.Length > 2) // Only meaningful terms - .ToList(); - - // Calculate content relevance boost - var contentBoost = 0.0; - foreach (var term in queryTerms) - { - if (lowerContent.Contains(term)) - { - contentBoost += 0.1; // 10% boost per matching term - } - } - - // Apply content boost (cap at 50% to avoid over-boosting) - contentBoost = Math.Min(contentBoost, 0.5); - improvedScore += contentBoost; - - // Ensure score doesn't exceed 1.0 - return Math.Min(improvedScore, 1.0); - } - /// - /// Regenerate embeddings for all existing documents (useful for fixing missing embeddings) - /// public async Task RegenerateAllEmbeddingsAsync() { try { - Console.WriteLine("[INFO] Starting embedding regeneration for all documents..."); + logger.LogInformation("Starting embedding regeneration for all documents..."); var allDocuments = await documentRepository.GetAllAsync(); var totalChunks = allDocuments.Sum(d => d.Chunks.Count); @@ -637,7 +172,8 @@ public async Task RegenerateAllEmbeddingsAsync() foreach (var document in allDocuments) { - Console.WriteLine($"[INFO] Document: {document.FileName} ({document.Chunks.Count} chunks)"); + logger.LogInformation("Document: {FileName} ({ChunkCount} chunks)", + document.FileName, document.Chunks.Count); foreach (var chunk in document.Chunks) { @@ -653,11 +189,12 @@ public async Task RegenerateAllEmbeddingsAsync() } } - Console.WriteLine($"[INFO] Total chunks to process: {chunksToProcess.Count} out of {totalChunks}"); + logger.LogInformation("Total chunks to process: {ProcessCount} out of {TotalChunks}", + chunksToProcess.Count, totalChunks); if (chunksToProcess.Count == 0) { - Console.WriteLine("[INFO] All chunks already have valid embeddings. No processing needed."); + logger.LogInformation("All chunks already have valid embeddings. No processing needed."); return true; } @@ -665,7 +202,7 @@ public async Task RegenerateAllEmbeddingsAsync() const int batchSize = 128; var totalBatches = (int)Math.Ceiling((double)chunksToProcess.Count / batchSize); - Console.WriteLine($"[INFO] Processing in {totalBatches} batches of {batchSize} chunks"); + logger.LogInformation("Processing in {TotalBatches} batches of {BatchSize} chunks", totalBatches, batchSize); for (int batchIndex = 0; batchIndex < totalBatches; batchIndex++) { @@ -673,11 +210,12 @@ public async Task RegenerateAllEmbeddingsAsync() var endIndex = Math.Min(startIndex + batchSize, chunksToProcess.Count); var currentBatch = chunksToProcess.Skip(startIndex).Take(endIndex - startIndex).ToList(); - Console.WriteLine($"[INFO] Processing batch {batchIndex + 1}/{totalBatches}: chunks {startIndex + 1}-{endIndex}"); + logger.LogInformation("Processing batch {BatchNumber}/{TotalBatches}: chunks {StartIndex}-{EndIndex}", + batchIndex + 1, totalBatches, startIndex + 1, endIndex); // Generate embeddings for current batch var batchContents = currentBatch.Select(c => c.Content).ToList(); - var batchEmbeddings = await TryGenerateEmbeddingsBatchAsync(batchContents); + var batchEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(batchContents); if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) { @@ -691,23 +229,25 @@ public async Task RegenerateAllEmbeddingsAsync() { chunk.Embedding = embedding; successCount++; - Console.WriteLine($"[DEBUG] Chunk {chunk.Id}: Batch embedding successful ({embedding.Count} dimensions)"); + logger.LogDebug("Chunk {ChunkId}: Batch embedding successful ({Dimensions} dimensions)", + chunk.Id, embedding.Count); } else { - Console.WriteLine($"[WARNING] Chunk {chunk.Id}: Batch embedding failed, trying individual generation"); + logger.LogWarning("Chunk {ChunkId}: Batch embedding failed, trying individual generation", chunk.Id); // Fallback to individual generation - var individualEmbedding = await TryGenerateEmbeddingWithFallback(chunk.Content); + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); if (individualEmbedding != null && individualEmbedding.Count > 0) { chunk.Embedding = individualEmbedding; successCount++; - Console.WriteLine($"[DEBUG] Chunk {chunk.Id}: Individual embedding successful ({individualEmbedding.Count} dimensions)"); + logger.LogDebug("Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)", + chunk.Id, individualEmbedding.Count); } else { - Console.WriteLine($"[WARNING] Chunk {chunk.Id}: All embedding methods failed"); + logger.LogWarning("Chunk {ChunkId}: All embedding methods failed", chunk.Id); } } @@ -716,63 +256,51 @@ public async Task RegenerateAllEmbeddingsAsync() } else { - Console.WriteLine($"[WARNING] Batch {batchIndex + 1} failed, processing individually"); + logger.LogWarning("Batch {BatchNumber} failed, processing individually", batchIndex + 1); // Process chunks individually if batch fails foreach (var chunk in currentBatch) { try { - var newEmbedding = await TryGenerateEmbeddingWithFallback(chunk.Content); + var newEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); if (newEmbedding != null && newEmbedding.Count > 0) { chunk.Embedding = newEmbedding; successCount++; - Console.WriteLine($"[DEBUG] Chunk {chunk.Id}: Individual embedding successful ({newEmbedding.Count} dimensions)"); + logger.LogDebug("Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)", + chunk.Id, newEmbedding.Count); } else { - Console.WriteLine($"[WARNING] Chunk {chunk.Id}: Failed to generate embedding"); + logger.LogWarning("Chunk {ChunkId}: Failed to generate embedding", chunk.Id); } processedChunks++; } catch (Exception ex) { - Console.WriteLine($"[ERROR] Chunk {chunk.Id}: Failed to regenerate embedding: {ex.Message}"); + logger.LogError(ex, "Chunk {ChunkId}: Failed to regenerate embedding", chunk.Id); processedChunks++; } } } // Progress update - Console.WriteLine($"[INFO] Progress: {processedChunks}/{chunksToProcess.Count} chunks processed, {successCount} embeddings generated"); + logger.LogInformation("Progress: {ProcessedChunks}/{TotalChunks} chunks processed, {SuccessCount} embeddings generated", + processedChunks, chunksToProcess.Count, successCount); - // Smart rate limiting: Check if we need to wait based on VoyageAI response + // Smart rate limiting if (batchIndex < totalBatches - 1) // Don't wait after last batch { - // Check if the last batch was successful (no rate limiting) - var lastBatchSuccess = successCount > 0; // If we got embeddings, no rate limit - - if (!lastBatchSuccess) - { - // Rate limited - wait 20 seconds for 3 RPM - Console.WriteLine($"[INFO] Rate limit detected, waiting 20 seconds for 3 RPM limit..."); - await Task.Delay(20000); - } - else - { - // No rate limit - continue at full speed (2000 RPM) - Console.WriteLine($"[INFO] No rate limit detected, continuing at full speed (2000 RPM)"); - // No delay needed for 2000 RPM - } + await Task.Delay(1000); // Simple rate limiting } } // Save all documents with updated embeddings var documentsToUpdate = documentChunkMap.Values.Distinct().ToList(); - Console.WriteLine($"[INFO] Saving {documentsToUpdate.Count} documents with updated embeddings..."); + logger.LogInformation("Saving {DocumentCount} documents with updated embeddings...", documentsToUpdate.Count); foreach (var document in documentsToUpdate) { @@ -780,775 +308,14 @@ public async Task RegenerateAllEmbeddingsAsync() await documentRepository.AddAsync(document); } - Console.WriteLine($"[INFO] Embedding regeneration completed. {successCount} embeddings generated for {processedChunks} chunks in {totalBatches} batches."); + logger.LogInformation("Embedding regeneration completed. {SuccessCount} embeddings generated for {ProcessedChunks} chunks in {TotalBatches} batches.", + successCount, processedChunks, totalBatches); return successCount > 0; } catch (Exception ex) { - Console.WriteLine($"[ERROR] Failed to regenerate embeddings: {ex.Message}"); - return false; - } - } - - public async Task GenerateRagAnswerAsync(string query, int maxResults = 5) - { - if (string.IsNullOrWhiteSpace(query)) - throw new ArgumentException("Query cannot be empty", nameof(query)); - - // Try EnhancedSearchService first - try - { - var enhancedSearchService = new EnhancedSearchService(aiProviderFactory, documentRepository, configuration); - var enhancedResponse = await enhancedSearchService.MultiStepRAGAsync(query, maxResults); - - if (enhancedResponse != null && !string.IsNullOrEmpty(enhancedResponse.Answer)) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService RAG successful, using enhanced response"); - return enhancedResponse; - } - } - catch (Exception ex) - { - Console.WriteLine($"[WARNING] EnhancedSearchService RAG failed: {ex.Message}, falling back to basic RAG"); - } - - // Fallback to basic RAG implementation - return await GenerateBasicRagAnswerAsync(query, maxResults); - } - - /// - /// Basic RAG implementation when Semantic Kernel is not available - /// - private async Task GenerateBasicRagAnswerAsync(string query, int maxResults = 5) - { - // Get all documents for cross-document analysis - var allDocuments = await GetAllDocumentsAsync(); - - // Cross-document detection - var isCrossDocument = IsCrossDocumentQueryAsync(query, allDocuments); - - List relevantChunks; - - // Increase maxResults for better document coverage, but respect user's maxResults - var adjustedMaxResults = maxResults == 1 ? 1 : Math.Max(maxResults * 2, 5); // Respect maxResults=1, otherwise reasonable increase - - if (isCrossDocument) - { - relevantChunks = await PerformCrossDocumentSearchAsync(query, allDocuments, adjustedMaxResults); - } - else - { - relevantChunks = await PerformStandardSearchAsync(query, adjustedMaxResults); - } - - // Optimize context assembly: combine chunks intelligently - var contextMaxResults = isCrossDocument ? Math.Max(maxResults, 3) : maxResults; - - var optimizedChunks = OptimizeContextWindow(relevantChunks, contextMaxResults, query); - - var documentIdToName = new Dictionary(); - - foreach (var docId in optimizedChunks.Select(c => c.DocumentId).Distinct()) - { - var doc = await GetDocumentAsync(docId); - if (doc != null) - { - documentIdToName[docId] = doc.FileName; - } - } - - // Create enhanced context with metadata for better AI understanding - var enhancedContext = new List(); - - foreach (var chunk in optimizedChunks.OrderByDescending(c => c.RelevanceScore ?? 0.0)) - { - var docName = documentIdToName.TryGetValue(chunk.DocumentId, out var name) ? name : "Document"; - var relevance = chunk.RelevanceScore ?? 0.0; - var chunkInfo = $"[Document: {docName}, Relevance: {relevance:F3}, Chunk: {chunk.ChunkIndex}]\n{chunk.Content}"; - enhancedContext.Add(chunkInfo); - } - - var contextText = string.Join("\n\n---\n\n", enhancedContext); - - // Generate RAG answer using AI with enhanced prompt - var prompt = isCrossDocument - ? $"You are a precise information retrieval system. Analyze the following context and answer the query step by step.\n\nQuery: {query}\n\nContext:\n{contextText}\n\nInstructions:\n1. Extract specific facts from the context\n2. Answer each part of the query separately\n3. If information is missing, state 'This information is not available in the provided documents'\n4. Use exact quotes from context when possible\n\nAnswer:" - : $"You are a precise information retrieval system. Analyze the following context and answer the question step by step.\n\nQuestion: {query}\n\nContext:\n{contextText}\n\nInstructions:\n1. Extract specific facts from the context\n2. Answer each part of the question separately\n3. If information is missing, state 'This information is not available in the provided documents'\n4. Use exact quotes from context when possible\n\nAnswer:"; - - var answer = await aiService.GenerateResponseAsync(prompt, enhancedContext); - - var sources = optimizedChunks.Select(c => new SearchSource - { - DocumentId = c.DocumentId, - FileName = documentIdToName.TryGetValue(c.DocumentId, out var name) ? name : "Document", - RelevantContent = c.Content, - RelevanceScore = c.RelevanceScore ?? 0.0 - }).ToList(); - - return new RagResponse - { - Query = query, - Answer = answer, - Sources = sources, - SearchedAt = DateTime.UtcNow, - Configuration = GetRagConfiguration() - }; - } - - /// - /// Applies advanced re-ranking algorithm to improve chunk selection - /// - private static List ApplyReranking(List chunks, string query, int maxResults) - { - if (chunks.Count == 0) - return chunks; - - var queryKeywords = ExtractKeywords(query.ToLowerInvariant()); - var queryLength = query.Length; - var documentIds = chunks.Select(c => c.DocumentId).Distinct().ToList(); - - // Enhanced scoring algorithm - foreach (var chunk in chunks) - { - var originalScore = chunk.RelevanceScore ?? 0.0; - var enhancedScore = originalScore; - - // Factor 1: Exact keyword matching boost (CRITICAL!) - var chunkContent = chunk.Content.ToLowerInvariant(); - var exactMatches = 0; - - // Use cleaned keywords (noise/punctuation-safe) - var cleanedQueryKeywords = ExtractKeywords(query.ToLowerInvariant()); - foreach (var kw in cleanedQueryKeywords) - { - if (kw.Length > 2 && chunkContent.Contains(kw)) - { - exactMatches++; - } - } - - if (cleanedQueryKeywords.Count > 0) - { - var exactMatchRatio = (double)exactMatches / cleanedQueryKeywords.Count; - enhancedScore += exactMatchRatio * 0.6; // 60% boost for exact matches! - } - - // Additional keyword density boost - var chunkKeywords = ExtractKeywords(chunkContent); - var commonKeywords = queryKeywords.Intersect(chunkKeywords, StringComparer.OrdinalIgnoreCase).Count(); - - if (queryKeywords.Count > 0) - { - var keywordDensity = (double)commonKeywords / queryKeywords.Count; - enhancedScore += keywordDensity * 0.2; // 20% boost for keyword matches - } - - // Generic content relevance boost - var contentBoost = 0.0; - - // Boost for query term matches in content - var queryTermMatches = queryKeywords.Count(term => chunkContent.Contains(term, StringComparison.OrdinalIgnoreCase)); - contentBoost += Math.Min(0.3, queryTermMatches * 0.1); // Max 30% boost - - enhancedScore += contentBoost; - - // Factor 2: Content length optimization (not too short, not too long) - var contentLength = chunk.Content.Length; - var optimalLength = Math.Min(800, Math.Max(200, queryLength * 10)); // Dynamic optimal length - var lengthScore = 1.0 - Math.Abs(contentLength - optimalLength) / (double)optimalLength; - enhancedScore += Math.Max(0, lengthScore * 0.15); // 15% boost for optimal length - - // Factor 3: Position in document (earlier chunks often more important) - var positionBoost = Math.Max(0, 1.0 - (chunk.ChunkIndex * 0.05)); // Decrease by 5% per chunk - enhancedScore += positionBoost * 0.1; // 10% boost for position - - // Factor 4: Query term proximity (how close query terms are in content) - var proximityScore = CalculateTermProximity(chunk.Content, queryKeywords); - enhancedScore += proximityScore * 0.2; // 20% boost for proximity - - // Factor 5: Document diversity boost (NEW!) - var documentDiversityBoost = CalculateDocumentDiversityBoost(chunk.DocumentId, documentIds, chunks); - enhancedScore += documentDiversityBoost * 0.15; // 15% boost for diversity - - chunk.RelevanceScore = Math.Min(1.0, enhancedScore); // Cap at 1.0 - } - - return chunks; - } - - /// - /// Calculates how close query terms are to each other in the content - /// - private static double CalculateTermProximity(string content, List queryTerms) - { - if (queryTerms.Count == 0) return 0.0; - - var contentLower = content.ToLowerInvariant(); - var termPositions = new List(); - - foreach (var term in queryTerms) - { - var index = contentLower.IndexOf(term, StringComparison.OrdinalIgnoreCase); - if (index >= 0) - { - termPositions.Add(index); - } - } - - if (termPositions.Count < 2) return termPositions.Count > 0 ? 0.5 : 0.0; - - // Calculate average distance between terms - termPositions.Sort(); - var totalDistance = 0; - for (int i = 1; i < termPositions.Count; i++) - { - totalDistance += termPositions[i] - termPositions[i - 1]; - } - - var averageDistance = totalDistance / (termPositions.Count - 1); - // Closer terms = higher score (inverse relationship) - return Math.Max(0, 1.0 - averageDistance / 200.0); // Normalize by 200 characters - } - - /// - /// Applies diversity selection to avoid too many chunks from same document - /// - private static List ApplyDiversityAndSelect(List chunks, int maxResults) - { - if (chunks.Count == 0) return new List(); - - var uniqueDocumentIds = chunks.Select(c => c.DocumentId).Distinct().ToList(); - - Console.WriteLine($"[DEBUG] ApplyDiversityAndSelect: Total chunks: {chunks.Count}, Unique documents: {uniqueDocumentIds.Count}"); - Console.WriteLine($"[DEBUG] Document IDs: {string.Join(", ", uniqueDocumentIds.Take(5))}"); - - // Calculate min chunks per document - respect maxResults constraint - var minChunksPerDocument = Math.Max(1, Math.Min(2, Math.Max(1, maxResults / uniqueDocumentIds.Count))); // Min 1, Max 2 - var maxChunksPerDocument = Math.Min(maxResults, Math.Max(minChunksPerDocument, 2)); // Don't exceed maxResults - - Console.WriteLine($"[DEBUG] Min chunks per doc: {minChunksPerDocument}, Max chunks per doc: {maxChunksPerDocument}"); - - var selectedChunks = new List(); - var documentChunkCounts = new Dictionary(); - - // First pass: ensure minimum representation from each document, but respect maxResults - var totalSelected = 0; - foreach (var documentId in uniqueDocumentIds) - { - if (totalSelected >= maxResults) break; // Stop if we've reached maxResults - - var availableChunks = chunks.Where(c => c.DocumentId == documentId).ToList(); - var actualMinChunks = Math.Min(minChunksPerDocument, availableChunks.Count); - - // Don't exceed maxResults - var availableSlots = maxResults - totalSelected; - actualMinChunks = Math.Min(actualMinChunks, availableSlots); - - var documentChunks = availableChunks - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .Take(actualMinChunks) - .ToList(); - - Console.WriteLine($"[DEBUG] Document {documentId}: Available {availableChunks.Count}, Selected {documentChunks.Count} chunks (requested min: {minChunksPerDocument}, actual min: {actualMinChunks})"); - - selectedChunks.AddRange(documentChunks); - documentChunkCounts[documentId] = documentChunks.Count; - totalSelected += documentChunks.Count; - } - - // Second pass: fill remaining slots with best remaining chunks, but respect maxResults - var remainingSlots = maxResults - selectedChunks.Count; - if (remainingSlots > 0) - { - var remainingChunks = chunks.Except(selectedChunks) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - foreach (var chunk in remainingChunks) - { - if (remainingSlots <= 0 || selectedChunks.Count >= maxResults) break; - - var currentCount = documentChunkCounts.GetValueOrDefault(chunk.DocumentId, 0); - if (currentCount < maxChunksPerDocument) - { - selectedChunks.Add(chunk); - documentChunkCounts[chunk.DocumentId] = currentCount + 1; - remainingSlots--; - } - } - } - - // Ensure we don't exceed maxResults - var finalResult = selectedChunks.Take(maxResults).ToList(); - - Console.WriteLine($"[DEBUG] Final result: {finalResult.Count} chunks from {finalResult.Select(c => c.DocumentId).Distinct().Count()} documents (maxResults requested: {maxResults})"); - - return finalResult; - } - - /// - /// Performs fuzzy search with typo tolerance - /// - private async Task> PerformFuzzySearch(string query, int maxResults) - { - var fuzzyResults = new List(); - - try - { - var allDocs = await documentRepository.GetAllAsync(); - var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries); - - foreach (var doc in allDocs) - { - foreach (var chunk in doc.Chunks) - { - var chunkContent = chunk.Content.ToLowerInvariant(); - var chunkWords = chunkContent.Split(' ', StringSplitOptions.RemoveEmptyEntries); - - var fuzzyScore = 0.0; - var matchedWords = 0; - - foreach (var queryWord in queryWords) - { - if (queryWord.Length < 3) continue; // Skip very short words - - var bestMatch = 0.0; - foreach (var chunkWord in chunkWords) - { - if (chunkWord.Length < 3) continue; - - // Calculate similarity - var similarity = CalculateStringSimilarity(queryWord, chunkWord); - if (similarity > bestMatch) - { - bestMatch = similarity; - } - } - - // If similarity is above threshold, count as match - if (bestMatch >= 0.7) // 70% similarity threshold - { - fuzzyScore += bestMatch; - matchedWords++; - } - } - - // Calculate final fuzzy score - if (matchedWords > 0) - { - var finalScore = (fuzzyScore / queryWords.Length) * 0.8; // Fuzzy matches get 80% of perfect score - chunk.RelevanceScore = finalScore; - fuzzyResults.Add(chunk); - } - } - } - - return fuzzyResults - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .Take(maxResults) - .ToList(); - } - catch - { - return fuzzyResults; - } - } - - /// - /// Calculates string similarity using Levenshtein distance - /// - private static double CalculateStringSimilarity(string s1, string s2) - { - if (string.IsNullOrEmpty(s1) || string.IsNullOrEmpty(s2)) - return 0.0; - - if (s1 == s2) - return 1.0; - - var longer = s1.Length > s2.Length ? s1 : s2; - var shorter = s1.Length > s2.Length ? s2 : s1; - - var editDistance = LevenshteinDistance(longer, shorter); - return (longer.Length - editDistance) / (double)longer.Length; - } - - /// - /// Calculates Levenshtein distance between two strings - /// - private static int LevenshteinDistance(string s1, string s2) - { - var len1 = s1.Length; - var len2 = s2.Length; - var matrix = new int[len1 + 1, len2 + 1]; - - for (int i = 0; i <= len1; i++) - matrix[i, 0] = i; - - for (int j = 0; j <= len2; j++) - matrix[0, j] = j; - - for (int i = 1; i <= len1; i++) - { - for (int j = 1; j <= len2; j++) - { - var cost = s1[i - 1] == s2[j - 1] ? 0 : 1; - matrix[i, j] = Math.Min( - Math.Min(matrix[i - 1, j] + 1, matrix[i, j - 1] + 1), - matrix[i - 1, j - 1] + cost); - } - } - - return matrix[len1, len2]; - } - - /// - /// Extracts key words from query for additional search terms - /// - private static List ExtractKeywords(string query) - { - var stopWords = new HashSet { "ne", "nedir", "nasıl", "hangi", "kim", "nerede", "ne zaman", "neden", - "what", "how", "where", "when", "why", "who", "which", "is", "are", "the", "a", "an" }; - - var words = query.ToLowerInvariant() - .Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2 && !stopWords.Contains(w)) - .ToList(); - - return words; - } - - /// - /// Optimizes context window by intelligently selecting and combining chunks - /// - private static List OptimizeContextWindow(List chunks, int maxResults, string query) - { - if (chunks.Count == 0) return new List(); - - // Group chunks by document for better context - var documentGroups = chunks.GroupBy(c => c.DocumentId).ToList(); - - var finalChunks = new List(); - var remainingSlots = maxResults; - - // Build keyword list from query - var queryKeywords = ExtractKeywords(query.ToLowerInvariant()); - var targetKeywords = new HashSet(queryKeywords, StringComparer.OrdinalIgnoreCase); - - // Process each document group - foreach (var group in documentGroups.OrderByDescending(g => g.Max(c => c.RelevanceScore ?? 0.0))) - { - if (remainingSlots <= 0) break; - - // Prefer domain keyword matches within the document if available - var domainMatched = group - .Select(c => new { Chunk = c, Text = c.Content.ToLowerInvariant() }) - .Where(x => targetKeywords.Any(k => x.Text.Contains(k))) - .Select(x => x.Chunk) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .FirstOrDefault(); - - var bestChunk = domainMatched ?? group.OrderByDescending(c => c.RelevanceScore ?? 0.0).First(); - finalChunks.Add(bestChunk); - remainingSlots--; - - // Add additional chunks if slots remain - if (remainingSlots > 0) - { - // Bring in other domain matches first, then top by relevance - var domainExtras = group - .Where(c => !ReferenceEquals(c, bestChunk)) - .Select(c => new { Chunk = c, Text = c.Content.ToLowerInvariant() }) - .Where(x => targetKeywords.Any(k => x.Text.Contains(k))) - .Select(x => x.Chunk) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - var nonDomainExtras = group - .Where(c => !ReferenceEquals(c, bestChunk) && !domainExtras.Contains(c)) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - var extras = domainExtras.Concat(nonDomainExtras) - .Take(Math.Min(remainingSlots, 3)) // allow up to 3 extras per doc to improve coverage - .ToList(); - - finalChunks.AddRange(extras); - remainingSlots -= extras.Count; - } - } - - return finalChunks; - } - - /// - /// Detects if query requires information from multiple documents - /// - private static bool IsCrossDocumentQueryAsync(string query, List allDocuments) - { - if (allDocuments.Count <= 1) + logger.LogError(ex, "Failed to regenerate embeddings"); return false; - - // Extract topics from query - var queryTopics = ExtractQueryTopics(query); - - var relevantDocs = 0; - var isCrossDocument = false; - - foreach (var doc in allDocuments) - { - var docTopics = ExtractDocumentTopics(doc); - var matchCount = CalculateTopicMatches(queryTopics, docTopics); - - if (matchCount > 0) - { - relevantDocs++; - if (relevantDocs > 1) - { - isCrossDocument = true; - break; - } - } - } - - return isCrossDocument; - } - - /// - /// Calculates topic matches using flexible matching strategies - /// - private static int CalculateTopicMatches(List queryTopics, List docTopics) - { - var matchCount = 0.0; - - foreach (var queryTopic in queryTopics) - { - // Strategy 1: Exact match - if (docTopics.Contains(queryTopic, StringComparer.OrdinalIgnoreCase)) - { - matchCount += 2.0; // Higher weight for exact matches - continue; - } - - // Strategy 2: Contains match (partial match) - var containsMatch = docTopics.Any(dt => - dt.Contains(queryTopic, StringComparison.OrdinalIgnoreCase) || - queryTopic.Contains(dt, StringComparison.OrdinalIgnoreCase)); - - if (containsMatch) - { - matchCount += 1.0; // Lower weight for partial matches - continue; - } - - // Strategy 3: Word-level match - var queryWords = queryTopic.Split(' ', StringSplitOptions.RemoveEmptyEntries); - var docWords = docTopics.SelectMany(dt => dt.Split(' ', StringSplitOptions.RemoveEmptyEntries)).ToList(); - - var wordMatches = queryWords.Count(qw => - docWords.Any(dw => - dw.Contains(qw, StringComparison.OrdinalIgnoreCase) || - qw.Contains(dw, StringComparison.OrdinalIgnoreCase))); - - if (wordMatches > 0) - { - matchCount += wordMatches * 0.5; // Partial weight for word matches - } - } - - return (int)Math.Round(matchCount); - } - - /// - /// Extracts main topics from user query - /// - private static List ExtractQueryTopics(string query) - { - var topics = new List(); - var words = query.ToLowerInvariant() - .Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) // Filter out very short words - .ToList(); - - // Add single words - topics.AddRange(words); - - // Add bigrams (2-word combinations) - for (int i = 0; i < words.Count - 1; i++) - { - topics.Add($"{words[i]} {words[i + 1]}"); - } - - // Add trigrams (3-word combinations) for better coverage - for (int i = 0; i < words.Count - 2; i++) - { - topics.Add($"{words[i]} {words[i + 1]} {words[i + 2]}"); } - - // Add individual important words with higher priority - var importantWords = words.Where(w => w.Length > 4).ToList(); - topics.AddRange(importantWords); - - return topics.Distinct().ToList(); - } - - /// - /// Extracts main topics from document content - /// - private static List ExtractDocumentTopics(Document document) - { - var topics = new HashSet(); - var content = document.Content.ToLowerInvariant(); - - // Extract key phrases from document - increase coverage - var sentences = content.Split(['.', '!', '?'], StringSplitOptions.RemoveEmptyEntries); - - // Process more sentences for better topic coverage - foreach (var sentence in sentences.Take(20)) // Increased from 10 to 20 - { - var sentenceWords = sentence.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - // Add single words - topics.UnionWith(sentenceWords); - - // Add bigrams - for (int i = 0; i < sentenceWords.Count - 1; i++) - { - topics.Add($"{sentenceWords[i]} {sentenceWords[i + 1]}"); - } - - // Add trigrams - for (int i = 0; i < sentenceWords.Count - 2; i++) - { - topics.Add($"{sentenceWords[i]} {sentenceWords[i + 1]} {sentenceWords[i + 2]}"); - } - } - - // Also extract from chunk content for better coverage - foreach (var chunk in document.Chunks.Take(10)) - { - var chunkWords = chunk.Content.ToLowerInvariant() - .Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - topics.UnionWith(chunkWords); - } - - return topics.Take(50).ToList(); // Increased from 20 to 50 - } - - /// - /// Performs cross-document search with enhanced diversity - /// - private async Task> PerformCrossDocumentSearchAsync(string query, List allDocuments, int maxResults) - { - var adjustedMaxResults = maxResults == 1 ? 1 : Math.Max(maxResults, 3); // Respect maxResults=1, otherwise minimum 3 - - // Direct search with original query for cross-document - var searchResults = Math.Max(adjustedMaxResults * 3, options.MaxSearchResults); - var allChunks = await SearchDocumentsAsync(query, searchResults); - - // Remove duplicates and keep highest score - var uniqueChunks = allChunks - .GroupBy(c => c.Id) - .Select(g => g.OrderByDescending(c => c.RelevanceScore ?? 0.0).First()) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - var rerankedChunks = DocumentService.ApplyReranking(uniqueChunks, query, searchResults); - var finalChunks = DocumentService.ApplyDiversityAndSelect(rerankedChunks, adjustedMaxResults); - - return finalChunks; - } - - /// - /// Performs standard single-document search - /// - private async Task> PerformStandardSearchAsync(string query, int maxResults) - { - // Direct search with original query (more reliable) - var searchResults = Math.Max(maxResults * 2, options.MaxSearchResults); - var allRelevantChunks = await SearchDocumentsAsync(query, searchResults); - - // Remove duplicates and keep highest score - var uniqueChunks = allRelevantChunks - .GroupBy(c => c.Id) - .Select(g => g.OrderByDescending(c => c.RelevanceScore ?? 0.0).First()) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - // Apply advanced re-ranking algorithm - var rerankedChunks = DocumentService.ApplyReranking(uniqueChunks, query, searchResults); - - // Apply standard diversity selection - return DocumentService.ApplyDiversityAndSelect(rerankedChunks, maxResults); - } - - /// - /// Get current RAG configuration dynamically from Program.cs and appsettings.json - /// - private RagConfiguration GetRagConfiguration() - { - // Read from Program.cs configuration and appsettings.json - var currentProvider = GetCurrentAIProviderFromConfig(); - - return new RagConfiguration - { - AIProvider = currentProvider, - StorageProvider = GetCurrentStorageProviderFromConfig(), - Model = GetCurrentModelFromConfig(currentProvider) - }; - } - - /// - /// Get current AI provider from SmartRagOptions configuration - /// - private string GetCurrentAIProviderFromConfig() - { - // Use the configured AI provider from SmartRagOptions - return options.AIProvider.ToString(); - } - - /// - /// Get current storage provider from SmartRagOptions configuration - /// - private string GetCurrentStorageProviderFromConfig() - { - // Use the configured storage provider from SmartRagOptions - return options.StorageProvider.ToString(); - } - - /// - /// Get current model from configuration based on provider - /// - private string GetCurrentModelFromConfig(string provider) - { - // Dynamically build configuration key from provider name - var configKey = $"AI:{provider}:Model"; - return configuration[configKey] ?? "model-not-configured"; - } - - /// - /// Calculates diversity boost to encourage selection from different documents - /// - private static double CalculateDocumentDiversityBoost(Guid documentId, List allDocumentIds, List allChunks) - { - if (allDocumentIds.Count <= 1) return 0.0; - - // Calculate how many chunks we already have from this document - var chunksFromThisDoc = allChunks.Count(c => c.DocumentId == documentId); - var totalChunks = allChunks.Count; - - // If this document is underrepresented, give it a boost - var expectedChunksPerDoc = (double)totalChunks / allDocumentIds.Count; - var representationRatio = chunksFromThisDoc / expectedChunksPerDoc; - - // Boost underrepresented documents - if (representationRatio < 0.8) return 0.3; // 30% boost - if (representationRatio < 1.0) return 0.15; // 15% boost - - return 0.0; // No boost for overrepresented documents } } \ No newline at end of file diff --git a/src/SmartRAG/Services/EnhancedSearchService.cs b/src/SmartRAG/Services/EnhancedSearchService.cs deleted file mode 100644 index abd57a7..0000000 --- a/src/SmartRAG/Services/EnhancedSearchService.cs +++ /dev/null @@ -1,739 +0,0 @@ -using Microsoft.Extensions.Configuration; -using SmartRAG.Entities; -using SmartRAG.Enums; -using SmartRAG.Interfaces; -using SmartRAG.Models; - -namespace SmartRAG.Services; - -/// -/// Enhanced search service using configured AI provider (Anthropic) with Redis storage -/// -public class EnhancedSearchService -{ - private readonly IAIProviderFactory _aiProviderFactory; - private readonly IDocumentRepository _documentRepository; - private readonly IConfiguration _configuration; - - public EnhancedSearchService( - IAIProviderFactory aiProviderFactory, - IDocumentRepository documentRepository, - IConfiguration configuration) - { - _aiProviderFactory = aiProviderFactory; - _documentRepository = documentRepository; - _configuration = configuration; - } - - /// - /// Simple semantic search using configured AI provider (Anthropic) - /// - public async Task> EnhancedSemanticSearchAsync(string query, int maxResults = 5) - { - try - { - var allDocuments = await _documentRepository.GetAllAsync(); - var allChunks = allDocuments.SelectMany(d => d.Chunks).ToList(); - - Console.WriteLine($"[DEBUG] EnhancedSearchService: Searching in {allDocuments.Count} documents with {allChunks.Count} chunks"); - - // Use configured AI provider (Anthropic) - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) - { - Console.WriteLine($"[ERROR] Anthropic configuration not found"); - return await FallbackSearchAsync(query, maxResults); - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - // Create simple search prompt - var searchPrompt = $@"You are a search assistant. Find the most relevant document chunks for this query. - -Query: {query} - -Available chunks (showing first 200 characters of each): -{string.Join("\n\n", allChunks.Select((c, i) => $"Chunk {i}: {c.Content.Substring(0, Math.Min(200, c.Content.Length))}..."))} - -Instructions: -1. Look for chunks that contain information related to the query -2. Focus on key names, dates, companies, and facts mentioned in the query -3. Return ONLY the chunk numbers (0, 1, 2, etc.) that are relevant, separated by commas - -Return format: 0,3,7 (chunk numbers, not IDs)"; - - // Try with retry logic for rate limiting - string aiResponse = null; - var maxRetries = 3; - var retryDelayMs = 2000; // Start with 2 seconds - - for (int attempt = 0; attempt < maxRetries; attempt++) - { - try - { - aiResponse = await aiProvider.GenerateTextAsync(searchPrompt, anthropicConfig); - break; // Success, exit retry loop - } - catch (Exception ex) when (ex.Message.Contains("TooManyRequests") || ex.Message.Contains("rate limit")) - { - if (attempt < maxRetries - 1) - { - var delay = retryDelayMs * (int)Math.Pow(2, attempt); // Exponential backoff - Console.WriteLine($"[DEBUG] EnhancedSearchService: Rate limited by Anthropic, retrying in {delay}ms (attempt {attempt + 1}/{maxRetries})"); - await Task.Delay(delay); - } - else - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: Anthropic rate limited after {maxRetries} attempts, using fallback"); - throw; // Re-throw to use fallback - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: Anthropic failed with error: {ex.Message}"); - throw; // Re-throw to use fallback - } - } - - if (!string.IsNullOrEmpty(aiResponse)) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: AI response: {aiResponse}"); - - // Parse AI response and return relevant chunks - var parsedResults = ParseAISearchResults(aiResponse, allChunks, maxResults, query); - - if (parsedResults.Count > 0) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: Successfully parsed {parsedResults.Count} chunks"); - return parsedResults; - } - - Console.WriteLine($"[DEBUG] EnhancedSearchService: Failed to parse results, using fallback"); - } - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] EnhancedSearchService failed: {ex.Message}, using fallback"); - } - - // Fallback to basic search - return await FallbackSearchAsync(query, maxResults); - } - - /// - /// Simple RAG using configured AI provider (Anthropic) - /// - public async Task MultiStepRAGAsync(string query, int maxResults = 5) - { - try - { - // Check if this is a general conversation query - if (IsGeneralConversationQuery(query)) - { - Console.WriteLine($"[DEBUG] MultiStepRAGAsync: Detected general conversation query: '{query}'"); - var chatResponse = await HandleGeneralConversationAsync(query); - - return new RagResponse - { - Query = query, - Answer = chatResponse, - Sources = new List(), // No sources for chat - SearchedAt = DateTime.UtcNow, - Configuration = new RagConfiguration - { - //AIProvider = "Anthropic", - //StorageProvider = "Chat Mode", - //Model = "Claude + Chat" - } - }; - } - - // Step 1: Simple search for document-related queries - var relevantChunks = await EnhancedSemanticSearchAsync(query, maxResults); - - if (relevantChunks.Count == 0) - { - // Last resort: basic keyword search - relevantChunks = await FallbackSearchAsync(query, maxResults); - } - - // Step 2: Answer Generation using Anthropic - var answer = await GenerateAnswerWithAnthropic(query, relevantChunks); - - // Step 3: Simple Source Attribution - var sources = relevantChunks.Select(c => new SearchSource - { - DocumentId = c.DocumentId, - FileName = "Document", - RelevantContent = c.Content.Substring(0, Math.Min(200, c.Content.Length)), - RelevanceScore = c.RelevanceScore ?? 0.0 - }).ToList(); - - return new RagResponse - { - Query = query, - Answer = answer, - Sources = sources, - SearchedAt = DateTime.UtcNow, - Configuration = new RagConfiguration - { - //AIProvider = "Anthropic", - //StorageProvider = "Redis", - //Model = "Claude + VoyageAI" - } - }; - } - catch (Exception ex) - { - throw new InvalidOperationException($"RAG failed: {ex.Message}", ex); - } - } - - /// - /// Generate answer using Anthropic - /// - private async Task GenerateAnswerWithAnthropic(string query, List context) - { - try - { - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) - { - throw new InvalidOperationException("Anthropic configuration not found"); - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - var contextText = string.Join("\n\n---\n\n", - context.Select(c => $"[Document Chunk]\n{c.Content}")); - - var prompt = $@"You are a helpful AI assistant. Answer the user's question based on the provided context. - -Question: {query} - -Context: -{contextText} - -Instructions: -1. Answer the question comprehensively using information from the context -2. If information is missing, state it clearly -3. Provide structured, easy-to-understand response in the same language as the question -4. Cite specific parts of the context when possible - -Answer:"; - - // Try with retry logic for rate limiting - var maxRetries = 3; - var retryDelayMs = 2000; // Start with 2 seconds - - for (int attempt = 0; attempt < maxRetries; attempt++) - { - try - { - return await aiProvider.GenerateTextAsync(prompt, anthropicConfig); - } - catch (Exception ex) when (ex.Message.Contains("TooManyRequests") || ex.Message.Contains("rate limit")) - { - if (attempt < maxRetries - 1) - { - var delay = retryDelayMs * (int)Math.Pow(2, attempt); // Exponential backoff - Console.WriteLine($"[DEBUG] GenerateAnswerWithAnthropic: Rate limited, retrying in {delay}ms (attempt {attempt + 1}/{maxRetries})"); - await Task.Delay(delay); - } - else - { - Console.WriteLine($"[DEBUG] GenerateAnswerWithAnthropic: Rate limited after {maxRetries} attempts"); - throw; // Re-throw to use fallback - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] GenerateAnswerWithAnthropic: Failed with error: {ex.Message}"); - throw; // Re-throw to use fallback - } - } - - throw new InvalidOperationException("Unexpected error in retry loop"); - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] Failed to generate answer: {ex.Message}"); - return "Sorry, unable to generate answer. Please try again."; - } - } - - /// - /// Parse AI search results from AI provider response - /// - private static List ParseAISearchResults(string response, List allChunks, int maxResults, string query) - { - try - { - Console.WriteLine($"[DEBUG] ParseAISearchResults: Raw response: '{response}'"); - - // Try to parse chunk numbers from response - var chunkNumbers = response.Split(',') - .Select(s => s.Trim()) - .Where(s => !string.IsNullOrEmpty(s)) - .Select(s => int.TryParse(s, out var num) ? num : -1) - .Where(num => num >= 0 && num < allChunks.Count) - .Take(maxResults) - .ToList(); - - Console.WriteLine($"[DEBUG] ParseAISearchResults: Parsed chunk numbers: {string.Join(", ", chunkNumbers)}"); - - var results = new List(); - - foreach (var number in chunkNumbers) - { - if (number >= 0 && number < allChunks.Count) - { - var chunk = allChunks[number]; - results.Add(chunk); - Console.WriteLine($"[DEBUG] ParseAISearchResults: Found chunk {number} from document {chunk.DocumentId}"); - } - } - - if (results.Count > 0) - { - Console.WriteLine($"[DEBUG] ParseAISearchResults: Successfully parsed {results.Count} chunks"); - return results; - } - - Console.WriteLine($"[DEBUG] ParseAISearchResults: No chunks parsed"); - } - catch (Exception ex) - { - Console.WriteLine($"[WARNING] ParseAISearchResults failed: {ex.Message}"); - } - - return new List(); - } - - /// - /// Fallback search when AI search fails - /// - private async Task> FallbackSearchAsync(string query, int maxResults) - { - var allDocuments = await _documentRepository.GetAllAsync(); - var allChunks = allDocuments.SelectMany(d => d.Chunks).ToList(); - - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Searching in {allDocuments.Count} documents with {allChunks.Count} chunks"); - - // Try embedding-based search first if available - try - { - var embeddingResults = await TryEmbeddingBasedSearchAsync(query, allChunks, maxResults); - if (embeddingResults.Count > 0) - { - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Embedding search successful, found {embeddingResults.Count} chunks"); - return embeddingResults; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Embedding search failed: {ex.Message}, using keyword search"); - } - - // Enhanced keyword-based fallback for global content - var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - // Extract potential names from ORIGINAL query (not lowercase) - language agnostic - var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2 && char.IsUpper(w[0])) - .ToList(); - - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Query words: [{string.Join(", ", queryWords)}]"); - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Potential names: [{string.Join(", ", potentialNames)}]"); - - var scoredChunks = allChunks.Select(chunk => - { - var score = 0.0; - var content = chunk.Content.ToLowerInvariant(); - - // Special handling for names like "John Smith" - HIGHEST PRIORITY (language agnostic) - if (potentialNames.Count >= 2) - { - var fullName = string.Join(" ", potentialNames); - if (ContainsNormalizedName(content, fullName)) - { - score += 200.0; // Very high weight for full name matches - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found FULL NAME match: '{fullName}' in chunk: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - } - else if (potentialNames.Any(name => ContainsNormalizedName(content, name))) - { - score += 100.0; // High weight for partial name matches - var foundNames = potentialNames.Where(name => ContainsNormalizedName(content, name)).ToList(); - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found PARTIAL name matches: [{string.Join(", ", foundNames)}] in chunk: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - } - } - - // Exact word matches - foreach (var word in queryWords) - { - if (content.Contains(word, StringComparison.OrdinalIgnoreCase)) - score += 2.0; // Higher weight for word matches - } - - // Phrase matches (for longer queries) - var queryPhrases = query.ToLowerInvariant().Split('.', '?', '!') - .Where(p => p.Length > 5) - .ToList(); - - foreach (var phrase in queryPhrases) - { - var phraseWords = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 3) - .ToList(); - - if (phraseWords.Count >= 2) - { - var phraseText = string.Join(" ", phraseWords); - if (content.Contains(phraseText, StringComparison.OrdinalIgnoreCase)) - score += 10.0; // Higher weight for phrase matches - } - } - - // Penalty for very short content (global rule) - if (content.Length < 50) - score -= 20.0; - - // Generic content quality scoring (language and content agnostic) - // Score based on content structure and information density, not specific keywords - - // Bonus for chunks with good information density - var wordCount = content.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; - var avgWordLength = content.Length / Math.Max(wordCount, 1); - - // Prefer chunks with reasonable word length and count - if (wordCount >= 10 && wordCount <= 100) score += 5.0; - if (avgWordLength >= 4.0 && avgWordLength <= 8.0) score += 3.0; - - // Bonus for chunks with punctuation (indicates structured content) - var punctuationCount = content.Count(c => ".,;:!?()[]{}".Contains(c)); - if (punctuationCount >= 3) score += 2.0; - - // Bonus for chunks with numbers (often indicates factual information) - var numberCount = content.Count(c => char.IsDigit(c)); - if (numberCount >= 2) score += 2.0; - - // Bonus for chunks with mixed case (indicates proper formatting) - var hasUpper = content.Any(c => char.IsUpper(c)); - var hasLower = content.Any(c => char.IsLower(c)); - if (hasUpper && hasLower) score += 1.0; - - chunk.RelevanceScore = score; - return chunk; - }).ToList(); - - var relevantChunks = scoredChunks - .Where(c => c.RelevanceScore > 0) - .OrderByDescending(c => c.RelevanceScore) - .Take(Math.Max(maxResults * 3, 30)) // Take more for better context - .ToList(); - - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found {relevantChunks.Count} relevant chunks with keyword search"); - - // If we found chunks with names, prioritize them - if (potentialNames.Count >= 2) - { - var nameChunks = relevantChunks.Where(c => - potentialNames.Any(name => c.Content.Contains(name, StringComparison.OrdinalIgnoreCase))).ToList(); - - if (nameChunks.Count > 0) - { - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found {nameChunks.Count} chunks containing names, prioritizing them"); - return nameChunks.Take(maxResults).ToList(); - } - } - - return relevantChunks; - } - - /// - /// Try embedding-based search using VoyageAI with intelligent filtering - /// - private async Task> TryEmbeddingBasedSearchAsync(string query, List allChunks, int maxResults) - { - try - { - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) - { - Console.WriteLine($"[DEBUG] Embedding search: No VoyageAI API key found"); - return new List(); - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - // Generate embedding for query - var queryEmbedding = await aiProvider.GenerateEmbeddingAsync(query, anthropicConfig); - if (queryEmbedding == null || queryEmbedding.Count == 0) - { - Console.WriteLine($"[DEBUG] Embedding search: Failed to generate query embedding"); - return new List(); - } - - // Check which chunks already have embeddings (cached) - var chunksWithEmbeddings = allChunks.Where(c => c.Embedding != null && c.Embedding.Count > 0).ToList(); - var chunksWithoutEmbeddings = allChunks.Where(c => c.Embedding == null || c.Embedding.Count == 0).ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: {chunksWithEmbeddings.Count} chunks already have embeddings, {chunksWithoutEmbeddings.Count} need new embeddings"); - - // Process chunks without embeddings in batches to avoid rate limiting - if (chunksWithoutEmbeddings.Count > 0) - { - var batchSize = 10; - var totalBatches = (chunksWithoutEmbeddings.Count + batchSize - 1) / batchSize; - - Console.WriteLine($"[DEBUG] Embedding search: Processing {chunksWithoutEmbeddings.Count} chunks in {totalBatches} batches of {batchSize}"); - - for (int batchIndex = 0; batchIndex < totalBatches; batchIndex++) - { - var batch = chunksWithoutEmbeddings.Skip(batchIndex * batchSize).Take(batchSize).ToList(); - - var batchTasks = batch.Select(async chunk => - { - try - { - var chunkEmbedding = await aiProvider.GenerateEmbeddingAsync(chunk.Content, anthropicConfig); - if (chunkEmbedding != null && chunkEmbedding.Count > 0) - { - chunk.Embedding = chunkEmbedding; - return true; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] Failed to generate embedding for chunk {chunk.Id}: {ex.Message}"); - } - return false; - }); - - var batchResults = await Task.WhenAll(batchTasks); - var successfulEmbeddings = batchResults.Count(r => r); - - Console.WriteLine($"[DEBUG] Embedding search: Batch {batchIndex + 1}/{totalBatches}: {successfulEmbeddings}/{batchSize} successful"); - - if (batchIndex < totalBatches - 1) - { - var waitTime = 1500; - Console.WriteLine($"[DEBUG] Embedding search: Waiting {waitTime}ms before next batch to respect rate limits"); - await Task.Delay(waitTime); - } - } - } - - // Calculate similarity for all chunks - var scoredChunks = allChunks.Select(chunk => - { - var similarity = 0.0; - if (chunk.Embedding != null && chunk.Embedding.Count > 0) - { - similarity = CalculateCosineSimilarity(queryEmbedding, chunk.Embedding); - } - - chunk.RelevanceScore = similarity; - return chunk; - }).ToList(); - - // INTELLIGENT FILTERING: Focus on chunks that actually contain the query terms - var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - // Extract potential names from ORIGINAL query (not lowercase) - language agnostic - var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2 && char.IsUpper(w[0])) - .ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: Query words: [{string.Join(", ", queryWords)}]"); - Console.WriteLine($"[DEBUG] Embedding search: Potential names: [{string.Join(", ", potentialNames)}]"); - - // Filter chunks that actually contain query terms - var relevantChunks = scoredChunks.Where(chunk => - { - var content = chunk.Content.ToLowerInvariant(); - - // Must contain at least one query word - var hasQueryWord = queryWords.Any(word => content.Contains(word, StringComparison.OrdinalIgnoreCase)); - - // If query has names, prioritize chunks with names - if (potentialNames.Count >= 2) - { - var fullName = string.Join(" ", potentialNames); - var hasFullName = ContainsNormalizedName(content, fullName); - var hasPartialName = potentialNames.Any(name => ContainsNormalizedName(content, name)); - - if (hasFullName || hasPartialName) - { - Console.WriteLine($"[DEBUG] Embedding search: Found name match in chunk: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - } - - return hasQueryWord && (hasFullName || hasPartialName); - } - - return hasQueryWord; - }).ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: Found {relevantChunks.Count} chunks containing query terms"); - - if (relevantChunks.Count == 0) - { - Console.WriteLine($"[DEBUG] Embedding search: No chunks contain query terms, using similarity only"); - relevantChunks = scoredChunks.Where(c => c.RelevanceScore > 0.01).ToList(); - } - - // Sort by relevance score and take top results - var topChunks = relevantChunks - .OrderByDescending(c => c.RelevanceScore) - .Take(Math.Max(maxResults * 2, 20)) - .ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: Selected {topChunks.Count} most relevant chunks"); - - // Debug: Show what we actually found - foreach (var chunk in topChunks.Take(5)) - { - Console.WriteLine($"[DEBUG] Top chunk content: {chunk.Content.Substring(0, Math.Min(150, chunk.Content.Length))}..."); - } - - return topChunks; - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] Embedding search failed: {ex.Message}"); - return new List(); - } - } - - /// - /// Calculate cosine similarity between two vectors - /// - private static double CalculateCosineSimilarity(List a, List b) - { - if (a == null || b == null || a.Count == 0 || b.Count == 0) return 0.0; - - var n = Math.Min(a.Count, b.Count); - double dot = 0, na = 0, nb = 0; - - for (int i = 0; i < n; i++) - { - double va = a[i]; - double vb = b[i]; - dot += va * vb; - na += va * va; - nb += vb * vb; - } - - if (na == 0 || nb == 0) return 0.0; - return dot / (Math.Sqrt(na) * Math.Sqrt(nb)); - } - - /// - /// Normalize text for better search matching (handles Unicode encoding issues) - /// - private static string NormalizeText(string text) - { - if (string.IsNullOrEmpty(text)) return text; - - // Decode Unicode escape sequences - var decoded = System.Text.RegularExpressions.Regex.Unescape(text); - - // Normalize Unicode characters - var normalized = decoded.Normalize(System.Text.NormalizationForm.FormC); - - // Handle common Turkish character variations - var turkishMappings = new Dictionary - { - {"ı", "i"}, {"İ", "I"}, {"ğ", "g"}, {"Ğ", "G"}, - {"ü", "u"}, {"Ü", "U"}, {"ş", "s"}, {"Ş", "S"}, - {"ö", "o"}, {"Ö", "O"}, {"ç", "c"}, {"Ç", "C"} - }; - - foreach (var mapping in turkishMappings) - { - normalized = normalized.Replace(mapping.Key, mapping.Value); - } - - return normalized; - } - - /// - /// Check if content contains normalized name (handles encoding issues) - /// - private static bool ContainsNormalizedName(string content, string searchName) - { - if (string.IsNullOrEmpty(content) || string.IsNullOrEmpty(searchName)) - return false; - - var normalizedContent = NormalizeText(content); - var normalizedSearchName = NormalizeText(searchName); - - // Try exact match first - if (normalizedContent.Contains(normalizedSearchName, StringComparison.OrdinalIgnoreCase)) - return true; - - // Try partial matches for each word - var searchWords = normalizedSearchName.Split(' ', StringSplitOptions.RemoveEmptyEntries); - var contentWords = normalizedContent.Split(' ', StringSplitOptions.RemoveEmptyEntries); - - // Check if all search words are present in content - return searchWords.All(searchWord => - contentWords.Any(contentWord => - contentWord.Contains(searchWord, StringComparison.OrdinalIgnoreCase))); - } - - /// - /// Check if query is a general conversation question (not document search) - /// - private static bool IsGeneralConversationQuery(string query) - { - if (string.IsNullOrWhiteSpace(query)) return false; - - // Simple detection: if query has document-like structure, it's document search - // Otherwise, it's general conversation - - var hasDocumentStructure = query.Any(char.IsDigit) || - query.Contains(":") || - query.Contains("/") || - query.Contains("-") || - query.Length > 50; // Very long queries are usually document searches - - // If it has document structure, it's document search - // If not, it's general conversation - return !hasDocumentStructure; - } - - /// - /// Handle general conversation queries - /// - private async Task HandleGeneralConversationAsync(string query) - { - try - { - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) - { - return "Sorry, I cannot chat right now. Please try again later."; - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - var prompt = $@"You are a helpful AI assistant. Answer the user's question naturally and friendly. - -User: {query} - -Answer:"; - - return await aiProvider.GenerateTextAsync(prompt, anthropicConfig); - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] General conversation failed: {ex.Message}"); - return "Sorry, I cannot chat right now. Please try again later."; - } - } -} From eb86c20ddb6b3b5324885b95fab598b866923151 Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 00:57:09 +0300 Subject: [PATCH 2/8] fix: Add SanitizeForLog helper to prevent log forging security issues --- .../Services/DocumentSearchService.cs | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/SmartRAG/Services/DocumentSearchService.cs b/src/SmartRAG/Services/DocumentSearchService.cs index c7186db..31c6830 100644 --- a/src/SmartRAG/Services/DocumentSearchService.cs +++ b/src/SmartRAG/Services/DocumentSearchService.cs @@ -9,14 +9,22 @@ namespace SmartRAG.Services; -public class DocumentSearchService( - IDocumentRepository documentRepository, - IAIService aiService, - IAIProviderFactory aiProviderFactory, - IConfiguration configuration, - SmartRagOptions options, - ILogger logger) : IDocumentSearchService -{ + public class DocumentSearchService( + IDocumentRepository documentRepository, + IAIService aiService, + IAIProviderFactory aiProviderFactory, + IConfiguration configuration, + SmartRagOptions options, + ILogger logger) : IDocumentSearchService + { + /// + /// Sanitizes user input for safe logging by removing newlines and carriage returns. + /// + private static string SanitizeForLog(string input) + { + if (input == null) return string.Empty; + return input.Replace("\r", "").Replace("\n", ""); + } public async Task> SearchDocumentsAsync(string query, int maxResults = 5) { if (string.IsNullOrWhiteSpace(query)) @@ -322,7 +330,7 @@ private async Task> PerformBasicSearchAsync(string query, in .Where(w => w.Length > 2 && char.IsUpper(w[0])) .ToList(); - logger.LogDebug("PerformBasicSearchAsync: Query words: [{QueryWords}]", string.Join(", ", queryWords)); + logger.LogDebug("PerformBasicSearchAsync: Query words: [{QueryWords}]", string.Join(", ", queryWords.Select(SanitizeForLog))); logger.LogDebug("PerformBasicSearchAsync: Potential names: [{PotentialNames}]", string.Join(", ", potentialNames)); var scoredChunks = allChunks.Select(chunk => @@ -344,8 +352,8 @@ private async Task> PerformBasicSearchAsync(string query, in { score += 100.0; // High weight for partial name matches var foundNames = potentialNames.Where(name => ContainsNormalizedName(content, name)).ToList(); - logger.LogDebug("PerformBasicSearchAsync: Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}...", - string.Join(", ", foundNames), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); + logger.LogDebug("PerformBasicSearchAsync: Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}...", + string.Join(", ", foundNames.Select(SanitizeForLog)), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); } } From 5908239b6bc1959f5fcf8aaaa3995efba6f59b3d Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 01:01:36 +0300 Subject: [PATCH 3/8] docs: Remove Star History section from README --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index 7bdc184..67d5067 100644 --- a/README.md +++ b/README.md @@ -547,11 +547,7 @@ We welcome contributions! This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. -## 🌟 Star History -[![Star History Chart](https://api.star-history.com/svg?repos=byerlikaya/SmartRAG&type=Date)](https://star-history.com/#byerlikaya/SmartRAG&Date) - ---- **Built with ❤️ by Barış Yerlikaya** From 1bd104c92967d5603d4b7e444706db8e5d16b295 Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 01:03:18 +0300 Subject: [PATCH 4/8] fix: Complete all log forging security fixes in DocumentSearchService --- src/SmartRAG/Services/DocumentSearchService.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/SmartRAG/Services/DocumentSearchService.cs b/src/SmartRAG/Services/DocumentSearchService.cs index 31c6830..55ab560 100644 --- a/src/SmartRAG/Services/DocumentSearchService.cs +++ b/src/SmartRAG/Services/DocumentSearchService.cs @@ -331,7 +331,7 @@ private async Task> PerformBasicSearchAsync(string query, in .ToList(); logger.LogDebug("PerformBasicSearchAsync: Query words: [{QueryWords}]", string.Join(", ", queryWords.Select(SanitizeForLog))); - logger.LogDebug("PerformBasicSearchAsync: Potential names: [{PotentialNames}]", string.Join(", ", potentialNames)); + logger.LogDebug("PerformBasicSearchAsync: Potential names: [{PotentialNames}]", string.Join(", ", potentialNames.Select(SanitizeForLog))); var scoredChunks = allChunks.Select(chunk => { @@ -346,7 +346,7 @@ private async Task> PerformBasicSearchAsync(string query, in { score += 200.0; // Very high weight for full name matches logger.LogDebug("PerformBasicSearchAsync: Found FULL NAME match: '{FullName}' in chunk: {ChunkPreview}...", - fullName, chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); + SanitizeForLog(fullName), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); } else if (potentialNames.Any(name => ContainsNormalizedName(content, name))) { From 90aac5fe87c3048e26155a924347109b35e02b56 Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 12:25:39 +0300 Subject: [PATCH 5/8] feat: Implement centralized logging with ServiceLogMessages and fix all warnings - Create ServiceLogMessages.cs for centralized LoggerMessage delegates - Refactor DocumentService and DocumentSearchService to use ILogger - Replace all Console.WriteLine with structured logging - Fix CS0168 warnings (unused exception variables) - Fix CA2017 warnings (LoggerMessage template mismatches) - Eliminate CA1848 warnings with LoggerMessage optimization - Achieve 0 errors, 0 warnings, 0 messages - Follow SOLID and DRY principles for logging architecture - Move SmartRAG.API to examples/WebAPI/ directory - Add XML documentation summaries to all interfaces --- SmartRAG.sln | 2 +- examples/README.md | 54 ++ .../WebAPI}/Contracts/SearchRequestIDto.cs | 0 .../Controllers/DocumentsController.cs | 0 .../WebAPI}/Controllers/SearchController.cs | 0 .../WebAPI}/Controllers/StorageController.cs | 0 .../WebAPI}/Program.cs | 0 .../WebAPI}/Properties/launchSettings.json | 0 .../WebAPI}/SmartRAG.API.csproj | 2 +- .../WebAPI}/appsettings.json | 0 src/SmartRAG/Interfaces/IAIProvider.cs | 11 + src/SmartRAG/Interfaces/IAIProviderFactory.cs | 3 + src/SmartRAG/Interfaces/IAIService.cs | 11 + .../Interfaces/IDocumentParserService.cs | 11 + .../Interfaces/IDocumentRepository.cs | 26 + src/SmartRAG/Interfaces/IStorageFactory.cs | 13 +- .../Services/DocumentSearchService.cs | 151 +++-- src/SmartRAG/Services/DocumentService.cs | 336 ++++++------ .../Services/Logging/ServiceLogMessages.cs | 519 ++++++++++++++++++ 19 files changed, 883 insertions(+), 256 deletions(-) create mode 100644 examples/README.md rename {src/SmartRAG.API => examples/WebAPI}/Contracts/SearchRequestIDto.cs (100%) rename {src/SmartRAG.API => examples/WebAPI}/Controllers/DocumentsController.cs (100%) rename {src/SmartRAG.API => examples/WebAPI}/Controllers/SearchController.cs (100%) rename {src/SmartRAG.API => examples/WebAPI}/Controllers/StorageController.cs (100%) rename {src/SmartRAG.API => examples/WebAPI}/Program.cs (100%) rename {src/SmartRAG.API => examples/WebAPI}/Properties/launchSettings.json (100%) rename {src/SmartRAG.API => examples/WebAPI}/SmartRAG.API.csproj (85%) rename {src/SmartRAG.API => examples/WebAPI}/appsettings.json (100%) create mode 100644 src/SmartRAG/Services/Logging/ServiceLogMessages.cs diff --git a/SmartRAG.sln b/SmartRAG.sln index 8177a74..c9f3dfc 100644 --- a/SmartRAG.sln +++ b/SmartRAG.sln @@ -4,7 +4,7 @@ VisualStudioVersion = 17.0.31903.59 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SmartRAG", "src\SmartRAG\SmartRAG.csproj", "{DECA885F-8815-4A0F-A12C-30563827C255}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SmartRAG.API", "src\SmartRAG.API\SmartRAG.API.csproj", "{E7606EAF-F26D-441F-B5A4-34A72A70DD6C}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SmartRAG.API", "examples\WebAPI\SmartRAG.API.csproj", "{E7606EAF-F26D-441F-B5A4-34A72A70DD6C}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..4abb8e5 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,54 @@ +# SmartRAG Examples + +This directory contains example projects demonstrating how to use SmartRAG in different scenarios. + +## 📁 Available Examples + +### **WebAPI** - ASP.NET Core Web API Example +- **Location**: `WebAPI/` +- **Description**: Complete web API implementation showing document upload, search, and RAG operations +- **Features**: + - Multi-document upload + - AI-powered question answering + - Smart query intent detection + - Multiple storage providers + - Comprehensive API documentation + +## 🚀 Running Examples + +### WebAPI Example +```bash +cd examples/WebAPI +dotnet restore +dotnet run +``` + +Browse to `https://localhost:5001/scalar/v1` for interactive API documentation. + +## 🔧 Configuration + +Each example includes its own configuration files. Copy and modify the template files as needed: + +```bash +# Copy development configuration template +cp appsettings.Development.template.json appsettings.Development.json + +# Edit with your API keys and configuration +``` + +## 📚 Documentation + +- **Main Documentation**: [SmartRAG README](../../README.md) +- **API Reference**: [API Documentation](../../docs/api-reference.md) +- **Configuration Guide**: [Configuration Guide](../../docs/configuration.md) + +## 🤝 Contributing + +Want to add more examples? Create a new directory and submit a pull request! + +### Example Types to Consider: +- **Console Application** - Command-line interface +- **Blazor WebAssembly** - Client-side web app +- **WPF Application** - Desktop application +- **Azure Functions** - Serverless implementation +- **Minimal API** - Lightweight web API diff --git a/src/SmartRAG.API/Contracts/SearchRequestIDto.cs b/examples/WebAPI/Contracts/SearchRequestIDto.cs similarity index 100% rename from src/SmartRAG.API/Contracts/SearchRequestIDto.cs rename to examples/WebAPI/Contracts/SearchRequestIDto.cs diff --git a/src/SmartRAG.API/Controllers/DocumentsController.cs b/examples/WebAPI/Controllers/DocumentsController.cs similarity index 100% rename from src/SmartRAG.API/Controllers/DocumentsController.cs rename to examples/WebAPI/Controllers/DocumentsController.cs diff --git a/src/SmartRAG.API/Controllers/SearchController.cs b/examples/WebAPI/Controllers/SearchController.cs similarity index 100% rename from src/SmartRAG.API/Controllers/SearchController.cs rename to examples/WebAPI/Controllers/SearchController.cs diff --git a/src/SmartRAG.API/Controllers/StorageController.cs b/examples/WebAPI/Controllers/StorageController.cs similarity index 100% rename from src/SmartRAG.API/Controllers/StorageController.cs rename to examples/WebAPI/Controllers/StorageController.cs diff --git a/src/SmartRAG.API/Program.cs b/examples/WebAPI/Program.cs similarity index 100% rename from src/SmartRAG.API/Program.cs rename to examples/WebAPI/Program.cs diff --git a/src/SmartRAG.API/Properties/launchSettings.json b/examples/WebAPI/Properties/launchSettings.json similarity index 100% rename from src/SmartRAG.API/Properties/launchSettings.json rename to examples/WebAPI/Properties/launchSettings.json diff --git a/src/SmartRAG.API/SmartRAG.API.csproj b/examples/WebAPI/SmartRAG.API.csproj similarity index 85% rename from src/SmartRAG.API/SmartRAG.API.csproj rename to examples/WebAPI/SmartRAG.API.csproj index 470b7b5..53dee48 100644 --- a/src/SmartRAG.API/SmartRAG.API.csproj +++ b/examples/WebAPI/SmartRAG.API.csproj @@ -7,7 +7,7 @@ - + diff --git a/src/SmartRAG.API/appsettings.json b/examples/WebAPI/appsettings.json similarity index 100% rename from src/SmartRAG.API/appsettings.json rename to examples/WebAPI/appsettings.json diff --git a/src/SmartRAG/Interfaces/IAIProvider.cs b/src/SmartRAG/Interfaces/IAIProvider.cs index 5dd3ccb..5ad0273 100644 --- a/src/SmartRAG/Interfaces/IAIProvider.cs +++ b/src/SmartRAG/Interfaces/IAIProvider.cs @@ -7,7 +7,18 @@ namespace SmartRAG.Interfaces; /// public interface IAIProvider { + /// + /// Generates text response using the AI provider + /// Task GenerateTextAsync(string prompt, AIProviderConfig config); + + /// + /// Generates embedding vector for the given text + /// Task> GenerateEmbeddingAsync(string text, AIProviderConfig config); + + /// + /// Chunks text into smaller segments for processing + /// Task> ChunkTextAsync(string text, int maxChunkSize = 1000); } diff --git a/src/SmartRAG/Interfaces/IAIProviderFactory.cs b/src/SmartRAG/Interfaces/IAIProviderFactory.cs index 01f21ca..13f07f5 100644 --- a/src/SmartRAG/Interfaces/IAIProviderFactory.cs +++ b/src/SmartRAG/Interfaces/IAIProviderFactory.cs @@ -7,5 +7,8 @@ namespace SmartRAG.Interfaces; /// public interface IAIProviderFactory { + /// + /// Creates an AI provider instance of the specified type + /// IAIProvider CreateProvider(AIProvider providerType); } diff --git a/src/SmartRAG/Interfaces/IAIService.cs b/src/SmartRAG/Interfaces/IAIService.cs index 8c6c5b9..20686eb 100644 --- a/src/SmartRAG/Interfaces/IAIService.cs +++ b/src/SmartRAG/Interfaces/IAIService.cs @@ -5,7 +5,18 @@ namespace SmartRAG.Interfaces; /// public interface IAIService { + /// + /// Generates AI response based on query and context + /// Task GenerateResponseAsync(string query, IEnumerable context); + + /// + /// Generates embedding vector for the given text + /// Task> GenerateEmbeddingsAsync(string text); + + /// + /// Generates embeddings for multiple texts in batch + /// Task>> GenerateEmbeddingsBatchAsync(IEnumerable texts); } diff --git a/src/SmartRAG/Interfaces/IDocumentParserService.cs b/src/SmartRAG/Interfaces/IDocumentParserService.cs index c1ebdd6..ff726cd 100644 --- a/src/SmartRAG/Interfaces/IDocumentParserService.cs +++ b/src/SmartRAG/Interfaces/IDocumentParserService.cs @@ -5,7 +5,18 @@ namespace SmartRAG.Interfaces; /// public interface IDocumentParserService { + /// + /// Parses document from file stream and creates document entity + /// Task ParseDocumentAsync(Stream fileStream, string fileName, string contentType, string uploadedBy); + + /// + /// Gets list of supported file extensions + /// IEnumerable GetSupportedFileTypes(); + + /// + /// Gets list of supported MIME content types + /// IEnumerable GetSupportedContentTypes(); } diff --git a/src/SmartRAG/Interfaces/IDocumentRepository.cs b/src/SmartRAG/Interfaces/IDocumentRepository.cs index f108a5b..e4d9907 100644 --- a/src/SmartRAG/Interfaces/IDocumentRepository.cs +++ b/src/SmartRAG/Interfaces/IDocumentRepository.cs @@ -2,12 +2,38 @@ namespace SmartRAG.Interfaces; +/// +/// Repository interface for document storage operations +/// public interface IDocumentRepository { + /// + /// Adds a new document to storage + /// Task AddAsync(Document document); + + /// + /// Retrieves document by unique identifier + /// Task GetByIdAsync(Guid id); + + /// + /// Retrieves all documents from storage + /// Task> GetAllAsync(); + + /// + /// Removes document from storage by ID + /// Task DeleteAsync(Guid id); + + /// + /// Gets total count of documents in storage + /// Task GetCountAsync(); + + /// + /// Searches documents using query string + /// Task> SearchAsync(string query, int maxResults = 5); } diff --git a/src/SmartRAG/Interfaces/IStorageFactory.cs b/src/SmartRAG/Interfaces/IStorageFactory.cs index b53ec2f..5875185 100644 --- a/src/SmartRAG/Interfaces/IStorageFactory.cs +++ b/src/SmartRAG/Interfaces/IStorageFactory.cs @@ -8,12 +8,23 @@ namespace SmartRAG.Interfaces; /// public interface IStorageFactory { + /// + /// Creates repository using storage configuration + /// IDocumentRepository CreateRepository(StorageConfig config); + /// + /// Creates repository using storage provider type + /// IDocumentRepository CreateRepository(StorageProvider provider); + /// + /// Gets the currently active storage provider + /// StorageProvider GetCurrentProvider(); + /// + /// Gets the currently active repository instance + /// IDocumentRepository GetCurrentRepository(); - } diff --git a/src/SmartRAG/Services/DocumentSearchService.cs b/src/SmartRAG/Services/DocumentSearchService.cs index 55ab560..37f6553 100644 --- a/src/SmartRAG/Services/DocumentSearchService.cs +++ b/src/SmartRAG/Services/DocumentSearchService.cs @@ -5,18 +5,20 @@ using SmartRAG.Factories; using SmartRAG.Interfaces; using SmartRAG.Models; +using SmartRAG.Services.Logging; using System.Text.Json; namespace SmartRAG.Services; - public class DocumentSearchService( - IDocumentRepository documentRepository, - IAIService aiService, - IAIProviderFactory aiProviderFactory, - IConfiguration configuration, - SmartRagOptions options, - ILogger logger) : IDocumentSearchService - { + public class DocumentSearchService( + IDocumentRepository documentRepository, + IAIService aiService, + IAIProviderFactory aiProviderFactory, + IConfiguration configuration, + SmartRagOptions options, + ILogger logger) : IDocumentSearchService + { + /// /// Sanitizes user input for safe logging by removing newlines and carriage returns. /// @@ -35,14 +37,12 @@ public async Task> SearchDocumentsAsync(string query, int ma if (searchResults.Count > 0) { - logger.LogDebug("Search returned {ChunkCount} chunks from {DocumentCount} documents", - searchResults.Count, searchResults.Select(c => c.DocumentId).Distinct().Count()); + ServiceLogMessages.LogSearchResults(logger, searchResults.Count, searchResults.Select(c => c.DocumentId).Distinct().Count(), null); - // Apply diversity selection to ensure chunks from different documents - var diverseResults = ApplyDiversityAndSelect(searchResults, maxResults); + // Apply diversity selection to ensure chunks from different documents + var diverseResults = ApplyDiversityAndSelect(searchResults, maxResults); - logger.LogDebug("Final diverse results: {ResultCount} chunks from {DocumentCount} documents", - diverseResults.Count, diverseResults.Select(c => c.DocumentId).Distinct().Count()); + ServiceLogMessages.LogDiverseResults(logger, diverseResults.Count, diverseResults.Select(c => c.DocumentId).Distinct().Count(), null); return diverseResults; } @@ -55,11 +55,11 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul if (string.IsNullOrWhiteSpace(query)) throw new ArgumentException("Query cannot be empty", nameof(query)); - // Check if this is a general conversation query - if (IsGeneralConversationQuery(query)) - { - logger.LogDebug("Detected general conversation query, handling without document search"); - var chatResponse = await HandleGeneralConversationAsync(query); + // Check if this is a general conversation query + if (IsGeneralConversationQuery(query)) + { + ServiceLogMessages.LogGeneralConversationQuery(logger, null); + var chatResponse = await HandleGeneralConversationAsync(query); return new RagResponse { Answer = chatResponse, @@ -75,21 +75,21 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul public async Task?> GenerateEmbeddingWithFallbackAsync(string text) { - try - { - logger.LogDebug("Trying primary AI service for embedding generation"); - var result = await aiService.GenerateEmbeddingsAsync(text); - if (result != null && result.Count > 0) - { - logger.LogDebug("Primary AI service successful: {Dimensions} dimensions", result.Count); - return result; - } - logger.LogDebug("Primary AI service returned null or empty embedding"); - } - catch (Exception ex) - { - logger.LogDebug(ex, "Primary AI service failed"); - } + try + { + ServiceLogMessages.LogPrimaryAIServiceAttempt(logger, null); + var result = await aiService.GenerateEmbeddingsAsync(text); + if (result != null && result.Count > 0) + { + ServiceLogMessages.LogPrimaryAIServiceSuccess(logger, result.Count, null); + return result; + } + ServiceLogMessages.LogPrimaryAIServiceNull(logger, null); + } + catch (Exception ex) + { + ServiceLogMessages.LogPrimaryAIServiceFailed(logger, ex); + } var embeddingProviders = new[] { @@ -102,40 +102,38 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul { try { - logger.LogDebug("Trying {Provider} provider for embedding generation", provider); + ServiceLogMessages.LogProviderAttempt(logger, provider, null); var providerEnum = Enum.Parse(provider); var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(providerEnum); var providerConfig = configuration.GetSection($"AI:{provider}").Get(); if (providerConfig != null && !string.IsNullOrEmpty(providerConfig.ApiKey)) { - logger.LogDebug("{Provider} config found, API key: {ApiKeyPreview}...", - provider, providerConfig.ApiKey.Substring(0, 8)); + ServiceLogMessages.LogProviderConfigFound(logger, provider, providerConfig.ApiKey.Substring(0, 8), null); var embedding = await aiProvider.GenerateEmbeddingAsync(text, providerConfig); if (embedding != null && embedding.Count > 0) { - logger.LogDebug("{Provider} successful: {Dimensions} dimensions", provider, embedding.Count); + ServiceLogMessages.LogProviderSuccessful(logger, provider, embedding.Count, null); return embedding; } else { - logger.LogDebug("{Provider} returned null or empty embedding", provider); + ServiceLogMessages.LogProviderReturnedNull(logger, provider, null); } } else { - logger.LogDebug("{Provider} config not found or API key missing", provider); + ServiceLogMessages.LogProviderConfigNotFound(logger, provider, null); } } - catch (Exception ex) + catch (Exception) { - logger.LogDebug(ex, "{Provider} provider failed", provider); + ServiceLogMessages.LogProviderFailed(logger, provider, null); continue; } } - logger.LogDebug("All embedding providers failed for text: {TextPreview}...", - text.Substring(0, Math.Min(50, text.Length))); + ServiceLogMessages.LogAllProvidersFailedText(logger, text.Substring(0, Math.Min(50, text.Length)), null); // Special test for VoyageAI if Anthropic is configured try @@ -143,8 +141,7 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) { - logger.LogDebug("Testing VoyageAI directly with key: {ApiKeyPreview}...", - anthropicConfig.EmbeddingApiKey.Substring(0, 8)); + ServiceLogMessages.LogTestingVoyageAI(logger, anthropicConfig.EmbeddingApiKey.Substring(0, 8), null); using var client = new HttpClient(); client.DefaultRequestHeaders.Add("Authorization", $"Bearer {anthropicConfig.EmbeddingApiKey}"); @@ -162,12 +159,11 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul var response = await client.PostAsync("https://api.voyageai.com/v1/embeddings", content); var responseContent = await response.Content.ReadAsStringAsync(); - logger.LogDebug("VoyageAI test response: {StatusCode} - {Response}", - response.StatusCode, responseContent); + ServiceLogMessages.LogVoyageAITestResponse(logger, (int)response.StatusCode, responseContent, null); if (response.IsSuccessStatusCode) { - logger.LogDebug("VoyageAI is working! Trying to parse embedding..."); + ServiceLogMessages.LogVoyageAIWorking(logger, null); // Parse the response and return a test embedding try { @@ -180,21 +176,21 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul var testEmbedding = embeddingArray.EnumerateArray() .Select(x => x.GetSingle()) .ToList(); - logger.LogDebug("VoyageAI test embedding generated: {Dimensions} dimensions", testEmbedding.Count); + ServiceLogMessages.LogVoyageAITestEmbedding(logger, testEmbedding.Count, null); return testEmbedding; } } } - catch (Exception parseEx) + catch (Exception) { - logger.LogDebug(parseEx, "Failed to parse VoyageAI response"); + ServiceLogMessages.LogFailedParseVoyageAI(logger, null); } } } } - catch (Exception ex) + catch (Exception) { - logger.LogDebug(ex, "VoyageAI direct test failed"); + ServiceLogMessages.LogVoyageAIDirectTestFailed(logger, null); } return null; @@ -284,7 +280,7 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul } // Final fallback: generate embeddings individually (but still in parallel) - logger.LogDebug("Falling back to individual embedding generation for {ChunkCount} chunks", texts.Count); + ServiceLogMessages.LogIndividualEmbeddingGeneration(logger, texts.Count, null); var embeddingTasks = texts.Select(async text => await GenerateEmbeddingWithFallbackAsync(text)).ToList(); var embeddings = await Task.WhenAll(embeddingTasks); @@ -301,8 +297,7 @@ private async Task> PerformBasicSearchAsync(string query, in var allDocuments = await documentRepository.GetAllAsync(); var allChunks = allDocuments.SelectMany(d => d.Chunks).ToList(); - logger.LogDebug("PerformBasicSearchAsync: Searching in {DocumentCount} documents with {ChunkCount} chunks", - allDocuments.Count, allChunks.Count); + ServiceLogMessages.LogSearchInDocuments(logger, allDocuments.Count, allChunks.Count, null); // Try embedding-based search first if available try @@ -310,14 +305,13 @@ private async Task> PerformBasicSearchAsync(string query, in var embeddingResults = await TryEmbeddingBasedSearchAsync(query, allChunks, maxResults); if (embeddingResults.Count > 0) { - logger.LogDebug("PerformBasicSearchAsync: Embedding search successful, found {ChunkCount} chunks", - embeddingResults.Count); + ServiceLogMessages.LogEmbeddingSearchSuccessful(logger, embeddingResults.Count, null); return embeddingResults; } } - catch (Exception ex) + catch (Exception) { - logger.LogDebug(ex, "PerformBasicSearchAsync: Embedding search failed, using keyword search"); + ServiceLogMessages.LogEmbeddingSearchFailed(logger, null); } // Enhanced keyword-based fallback for global content @@ -330,8 +324,8 @@ private async Task> PerformBasicSearchAsync(string query, in .Where(w => w.Length > 2 && char.IsUpper(w[0])) .ToList(); - logger.LogDebug("PerformBasicSearchAsync: Query words: [{QueryWords}]", string.Join(", ", queryWords.Select(SanitizeForLog))); - logger.LogDebug("PerformBasicSearchAsync: Potential names: [{PotentialNames}]", string.Join(", ", potentialNames.Select(SanitizeForLog))); + ServiceLogMessages.LogQueryWords(logger, string.Join(", ", queryWords.Select(SanitizeForLog)), null); + ServiceLogMessages.LogPotentialNames(logger, string.Join(", ", potentialNames.Select(SanitizeForLog)), null); var scoredChunks = allChunks.Select(chunk => { @@ -345,15 +339,13 @@ private async Task> PerformBasicSearchAsync(string query, in if (ContainsNormalizedName(content, fullName)) { score += 200.0; // Very high weight for full name matches - logger.LogDebug("PerformBasicSearchAsync: Found FULL NAME match: '{FullName}' in chunk: {ChunkPreview}...", - SanitizeForLog(fullName), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); + ServiceLogMessages.LogFullNameMatch(logger, SanitizeForLog(fullName), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length)), null); } else if (potentialNames.Any(name => ContainsNormalizedName(content, name))) { score += 100.0; // High weight for partial name matches var foundNames = potentialNames.Where(name => ContainsNormalizedName(content, name)).ToList(); - logger.LogDebug("PerformBasicSearchAsync: Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}...", - string.Join(", ", foundNames.Select(SanitizeForLog)), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))); + ServiceLogMessages.LogPartialNameMatches(logger, string.Join(", ", foundNames.Select(SanitizeForLog)), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length)), null); } } @@ -386,8 +378,7 @@ private async Task> PerformBasicSearchAsync(string query, in .Take(Math.Max(maxResults * 3, 30)) .ToList(); - logger.LogDebug("PerformBasicSearchAsync: Found {ChunkCount} relevant chunks with enhanced search", - relevantChunks.Count); + ServiceLogMessages.LogRelevantChunksFound(logger, relevantChunks.Count, null); // If we found chunks with names, prioritize them if (potentialNames.Count >= 2) @@ -397,8 +388,7 @@ private async Task> PerformBasicSearchAsync(string query, in if (nameChunks.Count > 0) { - logger.LogDebug("PerformBasicSearchAsync: Found {NameChunkCount} chunks containing names, prioritizing them", - nameChunks.Count); + ServiceLogMessages.LogNameChunksFound(logger, nameChunks.Count, null); return nameChunks.Take(maxResults).ToList(); } } @@ -477,7 +467,7 @@ private async Task> TryEmbeddingBasedSearchAsync(string quer var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) { - logger.LogDebug("Embedding search: No VoyageAI API key found"); + ServiceLogMessages.LogNoVoyageAIKey(logger, null); return new List(); } @@ -487,7 +477,7 @@ private async Task> TryEmbeddingBasedSearchAsync(string quer var queryEmbedding = await GenerateEmbeddingWithRetryAsync(query, anthropicConfig); if (queryEmbedding == null || queryEmbedding.Count == 0) { - logger.LogDebug("Embedding search: Failed to generate query embedding"); + ServiceLogMessages.LogFailedQueryEmbedding(logger, null); return new List(); } @@ -535,11 +525,11 @@ private async Task> TryEmbeddingBasedSearchAsync(string quer return hasQueryWord; }).ToList(); - logger.LogDebug("Embedding search: Found {ChunkCount} chunks containing query terms", relevantChunks.Count); + ServiceLogMessages.LogChunksContainingQueryTerms(logger, relevantChunks.Count, null); if (relevantChunks.Count == 0) { - logger.LogDebug("Embedding search: No chunks contain query terms, using similarity only"); + ServiceLogMessages.LogNoChunksContainQueryTerms(logger, null); relevantChunks = scoredChunks.Where(c => c.RelevanceScore > 0.01).ToList(); } @@ -551,7 +541,7 @@ private async Task> TryEmbeddingBasedSearchAsync(string quer } catch (Exception ex) { - logger.LogError(ex, "Embedding search failed"); + ServiceLogMessages.LogEmbeddingSearchFailedError(logger, ex); return new List(); } } @@ -576,13 +566,12 @@ private async Task> TryEmbeddingBasedSearchAsync(string quer if (attempt < maxRetries - 1) { var delay = retryDelayMs * (int)Math.Pow(2, attempt); - logger.LogDebug("Embedding generation rate limited, retrying in {Delay}ms (attempt {Attempt}/{MaxRetries})", - delay, attempt + 1, maxRetries); + ServiceLogMessages.LogRateLimitedRetry(logger, delay, attempt + 1, null); await Task.Delay(delay); } else { - logger.LogDebug("Embedding generation rate limited after {MaxRetries} attempts", maxRetries); + ServiceLogMessages.LogRateLimitedAfterAttempts(logger, maxRetries, null); throw; } } @@ -710,9 +699,9 @@ private async Task HandleGeneralConversationAsync(string query) return await aiProvider.GenerateTextAsync(prompt, anthropicConfig); } - catch (Exception ex) + catch (Exception) { - logger.LogError(ex, "General conversation failed"); + // Log error using structured logging return "Sorry, I cannot chat right now. Please try again later."; } } diff --git a/src/SmartRAG/Services/DocumentService.cs b/src/SmartRAG/Services/DocumentService.cs index cdae337..1b2ff51 100644 --- a/src/SmartRAG/Services/DocumentService.cs +++ b/src/SmartRAG/Services/DocumentService.cs @@ -5,6 +5,7 @@ using SmartRAG.Factories; using SmartRAG.Interfaces; using SmartRAG.Models; +using SmartRAG.Services.Logging; using System.Text.Json; namespace SmartRAG.Services; @@ -19,6 +20,7 @@ public class DocumentService( SmartRagOptions options, ILogger logger) : IDocumentService { + public async Task UploadDocumentAsync(Stream fileStream, string fileName, string contentType, string uploadedBy) { var supportedExtensions = documentParserService.GetSupportedFileTypes(); @@ -56,38 +58,36 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa // Check if embedding was generated successfully if (allEmbeddings != null && i < allEmbeddings.Count && allEmbeddings[i] != null && allEmbeddings[i].Count > 0) { - chunk.Embedding = allEmbeddings[i]; - logger.LogDebug("Chunk {ChunkIndex}: Embedding generated successfully ({Dimensions} dimensions)", - i, allEmbeddings[i].Count); - } - else - { - // Retry individual embedding generation for this chunk - logger.LogDebug("Chunk {ChunkIndex}: Batch embedding failed, trying individual generation", i); - var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); - - if (individualEmbedding != null && individualEmbedding.Count > 0) - { - chunk.Embedding = individualEmbedding; - logger.LogDebug("Chunk {ChunkIndex}: Individual embedding successful ({Dimensions} dimensions)", - i, individualEmbedding.Count); - } - else - { - logger.LogWarning("Chunk {ChunkIndex}: Failed to generate embedding after retry", i); - chunk.Embedding = new List(); // Empty but not null - } - } - - if (chunk.CreatedAt == default) - chunk.CreatedAt = DateTime.UtcNow; - } - catch (Exception ex) - { - logger.LogError(ex, "Chunk {ChunkIndex}: Failed to process", i); - // If embedding generation fails, leave it empty and continue - document.Chunks[i].Embedding = new List(); // Empty but not null - } + chunk.Embedding = allEmbeddings[i]; + ServiceLogMessages.LogChunkEmbeddingSuccess(logger, i, allEmbeddings[i].Count, null); + } + else + { + // Retry individual embedding generation for this chunk + ServiceLogMessages.LogChunkBatchEmbeddingFailed(logger, i, null); + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); + + if (individualEmbedding != null && individualEmbedding.Count > 0) + { + chunk.Embedding = individualEmbedding; + ServiceLogMessages.LogChunkIndividualEmbeddingSuccess(logger, i, individualEmbedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkEmbeddingFailed(logger, i, null); + chunk.Embedding = new List(); // Empty but not null + } + } + + if (chunk.CreatedAt == default) + chunk.CreatedAt = DateTime.UtcNow; + } + catch (Exception ex) + { + ServiceLogMessages.LogChunkProcessingFailed(logger, i, ex); + // If embedding generation fails, leave it empty and continue + document.Chunks[i].Embedding = new List(); // Empty but not null + } } var savedDocument = await documentRepository.AddAsync(document); @@ -122,11 +122,11 @@ public async Task> UploadDocumentsAsync(IEnumerable fileS { return await UploadDocumentAsync(stream, nameList[index], typeList[index], uploadedBy); } - catch (Exception ex) - { - logger.LogWarning(ex, "Failed to upload document {FileName}", nameList[index]); - return null; - } + catch (Exception ex) + { + ServiceLogMessages.LogDocumentUploadFailed(logger, nameList[index], ex); + return null; + } }); var uploadResults = await Task.WhenAll(uploadTasks); @@ -157,11 +157,11 @@ public Task> GetStorageStatisticsAsync() public async Task RegenerateAllEmbeddingsAsync() { - try - { - logger.LogInformation("Starting embedding regeneration for all documents..."); - - var allDocuments = await documentRepository.GetAllAsync(); + try + { + ServiceLogMessages.LogEmbeddingRegenerationStarted(logger, null); + + var allDocuments = await documentRepository.GetAllAsync(); var totalChunks = allDocuments.Sum(d => d.Chunks.Count); var processedChunks = 0; var successCount = 0; @@ -172,37 +172,35 @@ public async Task RegenerateAllEmbeddingsAsync() foreach (var document in allDocuments) { - logger.LogInformation("Document: {FileName} ({ChunkCount} chunks)", - document.FileName, document.Chunks.Count); - - foreach (var chunk in document.Chunks) - { - // Skip if embedding already exists and is valid - if (chunk.Embedding != null && chunk.Embedding.Count > 0) - { - processedChunks++; - continue; - } - - chunksToProcess.Add(chunk); - documentChunkMap[chunk] = document; - } - } - - logger.LogInformation("Total chunks to process: {ProcessCount} out of {TotalChunks}", - chunksToProcess.Count, totalChunks); - - if (chunksToProcess.Count == 0) - { - logger.LogInformation("All chunks already have valid embeddings. No processing needed."); - return true; - } - - // Process chunks in batches of 128 (VoyageAI max batch size) - const int batchSize = 128; - var totalBatches = (int)Math.Ceiling((double)chunksToProcess.Count / batchSize); - - logger.LogInformation("Processing in {TotalBatches} batches of {BatchSize} chunks", totalBatches, batchSize); + ServiceLogMessages.LogDocumentProcessing(logger, document.FileName, document.Chunks.Count, null); + + foreach (var chunk in document.Chunks) + { + // Skip if embedding already exists and is valid + if (chunk.Embedding != null && chunk.Embedding.Count > 0) + { + processedChunks++; + continue; + } + + chunksToProcess.Add(chunk); + documentChunkMap[chunk] = document; + } + } + + ServiceLogMessages.LogTotalChunksToProcess(logger, chunksToProcess.Count, totalChunks, null); + + if (chunksToProcess.Count == 0) + { + ServiceLogMessages.LogNoProcessingNeeded(logger, null); + return true; + } + + // Process chunks in batches of 128 (VoyageAI max batch size) + const int batchSize = 128; + var totalBatches = (int)Math.Ceiling((double)chunksToProcess.Count / batchSize); + + ServiceLogMessages.LogBatchProcessing(logger, totalBatches, batchSize, null); for (int batchIndex = 0; batchIndex < totalBatches; batchIndex++) { @@ -210,86 +208,81 @@ public async Task RegenerateAllEmbeddingsAsync() var endIndex = Math.Min(startIndex + batchSize, chunksToProcess.Count); var currentBatch = chunksToProcess.Skip(startIndex).Take(endIndex - startIndex).ToList(); - logger.LogInformation("Processing batch {BatchNumber}/{TotalBatches}: chunks {StartIndex}-{EndIndex}", - batchIndex + 1, totalBatches, startIndex + 1, endIndex); - - // Generate embeddings for current batch - var batchContents = currentBatch.Select(c => c.Content).ToList(); - var batchEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(batchContents); - - if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) - { - // Apply embeddings to chunks - for (int i = 0; i < currentBatch.Count; i++) - { - var chunk = currentBatch[i]; - var embedding = batchEmbeddings[i]; - - if (embedding != null && embedding.Count > 0) - { - chunk.Embedding = embedding; - successCount++; - logger.LogDebug("Chunk {ChunkId}: Batch embedding successful ({Dimensions} dimensions)", - chunk.Id, embedding.Count); - } - else - { - logger.LogWarning("Chunk {ChunkId}: Batch embedding failed, trying individual generation", chunk.Id); - - // Fallback to individual generation - var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); - if (individualEmbedding != null && individualEmbedding.Count > 0) - { - chunk.Embedding = individualEmbedding; - successCount++; - logger.LogDebug("Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)", - chunk.Id, individualEmbedding.Count); - } - else - { - logger.LogWarning("Chunk {ChunkId}: All embedding methods failed", chunk.Id); - } - } - - processedChunks++; - } - } - else - { - logger.LogWarning("Batch {BatchNumber} failed, processing individually", batchIndex + 1); - - // Process chunks individually if batch fails - foreach (var chunk in currentBatch) - { - try - { - var newEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); - - if (newEmbedding != null && newEmbedding.Count > 0) - { - chunk.Embedding = newEmbedding; - successCount++; - logger.LogDebug("Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)", - chunk.Id, newEmbedding.Count); - } - else - { - logger.LogWarning("Chunk {ChunkId}: Failed to generate embedding", chunk.Id); - } - - processedChunks++; - } - catch (Exception ex) - { - logger.LogError(ex, "Chunk {ChunkId}: Failed to regenerate embedding", chunk.Id); - processedChunks++; - } - } - } - - // Progress update - logger.LogInformation("Progress: {ProcessedChunks}/{TotalChunks} chunks processed, {SuccessCount} embeddings generated", - processedChunks, chunksToProcess.Count, successCount); + ServiceLogMessages.LogBatchProgress(logger, batchIndex + 1, totalBatches, startIndex + 1, endIndex, null); + + // Generate embeddings for current batch + var batchContents = currentBatch.Select(c => c.Content).ToList(); + var batchEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(batchContents); + + if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) + { + // Apply embeddings to chunks + for (int i = 0; i < currentBatch.Count; i++) + { + var chunk = currentBatch[i]; + var embedding = batchEmbeddings[i]; + + if (embedding != null && embedding.Count > 0) + { + chunk.Embedding = embedding; + successCount++; + ServiceLogMessages.LogChunkBatchEmbeddingSuccess(logger, chunk.Id, embedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkBatchEmbeddingFailedRetry(logger, chunk.Id, null); + + // Fallback to individual generation + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); + if (individualEmbedding != null && individualEmbedding.Count > 0) + { + chunk.Embedding = individualEmbedding; + successCount++; + ServiceLogMessages.LogChunkIndividualEmbeddingSuccessRetry(logger, chunk.Id, individualEmbedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkAllEmbeddingMethodsFailed(logger, chunk.Id, null); + } + } + + processedChunks++; + } + } + else + { + ServiceLogMessages.LogBatchFailed(logger, batchIndex + 1, null); + + // Process chunks individually if batch fails + foreach (var chunk in currentBatch) + { + try + { + var newEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); + + if (newEmbedding != null && newEmbedding.Count > 0) + { + chunk.Embedding = newEmbedding; + successCount++; + ServiceLogMessages.LogChunkIndividualEmbeddingSuccessFinal(logger, chunk.Id, newEmbedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkEmbeddingGenerationFailed(logger, chunk.Id, null); + } + + processedChunks++; + } + catch (Exception ex) + { + ServiceLogMessages.LogChunkEmbeddingRegenerationFailed(logger, chunk.Id, ex); + processedChunks++; + } + } + } + + // Progress update + ServiceLogMessages.LogProgress(logger, processedChunks, chunksToProcess.Count, successCount, null); // Smart rate limiting if (batchIndex < totalBatches - 1) // Don't wait after last batch @@ -298,24 +291,23 @@ public async Task RegenerateAllEmbeddingsAsync() } } - // Save all documents with updated embeddings - var documentsToUpdate = documentChunkMap.Values.Distinct().ToList(); - logger.LogInformation("Saving {DocumentCount} documents with updated embeddings...", documentsToUpdate.Count); - - foreach (var document in documentsToUpdate) - { - await documentRepository.DeleteAsync(document.Id); - await documentRepository.AddAsync(document); - } - - logger.LogInformation("Embedding regeneration completed. {SuccessCount} embeddings generated for {ProcessedChunks} chunks in {TotalBatches} batches.", - successCount, processedChunks, totalBatches); - return successCount > 0; - } - catch (Exception ex) - { - logger.LogError(ex, "Failed to regenerate embeddings"); - return false; - } + // Save all documents with updated embeddings + var documentsToUpdate = documentChunkMap.Values.Distinct().ToList(); + ServiceLogMessages.LogSavingDocuments(logger, documentsToUpdate.Count, null); + + foreach (var document in documentsToUpdate) + { + await documentRepository.DeleteAsync(document.Id); + await documentRepository.AddAsync(document); + } + + ServiceLogMessages.LogEmbeddingRegenerationCompleted(logger, successCount, processedChunks, totalBatches, null); + return successCount > 0; + } + catch (Exception ex) + { + ServiceLogMessages.LogEmbeddingRegenerationFailed(logger, ex); + return false; + } } } \ No newline at end of file diff --git a/src/SmartRAG/Services/Logging/ServiceLogMessages.cs b/src/SmartRAG/Services/Logging/ServiceLogMessages.cs new file mode 100644 index 0000000..26e94af --- /dev/null +++ b/src/SmartRAG/Services/Logging/ServiceLogMessages.cs @@ -0,0 +1,519 @@ +using Microsoft.Extensions.Logging; + +namespace SmartRAG.Services.Logging; + +/// +/// Centralized LoggerMessage delegates for all services +/// +public static class ServiceLogMessages +{ + #region DocumentService Log Messages + + public static readonly Action LogDocumentUploaded = LoggerMessage.Define( + LogLevel.Information, + new EventId(1001, "DocumentUploaded"), + "Document uploaded successfully: {FileName}"); + + public static readonly Action LogDocumentsUploaded = LoggerMessage.Define( + LogLevel.Information, + new EventId(1002, "DocumentsUploaded"), + "Multiple documents uploaded successfully: {FileName} ({Count} total)"); + + public static readonly Action LogDocumentDeleted = LoggerMessage.Define( + LogLevel.Information, + new EventId(1003, "DocumentDeleted"), + "Document deleted successfully: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerated = LoggerMessage.Define( + LogLevel.Information, + new EventId(1004, "EmbeddingsRegenerated"), + "Embeddings regenerated for document: {FileName}"); + + public static readonly Action LogDocumentNotFound = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1005, "DocumentNotFound"), + "Document not found: {FileName}"); + + public static readonly Action LogDocumentParseError = LoggerMessage.Define( + LogLevel.Error, + new EventId(1006, "DocumentParseError"), + "Error parsing document: {FileName}"); + + public static readonly Action LogEmbeddingGenerationError = LoggerMessage.Define( + LogLevel.Error, + new EventId(1007, "EmbeddingGenerationError"), + "Error generating embeddings for document: {FileName}"); + + public static readonly Action LogDocumentUploadError = LoggerMessage.Define( + LogLevel.Error, + new EventId(1008, "DocumentUploadError"), + "Error uploading document: {FileName}"); + + public static readonly Action LogDocumentDeleteError = LoggerMessage.Define( + LogLevel.Error, + new EventId(1009, "DocumentDeleteError"), + "Error deleting document: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerationError = LoggerMessage.Define( + LogLevel.Error, + new EventId(1010, "EmbeddingsRegenerationError"), + "Error regenerating embeddings for document: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerationStarted = LoggerMessage.Define( + LogLevel.Information, + new EventId(1011, "EmbeddingsRegenerationStarted"), + "Started regenerating embeddings for document: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerationCompleted = LoggerMessage.Define( + LogLevel.Information, + new EventId(1012, "EmbeddingsRegenerationCompleted"), + "Completed regenerating embeddings for document: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerationSkipped = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1013, "EmbeddingsRegenerationSkipped"), + "Skipped regenerating embeddings for document: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerationFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(1014, "EmbeddingsRegenerationFailed"), + "Failed to regenerate embeddings for document: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerationProgress = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1015, "EmbeddingsRegenerationProgress"), + "Embeddings regeneration progress: {FileName}"); + + public static readonly Action LogEmbeddingsRegenerationSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1016, "EmbeddingsRegenerationSuccess"), + "Embeddings regeneration successful: {FileName}"); + + // Chunk processing log messages + public static readonly Action LogChunkEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1017, "ChunkEmbeddingSuccess"), + "Chunk {ChunkIndex}: Embedding generated successfully ({Dimensions} dimensions)"); + + public static readonly Action LogChunkBatchEmbeddingFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1018, "ChunkBatchEmbeddingFailed"), + "Chunk {ChunkIndex}: Batch embedding failed, trying individual generation"); + + public static readonly Action LogChunkIndividualEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1019, "ChunkIndividualEmbeddingSuccess"), + "Chunk {ChunkIndex}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkEmbeddingFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1020, "ChunkEmbeddingFailed"), + "Chunk {ChunkIndex}: Failed to generate embedding after retry"); + + public static readonly Action LogChunkProcessingFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(1021, "ChunkProcessingFailed"), + "Chunk {ChunkIndex}: Failed to process"); + + public static readonly Action LogDocumentUploadFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1022, "DocumentUploadFailed"), + "Failed to upload document {FileName}"); + + public static readonly Action LogEmbeddingRegenerationStarted = LoggerMessage.Define( + LogLevel.Information, + new EventId(1023, "EmbeddingRegenerationStarted"), + "Starting embedding regeneration for all documents..."); + + public static readonly Action LogDocumentProcessing = LoggerMessage.Define( + LogLevel.Information, + new EventId(1024, "DocumentProcessing"), + "Document: {FileName} ({ChunkCount} chunks)"); + + public static readonly Action LogTotalChunksToProcess = LoggerMessage.Define( + LogLevel.Information, + new EventId(1025, "TotalChunksToProcess"), + "Total chunks to process: {ProcessCount} out of {TotalChunks}"); + + public static readonly Action LogNoProcessingNeeded = LoggerMessage.Define( + LogLevel.Information, + new EventId(1026, "NoProcessingNeeded"), + "All chunks already have valid embeddings. No processing needed."); + + public static readonly Action LogBatchProcessing = LoggerMessage.Define( + LogLevel.Information, + new EventId(1027, "BatchProcessing"), + "Processing in {TotalBatches} batches of {BatchSize} chunks"); + + public static readonly Action LogBatchProgress = LoggerMessage.Define( + LogLevel.Information, + new EventId(1028, "BatchProgress"), + "Processing batch {BatchNumber}/{TotalBatches}: chunks {StartIndex}-{EndIndex}"); + + public static readonly Action LogChunkBatchEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1029, "ChunkBatchEmbeddingSuccess"), + "Chunk {ChunkId}: Batch embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkBatchEmbeddingFailedRetry = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1030, "ChunkBatchEmbeddingFailedRetry"), + "Chunk {ChunkId}: Batch embedding failed, trying individual generation"); + + public static readonly Action LogChunkIndividualEmbeddingSuccessRetry = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1031, "ChunkIndividualEmbeddingSuccessRetry"), + "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkAllEmbeddingMethodsFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1032, "ChunkAllEmbeddingMethodsFailed"), + "Chunk {ChunkId}: All embedding methods failed"); + + public static readonly Action LogBatchFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1033, "BatchFailed"), + "Batch {BatchNumber} failed, processing individually"); + + public static readonly Action LogChunkIndividualEmbeddingSuccessFinal = LoggerMessage.Define( + LogLevel.Debug, + new EventId(1034, "ChunkIndividualEmbeddingSuccessFinal"), + "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkEmbeddingGenerationFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1035, "ChunkEmbeddingGenerationFailed"), + "Chunk {ChunkId}: Failed to generate embedding"); + + public static readonly Action LogChunkEmbeddingRegenerationFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(1036, "ChunkEmbeddingRegenerationFailed"), + "Chunk {ChunkId}: Failed to regenerate embedding"); + + public static readonly Action LogProgress = LoggerMessage.Define( + LogLevel.Information, + new EventId(1037, "Progress"), + "Progress: {ProcessedChunks}/{TotalChunks} chunks processed, {SuccessCount} embeddings generated"); + + public static readonly Action LogSavingDocuments = LoggerMessage.Define( + LogLevel.Information, + new EventId(1038, "SavingDocuments"), + "Saving {DocumentCount} documents with updated embeddings..."); + + public static readonly Action LogEmbeddingRegenerationCompleted = LoggerMessage.Define( + LogLevel.Information, + new EventId(1039, "EmbeddingRegenerationCompleted"), + "Embedding regeneration completed. {SuccessCount} embeddings generated for {ProcessedChunks} chunks in {TotalBatches} batches."); + + public static readonly Action LogEmbeddingRegenerationFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(1040, "EmbeddingRegenerationFailed"), + "Failed to regenerate embeddings"); + + #endregion + + #region DocumentSearchService Log Messages + + public static readonly Action LogSearchResults = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2001, "SearchResults"), + "Search returned {ChunkCount} chunks from {DocumentCount} documents"); + + public static readonly Action LogDiverseResults = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2002, "DiverseResults"), + "Final diverse results: {ResultCount} chunks from {DocumentCount} documents"); + + public static readonly Action LogGeneralConversationQuery = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2003, "GeneralConversationQuery"), + "Detected general conversation query, handling without document search"); + + public static readonly Action LogPrimaryAIServiceAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2004, "PrimaryAIServiceAttempt"), + "Trying primary AI service for embedding generation"); + + public static readonly Action LogPrimaryAIServiceSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2005, "PrimaryAIServiceSuccess"), + "Primary AI service successful: {Dimensions} dimensions"); + + public static readonly Action LogPrimaryAIServiceNull = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2006, "PrimaryAIServiceNull"), + "Primary AI service returned null or empty embedding"); + + public static readonly Action LogPrimaryAIServiceFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2007, "PrimaryAIServiceFailed"), + "Primary AI service failed"); + + public static readonly Action LogFallbackProviderAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2008, "FallbackProviderAttempt"), + "Trying fallback provider: {Provider}"); + + public static readonly Action LogFallbackProviderSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2009, "FallbackProviderSuccess"), + "Fallback provider {Provider} successful: {Dimensions} dimensions"); + + public static readonly Action LogFallbackProviderFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2010, "FallbackProviderFailed"), + "Fallback provider {Provider} failed"); + + public static readonly Action LogAllProvidersFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(2011, "AllProvidersFailed"), + "All embedding providers failed"); + + public static readonly Action LogBatchEmbeddingGeneration = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2012, "BatchEmbeddingGeneration"), + "Generating embeddings for {TextCount} texts in batch"); + + public static readonly Action LogBatchEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2013, "BatchEmbeddingSuccess"), + "Batch embedding successful: {TextCount} embeddings generated"); + + public static readonly Action LogBatchEmbeddingPartial = LoggerMessage.Define( + LogLevel.Warning, + new EventId(2014, "BatchEmbeddingPartial"), + "Batch embedding partially successful: {TextCount} embeddings generated"); + + public static readonly Action LogBatchEmbeddingFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(2015, "BatchEmbeddingFailed"), + "Batch embedding failed, falling back to individual generation"); + + public static readonly Action LogIndividualEmbeddingGeneration = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2016, "IndividualEmbeddingGeneration"), + "Generating individual embeddings for {TextCount} texts"); + + public static readonly Action LogIndividualEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2017, "IndividualEmbeddingSuccess"), + "Individual embedding successful: {TextCount} embeddings generated"); + + public static readonly Action LogQueryIntentDetection = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2018, "QueryIntentDetection"), + "Analyzing query intent for: {Query}"); + + public static readonly Action LogQueryIntentResult = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2019, "QueryIntentResult"), + "Query intent detected as: {Intent}"); + + public static readonly Action LogGeneralConversationHandling = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2020, "GeneralConversationHandling"), + "Handling general conversation query: {Query}"); + + public static readonly Action LogGeneralConversationResponse = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2021, "GeneralConversationResponse"), + "General conversation response generated: {Response}"); + + public static readonly Action LogBasicSearchQuery = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2022, "BasicSearchQuery"), + "Performing basic search for query: {Query}"); + + public static readonly Action LogBasicSearchResults = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2023, "BasicSearchResults"), + "Basic search returned {ChunkCount} chunks"); + + public static readonly Action LogBasicRagQuery = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2024, "BasicRagQuery"), + "Generating basic RAG answer for query: {Query}"); + + public static readonly Action LogBasicRagResults = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2025, "BasicRagResults"), + "Basic RAG generated answer with {SourceCount} sources"); + + public static readonly Action LogVoyageAIBatchAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2026, "VoyageAIBatchAttempt"), + "Attempting VoyageAI batch embedding for {TextCount} texts"); + + public static readonly Action LogVoyageAIBatchSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2027, "VoyageAIBatchSuccess"), + "VoyageAI batch embedding successful: {TextCount} embeddings"); + + public static readonly Action LogVoyageAIBatchFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(2028, "VoyageAIBatchFailed"), + "VoyageAI batch embedding failed"); + + public static readonly Action LogIndividualEmbeddingAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2029, "IndividualEmbeddingAttempt"), + "Attempting individual embedding for {TextCount} texts"); + + public static readonly Action LogIndividualEmbeddingAttemptSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2030, "IndividualEmbeddingAttemptSuccess"), + "Individual embedding attempt successful: {TextCount} embeddings"); + + public static readonly Action LogIndividualEmbeddingAttemptFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(2031, "IndividualEmbeddingAttemptFailed"), + "Individual embedding attempt failed"); + + // Additional logging delegates for remaining calls + public static readonly Action LogSearchInDocuments = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2032, "SearchInDocuments"), + "PerformBasicSearchAsync: Searching in {DocumentCount} documents with {ChunkCount} chunks"); + + public static readonly Action LogEmbeddingSearchSuccessful = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2033, "EmbeddingSearchSuccessful"), + "PerformBasicSearchAsync: Embedding search successful, found {ChunkCount} chunks"); + + public static readonly Action LogEmbeddingSearchFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2034, "EmbeddingSearchFailed"), + "PerformBasicSearchAsync: Embedding search failed, using keyword search"); + + public static readonly Action LogQueryWords = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2035, "QueryWords"), + "PerformBasicSearchAsync: Query words: [{QueryWords}]"); + + public static readonly Action LogPotentialNames = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2036, "PotentialNames"), + "PerformBasicSearchAsync: Potential names: [{PotentialNames}]"); + + public static readonly Action LogFullNameMatch = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2037, "FullNameMatch"), + "PerformBasicSearchAsync: Found FULL NAME match: '{FullName}' in chunk: {ChunkPreview}..."); + + public static readonly Action LogPartialNameMatches = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2038, "PartialNameMatches"), + "PerformBasicSearchAsync: Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}..."); + + public static readonly Action LogRelevantChunksFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2039, "RelevantChunksFound"), + "PerformBasicSearchAsync: Found {ChunkCount} relevant chunks with enhanced search"); + + public static readonly Action LogNameChunksFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2040, "NameChunksFound"), + "PerformBasicSearchAsync: Found {NameChunkCount} chunks containing names, prioritizing them"); + + public static readonly Action LogNoVoyageAIKey = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2041, "NoVoyageAIKey"), + "Embedding search: No VoyageAI API key found"); + + public static readonly Action LogFailedQueryEmbedding = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2042, "FailedQueryEmbedding"), + "Embedding search: Failed to generate query embedding"); + + public static readonly Action LogChunksContainingQueryTerms = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2043, "ChunksContainingQueryTerms"), + "Embedding search: Found {ChunkCount} chunks containing query terms"); + + public static readonly Action LogNoChunksContainQueryTerms = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2044, "NoChunksContainQueryTerms"), + "Embedding search: No chunks contain query terms, using similarity only"); + + public static readonly Action LogEmbeddingSearchFailedError = LoggerMessage.Define( + LogLevel.Error, + new EventId(2045, "EmbeddingSearchFailedError"), + "Embedding search failed"); + + public static readonly Action LogRateLimitedRetry = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2046, "RateLimitedRetry"), + "Embedding generation rate limited, retrying in {Delay}ms (attempt {Attempt}/{MaxRetries})"); + + public static readonly Action LogRateLimitedAfterAttempts = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2047, "RateLimitedAfterAttempts"), + "Embedding generation rate limited after {MaxRetries} attempts"); + + public static readonly Action LogProviderAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2048, "ProviderAttempt"), + "Trying {Provider} provider for embedding generation"); + + public static readonly Action LogProviderConfigFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2049, "ProviderConfigFound"), + "{Provider} config found, API key: {ApiKeyPreview}..."); + + public static readonly Action LogProviderSuccessful = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2050, "ProviderSuccessful"), + "{Provider} successful: {Dimensions} dimensions"); + + public static readonly Action LogProviderReturnedNull = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2051, "ProviderReturnedNull"), + "{Provider} returned null or empty embedding"); + + public static readonly Action LogProviderConfigNotFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2052, "ProviderConfigNotFound"), + "{Provider} config not found or API key missing"); + + public static readonly Action LogProviderFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2053, "ProviderFailed"), + "{Provider} provider failed"); + + public static readonly Action LogAllProvidersFailedText = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2054, "AllProvidersFailedText"), + "All embedding providers failed for text: {TextPreview}..."); + + public static readonly Action LogTestingVoyageAI = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2055, "TestingVoyageAI"), + "Testing VoyageAI directly with key: {ApiKeyPreview}..."); + + public static readonly Action LogVoyageAITestResponse = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2056, "VoyageAITestResponse"), + "VoyageAI test response: {StatusCode} - {Response}"); + + public static readonly Action LogVoyageAIWorking = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2057, "VoyageAIWorking"), + "VoyageAI is working! Trying to parse embedding..."); + + public static readonly Action LogVoyageAITestEmbedding = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2058, "VoyageAITestEmbedding"), + "VoyageAI test embedding generated: {Dimensions} dimensions"); + + public static readonly Action LogFailedParseVoyageAI = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2059, "FailedParseVoyageAI"), + "Failed to parse VoyageAI response"); + + public static readonly Action LogVoyageAIDirectTestFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2060, "VoyageAIDirectTestFailed"), + "VoyageAI direct test failed"); + + #endregion +} From 034bbe1b41951d753132311aaf1d583489c8b2dc Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 14:13:49 +0300 Subject: [PATCH 6/8] feat: Simplify and optimize logging architecture - Streamline ServiceLogMessages to essential log messages only - Fix all parameter mismatches and build errors - Achieve 0 errors, 0 warnings, 0 messages - Maintain performance benefits with LoggerMessage delegates - Follow SOLID and DRY principles for clean, maintainable code --- .../Services/DocumentSearchService.cs | 2 +- src/SmartRAG/Services/DocumentService.cs | 8 +- .../Services/Logging/ServiceLogMessages.cs | 561 +++++++----------- 3 files changed, 206 insertions(+), 365 deletions(-) diff --git a/src/SmartRAG/Services/DocumentSearchService.cs b/src/SmartRAG/Services/DocumentSearchService.cs index 37f6553..b87a369 100644 --- a/src/SmartRAG/Services/DocumentSearchService.cs +++ b/src/SmartRAG/Services/DocumentSearchService.cs @@ -566,7 +566,7 @@ private async Task> TryEmbeddingBasedSearchAsync(string quer if (attempt < maxRetries - 1) { var delay = retryDelayMs * (int)Math.Pow(2, attempt); - ServiceLogMessages.LogRateLimitedRetry(logger, delay, attempt + 1, null); + ServiceLogMessages.LogRateLimitedRetry(logger, delay, attempt + 1, maxRetries, null); await Task.Delay(delay); } else diff --git a/src/SmartRAG/Services/DocumentService.cs b/src/SmartRAG/Services/DocumentService.cs index 1b2ff51..9264e6e 100644 --- a/src/SmartRAG/Services/DocumentService.cs +++ b/src/SmartRAG/Services/DocumentService.cs @@ -200,7 +200,7 @@ public async Task RegenerateAllEmbeddingsAsync() const int batchSize = 128; var totalBatches = (int)Math.Ceiling((double)chunksToProcess.Count / batchSize); - ServiceLogMessages.LogBatchProcessing(logger, totalBatches, batchSize, null); + ServiceLogMessages.LogBatchProcessing(logger, totalBatches, null); for (int batchIndex = 0; batchIndex < totalBatches; batchIndex++) { @@ -208,7 +208,7 @@ public async Task RegenerateAllEmbeddingsAsync() var endIndex = Math.Min(startIndex + batchSize, chunksToProcess.Count); var currentBatch = chunksToProcess.Skip(startIndex).Take(endIndex - startIndex).ToList(); - ServiceLogMessages.LogBatchProgress(logger, batchIndex + 1, totalBatches, startIndex + 1, endIndex, null); + ServiceLogMessages.LogBatchProgress(logger, batchIndex + 1, totalBatches, null); // Generate embeddings for current batch var batchContents = currentBatch.Select(c => c.Content).ToList(); @@ -226,7 +226,7 @@ public async Task RegenerateAllEmbeddingsAsync() { chunk.Embedding = embedding; successCount++; - ServiceLogMessages.LogChunkBatchEmbeddingSuccess(logger, chunk.Id, embedding.Count, null); + ServiceLogMessages.LogChunkBatchEmbeddingSuccess(logger, i, embedding.Count, null); } else { @@ -301,7 +301,7 @@ public async Task RegenerateAllEmbeddingsAsync() await documentRepository.AddAsync(document); } - ServiceLogMessages.LogEmbeddingRegenerationCompleted(logger, successCount, processedChunks, totalBatches, null); + ServiceLogMessages.LogEmbeddingRegenerationCompleted(logger, successCount, processedChunks, null); return successCount > 0; } catch (Exception ex) diff --git a/src/SmartRAG/Services/Logging/ServiceLogMessages.cs b/src/SmartRAG/Services/Logging/ServiceLogMessages.cs index 26e94af..f9902eb 100644 --- a/src/SmartRAG/Services/Logging/ServiceLogMessages.cs +++ b/src/SmartRAG/Services/Logging/ServiceLogMessages.cs @@ -3,517 +3,358 @@ namespace SmartRAG.Services.Logging; /// -/// Centralized LoggerMessage delegates for all services +/// Centralized LoggerMessage delegates for performance optimization /// public static class ServiceLogMessages { - #region DocumentService Log Messages + #region Document Operations public static readonly Action LogDocumentUploaded = LoggerMessage.Define( LogLevel.Information, new EventId(1001, "DocumentUploaded"), "Document uploaded successfully: {FileName}"); - public static readonly Action LogDocumentsUploaded = LoggerMessage.Define( - LogLevel.Information, - new EventId(1002, "DocumentsUploaded"), - "Multiple documents uploaded successfully: {FileName} ({Count} total)"); + public static readonly Action LogDocumentUploadFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1002, "DocumentUploadFailed"), + "Failed to upload document: {FileName}"); public static readonly Action LogDocumentDeleted = LoggerMessage.Define( LogLevel.Information, new EventId(1003, "DocumentDeleted"), - "Document deleted successfully: {FileName}"); - - public static readonly Action LogEmbeddingsRegenerated = LoggerMessage.Define( - LogLevel.Information, - new EventId(1004, "EmbeddingsRegenerated"), - "Embeddings regenerated for document: {FileName}"); - - public static readonly Action LogDocumentNotFound = LoggerMessage.Define( - LogLevel.Warning, - new EventId(1005, "DocumentNotFound"), - "Document not found: {FileName}"); - - public static readonly Action LogDocumentParseError = LoggerMessage.Define( - LogLevel.Error, - new EventId(1006, "DocumentParseError"), - "Error parsing document: {FileName}"); - - public static readonly Action LogEmbeddingGenerationError = LoggerMessage.Define( - LogLevel.Error, - new EventId(1007, "EmbeddingGenerationError"), - "Error generating embeddings for document: {FileName}"); - - public static readonly Action LogDocumentUploadError = LoggerMessage.Define( - LogLevel.Error, - new EventId(1008, "DocumentUploadError"), - "Error uploading document: {FileName}"); - - public static readonly Action LogDocumentDeleteError = LoggerMessage.Define( - LogLevel.Error, - new EventId(1009, "DocumentDeleteError"), - "Error deleting document: {FileName}"); - - public static readonly Action LogEmbeddingsRegenerationError = LoggerMessage.Define( - LogLevel.Error, - new EventId(1010, "EmbeddingsRegenerationError"), - "Error regenerating embeddings for document: {FileName}"); + "Document deleted: {FileName}"); - public static readonly Action LogEmbeddingsRegenerationStarted = LoggerMessage.Define( - LogLevel.Information, - new EventId(1011, "EmbeddingsRegenerationStarted"), - "Started regenerating embeddings for document: {FileName}"); - - public static readonly Action LogEmbeddingsRegenerationCompleted = LoggerMessage.Define( - LogLevel.Information, - new EventId(1012, "EmbeddingsRegenerationCompleted"), - "Completed regenerating embeddings for document: {FileName}"); - - public static readonly Action LogEmbeddingsRegenerationSkipped = LoggerMessage.Define( - LogLevel.Warning, - new EventId(1013, "EmbeddingsRegenerationSkipped"), - "Skipped regenerating embeddings for document: {FileName}"); - - public static readonly Action LogEmbeddingsRegenerationFailed = LoggerMessage.Define( - LogLevel.Error, - new EventId(1014, "EmbeddingsRegenerationFailed"), - "Failed to regenerate embeddings for document: {FileName}"); - - public static readonly Action LogEmbeddingsRegenerationProgress = LoggerMessage.Define( - LogLevel.Debug, - new EventId(1015, "EmbeddingsRegenerationProgress"), - "Embeddings regeneration progress: {FileName}"); + #endregion - public static readonly Action LogEmbeddingsRegenerationSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(1016, "EmbeddingsRegenerationSuccess"), - "Embeddings regeneration successful: {FileName}"); + #region Embedding Operations - // Chunk processing log messages public static readonly Action LogChunkEmbeddingSuccess = LoggerMessage.Define( LogLevel.Debug, - new EventId(1017, "ChunkEmbeddingSuccess"), - "Chunk {ChunkIndex}: Embedding generated successfully ({Dimensions} dimensions)"); - - public static readonly Action LogChunkBatchEmbeddingFailed = LoggerMessage.Define( - LogLevel.Debug, - new EventId(1018, "ChunkBatchEmbeddingFailed"), - "Chunk {ChunkIndex}: Batch embedding failed, trying individual generation"); - - public static readonly Action LogChunkIndividualEmbeddingSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(1019, "ChunkIndividualEmbeddingSuccess"), - "Chunk {ChunkIndex}: Individual embedding successful ({Dimensions} dimensions)"); + new EventId(2001, "ChunkEmbeddingSuccess"), + "Chunk {Index}: Embedding generated ({Dimensions} dimensions)"); public static readonly Action LogChunkEmbeddingFailed = LoggerMessage.Define( LogLevel.Warning, - new EventId(1020, "ChunkEmbeddingFailed"), - "Chunk {ChunkIndex}: Failed to generate embedding after retry"); + new EventId(2002, "ChunkEmbeddingFailed"), + "Chunk {Index}: Failed to generate embedding"); public static readonly Action LogChunkProcessingFailed = LoggerMessage.Define( LogLevel.Error, - new EventId(1021, "ChunkProcessingFailed"), - "Chunk {ChunkIndex}: Failed to process"); - - public static readonly Action LogDocumentUploadFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(1022, "DocumentUploadFailed"), - "Failed to upload document {FileName}"); - - public static readonly Action LogEmbeddingRegenerationStarted = LoggerMessage.Define( - LogLevel.Information, - new EventId(1023, "EmbeddingRegenerationStarted"), - "Starting embedding regeneration for all documents..."); - - public static readonly Action LogDocumentProcessing = LoggerMessage.Define( - LogLevel.Information, - new EventId(1024, "DocumentProcessing"), - "Document: {FileName} ({ChunkCount} chunks)"); - - public static readonly Action LogTotalChunksToProcess = LoggerMessage.Define( - LogLevel.Information, - new EventId(1025, "TotalChunksToProcess"), - "Total chunks to process: {ProcessCount} out of {TotalChunks}"); - - public static readonly Action LogNoProcessingNeeded = LoggerMessage.Define( - LogLevel.Information, - new EventId(1026, "NoProcessingNeeded"), - "All chunks already have valid embeddings. No processing needed."); - - public static readonly Action LogBatchProcessing = LoggerMessage.Define( - LogLevel.Information, - new EventId(1027, "BatchProcessing"), - "Processing in {TotalBatches} batches of {BatchSize} chunks"); - - public static readonly Action LogBatchProgress = LoggerMessage.Define( - LogLevel.Information, - new EventId(1028, "BatchProgress"), - "Processing batch {BatchNumber}/{TotalBatches}: chunks {StartIndex}-{EndIndex}"); + new EventId(2003, "ChunkProcessingFailed"), + "Chunk {Index}: Failed to process"); - public static readonly Action LogChunkBatchEmbeddingSuccess = LoggerMessage.Define( + public static readonly Action LogChunkBatchEmbeddingSuccess = LoggerMessage.Define( LogLevel.Debug, - new EventId(1029, "ChunkBatchEmbeddingSuccess"), - "Chunk {ChunkId}: Batch embedding successful ({Dimensions} dimensions)"); - - public static readonly Action LogChunkBatchEmbeddingFailedRetry = LoggerMessage.Define( - LogLevel.Warning, - new EventId(1030, "ChunkBatchEmbeddingFailedRetry"), - "Chunk {ChunkId}: Batch embedding failed, trying individual generation"); + new EventId(2004, "ChunkBatchEmbeddingSuccess"), + "Chunk {Index}: Batch embedding successful ({Dimensions} dimensions)"); - public static readonly Action LogChunkIndividualEmbeddingSuccessRetry = LoggerMessage.Define( + public static readonly Action LogChunkBatchEmbeddingFailed = LoggerMessage.Define( LogLevel.Debug, - new EventId(1031, "ChunkIndividualEmbeddingSuccessRetry"), - "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); - - public static readonly Action LogChunkAllEmbeddingMethodsFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(1032, "ChunkAllEmbeddingMethodsFailed"), - "Chunk {ChunkId}: All embedding methods failed"); + new EventId(2005, "ChunkBatchEmbeddingFailed"), + "Chunk {Index}: Batch embedding failed, trying individual generation"); - public static readonly Action LogBatchFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(1033, "BatchFailed"), - "Batch {BatchNumber} failed, processing individually"); - - public static readonly Action LogChunkIndividualEmbeddingSuccessFinal = LoggerMessage.Define( + public static readonly Action LogChunkIndividualEmbeddingSuccess = LoggerMessage.Define( LogLevel.Debug, - new EventId(1034, "ChunkIndividualEmbeddingSuccessFinal"), - "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); - - public static readonly Action LogChunkEmbeddingGenerationFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(1035, "ChunkEmbeddingGenerationFailed"), - "Chunk {ChunkId}: Failed to generate embedding"); - - public static readonly Action LogChunkEmbeddingRegenerationFailed = LoggerMessage.Define( - LogLevel.Error, - new EventId(1036, "ChunkEmbeddingRegenerationFailed"), - "Chunk {ChunkId}: Failed to regenerate embedding"); + new EventId(2006, "ChunkIndividualEmbeddingSuccess"), + "Chunk {Index}: Individual embedding successful ({Dimensions} dimensions)"); - public static readonly Action LogProgress = LoggerMessage.Define( - LogLevel.Information, - new EventId(1037, "Progress"), - "Progress: {ProcessedChunks}/{TotalChunks} chunks processed, {SuccessCount} embeddings generated"); - - public static readonly Action LogSavingDocuments = LoggerMessage.Define( + public static readonly Action LogEmbeddingRegenerationStarted = LoggerMessage.Define( LogLevel.Information, - new EventId(1038, "SavingDocuments"), - "Saving {DocumentCount} documents with updated embeddings..."); + new EventId(2007, "EmbeddingRegenerationStarted"), + "Starting embedding regeneration for all documents"); - public static readonly Action LogEmbeddingRegenerationCompleted = LoggerMessage.Define( + public static readonly Action LogEmbeddingRegenerationCompleted = LoggerMessage.Define( LogLevel.Information, - new EventId(1039, "EmbeddingRegenerationCompleted"), - "Embedding regeneration completed. {SuccessCount} embeddings generated for {ProcessedChunks} chunks in {TotalBatches} batches."); + new EventId(2008, "EmbeddingRegenerationCompleted"), + "Embedding regeneration completed: {SuccessCount}/{TotalCount} chunks"); public static readonly Action LogEmbeddingRegenerationFailed = LoggerMessage.Define( LogLevel.Error, - new EventId(1040, "EmbeddingRegenerationFailed"), + new EventId(2009, "EmbeddingRegenerationFailed"), "Failed to regenerate embeddings"); #endregion - #region DocumentSearchService Log Messages + #region Search Operations public static readonly Action LogSearchResults = LoggerMessage.Define( LogLevel.Debug, - new EventId(2001, "SearchResults"), + new EventId(3001, "SearchResults"), "Search returned {ChunkCount} chunks from {DocumentCount} documents"); public static readonly Action LogDiverseResults = LoggerMessage.Define( LogLevel.Debug, - new EventId(2002, "DiverseResults"), + new EventId(3002, "DiverseResults"), "Final diverse results: {ResultCount} chunks from {DocumentCount} documents"); public static readonly Action LogGeneralConversationQuery = LoggerMessage.Define( LogLevel.Debug, - new EventId(2003, "GeneralConversationQuery"), + new EventId(3003, "GeneralConversationQuery"), "Detected general conversation query, handling without document search"); - public static readonly Action LogPrimaryAIServiceAttempt = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2004, "PrimaryAIServiceAttempt"), - "Trying primary AI service for embedding generation"); - - public static readonly Action LogPrimaryAIServiceSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2005, "PrimaryAIServiceSuccess"), - "Primary AI service successful: {Dimensions} dimensions"); - - public static readonly Action LogPrimaryAIServiceNull = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2006, "PrimaryAIServiceNull"), - "Primary AI service returned null or empty embedding"); - - public static readonly Action LogPrimaryAIServiceFailed = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2007, "PrimaryAIServiceFailed"), - "Primary AI service failed"); - - public static readonly Action LogFallbackProviderAttempt = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2008, "FallbackProviderAttempt"), - "Trying fallback provider: {Provider}"); - - public static readonly Action LogFallbackProviderSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2009, "FallbackProviderSuccess"), - "Fallback provider {Provider} successful: {Dimensions} dimensions"); - - public static readonly Action LogFallbackProviderFailed = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2010, "FallbackProviderFailed"), - "Fallback provider {Provider} failed"); - - public static readonly Action LogAllProvidersFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(2011, "AllProvidersFailed"), - "All embedding providers failed"); - - public static readonly Action LogBatchEmbeddingGeneration = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2012, "BatchEmbeddingGeneration"), - "Generating embeddings for {TextCount} texts in batch"); - - public static readonly Action LogBatchEmbeddingSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2013, "BatchEmbeddingSuccess"), - "Batch embedding successful: {TextCount} embeddings generated"); - - public static readonly Action LogBatchEmbeddingPartial = LoggerMessage.Define( - LogLevel.Warning, - new EventId(2014, "BatchEmbeddingPartial"), - "Batch embedding partially successful: {TextCount} embeddings generated"); - - public static readonly Action LogBatchEmbeddingFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(2015, "BatchEmbeddingFailed"), - "Batch embedding failed, falling back to individual generation"); - - public static readonly Action LogIndividualEmbeddingGeneration = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2016, "IndividualEmbeddingGeneration"), - "Generating individual embeddings for {TextCount} texts"); - - public static readonly Action LogIndividualEmbeddingSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2017, "IndividualEmbeddingSuccess"), - "Individual embedding successful: {TextCount} embeddings generated"); - - public static readonly Action LogQueryIntentDetection = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2018, "QueryIntentDetection"), - "Analyzing query intent for: {Query}"); - - public static readonly Action LogQueryIntentResult = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2019, "QueryIntentResult"), - "Query intent detected as: {Intent}"); - - public static readonly Action LogGeneralConversationHandling = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2020, "GeneralConversationHandling"), - "Handling general conversation query: {Query}"); - - public static readonly Action LogGeneralConversationResponse = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2021, "GeneralConversationResponse"), - "General conversation response generated: {Response}"); - - public static readonly Action LogBasicSearchQuery = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2022, "BasicSearchQuery"), - "Performing basic search for query: {Query}"); - - public static readonly Action LogBasicSearchResults = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2023, "BasicSearchResults"), - "Basic search returned {ChunkCount} chunks"); - - public static readonly Action LogBasicRagQuery = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2024, "BasicRagQuery"), - "Generating basic RAG answer for query: {Query}"); - - public static readonly Action LogBasicRagResults = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2025, "BasicRagResults"), - "Basic RAG generated answer with {SourceCount} sources"); - - public static readonly Action LogVoyageAIBatchAttempt = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2026, "VoyageAIBatchAttempt"), - "Attempting VoyageAI batch embedding for {TextCount} texts"); - - public static readonly Action LogVoyageAIBatchSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2027, "VoyageAIBatchSuccess"), - "VoyageAI batch embedding successful: {TextCount} embeddings"); - - public static readonly Action LogVoyageAIBatchFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(2028, "VoyageAIBatchFailed"), - "VoyageAI batch embedding failed"); - - public static readonly Action LogIndividualEmbeddingAttempt = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2029, "IndividualEmbeddingAttempt"), - "Attempting individual embedding for {TextCount} texts"); - - public static readonly Action LogIndividualEmbeddingAttemptSuccess = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2030, "IndividualEmbeddingAttemptSuccess"), - "Individual embedding attempt successful: {TextCount} embeddings"); - - public static readonly Action LogIndividualEmbeddingAttemptFailed = LoggerMessage.Define( - LogLevel.Warning, - new EventId(2031, "IndividualEmbeddingAttemptFailed"), - "Individual embedding attempt failed"); - - // Additional logging delegates for remaining calls public static readonly Action LogSearchInDocuments = LoggerMessage.Define( LogLevel.Debug, - new EventId(2032, "SearchInDocuments"), - "PerformBasicSearchAsync: Searching in {DocumentCount} documents with {ChunkCount} chunks"); + new EventId(3004, "SearchInDocuments"), + "Searching in {DocumentCount} documents with {ChunkCount} chunks"); public static readonly Action LogEmbeddingSearchSuccessful = LoggerMessage.Define( LogLevel.Debug, - new EventId(2033, "EmbeddingSearchSuccessful"), - "PerformBasicSearchAsync: Embedding search successful, found {ChunkCount} chunks"); + new EventId(3005, "EmbeddingSearchSuccessful"), + "Embedding search successful, found {ChunkCount} chunks"); public static readonly Action LogEmbeddingSearchFailed = LoggerMessage.Define( LogLevel.Debug, - new EventId(2034, "EmbeddingSearchFailed"), - "PerformBasicSearchAsync: Embedding search failed, using keyword search"); + new EventId(3006, "EmbeddingSearchFailed"), + "Embedding search failed, using keyword search"); public static readonly Action LogQueryWords = LoggerMessage.Define( LogLevel.Debug, - new EventId(2035, "QueryWords"), - "PerformBasicSearchAsync: Query words: [{QueryWords}]"); + new EventId(3007, "QueryWords"), + "Query words: [{QueryWords}]"); public static readonly Action LogPotentialNames = LoggerMessage.Define( LogLevel.Debug, - new EventId(2036, "PotentialNames"), - "PerformBasicSearchAsync: Potential names: [{PotentialNames}]"); + new EventId(3008, "PotentialNames"), + "Potential names: [{PotentialNames}]"); public static readonly Action LogFullNameMatch = LoggerMessage.Define( LogLevel.Debug, - new EventId(2037, "FullNameMatch"), - "PerformBasicSearchAsync: Found FULL NAME match: '{FullName}' in chunk: {ChunkPreview}..."); + new EventId(3009, "FullNameMatch"), + "Found FULL NAME match: '{FullName}' in chunk: {ChunkPreview}..."); public static readonly Action LogPartialNameMatches = LoggerMessage.Define( LogLevel.Debug, - new EventId(2038, "PartialNameMatches"), - "PerformBasicSearchAsync: Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}..."); + new EventId(3010, "PartialNameMatches"), + "Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}..."); public static readonly Action LogRelevantChunksFound = LoggerMessage.Define( LogLevel.Debug, - new EventId(2039, "RelevantChunksFound"), - "PerformBasicSearchAsync: Found {ChunkCount} relevant chunks with enhanced search"); + new EventId(3011, "RelevantChunksFound"), + "Found {ChunkCount} relevant chunks with enhanced search"); public static readonly Action LogNameChunksFound = LoggerMessage.Define( LogLevel.Debug, - new EventId(2040, "NameChunksFound"), - "PerformBasicSearchAsync: Found {NameChunkCount} chunks containing names, prioritizing them"); + new EventId(3012, "NameChunksFound"), + "Found {NameChunkCount} chunks containing names, prioritizing them"); public static readonly Action LogNoVoyageAIKey = LoggerMessage.Define( LogLevel.Debug, - new EventId(2041, "NoVoyageAIKey"), + new EventId(3013, "NoVoyageAIKey"), "Embedding search: No VoyageAI API key found"); public static readonly Action LogFailedQueryEmbedding = LoggerMessage.Define( LogLevel.Debug, - new EventId(2042, "FailedQueryEmbedding"), + new EventId(3014, "FailedQueryEmbedding"), "Embedding search: Failed to generate query embedding"); public static readonly Action LogChunksContainingQueryTerms = LoggerMessage.Define( LogLevel.Debug, - new EventId(2043, "ChunksContainingQueryTerms"), + new EventId(3015, "ChunksContainingQueryTerms"), "Embedding search: Found {ChunkCount} chunks containing query terms"); public static readonly Action LogNoChunksContainQueryTerms = LoggerMessage.Define( LogLevel.Debug, - new EventId(2044, "NoChunksContainQueryTerms"), + new EventId(3016, "NoChunksContainQueryTerms"), "Embedding search: No chunks contain query terms, using similarity only"); public static readonly Action LogEmbeddingSearchFailedError = LoggerMessage.Define( LogLevel.Error, - new EventId(2045, "EmbeddingSearchFailedError"), + new EventId(3017, "EmbeddingSearchFailedError"), "Embedding search failed"); - public static readonly Action LogRateLimitedRetry = LoggerMessage.Define( + public static readonly Action LogRateLimitedRetry = LoggerMessage.Define( LogLevel.Debug, - new EventId(2046, "RateLimitedRetry"), + new EventId(3018, "RateLimitedRetry"), "Embedding generation rate limited, retrying in {Delay}ms (attempt {Attempt}/{MaxRetries})"); public static readonly Action LogRateLimitedAfterAttempts = LoggerMessage.Define( LogLevel.Debug, - new EventId(2047, "RateLimitedAfterAttempts"), + new EventId(3019, "RateLimitedAfterAttempts"), "Embedding generation rate limited after {MaxRetries} attempts"); - public static readonly Action LogProviderAttempt = LoggerMessage.Define( + #endregion + + #region AI Provider Operations + + public static readonly Action LogPrimaryAIServiceAttempt = LoggerMessage.Define( LogLevel.Debug, - new EventId(2048, "ProviderAttempt"), - "Trying {Provider} provider for embedding generation"); + new EventId(4001, "PrimaryAIServiceAttempt"), + "Trying primary AI service for embedding generation"); - public static readonly Action LogProviderConfigFound = LoggerMessage.Define( + public static readonly Action LogPrimaryAIServiceSuccess = LoggerMessage.Define( LogLevel.Debug, - new EventId(2049, "ProviderConfigFound"), - "{Provider} config found, API key: {ApiKeyPreview}..."); + new EventId(4002, "PrimaryAIServiceSuccess"), + "Primary AI service successful: {Dimensions} dimensions"); + + public static readonly Action LogPrimaryAIServiceNull = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4003, "PrimaryAIServiceNull"), + "Primary AI service returned null or empty embedding"); + + public static readonly Action LogPrimaryAIServiceFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4004, "PrimaryAIServiceFailed"), + "Primary AI service failed"); + + public static readonly Action LogProviderAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4005, "ProviderAttempt"), + "Trying {Provider} provider for embedding generation"); public static readonly Action LogProviderSuccessful = LoggerMessage.Define( LogLevel.Debug, - new EventId(2050, "ProviderSuccessful"), + new EventId(4006, "ProviderSuccessful"), "{Provider} successful: {Dimensions} dimensions"); + public static readonly Action LogProviderFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4007, "ProviderFailed"), + "{Provider} provider failed"); + + public static readonly Action LogAllProvidersFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(4008, "AllProvidersFailed"), + "All embedding providers failed"); + + public static readonly Action LogProviderConfigFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4009, "ProviderConfigFound"), + "{Provider} config found, API key: {ApiKeyPreview}..."); + public static readonly Action LogProviderReturnedNull = LoggerMessage.Define( LogLevel.Debug, - new EventId(2051, "ProviderReturnedNull"), + new EventId(4010, "ProviderReturnedNull"), "{Provider} returned null or empty embedding"); public static readonly Action LogProviderConfigNotFound = LoggerMessage.Define( LogLevel.Debug, - new EventId(2052, "ProviderConfigNotFound"), + new EventId(4011, "ProviderConfigNotFound"), "{Provider} config not found or API key missing"); - public static readonly Action LogProviderFailed = LoggerMessage.Define( - LogLevel.Debug, - new EventId(2053, "ProviderFailed"), - "{Provider} provider failed"); - public static readonly Action LogAllProvidersFailedText = LoggerMessage.Define( LogLevel.Debug, - new EventId(2054, "AllProvidersFailedText"), + new EventId(4012, "AllProvidersFailedText"), "All embedding providers failed for text: {TextPreview}..."); public static readonly Action LogTestingVoyageAI = LoggerMessage.Define( LogLevel.Debug, - new EventId(2055, "TestingVoyageAI"), + new EventId(4013, "TestingVoyageAI"), "Testing VoyageAI directly with key: {ApiKeyPreview}..."); public static readonly Action LogVoyageAITestResponse = LoggerMessage.Define( LogLevel.Debug, - new EventId(2056, "VoyageAITestResponse"), + new EventId(4014, "VoyageAITestResponse"), "VoyageAI test response: {StatusCode} - {Response}"); public static readonly Action LogVoyageAIWorking = LoggerMessage.Define( LogLevel.Debug, - new EventId(2057, "VoyageAIWorking"), + new EventId(4015, "VoyageAIWorking"), "VoyageAI is working! Trying to parse embedding..."); public static readonly Action LogVoyageAITestEmbedding = LoggerMessage.Define( LogLevel.Debug, - new EventId(2058, "VoyageAITestEmbedding"), + new EventId(4016, "VoyageAITestEmbedding"), "VoyageAI test embedding generated: {Dimensions} dimensions"); public static readonly Action LogFailedParseVoyageAI = LoggerMessage.Define( LogLevel.Debug, - new EventId(2059, "FailedParseVoyageAI"), + new EventId(4017, "FailedParseVoyageAI"), "Failed to parse VoyageAI response"); public static readonly Action LogVoyageAIDirectTestFailed = LoggerMessage.Define( LogLevel.Debug, - new EventId(2060, "VoyageAIDirectTestFailed"), + new EventId(4018, "VoyageAIDirectTestFailed"), "VoyageAI direct test failed"); #endregion + + #region Batch Operations + + public static readonly Action LogBatchProcessing = LoggerMessage.Define( + LogLevel.Information, + new EventId(5001, "BatchProcessing"), + "Processing {BatchSize} chunks in batch"); + + public static readonly Action LogBatchCompleted = LoggerMessage.Define( + LogLevel.Information, + new EventId(5002, "BatchCompleted"), + "Batch completed: {ProcessedCount} chunks processed"); + + public static readonly Action LogBatchProgress = LoggerMessage.Define( + LogLevel.Information, + new EventId(5003, "BatchProgress"), + "Processing batch {BatchNumber}/{TotalBatches}"); + + public static readonly Action LogBatchFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(5004, "BatchFailed"), + "Batch {BatchNumber} failed, processing individually"); + + #endregion + + #region Progress and Status + + public static readonly Action LogProgress = LoggerMessage.Define( + LogLevel.Information, + new EventId(6001, "Progress"), + "Progress: {ProcessedChunks}/{TotalChunks} chunks processed, {SuccessCount} embeddings generated"); + + public static readonly Action LogSavingDocuments = LoggerMessage.Define( + LogLevel.Information, + new EventId(6002, "SavingDocuments"), + "Saving {DocumentCount} documents with updated embeddings"); + + public static readonly Action LogTotalChunksToProcess = LoggerMessage.Define( + LogLevel.Information, + new EventId(6003, "TotalChunksToProcess"), + "Total chunks to process: {ProcessCount} out of {TotalChunks}"); + + public static readonly Action LogNoProcessingNeeded = LoggerMessage.Define( + LogLevel.Information, + new EventId(6004, "NoProcessingNeeded"), + "All chunks already have valid embeddings. No processing needed."); + + public static readonly Action LogIndividualEmbeddingGeneration = LoggerMessage.Define( + LogLevel.Debug, + new EventId(6005, "IndividualEmbeddingGeneration"), + "Generating individual embeddings for {TextCount} texts"); + + public static readonly Action LogDocumentProcessing = LoggerMessage.Define( + LogLevel.Information, + new EventId(6006, "DocumentProcessing"), + "Document: {FileName} ({ChunkCount} chunks)"); + + + + public static readonly Action LogChunkBatchEmbeddingFailedRetry = LoggerMessage.Define( + LogLevel.Warning, + new EventId(6008, "ChunkBatchEmbeddingFailedRetry"), + "Chunk {ChunkId}: Batch embedding failed, trying individual generation"); + + public static readonly Action LogChunkIndividualEmbeddingSuccessRetry = LoggerMessage.Define( + LogLevel.Debug, + new EventId(6009, "ChunkIndividualEmbeddingSuccessRetry"), + "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkAllEmbeddingMethodsFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(6010, "ChunkAllEmbeddingMethodsFailed"), + "Chunk {ChunkId}: All embedding methods failed"); + + public static readonly Action LogChunkIndividualEmbeddingSuccessFinal = LoggerMessage.Define( + LogLevel.Debug, + new EventId(6011, "ChunkIndividualEmbeddingSuccessFinal"), + "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkEmbeddingGenerationFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(6012, "ChunkEmbeddingGenerationFailed"), + "Chunk {ChunkId}: Failed to generate embedding"); + + public static readonly Action LogChunkEmbeddingRegenerationFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(6013, "ChunkEmbeddingRegenerationFailed"), + "Chunk {ChunkId}: Failed to regenerate embedding"); + + #endregion } From 37c5651e27b20f6927847f86b1c12ab5597b3fe5 Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 16:34:52 +0300 Subject: [PATCH 7/8] Improve document services and WebAPI examples --- .../WebAPI/Controllers/DocumentsController.cs | 12 +- examples/WebAPI/Program.cs | 33 +- examples/WebAPI/SmartRAG.API.csproj | 1 + .../Services/DocumentSearchService.cs | 84 ++--- src/SmartRAG/Services/DocumentService.cs | 336 +++++++++--------- 5 files changed, 246 insertions(+), 220 deletions(-) diff --git a/examples/WebAPI/Controllers/DocumentsController.cs b/examples/WebAPI/Controllers/DocumentsController.cs index b9f3f10..a9cf03d 100644 --- a/examples/WebAPI/Controllers/DocumentsController.cs +++ b/examples/WebAPI/Controllers/DocumentsController.cs @@ -64,7 +64,10 @@ public IActionResult GetSupportedTypes() /// Upload multiple documents to the system /// [HttpPost("upload-multiple")] - public async Task>> UploadDocuments(List files) + [Consumes("multipart/form-data")] + [RequestSizeLimit(100 * 1024 * 1024)] // 100 MB + [RequestFormLimits(MultipartBodyLengthLimit = 100 * 1024 * 1024)] + public async Task>> UploadDocuments([FromForm] List files) { if (files == null || files.Count == 0) return BadRequest("No files provided"); @@ -76,9 +79,9 @@ public IActionResult GetSupportedTypes() var contentTypes = files.Select(f => f.ContentType); var documents = await documentService.UploadDocumentsAsync( - fileStreams, - fileNames, - contentTypes, + fileStreams, + fileNames, + contentTypes, "system"); return CreatedAtAction(nameof(GetAllDocuments), documents); @@ -88,7 +91,6 @@ public IActionResult GetSupportedTypes() return StatusCode(500, $"Internal server error: {ex.Message}"); } } - /// /// Get a document by ID /// diff --git a/examples/WebAPI/Program.cs b/examples/WebAPI/Program.cs index ecbe2b1..7776241 100644 --- a/examples/WebAPI/Program.cs +++ b/examples/WebAPI/Program.cs @@ -1,10 +1,17 @@ -using Microsoft.Extensions.Logging; using Scalar.AspNetCore; using SmartRAG.Enums; using SmartRAG.Extensions; +using Microsoft.OpenApi.Models; +using SmartRAG.API.Filters; var builder = WebApplication.CreateBuilder(args); +// Configure Kestrel server options for file uploads +builder.WebHost.UseKestrel(options => +{ + options.Limits.MaxRequestBodySize = 100 * 1024 * 1024; // 100 MB +}); + RegisterServices(builder.Services, builder.Configuration); var app = builder.Build(); @@ -26,11 +33,27 @@ static void RegisterServices(IServiceCollection services, IConfiguration configu services.AddControllers(); services.AddEndpointsApiExplorer(); services.AddOpenApi(); + services.AddSwaggerGen(c => + { + c.SwaggerDoc("v1", new OpenApiInfo { Title = "SmartRAG API", Version = "v1" }); + + // Configure multipart file upload for multiple files + c.OperationFilter(); + }); + + // Configure form options for file uploads + services.Configure(options => + { + options.MultipartBodyLengthLimit = 100 * 1024 * 1024; // 100 MB + options.ValueLengthLimit = int.MaxValue; + options.ValueCountLimit = int.MaxValue; + options.KeyLengthLimit = int.MaxValue; + }); // Add SmartRag services with minimal configuration services.UseSmartRag(configuration, storageProvider: StorageProvider.InMemory, // Default: InMemory - aiProvider: AIProvider.OpenAI // Use OpenAI provider + aiProvider: AIProvider.Gemini // Use OpenAI provider ); services.AddCors(options => @@ -50,9 +73,13 @@ static void ConfigureMiddleware(WebApplication app, IWebHostEnvironment environm if (environment.IsDevelopment()) { app.MapOpenApi(); - app.MapScalarApiReference(); + app.MapSwagger(); + app.UseSwaggerUI(); } + // Serve static files for simple upload page + app.UseStaticFiles(); + app.UseHttpsRedirection(); app.UseCors("AllowAll"); app.UseAuthorization(); diff --git a/examples/WebAPI/SmartRAG.API.csproj b/examples/WebAPI/SmartRAG.API.csproj index 53dee48..e63145a 100644 --- a/examples/WebAPI/SmartRAG.API.csproj +++ b/examples/WebAPI/SmartRAG.API.csproj @@ -13,6 +13,7 @@ + diff --git a/src/SmartRAG/Services/DocumentSearchService.cs b/src/SmartRAG/Services/DocumentSearchService.cs index b87a369..1b634a3 100644 --- a/src/SmartRAG/Services/DocumentSearchService.cs +++ b/src/SmartRAG/Services/DocumentSearchService.cs @@ -10,23 +10,23 @@ namespace SmartRAG.Services; - public class DocumentSearchService( - IDocumentRepository documentRepository, - IAIService aiService, - IAIProviderFactory aiProviderFactory, - IConfiguration configuration, - SmartRagOptions options, - ILogger logger) : IDocumentSearchService - { - - /// - /// Sanitizes user input for safe logging by removing newlines and carriage returns. - /// - private static string SanitizeForLog(string input) - { - if (input == null) return string.Empty; - return input.Replace("\r", "").Replace("\n", ""); - } +public class DocumentSearchService( + IDocumentRepository documentRepository, + IAIService aiService, + IAIProviderFactory aiProviderFactory, + IConfiguration configuration, + SmartRagOptions options, + ILogger logger) : IDocumentSearchService +{ + + /// + /// Sanitizes user input for safe logging by removing newlines and carriage returns. + /// + private static string SanitizeForLog(string input) + { + if (input == null) return string.Empty; + return input.Replace("\r", "").Replace("\n", ""); + } public async Task> SearchDocumentsAsync(string query, int maxResults = 5) { if (string.IsNullOrWhiteSpace(query)) @@ -37,12 +37,12 @@ public async Task> SearchDocumentsAsync(string query, int ma if (searchResults.Count > 0) { - ServiceLogMessages.LogSearchResults(logger, searchResults.Count, searchResults.Select(c => c.DocumentId).Distinct().Count(), null); + ServiceLogMessages.LogSearchResults(logger, searchResults.Count, searchResults.Select(c => c.DocumentId).Distinct().Count(), null); - // Apply diversity selection to ensure chunks from different documents - var diverseResults = ApplyDiversityAndSelect(searchResults, maxResults); + // Apply diversity selection to ensure chunks from different documents + var diverseResults = ApplyDiversityAndSelect(searchResults, maxResults); - ServiceLogMessages.LogDiverseResults(logger, diverseResults.Count, diverseResults.Select(c => c.DocumentId).Distinct().Count(), null); + ServiceLogMessages.LogDiverseResults(logger, diverseResults.Count, diverseResults.Select(c => c.DocumentId).Distinct().Count(), null); return diverseResults; } @@ -55,11 +55,11 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul if (string.IsNullOrWhiteSpace(query)) throw new ArgumentException("Query cannot be empty", nameof(query)); - // Check if this is a general conversation query - if (IsGeneralConversationQuery(query)) - { - ServiceLogMessages.LogGeneralConversationQuery(logger, null); - var chatResponse = await HandleGeneralConversationAsync(query); + // Check if this is a general conversation query + if (IsGeneralConversationQuery(query)) + { + ServiceLogMessages.LogGeneralConversationQuery(logger, null); + var chatResponse = await HandleGeneralConversationAsync(query); return new RagResponse { Answer = chatResponse, @@ -75,21 +75,21 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul public async Task?> GenerateEmbeddingWithFallbackAsync(string text) { - try - { - ServiceLogMessages.LogPrimaryAIServiceAttempt(logger, null); - var result = await aiService.GenerateEmbeddingsAsync(text); - if (result != null && result.Count > 0) - { - ServiceLogMessages.LogPrimaryAIServiceSuccess(logger, result.Count, null); - return result; - } - ServiceLogMessages.LogPrimaryAIServiceNull(logger, null); - } - catch (Exception ex) - { - ServiceLogMessages.LogPrimaryAIServiceFailed(logger, ex); - } + try + { + ServiceLogMessages.LogPrimaryAIServiceAttempt(logger, null); + var result = await aiService.GenerateEmbeddingsAsync(text); + if (result != null && result.Count > 0) + { + ServiceLogMessages.LogPrimaryAIServiceSuccess(logger, result.Count, null); + return result; + } + ServiceLogMessages.LogPrimaryAIServiceNull(logger, null); + } + catch (Exception ex) + { + ServiceLogMessages.LogPrimaryAIServiceFailed(logger, ex); + } var embeddingProviders = new[] { @@ -208,7 +208,7 @@ public async Task GenerateRagAnswerAsync(string query, int maxResul if (batchEmbeddings != null && batchEmbeddings.Count == texts.Count) return batchEmbeddings; } - catch + catch (Exception) { // Fallback to individual generation if batch fails } diff --git a/src/SmartRAG/Services/DocumentService.cs b/src/SmartRAG/Services/DocumentService.cs index 9264e6e..5f3ca55 100644 --- a/src/SmartRAG/Services/DocumentService.cs +++ b/src/SmartRAG/Services/DocumentService.cs @@ -1,12 +1,8 @@ -using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging; using SmartRAG.Entities; -using SmartRAG.Enums; -using SmartRAG.Factories; using SmartRAG.Interfaces; using SmartRAG.Models; using SmartRAG.Services.Logging; -using System.Text.Json; namespace SmartRAG.Services; @@ -58,36 +54,36 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa // Check if embedding was generated successfully if (allEmbeddings != null && i < allEmbeddings.Count && allEmbeddings[i] != null && allEmbeddings[i].Count > 0) { - chunk.Embedding = allEmbeddings[i]; - ServiceLogMessages.LogChunkEmbeddingSuccess(logger, i, allEmbeddings[i].Count, null); - } - else - { - // Retry individual embedding generation for this chunk - ServiceLogMessages.LogChunkBatchEmbeddingFailed(logger, i, null); - var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); - - if (individualEmbedding != null && individualEmbedding.Count > 0) - { - chunk.Embedding = individualEmbedding; - ServiceLogMessages.LogChunkIndividualEmbeddingSuccess(logger, i, individualEmbedding.Count, null); - } - else - { - ServiceLogMessages.LogChunkEmbeddingFailed(logger, i, null); - chunk.Embedding = new List(); // Empty but not null - } - } - - if (chunk.CreatedAt == default) - chunk.CreatedAt = DateTime.UtcNow; - } - catch (Exception ex) - { - ServiceLogMessages.LogChunkProcessingFailed(logger, i, ex); - // If embedding generation fails, leave it empty and continue - document.Chunks[i].Embedding = new List(); // Empty but not null - } + chunk.Embedding = allEmbeddings[i]; + ServiceLogMessages.LogChunkEmbeddingSuccess(logger, i, allEmbeddings[i].Count, null); + } + else + { + // Retry individual embedding generation for this chunk + ServiceLogMessages.LogChunkBatchEmbeddingFailed(logger, i, null); + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); + + if (individualEmbedding != null && individualEmbedding.Count > 0) + { + chunk.Embedding = individualEmbedding; + ServiceLogMessages.LogChunkIndividualEmbeddingSuccess(logger, i, individualEmbedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkEmbeddingFailed(logger, i, null); + chunk.Embedding = []; // Empty but not null + } + } + + if (chunk.CreatedAt == default) + chunk.CreatedAt = DateTime.UtcNow; + } + catch (Exception ex) + { + ServiceLogMessages.LogChunkProcessingFailed(logger, i, ex); + // If embedding generation fails, leave it empty and continue + document.Chunks[i].Embedding = []; // Empty but not null + } } var savedDocument = await documentRepository.AddAsync(document); @@ -116,17 +112,17 @@ public async Task> UploadDocumentsAsync(IEnumerable fileS var uploadedDocuments = new List(); // Parallel document upload for better performance - var uploadTasks = streamList.Select(async (stream, index) => + var uploadTasks = streamList.Select(async (stream, index) => { - try + try { return await UploadDocumentAsync(stream, nameList[index], typeList[index], uploadedBy); - } - catch (Exception ex) - { - ServiceLogMessages.LogDocumentUploadFailed(logger, nameList[index], ex); - return null; - } + } + catch (Exception ex) + { + ServiceLogMessages.LogDocumentUploadFailed(logger, nameList[index], ex); + return null; + } }); var uploadResults = await Task.WhenAll(uploadTasks); @@ -154,14 +150,14 @@ public Task> GetStorageStatisticsAsync() return Task.FromResult(stats); } - + public async Task RegenerateAllEmbeddingsAsync() { - try - { - ServiceLogMessages.LogEmbeddingRegenerationStarted(logger, null); - - var allDocuments = await documentRepository.GetAllAsync(); + try + { + ServiceLogMessages.LogEmbeddingRegenerationStarted(logger, null); + + var allDocuments = await documentRepository.GetAllAsync(); var totalChunks = allDocuments.Sum(d => d.Chunks.Count); var processedChunks = 0; var successCount = 0; @@ -172,35 +168,35 @@ public async Task RegenerateAllEmbeddingsAsync() foreach (var document in allDocuments) { - ServiceLogMessages.LogDocumentProcessing(logger, document.FileName, document.Chunks.Count, null); - - foreach (var chunk in document.Chunks) - { - // Skip if embedding already exists and is valid - if (chunk.Embedding != null && chunk.Embedding.Count > 0) - { - processedChunks++; - continue; - } - - chunksToProcess.Add(chunk); - documentChunkMap[chunk] = document; - } - } - - ServiceLogMessages.LogTotalChunksToProcess(logger, chunksToProcess.Count, totalChunks, null); - - if (chunksToProcess.Count == 0) - { - ServiceLogMessages.LogNoProcessingNeeded(logger, null); - return true; - } - - // Process chunks in batches of 128 (VoyageAI max batch size) - const int batchSize = 128; - var totalBatches = (int)Math.Ceiling((double)chunksToProcess.Count / batchSize); - - ServiceLogMessages.LogBatchProcessing(logger, totalBatches, null); + ServiceLogMessages.LogDocumentProcessing(logger, document.FileName, document.Chunks.Count, null); + + foreach (var chunk in document.Chunks) + { + // Skip if embedding already exists and is valid + if (chunk.Embedding != null && chunk.Embedding.Count > 0) + { + processedChunks++; + continue; + } + + chunksToProcess.Add(chunk); + documentChunkMap[chunk] = document; + } + } + + ServiceLogMessages.LogTotalChunksToProcess(logger, chunksToProcess.Count, totalChunks, null); + + if (chunksToProcess.Count == 0) + { + ServiceLogMessages.LogNoProcessingNeeded(logger, null); + return true; + } + + // Process chunks in batches of 128 (VoyageAI max batch size) + const int batchSize = 128; + var totalBatches = (int)Math.Ceiling((double)chunksToProcess.Count / batchSize); + + ServiceLogMessages.LogBatchProcessing(logger, totalBatches, null); for (int batchIndex = 0; batchIndex < totalBatches; batchIndex++) { @@ -208,81 +204,81 @@ public async Task RegenerateAllEmbeddingsAsync() var endIndex = Math.Min(startIndex + batchSize, chunksToProcess.Count); var currentBatch = chunksToProcess.Skip(startIndex).Take(endIndex - startIndex).ToList(); - ServiceLogMessages.LogBatchProgress(logger, batchIndex + 1, totalBatches, null); - - // Generate embeddings for current batch - var batchContents = currentBatch.Select(c => c.Content).ToList(); - var batchEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(batchContents); - - if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) - { - // Apply embeddings to chunks - for (int i = 0; i < currentBatch.Count; i++) - { - var chunk = currentBatch[i]; - var embedding = batchEmbeddings[i]; - - if (embedding != null && embedding.Count > 0) - { - chunk.Embedding = embedding; - successCount++; - ServiceLogMessages.LogChunkBatchEmbeddingSuccess(logger, i, embedding.Count, null); - } - else - { - ServiceLogMessages.LogChunkBatchEmbeddingFailedRetry(logger, chunk.Id, null); - - // Fallback to individual generation - var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); - if (individualEmbedding != null && individualEmbedding.Count > 0) - { - chunk.Embedding = individualEmbedding; - successCount++; - ServiceLogMessages.LogChunkIndividualEmbeddingSuccessRetry(logger, chunk.Id, individualEmbedding.Count, null); - } - else - { - ServiceLogMessages.LogChunkAllEmbeddingMethodsFailed(logger, chunk.Id, null); - } - } - - processedChunks++; - } - } - else - { - ServiceLogMessages.LogBatchFailed(logger, batchIndex + 1, null); - - // Process chunks individually if batch fails - foreach (var chunk in currentBatch) - { - try - { - var newEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); - - if (newEmbedding != null && newEmbedding.Count > 0) - { - chunk.Embedding = newEmbedding; - successCount++; - ServiceLogMessages.LogChunkIndividualEmbeddingSuccessFinal(logger, chunk.Id, newEmbedding.Count, null); - } - else - { - ServiceLogMessages.LogChunkEmbeddingGenerationFailed(logger, chunk.Id, null); - } - - processedChunks++; - } - catch (Exception ex) - { - ServiceLogMessages.LogChunkEmbeddingRegenerationFailed(logger, chunk.Id, ex); - processedChunks++; - } - } - } - - // Progress update - ServiceLogMessages.LogProgress(logger, processedChunks, chunksToProcess.Count, successCount, null); + ServiceLogMessages.LogBatchProgress(logger, batchIndex + 1, totalBatches, null); + + // Generate embeddings for current batch + var batchContents = currentBatch.Select(c => c.Content).ToList(); + var batchEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(batchContents); + + if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) + { + // Apply embeddings to chunks + for (int i = 0; i < currentBatch.Count; i++) + { + var chunk = currentBatch[i]; + var embedding = batchEmbeddings[i]; + + if (embedding != null && embedding.Count > 0) + { + chunk.Embedding = embedding; + successCount++; + ServiceLogMessages.LogChunkBatchEmbeddingSuccess(logger, i, embedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkBatchEmbeddingFailedRetry(logger, chunk.Id, null); + + // Fallback to individual generation + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); + if (individualEmbedding != null && individualEmbedding.Count > 0) + { + chunk.Embedding = individualEmbedding; + successCount++; + ServiceLogMessages.LogChunkIndividualEmbeddingSuccessRetry(logger, chunk.Id, individualEmbedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkAllEmbeddingMethodsFailed(logger, chunk.Id, null); + } + } + + processedChunks++; + } + } + else + { + ServiceLogMessages.LogBatchFailed(logger, batchIndex + 1, null); + + // Process chunks individually if batch fails + foreach (var chunk in currentBatch) + { + try + { + var newEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); + + if (newEmbedding != null && newEmbedding.Count > 0) + { + chunk.Embedding = newEmbedding; + successCount++; + ServiceLogMessages.LogChunkIndividualEmbeddingSuccessFinal(logger, chunk.Id, newEmbedding.Count, null); + } + else + { + ServiceLogMessages.LogChunkEmbeddingGenerationFailed(logger, chunk.Id, null); + } + + processedChunks++; + } + catch (Exception ex) + { + ServiceLogMessages.LogChunkEmbeddingRegenerationFailed(logger, chunk.Id, ex); + processedChunks++; + } + } + } + + // Progress update + ServiceLogMessages.LogProgress(logger, processedChunks, chunksToProcess.Count, successCount, null); // Smart rate limiting if (batchIndex < totalBatches - 1) // Don't wait after last batch @@ -291,23 +287,23 @@ public async Task RegenerateAllEmbeddingsAsync() } } - // Save all documents with updated embeddings - var documentsToUpdate = documentChunkMap.Values.Distinct().ToList(); - ServiceLogMessages.LogSavingDocuments(logger, documentsToUpdate.Count, null); - - foreach (var document in documentsToUpdate) - { - await documentRepository.DeleteAsync(document.Id); - await documentRepository.AddAsync(document); - } - - ServiceLogMessages.LogEmbeddingRegenerationCompleted(logger, successCount, processedChunks, null); - return successCount > 0; - } - catch (Exception ex) - { - ServiceLogMessages.LogEmbeddingRegenerationFailed(logger, ex); - return false; - } + // Save all documents with updated embeddings + var documentsToUpdate = documentChunkMap.Values.Distinct().ToList(); + ServiceLogMessages.LogSavingDocuments(logger, documentsToUpdate.Count, null); + + foreach (var document in documentsToUpdate) + { + await documentRepository.DeleteAsync(document.Id); + await documentRepository.AddAsync(document); + } + + ServiceLogMessages.LogEmbeddingRegenerationCompleted(logger, successCount, processedChunks, null); + return successCount > 0; + } + catch (Exception ex) + { + ServiceLogMessages.LogEmbeddingRegenerationFailed(logger, ex); + return false; + } } } \ No newline at end of file From 1bb56ab9019fcdc854e7b28e72aa10540817faff Mon Sep 17 00:00:00 2001 From: Baris Yerlikaya Date: Mon, 18 Aug 2025 16:37:32 +0300 Subject: [PATCH 8/8] Add WebAPI filters and wwwroot files --- .../Filters/MultipartFileUploadFilter.cs | 53 +++++++ examples/WebAPI/wwwroot/upload.html | 134 ++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 examples/WebAPI/Filters/MultipartFileUploadFilter.cs create mode 100644 examples/WebAPI/wwwroot/upload.html diff --git a/examples/WebAPI/Filters/MultipartFileUploadFilter.cs b/examples/WebAPI/Filters/MultipartFileUploadFilter.cs new file mode 100644 index 0000000..73aaf6f --- /dev/null +++ b/examples/WebAPI/Filters/MultipartFileUploadFilter.cs @@ -0,0 +1,53 @@ +using Microsoft.OpenApi.Models; +using Swashbuckle.AspNetCore.SwaggerGen; + +namespace SmartRAG.API.Filters; + +public class MultipartFileUploadFilter : IOperationFilter +{ + public void Apply(OpenApiOperation operation, OperationFilterContext context) + { + // Check if this is the upload-multiple endpoint + if (context.MethodInfo.Name == "UploadDocuments" && + context.MethodInfo.DeclaringType?.Name == "DocumentsController") + { + // Remove any existing request body + operation.RequestBody = null; + + // Add the multipart form data request body with multiple files + operation.RequestBody = new OpenApiRequestBody + { + Content = new Dictionary + { + ["multipart/form-data"] = new OpenApiMediaType + { + Schema = new OpenApiSchema + { + Type = "object", + Properties = new Dictionary + { + ["files"] = new OpenApiSchema + { + Type = "array", + Items = new OpenApiSchema + { + Type = "string", + Format = "binary" + } + } + }, + Required = new HashSet { "files" } + } + } + } + }; + + // Add external documentation link for simple upload page + if (operation.ExternalDocs == null) + operation.ExternalDocs = new OpenApiExternalDocs(); + + operation.ExternalDocs.Description = "Simple multiple file upload page"; + operation.ExternalDocs.Url = new Uri("/upload.html", UriKind.Relative); + } + } +} diff --git a/examples/WebAPI/wwwroot/upload.html b/examples/WebAPI/wwwroot/upload.html new file mode 100644 index 0000000..9c86e9c --- /dev/null +++ b/examples/WebAPI/wwwroot/upload.html @@ -0,0 +1,134 @@ + + + + + + SmartRAG - Multiple Document Upload + + + +
+
Multiple Document Upload
+
+ + + +
+ + +
+
+ Console Logs (Real-time) + +
+
+
+
+
+ + +