diff --git a/README.md b/README.md index 7bdc184..67d5067 100644 --- a/README.md +++ b/README.md @@ -547,11 +547,7 @@ We welcome contributions! This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. -## 🌟 Star History -[![Star History Chart](https://api.star-history.com/svg?repos=byerlikaya/SmartRAG&type=Date)](https://star-history.com/#byerlikaya/SmartRAG&Date) - ---- **Built with ❤️ by Barış Yerlikaya** diff --git a/SmartRAG.sln b/SmartRAG.sln index 8177a74..c9f3dfc 100644 --- a/SmartRAG.sln +++ b/SmartRAG.sln @@ -4,7 +4,7 @@ VisualStudioVersion = 17.0.31903.59 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SmartRAG", "src\SmartRAG\SmartRAG.csproj", "{DECA885F-8815-4A0F-A12C-30563827C255}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SmartRAG.API", "src\SmartRAG.API\SmartRAG.API.csproj", "{E7606EAF-F26D-441F-B5A4-34A72A70DD6C}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SmartRAG.API", "examples\WebAPI\SmartRAG.API.csproj", "{E7606EAF-F26D-441F-B5A4-34A72A70DD6C}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..4abb8e5 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,54 @@ +# SmartRAG Examples + +This directory contains example projects demonstrating how to use SmartRAG in different scenarios. + +## 📁 Available Examples + +### **WebAPI** - ASP.NET Core Web API Example +- **Location**: `WebAPI/` +- **Description**: Complete web API implementation showing document upload, search, and RAG operations +- **Features**: + - Multi-document upload + - AI-powered question answering + - Smart query intent detection + - Multiple storage providers + - Comprehensive API documentation + +## 🚀 Running Examples + +### WebAPI Example +```bash +cd examples/WebAPI +dotnet restore +dotnet run +``` + +Browse to `https://localhost:5001/scalar/v1` for interactive API documentation. + +## 🔧 Configuration + +Each example includes its own configuration files. Copy and modify the template files as needed: + +```bash +# Copy development configuration template +cp appsettings.Development.template.json appsettings.Development.json + +# Edit with your API keys and configuration +``` + +## 📚 Documentation + +- **Main Documentation**: [SmartRAG README](../../README.md) +- **API Reference**: [API Documentation](../../docs/api-reference.md) +- **Configuration Guide**: [Configuration Guide](../../docs/configuration.md) + +## 🤝 Contributing + +Want to add more examples? Create a new directory and submit a pull request! + +### Example Types to Consider: +- **Console Application** - Command-line interface +- **Blazor WebAssembly** - Client-side web app +- **WPF Application** - Desktop application +- **Azure Functions** - Serverless implementation +- **Minimal API** - Lightweight web API diff --git a/src/SmartRAG.API/Contracts/SearchRequestIDto.cs b/examples/WebAPI/Contracts/SearchRequestIDto.cs similarity index 100% rename from src/SmartRAG.API/Contracts/SearchRequestIDto.cs rename to examples/WebAPI/Contracts/SearchRequestIDto.cs diff --git a/src/SmartRAG.API/Controllers/DocumentsController.cs b/examples/WebAPI/Controllers/DocumentsController.cs similarity index 75% rename from src/SmartRAG.API/Controllers/DocumentsController.cs rename to examples/WebAPI/Controllers/DocumentsController.cs index db55203..a9cf03d 100644 --- a/src/SmartRAG.API/Controllers/DocumentsController.cs +++ b/examples/WebAPI/Controllers/DocumentsController.cs @@ -13,7 +13,9 @@ namespace SmartRAG.API.Controllers; [Route("api/[controller]")] [Produces("application/json")] [ProducesResponseType(StatusCodes.Status500InternalServerError)] -public class DocumentsController(IDocumentService documentService, IDocumentParserService documentParser) : ControllerBase +public class DocumentsController( + IDocumentService documentService, + IDocumentParserService documentParser) : ControllerBase { /// /// Gets supported file types and content types @@ -58,6 +60,37 @@ public IActionResult GetSupportedTypes() } } + /// + /// Upload multiple documents to the system + /// + [HttpPost("upload-multiple")] + [Consumes("multipart/form-data")] + [RequestSizeLimit(100 * 1024 * 1024)] // 100 MB + [RequestFormLimits(MultipartBodyLengthLimit = 100 * 1024 * 1024)] + public async Task>> UploadDocuments([FromForm] List files) + { + if (files == null || files.Count == 0) + return BadRequest("No files provided"); + + try + { + var fileStreams = files.Select(f => f.OpenReadStream()); + var fileNames = files.Select(f => f.FileName); + var contentTypes = files.Select(f => f.ContentType); + + var documents = await documentService.UploadDocumentsAsync( + fileStreams, + fileNames, + contentTypes, + "system"); + + return CreatedAtAction(nameof(GetAllDocuments), documents); + } + catch (Exception ex) + { + return StatusCode(500, $"Internal server error: {ex.Message}"); + } + } /// /// Get a document by ID /// @@ -83,7 +116,6 @@ public IActionResult GetSupportedTypes() return Ok(documents); } - /// /// Delete a document /// diff --git a/src/SmartRAG.API/Controllers/SearchController.cs b/examples/WebAPI/Controllers/SearchController.cs similarity index 84% rename from src/SmartRAG.API/Controllers/SearchController.cs rename to examples/WebAPI/Controllers/SearchController.cs index 83eed1d..d2628d8 100644 --- a/src/SmartRAG.API/Controllers/SearchController.cs +++ b/examples/WebAPI/Controllers/SearchController.cs @@ -10,7 +10,7 @@ namespace SmartRAG.API.Controllers; [Route("api/[controller]")] [Produces("application/json")] [ProducesResponseType(StatusCodes.Status500InternalServerError)] -public class SearchController(IDocumentService documentService) : ControllerBase +public class SearchController(IDocumentSearchService documentSearchService) : ControllerBase { /// /// Search documents using RAG (Retrieval-Augmented Generation) @@ -28,7 +28,7 @@ public async Task> Search([FromBody] Contracts.SearchReques try { - var response = await documentService.GenerateRagAnswerAsync(query, maxResults); + var response = await documentSearchService.GenerateRagAnswerAsync(query, maxResults); return Ok(response); } catch (Exception ex) diff --git a/src/SmartRAG.API/Controllers/StorageController.cs b/examples/WebAPI/Controllers/StorageController.cs similarity index 100% rename from src/SmartRAG.API/Controllers/StorageController.cs rename to examples/WebAPI/Controllers/StorageController.cs diff --git a/examples/WebAPI/Filters/MultipartFileUploadFilter.cs b/examples/WebAPI/Filters/MultipartFileUploadFilter.cs new file mode 100644 index 0000000..73aaf6f --- /dev/null +++ b/examples/WebAPI/Filters/MultipartFileUploadFilter.cs @@ -0,0 +1,53 @@ +using Microsoft.OpenApi.Models; +using Swashbuckle.AspNetCore.SwaggerGen; + +namespace SmartRAG.API.Filters; + +public class MultipartFileUploadFilter : IOperationFilter +{ + public void Apply(OpenApiOperation operation, OperationFilterContext context) + { + // Check if this is the upload-multiple endpoint + if (context.MethodInfo.Name == "UploadDocuments" && + context.MethodInfo.DeclaringType?.Name == "DocumentsController") + { + // Remove any existing request body + operation.RequestBody = null; + + // Add the multipart form data request body with multiple files + operation.RequestBody = new OpenApiRequestBody + { + Content = new Dictionary + { + ["multipart/form-data"] = new OpenApiMediaType + { + Schema = new OpenApiSchema + { + Type = "object", + Properties = new Dictionary + { + ["files"] = new OpenApiSchema + { + Type = "array", + Items = new OpenApiSchema + { + Type = "string", + Format = "binary" + } + } + }, + Required = new HashSet { "files" } + } + } + } + }; + + // Add external documentation link for simple upload page + if (operation.ExternalDocs == null) + operation.ExternalDocs = new OpenApiExternalDocs(); + + operation.ExternalDocs.Description = "Simple multiple file upload page"; + operation.ExternalDocs.Url = new Uri("/upload.html", UriKind.Relative); + } + } +} diff --git a/examples/WebAPI/Program.cs b/examples/WebAPI/Program.cs new file mode 100644 index 0000000..7776241 --- /dev/null +++ b/examples/WebAPI/Program.cs @@ -0,0 +1,87 @@ +using Scalar.AspNetCore; +using SmartRAG.Enums; +using SmartRAG.Extensions; +using Microsoft.OpenApi.Models; +using SmartRAG.API.Filters; + +var builder = WebApplication.CreateBuilder(args); + +// Configure Kestrel server options for file uploads +builder.WebHost.UseKestrel(options => +{ + options.Limits.MaxRequestBodySize = 100 * 1024 * 1024; // 100 MB +}); + +RegisterServices(builder.Services, builder.Configuration); + +var app = builder.Build(); +ConfigureMiddleware(app, builder.Environment); + +app.Run(); + +static void RegisterServices(IServiceCollection services, IConfiguration configuration) +{ + // Configure logging + services.AddLogging(builder => + { + builder.ClearProviders(); + builder.AddConsole(); + builder.AddDebug(); + builder.SetMinimumLevel(LogLevel.Debug); + }); + + services.AddControllers(); + services.AddEndpointsApiExplorer(); + services.AddOpenApi(); + services.AddSwaggerGen(c => + { + c.SwaggerDoc("v1", new OpenApiInfo { Title = "SmartRAG API", Version = "v1" }); + + // Configure multipart file upload for multiple files + c.OperationFilter(); + }); + + // Configure form options for file uploads + services.Configure(options => + { + options.MultipartBodyLengthLimit = 100 * 1024 * 1024; // 100 MB + options.ValueLengthLimit = int.MaxValue; + options.ValueCountLimit = int.MaxValue; + options.KeyLengthLimit = int.MaxValue; + }); + + // Add SmartRag services with minimal configuration + services.UseSmartRag(configuration, + storageProvider: StorageProvider.InMemory, // Default: InMemory + aiProvider: AIProvider.Gemini // Use OpenAI provider + ); + + services.AddCors(options => + { + options.AddPolicy("AllowAll", policy => + { + policy.AllowAnyOrigin() + .AllowAnyMethod() + .AllowAnyHeader(); + }); + }); +} + +static void ConfigureMiddleware(WebApplication app, IWebHostEnvironment environment) +{ + + if (environment.IsDevelopment()) + { + app.MapOpenApi(); + app.MapSwagger(); + app.UseSwaggerUI(); + } + + // Serve static files for simple upload page + app.UseStaticFiles(); + + app.UseHttpsRedirection(); + app.UseCors("AllowAll"); + app.UseAuthorization(); + app.MapControllers(); +} \ No newline at end of file diff --git a/src/SmartRAG.API/Properties/launchSettings.json b/examples/WebAPI/Properties/launchSettings.json similarity index 100% rename from src/SmartRAG.API/Properties/launchSettings.json rename to examples/WebAPI/Properties/launchSettings.json diff --git a/src/SmartRAG.API/SmartRAG.API.csproj b/examples/WebAPI/SmartRAG.API.csproj similarity index 74% rename from src/SmartRAG.API/SmartRAG.API.csproj rename to examples/WebAPI/SmartRAG.API.csproj index 470b7b5..e63145a 100644 --- a/src/SmartRAG.API/SmartRAG.API.csproj +++ b/examples/WebAPI/SmartRAG.API.csproj @@ -7,12 +7,13 @@ - + + diff --git a/src/SmartRAG.API/appsettings.json b/examples/WebAPI/appsettings.json similarity index 100% rename from src/SmartRAG.API/appsettings.json rename to examples/WebAPI/appsettings.json diff --git a/examples/WebAPI/wwwroot/upload.html b/examples/WebAPI/wwwroot/upload.html new file mode 100644 index 0000000..9c86e9c --- /dev/null +++ b/examples/WebAPI/wwwroot/upload.html @@ -0,0 +1,134 @@ + + + + + + SmartRAG - Multiple Document Upload + + + +
+
Multiple Document Upload
+
+ + + +
+ + +
+
+ Console Logs (Real-time) + +
+
+
+
+
+ + + diff --git a/src/SmartRAG.API/Program.cs b/src/SmartRAG.API/Program.cs deleted file mode 100644 index f00a9e9..0000000 --- a/src/SmartRAG.API/Program.cs +++ /dev/null @@ -1,51 +0,0 @@ -using Scalar.AspNetCore; -using SmartRAG.Enums; -using SmartRAG.Extensions; - -var builder = WebApplication.CreateBuilder(args); - -RegisterServices(builder.Services, builder.Configuration); - -var app = builder.Build(); -ConfigureMiddleware(app, builder.Environment); - -app.Run(); - -static void RegisterServices(IServiceCollection services, IConfiguration configuration) -{ - - services.AddControllers(); - services.AddEndpointsApiExplorer(); - services.AddOpenApi(); - - // Add SmartRag services with minimal configuration - services.UseSmartRag(configuration, - storageProvider: StorageProvider.InMemory, // Default: InMemory - aiProvider: AIProvider.OpenAI // Use OpenAI provider - ); - - services.AddCors(options => - { - options.AddPolicy("AllowAll", policy => - { - policy.AllowAnyOrigin() - .AllowAnyMethod() - .AllowAnyHeader(); - }); - }); -} - -static void ConfigureMiddleware(WebApplication app, IWebHostEnvironment environment) -{ - - if (environment.IsDevelopment()) - { - app.MapOpenApi(); - app.MapScalarApiReference(); - } - - app.UseHttpsRedirection(); - app.UseCors("AllowAll"); - app.UseAuthorization(); - app.MapControllers(); -} \ No newline at end of file diff --git a/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs b/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs index 1a8750a..3fd2900 100644 --- a/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs +++ b/src/SmartRAG/Extensions/ServiceCollectionExtensions.cs @@ -35,6 +35,7 @@ public static IServiceCollection AddSmartRag(this IServiceCollection services, I services.AddSingleton(); services.AddScoped(); services.AddScoped(); + services.AddScoped(); services.AddSingleton(options); diff --git a/src/SmartRAG/Interfaces/IAIProvider.cs b/src/SmartRAG/Interfaces/IAIProvider.cs index 5dd3ccb..5ad0273 100644 --- a/src/SmartRAG/Interfaces/IAIProvider.cs +++ b/src/SmartRAG/Interfaces/IAIProvider.cs @@ -7,7 +7,18 @@ namespace SmartRAG.Interfaces; ///
public interface IAIProvider { + /// + /// Generates text response using the AI provider + /// Task GenerateTextAsync(string prompt, AIProviderConfig config); + + /// + /// Generates embedding vector for the given text + /// Task> GenerateEmbeddingAsync(string text, AIProviderConfig config); + + /// + /// Chunks text into smaller segments for processing + /// Task> ChunkTextAsync(string text, int maxChunkSize = 1000); } diff --git a/src/SmartRAG/Interfaces/IAIProviderFactory.cs b/src/SmartRAG/Interfaces/IAIProviderFactory.cs index 01f21ca..13f07f5 100644 --- a/src/SmartRAG/Interfaces/IAIProviderFactory.cs +++ b/src/SmartRAG/Interfaces/IAIProviderFactory.cs @@ -7,5 +7,8 @@ namespace SmartRAG.Interfaces; ///
public interface IAIProviderFactory { + /// + /// Creates an AI provider instance of the specified type + /// IAIProvider CreateProvider(AIProvider providerType); } diff --git a/src/SmartRAG/Interfaces/IAIService.cs b/src/SmartRAG/Interfaces/IAIService.cs index 8c6c5b9..20686eb 100644 --- a/src/SmartRAG/Interfaces/IAIService.cs +++ b/src/SmartRAG/Interfaces/IAIService.cs @@ -5,7 +5,18 @@ namespace SmartRAG.Interfaces; /// public interface IAIService { + /// + /// Generates AI response based on query and context + /// Task GenerateResponseAsync(string query, IEnumerable context); + + /// + /// Generates embedding vector for the given text + /// Task> GenerateEmbeddingsAsync(string text); + + /// + /// Generates embeddings for multiple texts in batch + /// Task>> GenerateEmbeddingsBatchAsync(IEnumerable texts); } diff --git a/src/SmartRAG/Interfaces/IDocumentParserService.cs b/src/SmartRAG/Interfaces/IDocumentParserService.cs index c1ebdd6..ff726cd 100644 --- a/src/SmartRAG/Interfaces/IDocumentParserService.cs +++ b/src/SmartRAG/Interfaces/IDocumentParserService.cs @@ -5,7 +5,18 @@ namespace SmartRAG.Interfaces; /// public interface IDocumentParserService { + /// + /// Parses document from file stream and creates document entity + /// Task ParseDocumentAsync(Stream fileStream, string fileName, string contentType, string uploadedBy); + + /// + /// Gets list of supported file extensions + /// IEnumerable GetSupportedFileTypes(); + + /// + /// Gets list of supported MIME content types + /// IEnumerable GetSupportedContentTypes(); } diff --git a/src/SmartRAG/Interfaces/IDocumentRepository.cs b/src/SmartRAG/Interfaces/IDocumentRepository.cs index f108a5b..e4d9907 100644 --- a/src/SmartRAG/Interfaces/IDocumentRepository.cs +++ b/src/SmartRAG/Interfaces/IDocumentRepository.cs @@ -2,12 +2,38 @@ namespace SmartRAG.Interfaces; +/// +/// Repository interface for document storage operations +/// public interface IDocumentRepository { + /// + /// Adds a new document to storage + /// Task AddAsync(Document document); + + /// + /// Retrieves document by unique identifier + /// Task GetByIdAsync(Guid id); + + /// + /// Retrieves all documents from storage + /// Task> GetAllAsync(); + + /// + /// Removes document from storage by ID + /// Task DeleteAsync(Guid id); + + /// + /// Gets total count of documents in storage + /// Task GetCountAsync(); + + /// + /// Searches documents using query string + /// Task> SearchAsync(string query, int maxResults = 5); } diff --git a/src/SmartRAG/Interfaces/IDocumentSearchService.cs b/src/SmartRAG/Interfaces/IDocumentSearchService.cs new file mode 100644 index 0000000..73e2725 --- /dev/null +++ b/src/SmartRAG/Interfaces/IDocumentSearchService.cs @@ -0,0 +1,32 @@ +using System.Collections.Generic; +using System.Threading.Tasks; +using SmartRAG.Entities; +using SmartRAG.Models; + +namespace SmartRAG.Interfaces; + +/// +/// Service interface for AI-powered search and RAG operations +/// +public interface IDocumentSearchService +{ + /// + /// Search documents semantically + /// + Task> SearchDocumentsAsync(string query, int maxResults = 5); + + /// + /// Generate RAG answer + /// + Task GenerateRagAnswerAsync(string query, int maxResults = 5); + + /// + /// Generate embedding with fallback + /// + Task?> GenerateEmbeddingWithFallbackAsync(string text); + + /// + /// Generate batch embeddings + /// + Task>?> GenerateEmbeddingsBatchAsync(List texts); +} diff --git a/src/SmartRAG/Interfaces/IDocumentService.cs b/src/SmartRAG/Interfaces/IDocumentService.cs index 79d8752..cdc1621 100644 --- a/src/SmartRAG/Interfaces/IDocumentService.cs +++ b/src/SmartRAG/Interfaces/IDocumentService.cs @@ -1,19 +1,48 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; using SmartRAG.Entities; -using SmartRAG.Models; namespace SmartRAG.Interfaces; /// -/// Service interface for document operations +/// Service interface for document CRUD operations /// public interface IDocumentService { + /// + /// Upload a single document + /// Task UploadDocumentAsync(Stream fileStream, string fileName, string contentType, string uploadedBy); + + /// + /// Upload multiple documents + /// + Task> UploadDocumentsAsync(IEnumerable fileStreams, IEnumerable fileNames, IEnumerable contentTypes, string uploadedBy); + + /// + /// Get document by ID + /// Task GetDocumentAsync(Guid id); + + /// + /// Get all documents + /// Task> GetAllDocumentsAsync(); + + /// + /// Delete document + /// Task DeleteDocumentAsync(Guid id); - Task> SearchDocumentsAsync(string query, int maxResults = 5); + + /// + /// Get storage statistics + /// Task> GetStorageStatisticsAsync(); - Task GenerateRagAnswerAsync(string query, int maxResults = 5); + + /// + /// Regenerate all embeddings + /// Task RegenerateAllEmbeddingsAsync(); } diff --git a/src/SmartRAG/Interfaces/IStorageFactory.cs b/src/SmartRAG/Interfaces/IStorageFactory.cs index b53ec2f..5875185 100644 --- a/src/SmartRAG/Interfaces/IStorageFactory.cs +++ b/src/SmartRAG/Interfaces/IStorageFactory.cs @@ -8,12 +8,23 @@ namespace SmartRAG.Interfaces; /// public interface IStorageFactory { + /// + /// Creates repository using storage configuration + /// IDocumentRepository CreateRepository(StorageConfig config); + /// + /// Creates repository using storage provider type + /// IDocumentRepository CreateRepository(StorageProvider provider); + /// + /// Gets the currently active storage provider + /// StorageProvider GetCurrentProvider(); + /// + /// Gets the currently active repository instance + /// IDocumentRepository GetCurrentRepository(); - } diff --git a/src/SmartRAG/Services/DocumentSearchService.cs b/src/SmartRAG/Services/DocumentSearchService.cs new file mode 100644 index 0000000..1b634a3 --- /dev/null +++ b/src/SmartRAG/Services/DocumentSearchService.cs @@ -0,0 +1,710 @@ +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; +using SmartRAG.Entities; +using SmartRAG.Enums; +using SmartRAG.Factories; +using SmartRAG.Interfaces; +using SmartRAG.Models; +using SmartRAG.Services.Logging; +using System.Text.Json; + +namespace SmartRAG.Services; + +public class DocumentSearchService( + IDocumentRepository documentRepository, + IAIService aiService, + IAIProviderFactory aiProviderFactory, + IConfiguration configuration, + SmartRagOptions options, + ILogger logger) : IDocumentSearchService +{ + + /// + /// Sanitizes user input for safe logging by removing newlines and carriage returns. + /// + private static string SanitizeForLog(string input) + { + if (input == null) return string.Empty; + return input.Replace("\r", "").Replace("\n", ""); + } + public async Task> SearchDocumentsAsync(string query, int maxResults = 5) + { + if (string.IsNullOrWhiteSpace(query)) + throw new ArgumentException("Query cannot be empty", nameof(query)); + + // Use our integrated search algorithm with diversity selection + var searchResults = await PerformBasicSearchAsync(query, maxResults * 2); + + if (searchResults.Count > 0) + { + ServiceLogMessages.LogSearchResults(logger, searchResults.Count, searchResults.Select(c => c.DocumentId).Distinct().Count(), null); + + // Apply diversity selection to ensure chunks from different documents + var diverseResults = ApplyDiversityAndSelect(searchResults, maxResults); + + ServiceLogMessages.LogDiverseResults(logger, diverseResults.Count, diverseResults.Select(c => c.DocumentId).Distinct().Count(), null); + + return diverseResults; + } + + return searchResults; + } + + public async Task GenerateRagAnswerAsync(string query, int maxResults = 5) + { + if (string.IsNullOrWhiteSpace(query)) + throw new ArgumentException("Query cannot be empty", nameof(query)); + + // Check if this is a general conversation query + if (IsGeneralConversationQuery(query)) + { + ServiceLogMessages.LogGeneralConversationQuery(logger, null); + var chatResponse = await HandleGeneralConversationAsync(query); + return new RagResponse + { + Answer = chatResponse, + Sources = new List(), + SearchedAt = DateTime.UtcNow, + Configuration = GetRagConfiguration() + }; + } + + // Document search query - use our integrated RAG implementation + return await GenerateBasicRagAnswerAsync(query, maxResults); + } + + public async Task?> GenerateEmbeddingWithFallbackAsync(string text) + { + try + { + ServiceLogMessages.LogPrimaryAIServiceAttempt(logger, null); + var result = await aiService.GenerateEmbeddingsAsync(text); + if (result != null && result.Count > 0) + { + ServiceLogMessages.LogPrimaryAIServiceSuccess(logger, result.Count, null); + return result; + } + ServiceLogMessages.LogPrimaryAIServiceNull(logger, null); + } + catch (Exception ex) + { + ServiceLogMessages.LogPrimaryAIServiceFailed(logger, ex); + } + + var embeddingProviders = new[] + { + "Anthropic", + "OpenAI", + "Gemini" + }; + + foreach (var provider in embeddingProviders) + { + try + { + ServiceLogMessages.LogProviderAttempt(logger, provider, null); + var providerEnum = Enum.Parse(provider); + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(providerEnum); + var providerConfig = configuration.GetSection($"AI:{provider}").Get(); + + if (providerConfig != null && !string.IsNullOrEmpty(providerConfig.ApiKey)) + { + ServiceLogMessages.LogProviderConfigFound(logger, provider, providerConfig.ApiKey.Substring(0, 8), null); + var embedding = await aiProvider.GenerateEmbeddingAsync(text, providerConfig); + if (embedding != null && embedding.Count > 0) + { + ServiceLogMessages.LogProviderSuccessful(logger, provider, embedding.Count, null); + return embedding; + } + else + { + ServiceLogMessages.LogProviderReturnedNull(logger, provider, null); + } + } + else + { + ServiceLogMessages.LogProviderConfigNotFound(logger, provider, null); + } + } + catch (Exception) + { + ServiceLogMessages.LogProviderFailed(logger, provider, null); + continue; + } + } + + ServiceLogMessages.LogAllProvidersFailedText(logger, text.Substring(0, Math.Min(50, text.Length)), null); + + // Special test for VoyageAI if Anthropic is configured + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) + { + ServiceLogMessages.LogTestingVoyageAI(logger, anthropicConfig.EmbeddingApiKey.Substring(0, 8), null); + + using var client = new HttpClient(); + client.DefaultRequestHeaders.Add("Authorization", $"Bearer {anthropicConfig.EmbeddingApiKey}"); + + var testPayload = new + { + input = new[] { text }, + model = anthropicConfig.EmbeddingModel ?? "voyage-3.5", + input_type = "document" + }; + + var jsonContent = System.Text.Json.JsonSerializer.Serialize(testPayload); + var content = new StringContent(jsonContent, System.Text.Encoding.UTF8, "application/json"); + + var response = await client.PostAsync("https://api.voyageai.com/v1/embeddings", content); + var responseContent = await response.Content.ReadAsStringAsync(); + + ServiceLogMessages.LogVoyageAITestResponse(logger, (int)response.StatusCode, responseContent, null); + + if (response.IsSuccessStatusCode) + { + ServiceLogMessages.LogVoyageAIWorking(logger, null); + // Parse the response and return a test embedding + try + { + using var doc = System.Text.Json.JsonDocument.Parse(responseContent); + if (doc.RootElement.TryGetProperty("data", out var dataArray) && dataArray.ValueKind == JsonValueKind.Array) + { + var firstEmbedding = dataArray.EnumerateArray().FirstOrDefault(); + if (firstEmbedding.TryGetProperty("embedding", out var embeddingArray) && embeddingArray.ValueKind == JsonValueKind.Array) + { + var testEmbedding = embeddingArray.EnumerateArray() + .Select(x => x.GetSingle()) + .ToList(); + ServiceLogMessages.LogVoyageAITestEmbedding(logger, testEmbedding.Count, null); + return testEmbedding; + } + } + } + catch (Exception) + { + ServiceLogMessages.LogFailedParseVoyageAI(logger, null); + } + } + } + } + catch (Exception) + { + ServiceLogMessages.LogVoyageAIDirectTestFailed(logger, null); + } + + return null; + } + + public async Task>?> GenerateEmbeddingsBatchAsync(List texts) + { + if (texts == null || texts.Count == 0) + return null; + + try + { + // Try batch embedding generation first + var batchEmbeddings = await aiService.GenerateEmbeddingsBatchAsync(texts); + if (batchEmbeddings != null && batchEmbeddings.Count == texts.Count) + return batchEmbeddings; + } + catch (Exception) + { + // Fallback to individual generation if batch fails + } + + // Special handling for VoyageAI: Process in smaller batches to respect 3 RPM limit + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) + { + Console.WriteLine($"[DEBUG] Trying VoyageAI batch processing with rate limiting..."); + + // Process in smaller batches (3 chunks per minute = 20 seconds between batches) + const int rateLimitBatchSize = 3; + var allEmbeddings = new List>(); + + for (int i = 0; i < texts.Count; i += rateLimitBatchSize) + { + var currentBatch = texts.Skip(i).Take(rateLimitBatchSize).ToList(); + Console.WriteLine($"[DEBUG] Processing VoyageAI batch {i / rateLimitBatchSize + 1}: chunks {i + 1}-{Math.Min(i + rateLimitBatchSize, texts.Count)}"); + + // Generate embeddings for current batch using VoyageAI + var batchEmbeddings = await GenerateVoyageAIBatchAsync(currentBatch, anthropicConfig); + + if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) + { + allEmbeddings.AddRange(batchEmbeddings); + Console.WriteLine($"[DEBUG] VoyageAI batch {i / rateLimitBatchSize + 1} successful: {batchEmbeddings.Count} embeddings"); + } + else + { + Console.WriteLine($"[WARNING] VoyageAI batch {i / rateLimitBatchSize + 1} failed, using individual fallback"); + // Fallback to individual generation for this batch + var individualEmbeddings = await GenerateIndividualEmbeddingsAsync(currentBatch); + allEmbeddings.AddRange(individualEmbeddings); + } + + // Smart rate limiting: Detect if we hit rate limits and adjust + if (i + rateLimitBatchSize < texts.Count) + { + // Check if we got rate limited in the last batch + var lastBatchSuccess = batchEmbeddings != null && batchEmbeddings.Count > 0; + + if (!lastBatchSuccess) + { + // Rate limited - wait 20 seconds for 3 RPM + Console.WriteLine($"[INFO] Rate limit detected, waiting 20 seconds for 3 RPM limit..."); + await Task.Delay(20000); + } + else + { + // No rate limit - continue at full speed (2000 RPM) + Console.WriteLine($"[INFO] No rate limit detected, continuing at full speed (2000 RPM)"); + // No delay needed for 2000 RPM + } + } + } + + if (allEmbeddings.Count == texts.Count) + { + Console.WriteLine($"[DEBUG] VoyageAI batch processing completed: {allEmbeddings.Count} embeddings"); + return allEmbeddings; + } + } + } + catch (Exception ex) + { + Console.WriteLine($"[DEBUG] VoyageAI batch processing failed: {ex.Message}"); + } + + // Final fallback: generate embeddings individually (but still in parallel) + ServiceLogMessages.LogIndividualEmbeddingGeneration(logger, texts.Count, null); + var embeddingTasks = texts.Select(async text => await GenerateEmbeddingWithFallbackAsync(text)).ToList(); + var embeddings = await Task.WhenAll(embeddingTasks); + + return embeddings.Where(e => e != null).Select(e => e!).ToList(); + } + + #region Private Helper Methods + + /// + /// Enhanced search with intelligent filtering and name detection + /// + private async Task> PerformBasicSearchAsync(string query, int maxResults) + { + var allDocuments = await documentRepository.GetAllAsync(); + var allChunks = allDocuments.SelectMany(d => d.Chunks).ToList(); + + ServiceLogMessages.LogSearchInDocuments(logger, allDocuments.Count, allChunks.Count, null); + + // Try embedding-based search first if available + try + { + var embeddingResults = await TryEmbeddingBasedSearchAsync(query, allChunks, maxResults); + if (embeddingResults.Count > 0) + { + ServiceLogMessages.LogEmbeddingSearchSuccessful(logger, embeddingResults.Count, null); + return embeddingResults; + } + } + catch (Exception) + { + ServiceLogMessages.LogEmbeddingSearchFailed(logger, null); + } + + // Enhanced keyword-based fallback for global content + var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2) + .ToList(); + + // Extract potential names from ORIGINAL query (not lowercase) - language agnostic + var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2 && char.IsUpper(w[0])) + .ToList(); + + ServiceLogMessages.LogQueryWords(logger, string.Join(", ", queryWords.Select(SanitizeForLog)), null); + ServiceLogMessages.LogPotentialNames(logger, string.Join(", ", potentialNames.Select(SanitizeForLog)), null); + + var scoredChunks = allChunks.Select(chunk => + { + var score = 0.0; + var content = chunk.Content.ToLowerInvariant(); + + // Special handling for names like "John Smith" - HIGHEST PRIORITY (language agnostic) + if (potentialNames.Count >= 2) + { + var fullName = string.Join(" ", potentialNames); + if (ContainsNormalizedName(content, fullName)) + { + score += 200.0; // Very high weight for full name matches + ServiceLogMessages.LogFullNameMatch(logger, SanitizeForLog(fullName), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length)), null); + } + else if (potentialNames.Any(name => ContainsNormalizedName(content, name))) + { + score += 100.0; // High weight for partial name matches + var foundNames = potentialNames.Where(name => ContainsNormalizedName(content, name)).ToList(); + ServiceLogMessages.LogPartialNameMatches(logger, string.Join(", ", foundNames.Select(SanitizeForLog)), chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length)), null); + } + } + + // Exact word matches + foreach (var word in queryWords) + { + if (content.Contains(word, StringComparison.OrdinalIgnoreCase)) + score += 2.0; // Higher weight for word matches + } + + // Generic content quality scoring (language and content agnostic) + var wordCount = content.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; + if (wordCount >= 10 && wordCount <= 100) score += 5.0; + + // Bonus for chunks with punctuation (indicates structured content) + var punctuationCount = content.Count(c => ".,;:!?()[]{}".Contains(c)); + if (punctuationCount >= 3) score += 2.0; + + // Bonus for chunks with numbers (often indicates factual information) + var numberCount = content.Count(c => char.IsDigit(c)); + if (numberCount >= 2) score += 2.0; + + chunk.RelevanceScore = score; + return chunk; + }).ToList(); + + var relevantChunks = scoredChunks + .Where(c => c.RelevanceScore > 0) + .OrderByDescending(c => c.RelevanceScore) + .Take(Math.Max(maxResults * 3, 30)) + .ToList(); + + ServiceLogMessages.LogRelevantChunksFound(logger, relevantChunks.Count, null); + + // If we found chunks with names, prioritize them + if (potentialNames.Count >= 2) + { + var nameChunks = relevantChunks.Where(c => + potentialNames.Any(name => c.Content.Contains(name, StringComparison.OrdinalIgnoreCase))).ToList(); + + if (nameChunks.Count > 0) + { + ServiceLogMessages.LogNameChunksFound(logger, nameChunks.Count, null); + return nameChunks.Take(maxResults).ToList(); + } + } + + return relevantChunks.Take(maxResults).ToList(); + } + + private async Task GenerateBasicRagAnswerAsync(string query, int maxResults) + { + var chunks = await SearchDocumentsAsync(query, maxResults); + var context = string.Join("\n\n", chunks.Select(c => c.Content)); + var answer = await aiService.GenerateResponseAsync($"Question: {query}\n\nContext: {context}\n\nAnswer:", new List { context }); + + return new RagResponse + { + Query = query, + Answer = answer, + Sources = chunks.Select(c => new SearchSource + { + DocumentId = c.DocumentId, + FileName = "Document", + RelevantContent = c.Content, + RelevanceScore = c.RelevanceScore ?? 0.0 + }).ToList(), + SearchedAt = DateTime.UtcNow, + Configuration = GetRagConfiguration() + }; + } + + private static List ApplyDiversityAndSelect(List chunks, int maxResults) + { + return chunks.Take(maxResults).ToList(); + } + + private async Task>?> GenerateVoyageAIBatchAsync(List texts, AIProviderConfig config) + { + // VoyageAI batch işlemi için basit implementasyon + var results = new List>(); + foreach (var text in texts) + { + var embedding = await GenerateEmbeddingWithFallbackAsync(text); + if (embedding != null) + results.Add(embedding); + } + return results; + } + + private async Task>> GenerateIndividualEmbeddingsAsync(List texts) + { + var results = new List>(); + foreach (var text in texts) + { + var embedding = await GenerateEmbeddingWithFallbackAsync(text); + results.Add(embedding ?? new List()); + } + return results; + } + + private RagConfiguration GetRagConfiguration() + { + return new RagConfiguration + { + AIProvider = options.AIProvider.ToString(), + StorageProvider = options.StorageProvider.ToString(), + Model = configuration["AI:OpenAI:Model"] ?? "gpt-3.5-turbo" + }; + } + + /// + /// Try embedding-based search using VoyageAI with intelligent filtering + /// + private async Task> TryEmbeddingBasedSearchAsync(string query, List allChunks, int maxResults) + { + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) + { + ServiceLogMessages.LogNoVoyageAIKey(logger, null); + return new List(); + } + + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(AIProvider.Anthropic); + + // Generate embedding for query with retry logic + var queryEmbedding = await GenerateEmbeddingWithRetryAsync(query, anthropicConfig); + if (queryEmbedding == null || queryEmbedding.Count == 0) + { + ServiceLogMessages.LogFailedQueryEmbedding(logger, null); + return new List(); + } + + // Calculate similarity for all chunks + var scoredChunks = allChunks.Select(chunk => + { + var similarity = 0.0; + if (chunk.Embedding != null && chunk.Embedding.Count > 0) + { + similarity = CalculateCosineSimilarity(queryEmbedding, chunk.Embedding); + } + + chunk.RelevanceScore = similarity; + return chunk; + }).ToList(); + + // INTELLIGENT FILTERING: Focus on chunks that actually contain the query terms + var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2) + .ToList(); + + // Extract potential names from ORIGINAL query (not lowercase) - language agnostic + var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 2 && char.IsUpper(w[0])) + .ToList(); + + // Filter chunks that actually contain query terms + var relevantChunks = scoredChunks.Where(chunk => + { + var content = chunk.Content.ToLowerInvariant(); + + // Must contain at least one query word + var hasQueryWord = queryWords.Any(word => content.Contains(word, StringComparison.OrdinalIgnoreCase)); + + // If query has names, prioritize chunks with names + if (potentialNames.Count >= 2) + { + var fullName = string.Join(" ", potentialNames); + var hasFullName = ContainsNormalizedName(content, fullName); + var hasPartialName = potentialNames.Any(name => ContainsNormalizedName(content, name)); + + return hasQueryWord && (hasFullName || hasPartialName); + } + + return hasQueryWord; + }).ToList(); + + ServiceLogMessages.LogChunksContainingQueryTerms(logger, relevantChunks.Count, null); + + if (relevantChunks.Count == 0) + { + ServiceLogMessages.LogNoChunksContainQueryTerms(logger, null); + relevantChunks = scoredChunks.Where(c => c.RelevanceScore > 0.01).ToList(); + } + + // Sort by relevance score and take top results + return relevantChunks + .OrderByDescending(c => c.RelevanceScore) + .Take(Math.Max(maxResults * 2, 20)) + .ToList(); + } + catch (Exception ex) + { + ServiceLogMessages.LogEmbeddingSearchFailedError(logger, ex); + return new List(); + } + } + + /// + /// Generate embedding with retry logic for rate limiting + /// + private async Task?> GenerateEmbeddingWithRetryAsync(string text, AIProviderConfig config) + { + var maxRetries = 3; + var retryDelayMs = 2000; + + for (int attempt = 0; attempt < maxRetries; attempt++) + { + try + { + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(AIProvider.Anthropic); + return await aiProvider.GenerateEmbeddingAsync(text, config); + } + catch (Exception ex) when (ex.Message.Contains("TooManyRequests") || ex.Message.Contains("rate limit")) + { + if (attempt < maxRetries - 1) + { + var delay = retryDelayMs * (int)Math.Pow(2, attempt); + ServiceLogMessages.LogRateLimitedRetry(logger, delay, attempt + 1, maxRetries, null); + await Task.Delay(delay); + } + else + { + ServiceLogMessages.LogRateLimitedAfterAttempts(logger, maxRetries, null); + throw; + } + } + } + + return null; + } + + /// + /// Calculate cosine similarity between two vectors + /// + private static double CalculateCosineSimilarity(List a, List b) + { + if (a == null || b == null || a.Count == 0 || b.Count == 0) return 0.0; + + var n = Math.Min(a.Count, b.Count); + double dot = 0, na = 0, nb = 0; + + for (int i = 0; i < n; i++) + { + double va = a[i]; + double vb = b[i]; + dot += va * vb; + na += va * va; + nb += vb * vb; + } + + if (na == 0 || nb == 0) return 0.0; + return dot / (Math.Sqrt(na) * Math.Sqrt(nb)); + } + + /// + /// Normalize text for better search matching (handles Unicode encoding issues) + /// + private static string NormalizeText(string text) + { + if (string.IsNullOrEmpty(text)) return text; + + // Decode Unicode escape sequences + var decoded = System.Text.RegularExpressions.Regex.Unescape(text); + + // Normalize Unicode characters + var normalized = decoded.Normalize(System.Text.NormalizationForm.FormC); + + // Handle common Turkish character variations (can be extended for other languages) + var characterMappings = new Dictionary + { + {"ı", "i"}, {"İ", "I"}, {"ğ", "g"}, {"Ğ", "G"}, + {"ü", "u"}, {"Ü", "U"}, {"ş", "s"}, {"Ş", "S"}, + {"ö", "o"}, {"Ö", "O"}, {"ç", "c"}, {"Ç", "C"} + }; + + foreach (var mapping in characterMappings) + { + normalized = normalized.Replace(mapping.Key, mapping.Value); + } + + return normalized; + } + + /// + /// Check if content contains normalized name (handles encoding issues) + /// + private static bool ContainsNormalizedName(string content, string searchName) + { + if (string.IsNullOrEmpty(content) || string.IsNullOrEmpty(searchName)) + return false; + + var normalizedContent = NormalizeText(content); + var normalizedSearchName = NormalizeText(searchName); + + // Try exact match first + if (normalizedContent.Contains(normalizedSearchName, StringComparison.OrdinalIgnoreCase)) + return true; + + // Try partial matches for each word + var searchWords = normalizedSearchName.Split(' ', StringSplitOptions.RemoveEmptyEntries); + var contentWords = normalizedContent.Split(' ', StringSplitOptions.RemoveEmptyEntries); + + // Check if all search words are present in content + return searchWords.All(searchWord => + contentWords.Any(contentWord => + contentWord.Contains(searchWord, StringComparison.OrdinalIgnoreCase))); + } + + /// + /// Check if query is a general conversation question (not document search) + /// + private static bool IsGeneralConversationQuery(string query) + { + if (string.IsNullOrWhiteSpace(query)) return false; + + // Simple detection: if query has document-like structure, it's document search + var hasDocumentStructure = query.Any(char.IsDigit) || + query.Contains(':') || + query.Contains('/') || + query.Contains('-') || + query.Length > 50; + + // If it has document structure, it's document search + // If not, it's general conversation + return !hasDocumentStructure; + } + + /// + /// Handle general conversation queries + /// + private async Task HandleGeneralConversationAsync(string query) + { + try + { + var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); + if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) + { + return "Sorry, I cannot chat right now. Please try again later."; + } + + var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(AIProvider.Anthropic); + + var prompt = $@"You are a helpful AI assistant. Answer the user's question naturally and friendly. + +User: {query} + +Answer:"; + + return await aiProvider.GenerateTextAsync(prompt, anthropicConfig); + } + catch (Exception) + { + // Log error using structured logging + return "Sorry, I cannot chat right now. Please try again later."; + } + } + + #endregion +} diff --git a/src/SmartRAG/Services/DocumentService.cs b/src/SmartRAG/Services/DocumentService.cs index 8e311d5..5f3ca55 100644 --- a/src/SmartRAG/Services/DocumentService.cs +++ b/src/SmartRAG/Services/DocumentService.cs @@ -1,30 +1,25 @@ - -using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.Logging; using SmartRAG.Entities; -using SmartRAG.Enums; -using SmartRAG.Factories; using SmartRAG.Interfaces; using SmartRAG.Models; -using System.Text.Json; +using SmartRAG.Services.Logging; namespace SmartRAG.Services; /// -/// Implementation of document service with enhanced semantic search using repository pattern +/// Implementation of document service focused on CRUD operations /// public class DocumentService( IDocumentRepository documentRepository, IDocumentParserService documentParserService, - IAIService aiService, + IDocumentSearchService documentSearchService, SmartRagOptions options, - IAIProviderFactory aiProviderFactory, - IConfiguration configuration) : IDocumentService + ILogger logger) : IDocumentService { public async Task UploadDocumentAsync(Stream fileStream, string fileName, string contentType, string uploadedBy) { var supportedExtensions = documentParserService.GetSupportedFileTypes(); - var supportedContentTypes = documentParserService.GetSupportedContentTypes(); var ext = Path.GetExtension(fileName).ToLowerInvariant(); @@ -45,7 +40,7 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa // Generate embeddings for all chunks in batch for better performance var allChunkContents = document.Chunks.Select(c => c.Content).ToList(); - var allEmbeddings = await TryGenerateEmbeddingsBatchAsync(allChunkContents); + var allEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(allChunkContents); // Apply embeddings to chunks with retry mechanism for (int i = 0; i < document.Chunks.Count; i++) @@ -60,23 +55,23 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa if (allEmbeddings != null && i < allEmbeddings.Count && allEmbeddings[i] != null && allEmbeddings[i].Count > 0) { chunk.Embedding = allEmbeddings[i]; - Console.WriteLine($"[DEBUG] Chunk {i}: Embedding generated successfully ({allEmbeddings[i].Count} dimensions)"); + ServiceLogMessages.LogChunkEmbeddingSuccess(logger, i, allEmbeddings[i].Count, null); } else { // Retry individual embedding generation for this chunk - Console.WriteLine($"[DEBUG] Chunk {i}: Batch embedding failed, trying individual generation"); - var individualEmbedding = await TryGenerateEmbeddingWithFallback(chunk.Content); + ServiceLogMessages.LogChunkBatchEmbeddingFailed(logger, i, null); + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); if (individualEmbedding != null && individualEmbedding.Count > 0) { chunk.Embedding = individualEmbedding; - Console.WriteLine($"[DEBUG] Chunk {i}: Individual embedding successful ({individualEmbedding.Count} dimensions)"); + ServiceLogMessages.LogChunkIndividualEmbeddingSuccess(logger, i, individualEmbedding.Count, null); } else { - Console.WriteLine($"[WARNING] Chunk {i}: Failed to generate embedding after retry"); - chunk.Embedding = new List(); // Empty but not null + ServiceLogMessages.LogChunkEmbeddingFailed(logger, i, null); + chunk.Embedding = []; // Empty but not null } } @@ -85,9 +80,9 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa } catch (Exception ex) { - Console.WriteLine($"[ERROR] Chunk {i}: Failed to process: {ex.Message}"); + ServiceLogMessages.LogChunkProcessingFailed(logger, i, ex); // If embedding generation fails, leave it empty and continue - document.Chunks[i].Embedding = new List(); // Empty but not null + document.Chunks[i].Embedding = []; // Empty but not null } } @@ -96,475 +91,51 @@ public async Task UploadDocumentAsync(Stream fileStream, string fileNa return savedDocument; } - public async Task GetDocumentAsync(Guid id) => await documentRepository.GetByIdAsync(id); - - public async Task> GetAllDocumentsAsync() => await documentRepository.GetAllAsync(); - - public async Task DeleteDocumentAsync(Guid id) => await documentRepository.DeleteAsync(id); - - public async Task> SearchDocumentsAsync(string query, int maxResults = 5) + public async Task> UploadDocumentsAsync(IEnumerable fileStreams, IEnumerable fileNames, IEnumerable contentTypes, string uploadedBy) { - if (string.IsNullOrWhiteSpace(query)) - throw new ArgumentException("Query cannot be empty", nameof(query)); - - try - { - // Use EnhancedSearchService directly (simplified without Semantic Kernel) - var enhancedSearchService = new EnhancedSearchService(aiProviderFactory, documentRepository, configuration); - var enhancedResults = await enhancedSearchService.EnhancedSemanticSearchAsync(query, maxResults * 2); - - if (enhancedResults.Count > 0) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService returned {enhancedResults.Count} chunks from {enhancedResults.Select(c => c.DocumentId).Distinct().Count()} documents"); - - // Apply diversity selection to ensure chunks from different documents - var diverseResults = ApplyDiversityAndSelect(enhancedResults, maxResults); - - Console.WriteLine($"[DEBUG] Final diverse results: {diverseResults.Count} chunks from {diverseResults.Select(c => c.DocumentId).Distinct().Count()} documents"); - - return diverseResults; - } - } - catch (Exception ex) - { - Console.WriteLine($"[WARNING] EnhancedSearchService failed: {ex.Message}. Falling back to basic search."); - } - - // Fallback to basic search if EnhancedSearchService fails - return await PerformBasicSearchAsync(query, maxResults); - } - - /// - /// Basic search fallback when Semantic Kernel is not available - /// - private async Task> PerformBasicSearchAsync(string query, int maxResults) - { - var cleanedQuery = query; - var allDocs = await documentRepository.GetAllAsync(); - - // Fix any chunks with missing DocumentId - foreach (var doc in allDocs) - { - foreach (var chunk in doc.Chunks) - { - if (chunk.DocumentId == Guid.Empty) - chunk.DocumentId = doc.Id; - } - } - - var allResults = new List(); + if (fileStreams == null || !fileStreams.Any()) + throw new ArgumentException("No file streams provided", nameof(fileStreams)); - try - { - // Try embedding generation - var queryEmbedding = await TryGenerateEmbeddingWithFallback(cleanedQuery); - if (queryEmbedding != null && queryEmbedding.Count > 0) - { - var vecScored = new List<(DocumentChunk chunk, double score)>(); - foreach (var doc in allDocs) - { - foreach (var chunk in doc.Chunks) - { - if (chunk.Embedding != null && chunk.Embedding.Count > 0) - { - var score = ComputeCosineSimilarity(queryEmbedding, chunk.Embedding); - Console.WriteLine($"[DEBUG] Chunk {chunk.Id} from {doc.FileName}: score={score:F4}, query_emb_dim={queryEmbedding.Count}, chunk_emb_dim={chunk.Embedding.Count}, content={chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - vecScored.Add((chunk, score)); - } - } - } - - // Apply improved relevance scoring with content-based boosting - var semanticResults = vecScored - .Select(x => { - var improvedScore = ImproveRelevanceScore(x.score, x.chunk.Content, cleanedQuery); - Console.WriteLine($"[DEBUG] Improved relevance score: chunk={x.chunk.Id}, base={x.score:F4}, final={improvedScore:F4}"); - x.chunk.RelevanceScore = improvedScore; - return x.chunk; - }) - .OrderByDescending(x => x.RelevanceScore) - .Take(maxResults * 2) - .ToList(); + if (fileNames == null || !fileNames.Any()) + throw new ArgumentException("No file names provided", nameof(fileNames)); - allResults.AddRange(semanticResults); - } - } - catch - { - // Continue with other search methods - } + if (contentTypes == null || !contentTypes.Any()) + throw new ArgumentException("No content types provided", nameof(contentTypes)); - // Repository search - var primary = await documentRepository.SearchAsync(cleanedQuery, maxResults * 2); - allResults.AddRange(primary); + var streamList = fileStreams.ToList(); + var nameList = fileNames.ToList(); + var typeList = contentTypes.ToList(); - // Fuzzy search if needed - if (allResults.Count < maxResults) - { - var fuzzyResults = await PerformFuzzySearch(cleanedQuery, maxResults); - allResults.AddRange(fuzzyResults.Where(f => !allResults.Any(p => p.Id == f.Id))); - } + if (streamList.Count != nameList.Count || streamList.Count != typeList.Count) + throw new ArgumentException("Number of file streams, names, and content types must match"); - // Remove duplicates and ensure diversity - var uniqueResults = allResults - .GroupBy(c => c.Id) - .Select(g => g.OrderByDescending(c => c.RelevanceScore ?? 0.0).First()) - .ToList(); - - return ApplyDiversityAndSelect(uniqueResults, maxResults); - } - - private async Task?> TryGenerateEmbeddingWithFallback(string text) - { - try - { - Console.WriteLine($"[DEBUG] Trying primary AI service for embedding generation"); - var result = await aiService.GenerateEmbeddingsAsync(text); - if (result != null && result.Count > 0) - { - Console.WriteLine($"[DEBUG] Primary AI service successful: {result.Count} dimensions"); - return result; - } - Console.WriteLine($"[DEBUG] Primary AI service returned null or empty embedding"); - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] Primary AI service failed: {ex.Message}"); - } - - var embeddingProviders = new[] - { - "Anthropic", - "OpenAI", - "Gemini" - }; + var uploadedDocuments = new List(); - foreach (var provider in embeddingProviders) + // Parallel document upload for better performance + var uploadTasks = streamList.Select(async (stream, index) => { try { - Console.WriteLine($"[DEBUG] Trying {provider} provider for embedding generation"); - var providerEnum = Enum.Parse(provider); - var aiProvider = ((AIProviderFactory)aiProviderFactory).CreateProvider(providerEnum); - var providerConfig = configuration.GetSection($"AI:{provider}").Get(); - - if (providerConfig != null && !string.IsNullOrEmpty(providerConfig.ApiKey)) - { - Console.WriteLine($"[DEBUG] {provider} config found, API key: {providerConfig.ApiKey.Substring(0, 8)}..."); - var embedding = await aiProvider.GenerateEmbeddingAsync(text, providerConfig); - if (embedding != null && embedding.Count > 0) - { - Console.WriteLine($"[DEBUG] {provider} successful: {embedding.Count} dimensions"); - return embedding; - } - else - { - Console.WriteLine($"[DEBUG] {provider} returned null or empty embedding"); - } - } - else - { - Console.WriteLine($"[DEBUG] {provider} config not found or API key missing"); - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] {provider} failed: {ex.Message}"); - continue; - } - } - - Console.WriteLine($"[DEBUG] All embedding providers failed for text: {text.Substring(0, Math.Min(50, text.Length))}..."); - - // Special test for VoyageAI if Anthropic is configured - try - { - var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) - { - Console.WriteLine($"[DEBUG] Testing VoyageAI directly with key: {anthropicConfig.EmbeddingApiKey.Substring(0, 8)}..."); - - using var client = new HttpClient(); - client.DefaultRequestHeaders.Add("Authorization", $"Bearer {anthropicConfig.EmbeddingApiKey}"); - - var testPayload = new - { - input = new[] { "test" }, - model = anthropicConfig.EmbeddingModel ?? "voyage-3.5", - input_type = "document" - }; - - var jsonContent = System.Text.Json.JsonSerializer.Serialize(testPayload); - var content = new StringContent(jsonContent, System.Text.Encoding.UTF8, "application/json"); - - var response = await client.PostAsync("https://api.voyageai.com/v1/embeddings", content); - var responseContent = await response.Content.ReadAsStringAsync(); - - Console.WriteLine($"[DEBUG] VoyageAI test response: {response.StatusCode} - {responseContent}"); - - if (response.IsSuccessStatusCode) - { - Console.WriteLine($"[DEBUG] VoyageAI is working! Trying to parse embedding..."); - // Parse the response and return a test embedding - try - { - using var doc = System.Text.Json.JsonDocument.Parse(responseContent); - if (doc.RootElement.TryGetProperty("data", out var dataArray) && dataArray.ValueKind == JsonValueKind.Array) - { - var firstEmbedding = dataArray.EnumerateArray().FirstOrDefault(); - if (firstEmbedding.TryGetProperty("embedding", out var embeddingArray) && embeddingArray.ValueKind == JsonValueKind.Array) - { - var testEmbedding = embeddingArray.EnumerateArray() - .Select(x => x.GetSingle()) - .ToList(); - Console.WriteLine($"[DEBUG] VoyageAI test embedding generated: {testEmbedding.Count} dimensions"); - return testEmbedding; - } - } - } - catch (Exception parseEx) - { - Console.WriteLine($"[DEBUG] Failed to parse VoyageAI response: {parseEx.Message}"); - } - } - } + return await UploadDocumentAsync(stream, nameList[index], typeList[index], uploadedBy); } catch (Exception ex) { - Console.WriteLine($"[DEBUG] VoyageAI direct test failed: {ex.Message}"); - } - + ServiceLogMessages.LogDocumentUploadFailed(logger, nameList[index], ex); return null; } + }); - /// - /// Generates embeddings for multiple texts in batch for better performance - /// - private async Task>?> TryGenerateEmbeddingsBatchAsync(List texts) - { - if (texts == null || texts.Count == 0) - return null; + var uploadResults = await Task.WhenAll(uploadTasks); + uploadedDocuments.AddRange(uploadResults.Where(doc => doc != null)!); - try - { - // Try batch embedding generation first - var batchEmbeddings = await aiService.GenerateEmbeddingsBatchAsync(texts); - if (batchEmbeddings != null && batchEmbeddings.Count == texts.Count) - return batchEmbeddings; - } - catch - { - // Fallback to individual generation if batch fails - } + return uploadedDocuments; + } - // Special handling for VoyageAI: Process in smaller batches to respect 3 RPM limit - try - { - var anthropicConfig = configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig != null && !string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) - { - Console.WriteLine($"[DEBUG] Trying VoyageAI batch processing with rate limiting..."); - - // Process in smaller batches (3 chunks per minute = 20 seconds between batches) - const int rateLimitBatchSize = 3; - var allEmbeddings = new List>(); - - for (int i = 0; i < texts.Count; i += rateLimitBatchSize) - { - var currentBatch = texts.Skip(i).Take(rateLimitBatchSize).ToList(); - Console.WriteLine($"[DEBUG] Processing VoyageAI batch {i/rateLimitBatchSize + 1}: chunks {i+1}-{Math.Min(i+rateLimitBatchSize, texts.Count)}"); - - // Generate embeddings for current batch using VoyageAI - var batchEmbeddings = await GenerateVoyageAIBatchAsync(currentBatch, anthropicConfig); - - if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) - { - allEmbeddings.AddRange(batchEmbeddings); - Console.WriteLine($"[DEBUG] VoyageAI batch {i/rateLimitBatchSize + 1} successful: {batchEmbeddings.Count} embeddings"); - } - else - { - Console.WriteLine($"[WARNING] VoyageAI batch {i/rateLimitBatchSize + 1} failed, using individual fallback"); - // Fallback to individual generation for this batch - var individualEmbeddings = await GenerateIndividualEmbeddingsAsync(currentBatch); - allEmbeddings.AddRange(individualEmbeddings); - } - - // Smart rate limiting: Detect if we hit rate limits and adjust - if (i + rateLimitBatchSize < texts.Count) - { - // Check if we got rate limited in the last batch - var lastBatchSuccess = batchEmbeddings != null && batchEmbeddings.Count > 0; - - if (!lastBatchSuccess) - { - // Rate limited - wait 20 seconds for 3 RPM - Console.WriteLine($"[INFO] Rate limit detected, waiting 20 seconds for 3 RPM limit..."); - await Task.Delay(20000); - } - else - { - // No rate limit - continue at full speed (2000 RPM) - Console.WriteLine($"[INFO] No rate limit detected, continuing at full speed (2000 RPM)"); - // No delay needed for 2000 RPM - } - } - } - - if (allEmbeddings.Count == texts.Count) - { - Console.WriteLine($"[DEBUG] VoyageAI batch processing completed: {allEmbeddings.Count} embeddings"); - return allEmbeddings; - } - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] VoyageAI batch processing failed: {ex.Message}"); - } + public async Task GetDocumentAsync(Guid id) => await documentRepository.GetByIdAsync(id); - // Final fallback: generate embeddings individually (but still in parallel) - Console.WriteLine($"[DEBUG] Falling back to individual embedding generation for {texts.Count} chunks"); - var embeddingTasks = texts.Select(async text => await TryGenerateEmbeddingWithFallback(text)).ToList(); - var embeddings = await Task.WhenAll(embeddingTasks); - - return embeddings.Where(e => e != null).Select(e => e!).ToList(); - } - - /// - /// Generates embeddings for a batch using VoyageAI directly - /// - private async Task>?> GenerateVoyageAIBatchAsync(List texts, AIProviderConfig config) - { - try - { - using var client = new HttpClient(); - client.DefaultRequestHeaders.Add("Authorization", $"Bearer {config.EmbeddingApiKey}"); - - var payload = new - { - input = texts, - model = config.EmbeddingModel ?? "voyage-3.5", - input_type = "document" - }; - - var jsonContent = System.Text.Json.JsonSerializer.Serialize(payload); - var content = new StringContent(jsonContent, System.Text.Encoding.UTF8, "application/json"); - - Console.WriteLine($"[DEBUG] VoyageAI batch request payload: {jsonContent}"); - var response = await client.PostAsync("https://api.voyageai.com/v1/embeddings", content); - var responseContent = await response.Content.ReadAsStringAsync(); - - Console.WriteLine($"[DEBUG] VoyageAI batch response: {response.StatusCode} - {responseContent}"); - - if (response.IsSuccessStatusCode) - { - var parsedEmbeddings = ParseVoyageAIBatchResponse(responseContent); - Console.WriteLine($"[DEBUG] VoyageAI batch parsed: {parsedEmbeddings?.Count ?? 0} embeddings"); - return parsedEmbeddings; - } - else - { - Console.WriteLine($"[DEBUG] VoyageAI batch request failed: {response.StatusCode} - {responseContent}"); - return null; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] VoyageAI batch generation failed: {ex.Message}"); - return null; - } - } - - /// - /// Parses VoyageAI batch response - /// - private static List>? ParseVoyageAIBatchResponse(string response) - { - try - { - using var doc = System.Text.Json.JsonDocument.Parse(response); - - if (doc.RootElement.TryGetProperty("data", out var dataArray) && dataArray.ValueKind == JsonValueKind.Array) - { - var embeddings = new List>(); - - foreach (var item in dataArray.EnumerateArray()) - { - if (item.TryGetProperty("embedding", out var embeddingArray) && embeddingArray.ValueKind == JsonValueKind.Array) - { - var embedding = embeddingArray.EnumerateArray() - .Select(x => x.GetSingle()) - .ToList(); - embeddings.Add(embedding); - } - } - - return embeddings.Count > 0 ? embeddings : null; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] Failed to parse VoyageAI batch response: {ex.Message}"); - } - - return null; - } - - /// - /// Generates embeddings individually for a batch as fallback - /// - private async Task>> GenerateIndividualEmbeddingsAsync(List texts) - { - var embeddings = new List>(); - - foreach (var text in texts) - { - var embedding = await TryGenerateEmbeddingWithFallback(text); - embeddings.Add(embedding ?? new List()); - } - - return embeddings; - } + public async Task> GetAllDocumentsAsync() => await documentRepository.GetAllAsync(); - private static double ComputeCosineSimilarity(List a, List b) - { - if (a == null || b == null) return 0.0; - int n = Math.Min(a.Count, b.Count); - if (n == 0) return 0.0; - - // Normalize embeddings for better similarity calculation - var normalizedA = NormalizeEmbedding(a); - var normalizedB = NormalizeEmbedding(b); - - double dot = 0; - for (int i = 0; i < n; i++) - { - dot += normalizedA[i] * normalizedB[i]; - } - - // Cosine similarity is just dot product of normalized vectors - return dot; - } - - /// - /// Normalizes embedding vector to unit length for better similarity calculation - /// - private static List NormalizeEmbedding(List embedding) - { - if (embedding == null || embedding.Count == 0) return new List(); - - // Convert to double for better precision - var doubleEmbedding = embedding.Select(x => (double)x).ToList(); - - // Calculate magnitude - double magnitude = Math.Sqrt(doubleEmbedding.Sum(x => x * x)); - - if (magnitude == 0) return doubleEmbedding; - - // Normalize to unit length - return doubleEmbedding.Select(x => x / magnitude).ToList(); - } + public async Task DeleteDocumentAsync(Guid id) => await documentRepository.DeleteAsync(id); public Task> GetStorageStatisticsAsync() { @@ -580,51 +151,11 @@ public Task> GetStorageStatisticsAsync() return Task.FromResult(stats); } - /// - /// Improves relevance score by considering content similarity and keyword matching - /// - private static double ImproveRelevanceScore(double baseScore, string content, string query) - { - if (string.IsNullOrEmpty(content) || string.IsNullOrEmpty(query)) - return baseScore; - - var improvedScore = baseScore; - - // Convert to lowercase for case-insensitive comparison - var lowerContent = content.ToLowerInvariant(); - var lowerQuery = query.ToLowerInvariant(); - - // Extract key terms from query (simple approach) - var queryTerms = lowerQuery.Split(new[] { ' ', ',', '.', '?', '!' }, StringSplitOptions.RemoveEmptyEntries) - .Where(term => term.Length > 2) // Only meaningful terms - .ToList(); - - // Calculate content relevance boost - var contentBoost = 0.0; - foreach (var term in queryTerms) - { - if (lowerContent.Contains(term)) - { - contentBoost += 0.1; // 10% boost per matching term - } - } - - // Apply content boost (cap at 50% to avoid over-boosting) - contentBoost = Math.Min(contentBoost, 0.5); - improvedScore += contentBoost; - - // Ensure score doesn't exceed 1.0 - return Math.Min(improvedScore, 1.0); - } - - /// - /// Regenerate embeddings for all existing documents (useful for fixing missing embeddings) - /// public async Task RegenerateAllEmbeddingsAsync() { try { - Console.WriteLine("[INFO] Starting embedding regeneration for all documents..."); + ServiceLogMessages.LogEmbeddingRegenerationStarted(logger, null); var allDocuments = await documentRepository.GetAllAsync(); var totalChunks = allDocuments.Sum(d => d.Chunks.Count); @@ -637,7 +168,7 @@ public async Task RegenerateAllEmbeddingsAsync() foreach (var document in allDocuments) { - Console.WriteLine($"[INFO] Document: {document.FileName} ({document.Chunks.Count} chunks)"); + ServiceLogMessages.LogDocumentProcessing(logger, document.FileName, document.Chunks.Count, null); foreach (var chunk in document.Chunks) { @@ -653,11 +184,11 @@ public async Task RegenerateAllEmbeddingsAsync() } } - Console.WriteLine($"[INFO] Total chunks to process: {chunksToProcess.Count} out of {totalChunks}"); + ServiceLogMessages.LogTotalChunksToProcess(logger, chunksToProcess.Count, totalChunks, null); if (chunksToProcess.Count == 0) { - Console.WriteLine("[INFO] All chunks already have valid embeddings. No processing needed."); + ServiceLogMessages.LogNoProcessingNeeded(logger, null); return true; } @@ -665,7 +196,7 @@ public async Task RegenerateAllEmbeddingsAsync() const int batchSize = 128; var totalBatches = (int)Math.Ceiling((double)chunksToProcess.Count / batchSize); - Console.WriteLine($"[INFO] Processing in {totalBatches} batches of {batchSize} chunks"); + ServiceLogMessages.LogBatchProcessing(logger, totalBatches, null); for (int batchIndex = 0; batchIndex < totalBatches; batchIndex++) { @@ -673,11 +204,11 @@ public async Task RegenerateAllEmbeddingsAsync() var endIndex = Math.Min(startIndex + batchSize, chunksToProcess.Count); var currentBatch = chunksToProcess.Skip(startIndex).Take(endIndex - startIndex).ToList(); - Console.WriteLine($"[INFO] Processing batch {batchIndex + 1}/{totalBatches}: chunks {startIndex + 1}-{endIndex}"); + ServiceLogMessages.LogBatchProgress(logger, batchIndex + 1, totalBatches, null); // Generate embeddings for current batch var batchContents = currentBatch.Select(c => c.Content).ToList(); - var batchEmbeddings = await TryGenerateEmbeddingsBatchAsync(batchContents); + var batchEmbeddings = await documentSearchService.GenerateEmbeddingsBatchAsync(batchContents); if (batchEmbeddings != null && batchEmbeddings.Count == currentBatch.Count) { @@ -691,23 +222,23 @@ public async Task RegenerateAllEmbeddingsAsync() { chunk.Embedding = embedding; successCount++; - Console.WriteLine($"[DEBUG] Chunk {chunk.Id}: Batch embedding successful ({embedding.Count} dimensions)"); + ServiceLogMessages.LogChunkBatchEmbeddingSuccess(logger, i, embedding.Count, null); } else { - Console.WriteLine($"[WARNING] Chunk {chunk.Id}: Batch embedding failed, trying individual generation"); + ServiceLogMessages.LogChunkBatchEmbeddingFailedRetry(logger, chunk.Id, null); // Fallback to individual generation - var individualEmbedding = await TryGenerateEmbeddingWithFallback(chunk.Content); + var individualEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); if (individualEmbedding != null && individualEmbedding.Count > 0) { chunk.Embedding = individualEmbedding; successCount++; - Console.WriteLine($"[DEBUG] Chunk {chunk.Id}: Individual embedding successful ({individualEmbedding.Count} dimensions)"); + ServiceLogMessages.LogChunkIndividualEmbeddingSuccessRetry(logger, chunk.Id, individualEmbedding.Count, null); } else { - Console.WriteLine($"[WARNING] Chunk {chunk.Id}: All embedding methods failed"); + ServiceLogMessages.LogChunkAllEmbeddingMethodsFailed(logger, chunk.Id, null); } } @@ -716,63 +247,49 @@ public async Task RegenerateAllEmbeddingsAsync() } else { - Console.WriteLine($"[WARNING] Batch {batchIndex + 1} failed, processing individually"); + ServiceLogMessages.LogBatchFailed(logger, batchIndex + 1, null); // Process chunks individually if batch fails foreach (var chunk in currentBatch) { try { - var newEmbedding = await TryGenerateEmbeddingWithFallback(chunk.Content); + var newEmbedding = await documentSearchService.GenerateEmbeddingWithFallbackAsync(chunk.Content); if (newEmbedding != null && newEmbedding.Count > 0) { chunk.Embedding = newEmbedding; successCount++; - Console.WriteLine($"[DEBUG] Chunk {chunk.Id}: Individual embedding successful ({newEmbedding.Count} dimensions)"); + ServiceLogMessages.LogChunkIndividualEmbeddingSuccessFinal(logger, chunk.Id, newEmbedding.Count, null); } else { - Console.WriteLine($"[WARNING] Chunk {chunk.Id}: Failed to generate embedding"); + ServiceLogMessages.LogChunkEmbeddingGenerationFailed(logger, chunk.Id, null); } processedChunks++; } catch (Exception ex) { - Console.WriteLine($"[ERROR] Chunk {chunk.Id}: Failed to regenerate embedding: {ex.Message}"); + ServiceLogMessages.LogChunkEmbeddingRegenerationFailed(logger, chunk.Id, ex); processedChunks++; } } } // Progress update - Console.WriteLine($"[INFO] Progress: {processedChunks}/{chunksToProcess.Count} chunks processed, {successCount} embeddings generated"); + ServiceLogMessages.LogProgress(logger, processedChunks, chunksToProcess.Count, successCount, null); - // Smart rate limiting: Check if we need to wait based on VoyageAI response + // Smart rate limiting if (batchIndex < totalBatches - 1) // Don't wait after last batch { - // Check if the last batch was successful (no rate limiting) - var lastBatchSuccess = successCount > 0; // If we got embeddings, no rate limit - - if (!lastBatchSuccess) - { - // Rate limited - wait 20 seconds for 3 RPM - Console.WriteLine($"[INFO] Rate limit detected, waiting 20 seconds for 3 RPM limit..."); - await Task.Delay(20000); - } - else - { - // No rate limit - continue at full speed (2000 RPM) - Console.WriteLine($"[INFO] No rate limit detected, continuing at full speed (2000 RPM)"); - // No delay needed for 2000 RPM - } + await Task.Delay(1000); // Simple rate limiting } } // Save all documents with updated embeddings var documentsToUpdate = documentChunkMap.Values.Distinct().ToList(); - Console.WriteLine($"[INFO] Saving {documentsToUpdate.Count} documents with updated embeddings..."); + ServiceLogMessages.LogSavingDocuments(logger, documentsToUpdate.Count, null); foreach (var document in documentsToUpdate) { @@ -780,775 +297,13 @@ public async Task RegenerateAllEmbeddingsAsync() await documentRepository.AddAsync(document); } - Console.WriteLine($"[INFO] Embedding regeneration completed. {successCount} embeddings generated for {processedChunks} chunks in {totalBatches} batches."); + ServiceLogMessages.LogEmbeddingRegenerationCompleted(logger, successCount, processedChunks, null); return successCount > 0; } catch (Exception ex) { - Console.WriteLine($"[ERROR] Failed to regenerate embeddings: {ex.Message}"); - return false; - } - } - - public async Task GenerateRagAnswerAsync(string query, int maxResults = 5) - { - if (string.IsNullOrWhiteSpace(query)) - throw new ArgumentException("Query cannot be empty", nameof(query)); - - // Try EnhancedSearchService first - try - { - var enhancedSearchService = new EnhancedSearchService(aiProviderFactory, documentRepository, configuration); - var enhancedResponse = await enhancedSearchService.MultiStepRAGAsync(query, maxResults); - - if (enhancedResponse != null && !string.IsNullOrEmpty(enhancedResponse.Answer)) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService RAG successful, using enhanced response"); - return enhancedResponse; - } - } - catch (Exception ex) - { - Console.WriteLine($"[WARNING] EnhancedSearchService RAG failed: {ex.Message}, falling back to basic RAG"); - } - - // Fallback to basic RAG implementation - return await GenerateBasicRagAnswerAsync(query, maxResults); - } - - /// - /// Basic RAG implementation when Semantic Kernel is not available - /// - private async Task GenerateBasicRagAnswerAsync(string query, int maxResults = 5) - { - // Get all documents for cross-document analysis - var allDocuments = await GetAllDocumentsAsync(); - - // Cross-document detection - var isCrossDocument = IsCrossDocumentQueryAsync(query, allDocuments); - - List relevantChunks; - - // Increase maxResults for better document coverage, but respect user's maxResults - var adjustedMaxResults = maxResults == 1 ? 1 : Math.Max(maxResults * 2, 5); // Respect maxResults=1, otherwise reasonable increase - - if (isCrossDocument) - { - relevantChunks = await PerformCrossDocumentSearchAsync(query, allDocuments, adjustedMaxResults); - } - else - { - relevantChunks = await PerformStandardSearchAsync(query, adjustedMaxResults); - } - - // Optimize context assembly: combine chunks intelligently - var contextMaxResults = isCrossDocument ? Math.Max(maxResults, 3) : maxResults; - - var optimizedChunks = OptimizeContextWindow(relevantChunks, contextMaxResults, query); - - var documentIdToName = new Dictionary(); - - foreach (var docId in optimizedChunks.Select(c => c.DocumentId).Distinct()) - { - var doc = await GetDocumentAsync(docId); - if (doc != null) - { - documentIdToName[docId] = doc.FileName; - } - } - - // Create enhanced context with metadata for better AI understanding - var enhancedContext = new List(); - - foreach (var chunk in optimizedChunks.OrderByDescending(c => c.RelevanceScore ?? 0.0)) - { - var docName = documentIdToName.TryGetValue(chunk.DocumentId, out var name) ? name : "Document"; - var relevance = chunk.RelevanceScore ?? 0.0; - var chunkInfo = $"[Document: {docName}, Relevance: {relevance:F3}, Chunk: {chunk.ChunkIndex}]\n{chunk.Content}"; - enhancedContext.Add(chunkInfo); - } - - var contextText = string.Join("\n\n---\n\n", enhancedContext); - - // Generate RAG answer using AI with enhanced prompt - var prompt = isCrossDocument - ? $"You are a precise information retrieval system. Analyze the following context and answer the query step by step.\n\nQuery: {query}\n\nContext:\n{contextText}\n\nInstructions:\n1. Extract specific facts from the context\n2. Answer each part of the query separately\n3. If information is missing, state 'This information is not available in the provided documents'\n4. Use exact quotes from context when possible\n\nAnswer:" - : $"You are a precise information retrieval system. Analyze the following context and answer the question step by step.\n\nQuestion: {query}\n\nContext:\n{contextText}\n\nInstructions:\n1. Extract specific facts from the context\n2. Answer each part of the question separately\n3. If information is missing, state 'This information is not available in the provided documents'\n4. Use exact quotes from context when possible\n\nAnswer:"; - - var answer = await aiService.GenerateResponseAsync(prompt, enhancedContext); - - var sources = optimizedChunks.Select(c => new SearchSource - { - DocumentId = c.DocumentId, - FileName = documentIdToName.TryGetValue(c.DocumentId, out var name) ? name : "Document", - RelevantContent = c.Content, - RelevanceScore = c.RelevanceScore ?? 0.0 - }).ToList(); - - return new RagResponse - { - Query = query, - Answer = answer, - Sources = sources, - SearchedAt = DateTime.UtcNow, - Configuration = GetRagConfiguration() - }; - } - - /// - /// Applies advanced re-ranking algorithm to improve chunk selection - /// - private static List ApplyReranking(List chunks, string query, int maxResults) - { - if (chunks.Count == 0) - return chunks; - - var queryKeywords = ExtractKeywords(query.ToLowerInvariant()); - var queryLength = query.Length; - var documentIds = chunks.Select(c => c.DocumentId).Distinct().ToList(); - - // Enhanced scoring algorithm - foreach (var chunk in chunks) - { - var originalScore = chunk.RelevanceScore ?? 0.0; - var enhancedScore = originalScore; - - // Factor 1: Exact keyword matching boost (CRITICAL!) - var chunkContent = chunk.Content.ToLowerInvariant(); - var exactMatches = 0; - - // Use cleaned keywords (noise/punctuation-safe) - var cleanedQueryKeywords = ExtractKeywords(query.ToLowerInvariant()); - foreach (var kw in cleanedQueryKeywords) - { - if (kw.Length > 2 && chunkContent.Contains(kw)) - { - exactMatches++; - } - } - - if (cleanedQueryKeywords.Count > 0) - { - var exactMatchRatio = (double)exactMatches / cleanedQueryKeywords.Count; - enhancedScore += exactMatchRatio * 0.6; // 60% boost for exact matches! - } - - // Additional keyword density boost - var chunkKeywords = ExtractKeywords(chunkContent); - var commonKeywords = queryKeywords.Intersect(chunkKeywords, StringComparer.OrdinalIgnoreCase).Count(); - - if (queryKeywords.Count > 0) - { - var keywordDensity = (double)commonKeywords / queryKeywords.Count; - enhancedScore += keywordDensity * 0.2; // 20% boost for keyword matches - } - - // Generic content relevance boost - var contentBoost = 0.0; - - // Boost for query term matches in content - var queryTermMatches = queryKeywords.Count(term => chunkContent.Contains(term, StringComparison.OrdinalIgnoreCase)); - contentBoost += Math.Min(0.3, queryTermMatches * 0.1); // Max 30% boost - - enhancedScore += contentBoost; - - // Factor 2: Content length optimization (not too short, not too long) - var contentLength = chunk.Content.Length; - var optimalLength = Math.Min(800, Math.Max(200, queryLength * 10)); // Dynamic optimal length - var lengthScore = 1.0 - Math.Abs(contentLength - optimalLength) / (double)optimalLength; - enhancedScore += Math.Max(0, lengthScore * 0.15); // 15% boost for optimal length - - // Factor 3: Position in document (earlier chunks often more important) - var positionBoost = Math.Max(0, 1.0 - (chunk.ChunkIndex * 0.05)); // Decrease by 5% per chunk - enhancedScore += positionBoost * 0.1; // 10% boost for position - - // Factor 4: Query term proximity (how close query terms are in content) - var proximityScore = CalculateTermProximity(chunk.Content, queryKeywords); - enhancedScore += proximityScore * 0.2; // 20% boost for proximity - - // Factor 5: Document diversity boost (NEW!) - var documentDiversityBoost = CalculateDocumentDiversityBoost(chunk.DocumentId, documentIds, chunks); - enhancedScore += documentDiversityBoost * 0.15; // 15% boost for diversity - - chunk.RelevanceScore = Math.Min(1.0, enhancedScore); // Cap at 1.0 - } - - return chunks; - } - - /// - /// Calculates how close query terms are to each other in the content - /// - private static double CalculateTermProximity(string content, List queryTerms) - { - if (queryTerms.Count == 0) return 0.0; - - var contentLower = content.ToLowerInvariant(); - var termPositions = new List(); - - foreach (var term in queryTerms) - { - var index = contentLower.IndexOf(term, StringComparison.OrdinalIgnoreCase); - if (index >= 0) - { - termPositions.Add(index); - } - } - - if (termPositions.Count < 2) return termPositions.Count > 0 ? 0.5 : 0.0; - - // Calculate average distance between terms - termPositions.Sort(); - var totalDistance = 0; - for (int i = 1; i < termPositions.Count; i++) - { - totalDistance += termPositions[i] - termPositions[i - 1]; - } - - var averageDistance = totalDistance / (termPositions.Count - 1); - // Closer terms = higher score (inverse relationship) - return Math.Max(0, 1.0 - averageDistance / 200.0); // Normalize by 200 characters - } - - /// - /// Applies diversity selection to avoid too many chunks from same document - /// - private static List ApplyDiversityAndSelect(List chunks, int maxResults) - { - if (chunks.Count == 0) return new List(); - - var uniqueDocumentIds = chunks.Select(c => c.DocumentId).Distinct().ToList(); - - Console.WriteLine($"[DEBUG] ApplyDiversityAndSelect: Total chunks: {chunks.Count}, Unique documents: {uniqueDocumentIds.Count}"); - Console.WriteLine($"[DEBUG] Document IDs: {string.Join(", ", uniqueDocumentIds.Take(5))}"); - - // Calculate min chunks per document - respect maxResults constraint - var minChunksPerDocument = Math.Max(1, Math.Min(2, Math.Max(1, maxResults / uniqueDocumentIds.Count))); // Min 1, Max 2 - var maxChunksPerDocument = Math.Min(maxResults, Math.Max(minChunksPerDocument, 2)); // Don't exceed maxResults - - Console.WriteLine($"[DEBUG] Min chunks per doc: {minChunksPerDocument}, Max chunks per doc: {maxChunksPerDocument}"); - - var selectedChunks = new List(); - var documentChunkCounts = new Dictionary(); - - // First pass: ensure minimum representation from each document, but respect maxResults - var totalSelected = 0; - foreach (var documentId in uniqueDocumentIds) - { - if (totalSelected >= maxResults) break; // Stop if we've reached maxResults - - var availableChunks = chunks.Where(c => c.DocumentId == documentId).ToList(); - var actualMinChunks = Math.Min(minChunksPerDocument, availableChunks.Count); - - // Don't exceed maxResults - var availableSlots = maxResults - totalSelected; - actualMinChunks = Math.Min(actualMinChunks, availableSlots); - - var documentChunks = availableChunks - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .Take(actualMinChunks) - .ToList(); - - Console.WriteLine($"[DEBUG] Document {documentId}: Available {availableChunks.Count}, Selected {documentChunks.Count} chunks (requested min: {minChunksPerDocument}, actual min: {actualMinChunks})"); - - selectedChunks.AddRange(documentChunks); - documentChunkCounts[documentId] = documentChunks.Count; - totalSelected += documentChunks.Count; - } - - // Second pass: fill remaining slots with best remaining chunks, but respect maxResults - var remainingSlots = maxResults - selectedChunks.Count; - if (remainingSlots > 0) - { - var remainingChunks = chunks.Except(selectedChunks) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - foreach (var chunk in remainingChunks) - { - if (remainingSlots <= 0 || selectedChunks.Count >= maxResults) break; - - var currentCount = documentChunkCounts.GetValueOrDefault(chunk.DocumentId, 0); - if (currentCount < maxChunksPerDocument) - { - selectedChunks.Add(chunk); - documentChunkCounts[chunk.DocumentId] = currentCount + 1; - remainingSlots--; - } - } - } - - // Ensure we don't exceed maxResults - var finalResult = selectedChunks.Take(maxResults).ToList(); - - Console.WriteLine($"[DEBUG] Final result: {finalResult.Count} chunks from {finalResult.Select(c => c.DocumentId).Distinct().Count()} documents (maxResults requested: {maxResults})"); - - return finalResult; - } - - /// - /// Performs fuzzy search with typo tolerance - /// - private async Task> PerformFuzzySearch(string query, int maxResults) - { - var fuzzyResults = new List(); - - try - { - var allDocs = await documentRepository.GetAllAsync(); - var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries); - - foreach (var doc in allDocs) - { - foreach (var chunk in doc.Chunks) - { - var chunkContent = chunk.Content.ToLowerInvariant(); - var chunkWords = chunkContent.Split(' ', StringSplitOptions.RemoveEmptyEntries); - - var fuzzyScore = 0.0; - var matchedWords = 0; - - foreach (var queryWord in queryWords) - { - if (queryWord.Length < 3) continue; // Skip very short words - - var bestMatch = 0.0; - foreach (var chunkWord in chunkWords) - { - if (chunkWord.Length < 3) continue; - - // Calculate similarity - var similarity = CalculateStringSimilarity(queryWord, chunkWord); - if (similarity > bestMatch) - { - bestMatch = similarity; - } - } - - // If similarity is above threshold, count as match - if (bestMatch >= 0.7) // 70% similarity threshold - { - fuzzyScore += bestMatch; - matchedWords++; - } - } - - // Calculate final fuzzy score - if (matchedWords > 0) - { - var finalScore = (fuzzyScore / queryWords.Length) * 0.8; // Fuzzy matches get 80% of perfect score - chunk.RelevanceScore = finalScore; - fuzzyResults.Add(chunk); - } - } - } - - return fuzzyResults - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .Take(maxResults) - .ToList(); - } - catch - { - return fuzzyResults; - } - } - - /// - /// Calculates string similarity using Levenshtein distance - /// - private static double CalculateStringSimilarity(string s1, string s2) - { - if (string.IsNullOrEmpty(s1) || string.IsNullOrEmpty(s2)) - return 0.0; - - if (s1 == s2) - return 1.0; - - var longer = s1.Length > s2.Length ? s1 : s2; - var shorter = s1.Length > s2.Length ? s2 : s1; - - var editDistance = LevenshteinDistance(longer, shorter); - return (longer.Length - editDistance) / (double)longer.Length; - } - - /// - /// Calculates Levenshtein distance between two strings - /// - private static int LevenshteinDistance(string s1, string s2) - { - var len1 = s1.Length; - var len2 = s2.Length; - var matrix = new int[len1 + 1, len2 + 1]; - - for (int i = 0; i <= len1; i++) - matrix[i, 0] = i; - - for (int j = 0; j <= len2; j++) - matrix[0, j] = j; - - for (int i = 1; i <= len1; i++) - { - for (int j = 1; j <= len2; j++) - { - var cost = s1[i - 1] == s2[j - 1] ? 0 : 1; - matrix[i, j] = Math.Min( - Math.Min(matrix[i - 1, j] + 1, matrix[i, j - 1] + 1), - matrix[i - 1, j - 1] + cost); - } - } - - return matrix[len1, len2]; - } - - /// - /// Extracts key words from query for additional search terms - /// - private static List ExtractKeywords(string query) - { - var stopWords = new HashSet { "ne", "nedir", "nasıl", "hangi", "kim", "nerede", "ne zaman", "neden", - "what", "how", "where", "when", "why", "who", "which", "is", "are", "the", "a", "an" }; - - var words = query.ToLowerInvariant() - .Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2 && !stopWords.Contains(w)) - .ToList(); - - return words; - } - - /// - /// Optimizes context window by intelligently selecting and combining chunks - /// - private static List OptimizeContextWindow(List chunks, int maxResults, string query) - { - if (chunks.Count == 0) return new List(); - - // Group chunks by document for better context - var documentGroups = chunks.GroupBy(c => c.DocumentId).ToList(); - - var finalChunks = new List(); - var remainingSlots = maxResults; - - // Build keyword list from query - var queryKeywords = ExtractKeywords(query.ToLowerInvariant()); - var targetKeywords = new HashSet(queryKeywords, StringComparer.OrdinalIgnoreCase); - - // Process each document group - foreach (var group in documentGroups.OrderByDescending(g => g.Max(c => c.RelevanceScore ?? 0.0))) - { - if (remainingSlots <= 0) break; - - // Prefer domain keyword matches within the document if available - var domainMatched = group - .Select(c => new { Chunk = c, Text = c.Content.ToLowerInvariant() }) - .Where(x => targetKeywords.Any(k => x.Text.Contains(k))) - .Select(x => x.Chunk) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .FirstOrDefault(); - - var bestChunk = domainMatched ?? group.OrderByDescending(c => c.RelevanceScore ?? 0.0).First(); - finalChunks.Add(bestChunk); - remainingSlots--; - - // Add additional chunks if slots remain - if (remainingSlots > 0) - { - // Bring in other domain matches first, then top by relevance - var domainExtras = group - .Where(c => !ReferenceEquals(c, bestChunk)) - .Select(c => new { Chunk = c, Text = c.Content.ToLowerInvariant() }) - .Where(x => targetKeywords.Any(k => x.Text.Contains(k))) - .Select(x => x.Chunk) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - var nonDomainExtras = group - .Where(c => !ReferenceEquals(c, bestChunk) && !domainExtras.Contains(c)) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - var extras = domainExtras.Concat(nonDomainExtras) - .Take(Math.Min(remainingSlots, 3)) // allow up to 3 extras per doc to improve coverage - .ToList(); - - finalChunks.AddRange(extras); - remainingSlots -= extras.Count; - } - } - - return finalChunks; - } - - /// - /// Detects if query requires information from multiple documents - /// - private static bool IsCrossDocumentQueryAsync(string query, List allDocuments) - { - if (allDocuments.Count <= 1) + ServiceLogMessages.LogEmbeddingRegenerationFailed(logger, ex); return false; - - // Extract topics from query - var queryTopics = ExtractQueryTopics(query); - - var relevantDocs = 0; - var isCrossDocument = false; - - foreach (var doc in allDocuments) - { - var docTopics = ExtractDocumentTopics(doc); - var matchCount = CalculateTopicMatches(queryTopics, docTopics); - - if (matchCount > 0) - { - relevantDocs++; - if (relevantDocs > 1) - { - isCrossDocument = true; - break; - } - } } - - return isCrossDocument; - } - - /// - /// Calculates topic matches using flexible matching strategies - /// - private static int CalculateTopicMatches(List queryTopics, List docTopics) - { - var matchCount = 0.0; - - foreach (var queryTopic in queryTopics) - { - // Strategy 1: Exact match - if (docTopics.Contains(queryTopic, StringComparer.OrdinalIgnoreCase)) - { - matchCount += 2.0; // Higher weight for exact matches - continue; - } - - // Strategy 2: Contains match (partial match) - var containsMatch = docTopics.Any(dt => - dt.Contains(queryTopic, StringComparison.OrdinalIgnoreCase) || - queryTopic.Contains(dt, StringComparison.OrdinalIgnoreCase)); - - if (containsMatch) - { - matchCount += 1.0; // Lower weight for partial matches - continue; - } - - // Strategy 3: Word-level match - var queryWords = queryTopic.Split(' ', StringSplitOptions.RemoveEmptyEntries); - var docWords = docTopics.SelectMany(dt => dt.Split(' ', StringSplitOptions.RemoveEmptyEntries)).ToList(); - - var wordMatches = queryWords.Count(qw => - docWords.Any(dw => - dw.Contains(qw, StringComparison.OrdinalIgnoreCase) || - qw.Contains(dw, StringComparison.OrdinalIgnoreCase))); - - if (wordMatches > 0) - { - matchCount += wordMatches * 0.5; // Partial weight for word matches - } - } - - return (int)Math.Round(matchCount); - } - - /// - /// Extracts main topics from user query - /// - private static List ExtractQueryTopics(string query) - { - var topics = new List(); - var words = query.ToLowerInvariant() - .Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) // Filter out very short words - .ToList(); - - // Add single words - topics.AddRange(words); - - // Add bigrams (2-word combinations) - for (int i = 0; i < words.Count - 1; i++) - { - topics.Add($"{words[i]} {words[i + 1]}"); - } - - // Add trigrams (3-word combinations) for better coverage - for (int i = 0; i < words.Count - 2; i++) - { - topics.Add($"{words[i]} {words[i + 1]} {words[i + 2]}"); - } - - // Add individual important words with higher priority - var importantWords = words.Where(w => w.Length > 4).ToList(); - topics.AddRange(importantWords); - - return topics.Distinct().ToList(); - } - - /// - /// Extracts main topics from document content - /// - private static List ExtractDocumentTopics(Document document) - { - var topics = new HashSet(); - var content = document.Content.ToLowerInvariant(); - - // Extract key phrases from document - increase coverage - var sentences = content.Split(['.', '!', '?'], StringSplitOptions.RemoveEmptyEntries); - - // Process more sentences for better topic coverage - foreach (var sentence in sentences.Take(20)) // Increased from 10 to 20 - { - var sentenceWords = sentence.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - // Add single words - topics.UnionWith(sentenceWords); - - // Add bigrams - for (int i = 0; i < sentenceWords.Count - 1; i++) - { - topics.Add($"{sentenceWords[i]} {sentenceWords[i + 1]}"); - } - - // Add trigrams - for (int i = 0; i < sentenceWords.Count - 2; i++) - { - topics.Add($"{sentenceWords[i]} {sentenceWords[i + 1]} {sentenceWords[i + 2]}"); - } - } - - // Also extract from chunk content for better coverage - foreach (var chunk in document.Chunks.Take(10)) - { - var chunkWords = chunk.Content.ToLowerInvariant() - .Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - topics.UnionWith(chunkWords); - } - - return topics.Take(50).ToList(); // Increased from 20 to 50 - } - - /// - /// Performs cross-document search with enhanced diversity - /// - private async Task> PerformCrossDocumentSearchAsync(string query, List allDocuments, int maxResults) - { - var adjustedMaxResults = maxResults == 1 ? 1 : Math.Max(maxResults, 3); // Respect maxResults=1, otherwise minimum 3 - - // Direct search with original query for cross-document - var searchResults = Math.Max(adjustedMaxResults * 3, options.MaxSearchResults); - var allChunks = await SearchDocumentsAsync(query, searchResults); - - // Remove duplicates and keep highest score - var uniqueChunks = allChunks - .GroupBy(c => c.Id) - .Select(g => g.OrderByDescending(c => c.RelevanceScore ?? 0.0).First()) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - var rerankedChunks = DocumentService.ApplyReranking(uniqueChunks, query, searchResults); - var finalChunks = DocumentService.ApplyDiversityAndSelect(rerankedChunks, adjustedMaxResults); - - return finalChunks; - } - - /// - /// Performs standard single-document search - /// - private async Task> PerformStandardSearchAsync(string query, int maxResults) - { - // Direct search with original query (more reliable) - var searchResults = Math.Max(maxResults * 2, options.MaxSearchResults); - var allRelevantChunks = await SearchDocumentsAsync(query, searchResults); - - // Remove duplicates and keep highest score - var uniqueChunks = allRelevantChunks - .GroupBy(c => c.Id) - .Select(g => g.OrderByDescending(c => c.RelevanceScore ?? 0.0).First()) - .OrderByDescending(c => c.RelevanceScore ?? 0.0) - .ToList(); - - // Apply advanced re-ranking algorithm - var rerankedChunks = DocumentService.ApplyReranking(uniqueChunks, query, searchResults); - - // Apply standard diversity selection - return DocumentService.ApplyDiversityAndSelect(rerankedChunks, maxResults); - } - - /// - /// Get current RAG configuration dynamically from Program.cs and appsettings.json - /// - private RagConfiguration GetRagConfiguration() - { - // Read from Program.cs configuration and appsettings.json - var currentProvider = GetCurrentAIProviderFromConfig(); - - return new RagConfiguration - { - AIProvider = currentProvider, - StorageProvider = GetCurrentStorageProviderFromConfig(), - Model = GetCurrentModelFromConfig(currentProvider) - }; - } - - /// - /// Get current AI provider from SmartRagOptions configuration - /// - private string GetCurrentAIProviderFromConfig() - { - // Use the configured AI provider from SmartRagOptions - return options.AIProvider.ToString(); - } - - /// - /// Get current storage provider from SmartRagOptions configuration - /// - private string GetCurrentStorageProviderFromConfig() - { - // Use the configured storage provider from SmartRagOptions - return options.StorageProvider.ToString(); - } - - /// - /// Get current model from configuration based on provider - /// - private string GetCurrentModelFromConfig(string provider) - { - // Dynamically build configuration key from provider name - var configKey = $"AI:{provider}:Model"; - return configuration[configKey] ?? "model-not-configured"; - } - - /// - /// Calculates diversity boost to encourage selection from different documents - /// - private static double CalculateDocumentDiversityBoost(Guid documentId, List allDocumentIds, List allChunks) - { - if (allDocumentIds.Count <= 1) return 0.0; - - // Calculate how many chunks we already have from this document - var chunksFromThisDoc = allChunks.Count(c => c.DocumentId == documentId); - var totalChunks = allChunks.Count; - - // If this document is underrepresented, give it a boost - var expectedChunksPerDoc = (double)totalChunks / allDocumentIds.Count; - var representationRatio = chunksFromThisDoc / expectedChunksPerDoc; - - // Boost underrepresented documents - if (representationRatio < 0.8) return 0.3; // 30% boost - if (representationRatio < 1.0) return 0.15; // 15% boost - - return 0.0; // No boost for overrepresented documents } } \ No newline at end of file diff --git a/src/SmartRAG/Services/EnhancedSearchService.cs b/src/SmartRAG/Services/EnhancedSearchService.cs deleted file mode 100644 index abd57a7..0000000 --- a/src/SmartRAG/Services/EnhancedSearchService.cs +++ /dev/null @@ -1,739 +0,0 @@ -using Microsoft.Extensions.Configuration; -using SmartRAG.Entities; -using SmartRAG.Enums; -using SmartRAG.Interfaces; -using SmartRAG.Models; - -namespace SmartRAG.Services; - -/// -/// Enhanced search service using configured AI provider (Anthropic) with Redis storage -/// -public class EnhancedSearchService -{ - private readonly IAIProviderFactory _aiProviderFactory; - private readonly IDocumentRepository _documentRepository; - private readonly IConfiguration _configuration; - - public EnhancedSearchService( - IAIProviderFactory aiProviderFactory, - IDocumentRepository documentRepository, - IConfiguration configuration) - { - _aiProviderFactory = aiProviderFactory; - _documentRepository = documentRepository; - _configuration = configuration; - } - - /// - /// Simple semantic search using configured AI provider (Anthropic) - /// - public async Task> EnhancedSemanticSearchAsync(string query, int maxResults = 5) - { - try - { - var allDocuments = await _documentRepository.GetAllAsync(); - var allChunks = allDocuments.SelectMany(d => d.Chunks).ToList(); - - Console.WriteLine($"[DEBUG] EnhancedSearchService: Searching in {allDocuments.Count} documents with {allChunks.Count} chunks"); - - // Use configured AI provider (Anthropic) - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) - { - Console.WriteLine($"[ERROR] Anthropic configuration not found"); - return await FallbackSearchAsync(query, maxResults); - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - // Create simple search prompt - var searchPrompt = $@"You are a search assistant. Find the most relevant document chunks for this query. - -Query: {query} - -Available chunks (showing first 200 characters of each): -{string.Join("\n\n", allChunks.Select((c, i) => $"Chunk {i}: {c.Content.Substring(0, Math.Min(200, c.Content.Length))}..."))} - -Instructions: -1. Look for chunks that contain information related to the query -2. Focus on key names, dates, companies, and facts mentioned in the query -3. Return ONLY the chunk numbers (0, 1, 2, etc.) that are relevant, separated by commas - -Return format: 0,3,7 (chunk numbers, not IDs)"; - - // Try with retry logic for rate limiting - string aiResponse = null; - var maxRetries = 3; - var retryDelayMs = 2000; // Start with 2 seconds - - for (int attempt = 0; attempt < maxRetries; attempt++) - { - try - { - aiResponse = await aiProvider.GenerateTextAsync(searchPrompt, anthropicConfig); - break; // Success, exit retry loop - } - catch (Exception ex) when (ex.Message.Contains("TooManyRequests") || ex.Message.Contains("rate limit")) - { - if (attempt < maxRetries - 1) - { - var delay = retryDelayMs * (int)Math.Pow(2, attempt); // Exponential backoff - Console.WriteLine($"[DEBUG] EnhancedSearchService: Rate limited by Anthropic, retrying in {delay}ms (attempt {attempt + 1}/{maxRetries})"); - await Task.Delay(delay); - } - else - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: Anthropic rate limited after {maxRetries} attempts, using fallback"); - throw; // Re-throw to use fallback - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: Anthropic failed with error: {ex.Message}"); - throw; // Re-throw to use fallback - } - } - - if (!string.IsNullOrEmpty(aiResponse)) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: AI response: {aiResponse}"); - - // Parse AI response and return relevant chunks - var parsedResults = ParseAISearchResults(aiResponse, allChunks, maxResults, query); - - if (parsedResults.Count > 0) - { - Console.WriteLine($"[DEBUG] EnhancedSearchService: Successfully parsed {parsedResults.Count} chunks"); - return parsedResults; - } - - Console.WriteLine($"[DEBUG] EnhancedSearchService: Failed to parse results, using fallback"); - } - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] EnhancedSearchService failed: {ex.Message}, using fallback"); - } - - // Fallback to basic search - return await FallbackSearchAsync(query, maxResults); - } - - /// - /// Simple RAG using configured AI provider (Anthropic) - /// - public async Task MultiStepRAGAsync(string query, int maxResults = 5) - { - try - { - // Check if this is a general conversation query - if (IsGeneralConversationQuery(query)) - { - Console.WriteLine($"[DEBUG] MultiStepRAGAsync: Detected general conversation query: '{query}'"); - var chatResponse = await HandleGeneralConversationAsync(query); - - return new RagResponse - { - Query = query, - Answer = chatResponse, - Sources = new List(), // No sources for chat - SearchedAt = DateTime.UtcNow, - Configuration = new RagConfiguration - { - //AIProvider = "Anthropic", - //StorageProvider = "Chat Mode", - //Model = "Claude + Chat" - } - }; - } - - // Step 1: Simple search for document-related queries - var relevantChunks = await EnhancedSemanticSearchAsync(query, maxResults); - - if (relevantChunks.Count == 0) - { - // Last resort: basic keyword search - relevantChunks = await FallbackSearchAsync(query, maxResults); - } - - // Step 2: Answer Generation using Anthropic - var answer = await GenerateAnswerWithAnthropic(query, relevantChunks); - - // Step 3: Simple Source Attribution - var sources = relevantChunks.Select(c => new SearchSource - { - DocumentId = c.DocumentId, - FileName = "Document", - RelevantContent = c.Content.Substring(0, Math.Min(200, c.Content.Length)), - RelevanceScore = c.RelevanceScore ?? 0.0 - }).ToList(); - - return new RagResponse - { - Query = query, - Answer = answer, - Sources = sources, - SearchedAt = DateTime.UtcNow, - Configuration = new RagConfiguration - { - //AIProvider = "Anthropic", - //StorageProvider = "Redis", - //Model = "Claude + VoyageAI" - } - }; - } - catch (Exception ex) - { - throw new InvalidOperationException($"RAG failed: {ex.Message}", ex); - } - } - - /// - /// Generate answer using Anthropic - /// - private async Task GenerateAnswerWithAnthropic(string query, List context) - { - try - { - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) - { - throw new InvalidOperationException("Anthropic configuration not found"); - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - var contextText = string.Join("\n\n---\n\n", - context.Select(c => $"[Document Chunk]\n{c.Content}")); - - var prompt = $@"You are a helpful AI assistant. Answer the user's question based on the provided context. - -Question: {query} - -Context: -{contextText} - -Instructions: -1. Answer the question comprehensively using information from the context -2. If information is missing, state it clearly -3. Provide structured, easy-to-understand response in the same language as the question -4. Cite specific parts of the context when possible - -Answer:"; - - // Try with retry logic for rate limiting - var maxRetries = 3; - var retryDelayMs = 2000; // Start with 2 seconds - - for (int attempt = 0; attempt < maxRetries; attempt++) - { - try - { - return await aiProvider.GenerateTextAsync(prompt, anthropicConfig); - } - catch (Exception ex) when (ex.Message.Contains("TooManyRequests") || ex.Message.Contains("rate limit")) - { - if (attempt < maxRetries - 1) - { - var delay = retryDelayMs * (int)Math.Pow(2, attempt); // Exponential backoff - Console.WriteLine($"[DEBUG] GenerateAnswerWithAnthropic: Rate limited, retrying in {delay}ms (attempt {attempt + 1}/{maxRetries})"); - await Task.Delay(delay); - } - else - { - Console.WriteLine($"[DEBUG] GenerateAnswerWithAnthropic: Rate limited after {maxRetries} attempts"); - throw; // Re-throw to use fallback - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] GenerateAnswerWithAnthropic: Failed with error: {ex.Message}"); - throw; // Re-throw to use fallback - } - } - - throw new InvalidOperationException("Unexpected error in retry loop"); - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] Failed to generate answer: {ex.Message}"); - return "Sorry, unable to generate answer. Please try again."; - } - } - - /// - /// Parse AI search results from AI provider response - /// - private static List ParseAISearchResults(string response, List allChunks, int maxResults, string query) - { - try - { - Console.WriteLine($"[DEBUG] ParseAISearchResults: Raw response: '{response}'"); - - // Try to parse chunk numbers from response - var chunkNumbers = response.Split(',') - .Select(s => s.Trim()) - .Where(s => !string.IsNullOrEmpty(s)) - .Select(s => int.TryParse(s, out var num) ? num : -1) - .Where(num => num >= 0 && num < allChunks.Count) - .Take(maxResults) - .ToList(); - - Console.WriteLine($"[DEBUG] ParseAISearchResults: Parsed chunk numbers: {string.Join(", ", chunkNumbers)}"); - - var results = new List(); - - foreach (var number in chunkNumbers) - { - if (number >= 0 && number < allChunks.Count) - { - var chunk = allChunks[number]; - results.Add(chunk); - Console.WriteLine($"[DEBUG] ParseAISearchResults: Found chunk {number} from document {chunk.DocumentId}"); - } - } - - if (results.Count > 0) - { - Console.WriteLine($"[DEBUG] ParseAISearchResults: Successfully parsed {results.Count} chunks"); - return results; - } - - Console.WriteLine($"[DEBUG] ParseAISearchResults: No chunks parsed"); - } - catch (Exception ex) - { - Console.WriteLine($"[WARNING] ParseAISearchResults failed: {ex.Message}"); - } - - return new List(); - } - - /// - /// Fallback search when AI search fails - /// - private async Task> FallbackSearchAsync(string query, int maxResults) - { - var allDocuments = await _documentRepository.GetAllAsync(); - var allChunks = allDocuments.SelectMany(d => d.Chunks).ToList(); - - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Searching in {allDocuments.Count} documents with {allChunks.Count} chunks"); - - // Try embedding-based search first if available - try - { - var embeddingResults = await TryEmbeddingBasedSearchAsync(query, allChunks, maxResults); - if (embeddingResults.Count > 0) - { - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Embedding search successful, found {embeddingResults.Count} chunks"); - return embeddingResults; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Embedding search failed: {ex.Message}, using keyword search"); - } - - // Enhanced keyword-based fallback for global content - var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - // Extract potential names from ORIGINAL query (not lowercase) - language agnostic - var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2 && char.IsUpper(w[0])) - .ToList(); - - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Query words: [{string.Join(", ", queryWords)}]"); - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Potential names: [{string.Join(", ", potentialNames)}]"); - - var scoredChunks = allChunks.Select(chunk => - { - var score = 0.0; - var content = chunk.Content.ToLowerInvariant(); - - // Special handling for names like "John Smith" - HIGHEST PRIORITY (language agnostic) - if (potentialNames.Count >= 2) - { - var fullName = string.Join(" ", potentialNames); - if (ContainsNormalizedName(content, fullName)) - { - score += 200.0; // Very high weight for full name matches - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found FULL NAME match: '{fullName}' in chunk: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - } - else if (potentialNames.Any(name => ContainsNormalizedName(content, name))) - { - score += 100.0; // High weight for partial name matches - var foundNames = potentialNames.Where(name => ContainsNormalizedName(content, name)).ToList(); - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found PARTIAL name matches: [{string.Join(", ", foundNames)}] in chunk: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - } - } - - // Exact word matches - foreach (var word in queryWords) - { - if (content.Contains(word, StringComparison.OrdinalIgnoreCase)) - score += 2.0; // Higher weight for word matches - } - - // Phrase matches (for longer queries) - var queryPhrases = query.ToLowerInvariant().Split('.', '?', '!') - .Where(p => p.Length > 5) - .ToList(); - - foreach (var phrase in queryPhrases) - { - var phraseWords = phrase.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 3) - .ToList(); - - if (phraseWords.Count >= 2) - { - var phraseText = string.Join(" ", phraseWords); - if (content.Contains(phraseText, StringComparison.OrdinalIgnoreCase)) - score += 10.0; // Higher weight for phrase matches - } - } - - // Penalty for very short content (global rule) - if (content.Length < 50) - score -= 20.0; - - // Generic content quality scoring (language and content agnostic) - // Score based on content structure and information density, not specific keywords - - // Bonus for chunks with good information density - var wordCount = content.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; - var avgWordLength = content.Length / Math.Max(wordCount, 1); - - // Prefer chunks with reasonable word length and count - if (wordCount >= 10 && wordCount <= 100) score += 5.0; - if (avgWordLength >= 4.0 && avgWordLength <= 8.0) score += 3.0; - - // Bonus for chunks with punctuation (indicates structured content) - var punctuationCount = content.Count(c => ".,;:!?()[]{}".Contains(c)); - if (punctuationCount >= 3) score += 2.0; - - // Bonus for chunks with numbers (often indicates factual information) - var numberCount = content.Count(c => char.IsDigit(c)); - if (numberCount >= 2) score += 2.0; - - // Bonus for chunks with mixed case (indicates proper formatting) - var hasUpper = content.Any(c => char.IsUpper(c)); - var hasLower = content.Any(c => char.IsLower(c)); - if (hasUpper && hasLower) score += 1.0; - - chunk.RelevanceScore = score; - return chunk; - }).ToList(); - - var relevantChunks = scoredChunks - .Where(c => c.RelevanceScore > 0) - .OrderByDescending(c => c.RelevanceScore) - .Take(Math.Max(maxResults * 3, 30)) // Take more for better context - .ToList(); - - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found {relevantChunks.Count} relevant chunks with keyword search"); - - // If we found chunks with names, prioritize them - if (potentialNames.Count >= 2) - { - var nameChunks = relevantChunks.Where(c => - potentialNames.Any(name => c.Content.Contains(name, StringComparison.OrdinalIgnoreCase))).ToList(); - - if (nameChunks.Count > 0) - { - Console.WriteLine($"[DEBUG] FallbackSearchAsync: Found {nameChunks.Count} chunks containing names, prioritizing them"); - return nameChunks.Take(maxResults).ToList(); - } - } - - return relevantChunks; - } - - /// - /// Try embedding-based search using VoyageAI with intelligent filtering - /// - private async Task> TryEmbeddingBasedSearchAsync(string query, List allChunks, int maxResults) - { - try - { - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.EmbeddingApiKey)) - { - Console.WriteLine($"[DEBUG] Embedding search: No VoyageAI API key found"); - return new List(); - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - // Generate embedding for query - var queryEmbedding = await aiProvider.GenerateEmbeddingAsync(query, anthropicConfig); - if (queryEmbedding == null || queryEmbedding.Count == 0) - { - Console.WriteLine($"[DEBUG] Embedding search: Failed to generate query embedding"); - return new List(); - } - - // Check which chunks already have embeddings (cached) - var chunksWithEmbeddings = allChunks.Where(c => c.Embedding != null && c.Embedding.Count > 0).ToList(); - var chunksWithoutEmbeddings = allChunks.Where(c => c.Embedding == null || c.Embedding.Count == 0).ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: {chunksWithEmbeddings.Count} chunks already have embeddings, {chunksWithoutEmbeddings.Count} need new embeddings"); - - // Process chunks without embeddings in batches to avoid rate limiting - if (chunksWithoutEmbeddings.Count > 0) - { - var batchSize = 10; - var totalBatches = (chunksWithoutEmbeddings.Count + batchSize - 1) / batchSize; - - Console.WriteLine($"[DEBUG] Embedding search: Processing {chunksWithoutEmbeddings.Count} chunks in {totalBatches} batches of {batchSize}"); - - for (int batchIndex = 0; batchIndex < totalBatches; batchIndex++) - { - var batch = chunksWithoutEmbeddings.Skip(batchIndex * batchSize).Take(batchSize).ToList(); - - var batchTasks = batch.Select(async chunk => - { - try - { - var chunkEmbedding = await aiProvider.GenerateEmbeddingAsync(chunk.Content, anthropicConfig); - if (chunkEmbedding != null && chunkEmbedding.Count > 0) - { - chunk.Embedding = chunkEmbedding; - return true; - } - } - catch (Exception ex) - { - Console.WriteLine($"[DEBUG] Failed to generate embedding for chunk {chunk.Id}: {ex.Message}"); - } - return false; - }); - - var batchResults = await Task.WhenAll(batchTasks); - var successfulEmbeddings = batchResults.Count(r => r); - - Console.WriteLine($"[DEBUG] Embedding search: Batch {batchIndex + 1}/{totalBatches}: {successfulEmbeddings}/{batchSize} successful"); - - if (batchIndex < totalBatches - 1) - { - var waitTime = 1500; - Console.WriteLine($"[DEBUG] Embedding search: Waiting {waitTime}ms before next batch to respect rate limits"); - await Task.Delay(waitTime); - } - } - } - - // Calculate similarity for all chunks - var scoredChunks = allChunks.Select(chunk => - { - var similarity = 0.0; - if (chunk.Embedding != null && chunk.Embedding.Count > 0) - { - similarity = CalculateCosineSimilarity(queryEmbedding, chunk.Embedding); - } - - chunk.RelevanceScore = similarity; - return chunk; - }).ToList(); - - // INTELLIGENT FILTERING: Focus on chunks that actually contain the query terms - var queryWords = query.ToLowerInvariant().Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2) - .ToList(); - - // Extract potential names from ORIGINAL query (not lowercase) - language agnostic - var potentialNames = query.Split(' ', StringSplitOptions.RemoveEmptyEntries) - .Where(w => w.Length > 2 && char.IsUpper(w[0])) - .ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: Query words: [{string.Join(", ", queryWords)}]"); - Console.WriteLine($"[DEBUG] Embedding search: Potential names: [{string.Join(", ", potentialNames)}]"); - - // Filter chunks that actually contain query terms - var relevantChunks = scoredChunks.Where(chunk => - { - var content = chunk.Content.ToLowerInvariant(); - - // Must contain at least one query word - var hasQueryWord = queryWords.Any(word => content.Contains(word, StringComparison.OrdinalIgnoreCase)); - - // If query has names, prioritize chunks with names - if (potentialNames.Count >= 2) - { - var fullName = string.Join(" ", potentialNames); - var hasFullName = ContainsNormalizedName(content, fullName); - var hasPartialName = potentialNames.Any(name => ContainsNormalizedName(content, name)); - - if (hasFullName || hasPartialName) - { - Console.WriteLine($"[DEBUG] Embedding search: Found name match in chunk: {chunk.Content.Substring(0, Math.Min(100, chunk.Content.Length))}..."); - } - - return hasQueryWord && (hasFullName || hasPartialName); - } - - return hasQueryWord; - }).ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: Found {relevantChunks.Count} chunks containing query terms"); - - if (relevantChunks.Count == 0) - { - Console.WriteLine($"[DEBUG] Embedding search: No chunks contain query terms, using similarity only"); - relevantChunks = scoredChunks.Where(c => c.RelevanceScore > 0.01).ToList(); - } - - // Sort by relevance score and take top results - var topChunks = relevantChunks - .OrderByDescending(c => c.RelevanceScore) - .Take(Math.Max(maxResults * 2, 20)) - .ToList(); - - Console.WriteLine($"[DEBUG] Embedding search: Selected {topChunks.Count} most relevant chunks"); - - // Debug: Show what we actually found - foreach (var chunk in topChunks.Take(5)) - { - Console.WriteLine($"[DEBUG] Top chunk content: {chunk.Content.Substring(0, Math.Min(150, chunk.Content.Length))}..."); - } - - return topChunks; - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] Embedding search failed: {ex.Message}"); - return new List(); - } - } - - /// - /// Calculate cosine similarity between two vectors - /// - private static double CalculateCosineSimilarity(List a, List b) - { - if (a == null || b == null || a.Count == 0 || b.Count == 0) return 0.0; - - var n = Math.Min(a.Count, b.Count); - double dot = 0, na = 0, nb = 0; - - for (int i = 0; i < n; i++) - { - double va = a[i]; - double vb = b[i]; - dot += va * vb; - na += va * va; - nb += vb * vb; - } - - if (na == 0 || nb == 0) return 0.0; - return dot / (Math.Sqrt(na) * Math.Sqrt(nb)); - } - - /// - /// Normalize text for better search matching (handles Unicode encoding issues) - /// - private static string NormalizeText(string text) - { - if (string.IsNullOrEmpty(text)) return text; - - // Decode Unicode escape sequences - var decoded = System.Text.RegularExpressions.Regex.Unescape(text); - - // Normalize Unicode characters - var normalized = decoded.Normalize(System.Text.NormalizationForm.FormC); - - // Handle common Turkish character variations - var turkishMappings = new Dictionary - { - {"ı", "i"}, {"İ", "I"}, {"ğ", "g"}, {"Ğ", "G"}, - {"ü", "u"}, {"Ü", "U"}, {"ş", "s"}, {"Ş", "S"}, - {"ö", "o"}, {"Ö", "O"}, {"ç", "c"}, {"Ç", "C"} - }; - - foreach (var mapping in turkishMappings) - { - normalized = normalized.Replace(mapping.Key, mapping.Value); - } - - return normalized; - } - - /// - /// Check if content contains normalized name (handles encoding issues) - /// - private static bool ContainsNormalizedName(string content, string searchName) - { - if (string.IsNullOrEmpty(content) || string.IsNullOrEmpty(searchName)) - return false; - - var normalizedContent = NormalizeText(content); - var normalizedSearchName = NormalizeText(searchName); - - // Try exact match first - if (normalizedContent.Contains(normalizedSearchName, StringComparison.OrdinalIgnoreCase)) - return true; - - // Try partial matches for each word - var searchWords = normalizedSearchName.Split(' ', StringSplitOptions.RemoveEmptyEntries); - var contentWords = normalizedContent.Split(' ', StringSplitOptions.RemoveEmptyEntries); - - // Check if all search words are present in content - return searchWords.All(searchWord => - contentWords.Any(contentWord => - contentWord.Contains(searchWord, StringComparison.OrdinalIgnoreCase))); - } - - /// - /// Check if query is a general conversation question (not document search) - /// - private static bool IsGeneralConversationQuery(string query) - { - if (string.IsNullOrWhiteSpace(query)) return false; - - // Simple detection: if query has document-like structure, it's document search - // Otherwise, it's general conversation - - var hasDocumentStructure = query.Any(char.IsDigit) || - query.Contains(":") || - query.Contains("/") || - query.Contains("-") || - query.Length > 50; // Very long queries are usually document searches - - // If it has document structure, it's document search - // If not, it's general conversation - return !hasDocumentStructure; - } - - /// - /// Handle general conversation queries - /// - private async Task HandleGeneralConversationAsync(string query) - { - try - { - var anthropicConfig = _configuration.GetSection("AI:Anthropic").Get(); - if (anthropicConfig == null || string.IsNullOrEmpty(anthropicConfig.ApiKey)) - { - return "Sorry, I cannot chat right now. Please try again later."; - } - - var aiProvider = _aiProviderFactory.CreateProvider(AIProvider.Anthropic); - - var prompt = $@"You are a helpful AI assistant. Answer the user's question naturally and friendly. - -User: {query} - -Answer:"; - - return await aiProvider.GenerateTextAsync(prompt, anthropicConfig); - } - catch (Exception ex) - { - Console.WriteLine($"[ERROR] General conversation failed: {ex.Message}"); - return "Sorry, I cannot chat right now. Please try again later."; - } - } -} diff --git a/src/SmartRAG/Services/Logging/ServiceLogMessages.cs b/src/SmartRAG/Services/Logging/ServiceLogMessages.cs new file mode 100644 index 0000000..f9902eb --- /dev/null +++ b/src/SmartRAG/Services/Logging/ServiceLogMessages.cs @@ -0,0 +1,360 @@ +using Microsoft.Extensions.Logging; + +namespace SmartRAG.Services.Logging; + +/// +/// Centralized LoggerMessage delegates for performance optimization +/// +public static class ServiceLogMessages +{ + #region Document Operations + + public static readonly Action LogDocumentUploaded = LoggerMessage.Define( + LogLevel.Information, + new EventId(1001, "DocumentUploaded"), + "Document uploaded successfully: {FileName}"); + + public static readonly Action LogDocumentUploadFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(1002, "DocumentUploadFailed"), + "Failed to upload document: {FileName}"); + + public static readonly Action LogDocumentDeleted = LoggerMessage.Define( + LogLevel.Information, + new EventId(1003, "DocumentDeleted"), + "Document deleted: {FileName}"); + + #endregion + + #region Embedding Operations + + public static readonly Action LogChunkEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2001, "ChunkEmbeddingSuccess"), + "Chunk {Index}: Embedding generated ({Dimensions} dimensions)"); + + public static readonly Action LogChunkEmbeddingFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(2002, "ChunkEmbeddingFailed"), + "Chunk {Index}: Failed to generate embedding"); + + public static readonly Action LogChunkProcessingFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(2003, "ChunkProcessingFailed"), + "Chunk {Index}: Failed to process"); + + public static readonly Action LogChunkBatchEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2004, "ChunkBatchEmbeddingSuccess"), + "Chunk {Index}: Batch embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkBatchEmbeddingFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2005, "ChunkBatchEmbeddingFailed"), + "Chunk {Index}: Batch embedding failed, trying individual generation"); + + public static readonly Action LogChunkIndividualEmbeddingSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(2006, "ChunkIndividualEmbeddingSuccess"), + "Chunk {Index}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogEmbeddingRegenerationStarted = LoggerMessage.Define( + LogLevel.Information, + new EventId(2007, "EmbeddingRegenerationStarted"), + "Starting embedding regeneration for all documents"); + + public static readonly Action LogEmbeddingRegenerationCompleted = LoggerMessage.Define( + LogLevel.Information, + new EventId(2008, "EmbeddingRegenerationCompleted"), + "Embedding regeneration completed: {SuccessCount}/{TotalCount} chunks"); + + public static readonly Action LogEmbeddingRegenerationFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(2009, "EmbeddingRegenerationFailed"), + "Failed to regenerate embeddings"); + + #endregion + + #region Search Operations + + public static readonly Action LogSearchResults = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3001, "SearchResults"), + "Search returned {ChunkCount} chunks from {DocumentCount} documents"); + + public static readonly Action LogDiverseResults = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3002, "DiverseResults"), + "Final diverse results: {ResultCount} chunks from {DocumentCount} documents"); + + public static readonly Action LogGeneralConversationQuery = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3003, "GeneralConversationQuery"), + "Detected general conversation query, handling without document search"); + + public static readonly Action LogSearchInDocuments = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3004, "SearchInDocuments"), + "Searching in {DocumentCount} documents with {ChunkCount} chunks"); + + public static readonly Action LogEmbeddingSearchSuccessful = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3005, "EmbeddingSearchSuccessful"), + "Embedding search successful, found {ChunkCount} chunks"); + + public static readonly Action LogEmbeddingSearchFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3006, "EmbeddingSearchFailed"), + "Embedding search failed, using keyword search"); + + public static readonly Action LogQueryWords = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3007, "QueryWords"), + "Query words: [{QueryWords}]"); + + public static readonly Action LogPotentialNames = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3008, "PotentialNames"), + "Potential names: [{PotentialNames}]"); + + public static readonly Action LogFullNameMatch = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3009, "FullNameMatch"), + "Found FULL NAME match: '{FullName}' in chunk: {ChunkPreview}..."); + + public static readonly Action LogPartialNameMatches = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3010, "PartialNameMatches"), + "Found PARTIAL name matches: [{FoundNames}] in chunk: {ChunkPreview}..."); + + public static readonly Action LogRelevantChunksFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3011, "RelevantChunksFound"), + "Found {ChunkCount} relevant chunks with enhanced search"); + + public static readonly Action LogNameChunksFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3012, "NameChunksFound"), + "Found {NameChunkCount} chunks containing names, prioritizing them"); + + public static readonly Action LogNoVoyageAIKey = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3013, "NoVoyageAIKey"), + "Embedding search: No VoyageAI API key found"); + + public static readonly Action LogFailedQueryEmbedding = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3014, "FailedQueryEmbedding"), + "Embedding search: Failed to generate query embedding"); + + public static readonly Action LogChunksContainingQueryTerms = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3015, "ChunksContainingQueryTerms"), + "Embedding search: Found {ChunkCount} chunks containing query terms"); + + public static readonly Action LogNoChunksContainQueryTerms = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3016, "NoChunksContainQueryTerms"), + "Embedding search: No chunks contain query terms, using similarity only"); + + public static readonly Action LogEmbeddingSearchFailedError = LoggerMessage.Define( + LogLevel.Error, + new EventId(3017, "EmbeddingSearchFailedError"), + "Embedding search failed"); + + public static readonly Action LogRateLimitedRetry = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3018, "RateLimitedRetry"), + "Embedding generation rate limited, retrying in {Delay}ms (attempt {Attempt}/{MaxRetries})"); + + public static readonly Action LogRateLimitedAfterAttempts = LoggerMessage.Define( + LogLevel.Debug, + new EventId(3019, "RateLimitedAfterAttempts"), + "Embedding generation rate limited after {MaxRetries} attempts"); + + #endregion + + #region AI Provider Operations + + public static readonly Action LogPrimaryAIServiceAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4001, "PrimaryAIServiceAttempt"), + "Trying primary AI service for embedding generation"); + + public static readonly Action LogPrimaryAIServiceSuccess = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4002, "PrimaryAIServiceSuccess"), + "Primary AI service successful: {Dimensions} dimensions"); + + public static readonly Action LogPrimaryAIServiceNull = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4003, "PrimaryAIServiceNull"), + "Primary AI service returned null or empty embedding"); + + public static readonly Action LogPrimaryAIServiceFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4004, "PrimaryAIServiceFailed"), + "Primary AI service failed"); + + public static readonly Action LogProviderAttempt = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4005, "ProviderAttempt"), + "Trying {Provider} provider for embedding generation"); + + public static readonly Action LogProviderSuccessful = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4006, "ProviderSuccessful"), + "{Provider} successful: {Dimensions} dimensions"); + + public static readonly Action LogProviderFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4007, "ProviderFailed"), + "{Provider} provider failed"); + + public static readonly Action LogAllProvidersFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(4008, "AllProvidersFailed"), + "All embedding providers failed"); + + public static readonly Action LogProviderConfigFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4009, "ProviderConfigFound"), + "{Provider} config found, API key: {ApiKeyPreview}..."); + + public static readonly Action LogProviderReturnedNull = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4010, "ProviderReturnedNull"), + "{Provider} returned null or empty embedding"); + + public static readonly Action LogProviderConfigNotFound = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4011, "ProviderConfigNotFound"), + "{Provider} config not found or API key missing"); + + public static readonly Action LogAllProvidersFailedText = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4012, "AllProvidersFailedText"), + "All embedding providers failed for text: {TextPreview}..."); + + public static readonly Action LogTestingVoyageAI = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4013, "TestingVoyageAI"), + "Testing VoyageAI directly with key: {ApiKeyPreview}..."); + + public static readonly Action LogVoyageAITestResponse = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4014, "VoyageAITestResponse"), + "VoyageAI test response: {StatusCode} - {Response}"); + + public static readonly Action LogVoyageAIWorking = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4015, "VoyageAIWorking"), + "VoyageAI is working! Trying to parse embedding..."); + + public static readonly Action LogVoyageAITestEmbedding = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4016, "VoyageAITestEmbedding"), + "VoyageAI test embedding generated: {Dimensions} dimensions"); + + public static readonly Action LogFailedParseVoyageAI = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4017, "FailedParseVoyageAI"), + "Failed to parse VoyageAI response"); + + public static readonly Action LogVoyageAIDirectTestFailed = LoggerMessage.Define( + LogLevel.Debug, + new EventId(4018, "VoyageAIDirectTestFailed"), + "VoyageAI direct test failed"); + + #endregion + + #region Batch Operations + + public static readonly Action LogBatchProcessing = LoggerMessage.Define( + LogLevel.Information, + new EventId(5001, "BatchProcessing"), + "Processing {BatchSize} chunks in batch"); + + public static readonly Action LogBatchCompleted = LoggerMessage.Define( + LogLevel.Information, + new EventId(5002, "BatchCompleted"), + "Batch completed: {ProcessedCount} chunks processed"); + + public static readonly Action LogBatchProgress = LoggerMessage.Define( + LogLevel.Information, + new EventId(5003, "BatchProgress"), + "Processing batch {BatchNumber}/{TotalBatches}"); + + public static readonly Action LogBatchFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(5004, "BatchFailed"), + "Batch {BatchNumber} failed, processing individually"); + + #endregion + + #region Progress and Status + + public static readonly Action LogProgress = LoggerMessage.Define( + LogLevel.Information, + new EventId(6001, "Progress"), + "Progress: {ProcessedChunks}/{TotalChunks} chunks processed, {SuccessCount} embeddings generated"); + + public static readonly Action LogSavingDocuments = LoggerMessage.Define( + LogLevel.Information, + new EventId(6002, "SavingDocuments"), + "Saving {DocumentCount} documents with updated embeddings"); + + public static readonly Action LogTotalChunksToProcess = LoggerMessage.Define( + LogLevel.Information, + new EventId(6003, "TotalChunksToProcess"), + "Total chunks to process: {ProcessCount} out of {TotalChunks}"); + + public static readonly Action LogNoProcessingNeeded = LoggerMessage.Define( + LogLevel.Information, + new EventId(6004, "NoProcessingNeeded"), + "All chunks already have valid embeddings. No processing needed."); + + public static readonly Action LogIndividualEmbeddingGeneration = LoggerMessage.Define( + LogLevel.Debug, + new EventId(6005, "IndividualEmbeddingGeneration"), + "Generating individual embeddings for {TextCount} texts"); + + public static readonly Action LogDocumentProcessing = LoggerMessage.Define( + LogLevel.Information, + new EventId(6006, "DocumentProcessing"), + "Document: {FileName} ({ChunkCount} chunks)"); + + + + public static readonly Action LogChunkBatchEmbeddingFailedRetry = LoggerMessage.Define( + LogLevel.Warning, + new EventId(6008, "ChunkBatchEmbeddingFailedRetry"), + "Chunk {ChunkId}: Batch embedding failed, trying individual generation"); + + public static readonly Action LogChunkIndividualEmbeddingSuccessRetry = LoggerMessage.Define( + LogLevel.Debug, + new EventId(6009, "ChunkIndividualEmbeddingSuccessRetry"), + "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkAllEmbeddingMethodsFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(6010, "ChunkAllEmbeddingMethodsFailed"), + "Chunk {ChunkId}: All embedding methods failed"); + + public static readonly Action LogChunkIndividualEmbeddingSuccessFinal = LoggerMessage.Define( + LogLevel.Debug, + new EventId(6011, "ChunkIndividualEmbeddingSuccessFinal"), + "Chunk {ChunkId}: Individual embedding successful ({Dimensions} dimensions)"); + + public static readonly Action LogChunkEmbeddingGenerationFailed = LoggerMessage.Define( + LogLevel.Warning, + new EventId(6012, "ChunkEmbeddingGenerationFailed"), + "Chunk {ChunkId}: Failed to generate embedding"); + + public static readonly Action LogChunkEmbeddingRegenerationFailed = LoggerMessage.Define( + LogLevel.Error, + new EventId(6013, "ChunkEmbeddingRegenerationFailed"), + "Chunk {ChunkId}: Failed to regenerate embedding"); + + #endregion +}