Skip to content

Commit 185025b

Browse files
committed
2 parents 94be1fb + 7b7ed53 commit 185025b

File tree

5 files changed

+231
-14
lines changed

5 files changed

+231
-14
lines changed

README.md

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ SmartRAG is a **production-ready** .NET 9.0 library that provides a complete **R
1818
- 🧠 **Advanced RAG Pipeline**: Smart chunking, semantic retrieval, AI-powered answer generation
1919
-**Lightning Fast**: Optimized vector search with context-aware answer synthesis
2020
- 🔌 **Plug & Play**: Single-line integration with dependency injection
21-
- 📄 **Multi-Format**: PDF, Word, text files with intelligent parsing
21+
- 📄 **Multi-Format**: PDF, Word, Excel, text files with intelligent parsing
2222
- 🎯 **Enhanced Semantic Search**: Advanced hybrid scoring with 80% semantic + 20% keyword relevance
2323
- 🔍 **Smart Document Chunking**: Word boundary validation and optimal break points for context preservation
2424
-**Enterprise Grade**: Zero Warnings Policy, SOLID principles, comprehensive logging, XML documentation
@@ -38,7 +38,7 @@ SmartRAG is a **production-ready** .NET 9.0 library that provides a complete **R
3838
- **Language-Agnostic Design**: No hardcoded language patterns - works globally with any language
3939
- **Multiple Storage Options**: From in-memory to enterprise vector databases
4040
- **AI Provider Flexibility**: Switch between providers without code changes
41-
- **Document Intelligence**: Advanced parsing for PDF, Word, and text formats
41+
- **Document Intelligence**: Advanced parsing for PDF, Word, Excel, and text formats
4242
- **Configuration-First**: Environment-based configuration with sensible defaults
4343
- **Dependency Injection**: Full DI container integration
4444
- **Enhanced Semantic Search**: Advanced hybrid scoring combining semantic similarity and keyword relevance
@@ -134,6 +134,42 @@ dotnet add package SmartRAG
134134
<PackageReference Include="SmartRAG" Version="1.1.0" />
135135
```
136136

137+
## 📄 Supported Document Formats
138+
139+
SmartRAG supports a wide range of document formats with intelligent parsing and text extraction:
140+
141+
### **📊 Excel Files (.xlsx, .xls)**
142+
- **Advanced Parsing**: Extracts text from all worksheets and cells
143+
- **Structured Data**: Preserves table structure with tab-separated values
144+
- **Worksheet Names**: Includes worksheet names for context
145+
- **Cell Content**: Extracts all non-empty cell values
146+
- **Format Preservation**: Maintains data organization for better context
147+
148+
### **📝 Word Documents (.docx, .doc)**
149+
- **Rich Text Extraction**: Preserves formatting and structure
150+
- **Table Support**: Extracts content from tables and lists
151+
- **Paragraph Handling**: Maintains paragraph breaks and flow
152+
- **Metadata Preservation**: Keeps document structure intact
153+
154+
### **📋 PDF Documents (.pdf)**
155+
- **Multi-Page Support**: Processes all pages with text extraction
156+
- **Layout Preservation**: Maintains document structure and flow
157+
- **Text Quality**: High-quality text extraction for analysis
158+
- **Page Separation**: Clear page boundaries for context
159+
160+
### **📄 Text Files (.txt, .md, .json, .xml, .csv, .html, .htm)**
161+
- **Universal Support**: Handles all text-based formats
162+
- **Encoding Detection**: Automatic UTF-8 and encoding detection
163+
- **Structure Preservation**: Maintains original formatting
164+
- **Fast Processing**: Optimized for text-based content
165+
166+
### **🔍 Content Type Support**
167+
SmartRAG automatically detects file types using both file extensions and MIME content types:
168+
- **Excel**: `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`, `application/vnd.ms-excel`
169+
- **Word**: `application/vnd.openxmlformats-officedocument.wordprocessingml.document`, `application/msword`
170+
- **PDF**: `application/pdf`
171+
- **Text**: `text/*`, `application/json`, `application/xml`, `application/csv`
172+
137173
## 🚀 Quick Start
138174

139175
### 1. **Development Setup**

src/SmartRAG/Providers/AnthropicProvider.cs

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,15 @@ public override async Task<List<List<float>>> GenerateEmbeddingsBatchAsync(IEnum
144144
if (inputList.Count == 0)
145145
return [];
146146

147+
// Filter out empty or null strings to prevent Voyage AI API errors
148+
var validInputs = inputList.Where(text => !string.IsNullOrWhiteSpace(text)).ToList();
149+
150+
if (validInputs.Count == 0)
151+
{
152+
ProviderLogMessages.LogAnthropicEmbeddingValidationError(Logger, "All input texts are empty or null", null);
153+
return Enumerable.Range(0, inputList.Count).Select(_ => new List<float>()).ToList();
154+
}
155+
147156
var voyageApiKey = config.EmbeddingApiKey ?? config.ApiKey;
148157
var voyageModel = config.EmbeddingModel ?? DefaultVoyageModel;
149158

@@ -156,7 +165,7 @@ public override async Task<List<List<float>>> GenerateEmbeddingsBatchAsync(IEnum
156165

157166
var payload = new
158167
{
159-
input = inputList.ToArray(),
168+
input = validInputs.ToArray(),
160169
model = voyageModel,
161170
input_type = VoyageInputType
162171
};
@@ -173,7 +182,8 @@ public override async Task<List<List<float>>> GenerateEmbeddingsBatchAsync(IEnum
173182

174183
try
175184
{
176-
return ParseVoyageBatchEmbeddingResponse(response, inputList.Count);
185+
var validEmbeddings = ParseVoyageBatchEmbeddingResponse(response, validInputs.Count);
186+
return MapEmbeddingsToOriginalInputs(validEmbeddings, inputList, validInputs);
177187
}
178188
catch (Exception ex)
179189
{
@@ -295,5 +305,39 @@ private static List<List<float>> ParseVoyageBatchEmbeddingResponse(string respon
295305
.ToList();
296306
}
297307

308+
/// <summary>
309+
/// Maps valid embeddings back to original input positions, filling empty positions with empty embeddings
310+
/// </summary>
311+
private static List<List<float>> MapEmbeddingsToOriginalInputs(List<List<float>> validEmbeddings, List<string> originalInputs, List<string> validInputs)
312+
{
313+
var result = new List<List<float>>();
314+
var validIndex = 0;
315+
316+
for (int i = 0; i < originalInputs.Count; i++)
317+
{
318+
if (string.IsNullOrWhiteSpace(originalInputs[i]))
319+
{
320+
// Empty input gets empty embedding
321+
result.Add(new List<float>());
322+
}
323+
else
324+
{
325+
// Valid input gets corresponding embedding
326+
if (validIndex < validEmbeddings.Count)
327+
{
328+
result.Add(validEmbeddings[validIndex]);
329+
validIndex++;
330+
}
331+
else
332+
{
333+
// Fallback: empty embedding if something goes wrong
334+
result.Add(new List<float>());
335+
}
336+
}
337+
}
338+
339+
return result;
340+
}
341+
298342
#endregion
299343
}

src/SmartRAG/Providers/BaseAIProvider.cs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,9 @@ protected static HttpClient CreateHttpClientWithoutAuth(Dictionary<string, strin
210210
if (success)
211211
return (true, response, string.Empty);
212212

213-
// Handle rate limiting
214-
if (error.Contains("429") || error.Contains("TooManyRequests"))
213+
// Handle rate limiting and server overload
214+
if (error.Contains("429") || error.Contains("TooManyRequests") ||
215+
error.Contains("529") || error.Contains("Overloaded"))
215216
{
216217
attempt++;
217218
if (attempt < maxRetries)
@@ -299,11 +300,12 @@ private static void AddAdditionalHeaders(HttpClient client, Dictionary<string, s
299300
}
300301

301302
/// <summary>
302-
/// Calculates retry delay for rate limiting
303+
/// Calculates retry delay for rate limiting and server overload
303304
/// </summary>
304305
private static int CalculateRetryDelay(int attempt)
305306
{
306-
return MinRetryDelayMs;
307+
// For server overload (529), use exponential backoff starting from 60 seconds
308+
return MinRetryDelayMs * attempt;
307309
}
308310

309311
/// <summary>

src/SmartRAG/Services/DocumentParserService.cs

Lines changed: 140 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
using SmartRAG.Services;
1313
using System.Text;
1414
using System.Text.RegularExpressions;
15+
using OfficeOpenXml;
1516

1617
namespace SmartRAG.Services;
1718

@@ -22,11 +23,19 @@ public class DocumentParserService(
2223
IOptions<SmartRagOptions> options,
2324
ILogger<DocumentParserService> logger) : IDocumentParserService
2425
{
26+
/// <summary>
27+
/// Static constructor to set EPPlus license once for the application
28+
/// </summary>
29+
static DocumentParserService()
30+
{
31+
// Set EPPlus 8+ license for non-commercial organization use
32+
ExcelPackage.License.SetNonCommercialOrganization("SmartRAG");
33+
}
2534
#region Constants
2635

2736
// Content validation constants
28-
private const int MinContentLength = 10;
29-
private const double MinMeaningfulTextRatio = 0.3;
37+
private const int MinContentLength = 5; // Reduced for Excel files
38+
private const double MinMeaningfulTextRatio = 0.1; // Reduced for Excel files
3039

3140
// Chunk boundary search constants
3241
private const int DefaultDynamicSearchRange = 500;
@@ -36,6 +45,7 @@ public class DocumentParserService(
3645
// File extension constants
3746
private static readonly string[] WordExtensions = [".docx", ".doc"];
3847
private static readonly string[] PdfExtensions = [".pdf"];
48+
private static readonly string[] ExcelExtensions = [".xlsx", ".xls"];
3949
private static readonly string[] TextExtensions = [".txt", ".md", ".json", ".xml", ".csv", ".html", ".htm"];
4050

4151
// Content type constants
@@ -45,6 +55,12 @@ public class DocumentParserService(
4555
"application/vnd.ms-word"
4656
];
4757

58+
private static readonly string[] ExcelContentTypes = [
59+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
60+
"application/vnd.ms-excel",
61+
"application/vnd.ms-excel.sheet.macroEnabled.12"
62+
];
63+
4864
private static readonly string[] TextContentTypes = ["text/", "application/json", "application/xml", "application/csv"];
4965

5066
// Sentence ending constants
@@ -91,12 +107,12 @@ public class DocumentParserService(
91107
/// <summary>
92108
/// Gets supported file types
93109
/// </summary>
94-
public IEnumerable<string> GetSupportedFileTypes() => TextExtensions.Concat(WordExtensions).Concat(PdfExtensions);
110+
public IEnumerable<string> GetSupportedFileTypes() => TextExtensions.Concat(WordExtensions).Concat(PdfExtensions).Concat(ExcelExtensions);
95111

96112
/// <summary>
97113
/// Gets supported content types
98114
/// </summary>
99-
public IEnumerable<string> GetSupportedContentTypes() => TextContentTypes.Concat(WordContentTypes).Append("application/pdf");
115+
public IEnumerable<string> GetSupportedContentTypes() => TextContentTypes.Concat(WordContentTypes).Append("application/pdf").Concat(ExcelContentTypes);
100116

101117
#endregion
102118

@@ -153,6 +169,15 @@ private static bool IsPdfDocument(string fileName, string contentType)
153169
contentType == "application/pdf";
154170
}
155171

172+
/// <summary>
173+
/// Checks if file is an Excel document
174+
/// </summary>
175+
private static bool IsExcelDocument(string fileName, string contentType)
176+
{
177+
return ExcelExtensions.Any(ext => fileName.EndsWith(ext, StringComparison.OrdinalIgnoreCase)) ||
178+
ExcelContentTypes.Any(ct => contentType.Contains(ct));
179+
}
180+
156181
/// <summary>
157182
/// Checks if file is text-based
158183
/// </summary>
@@ -175,6 +200,10 @@ private static async Task<string> ExtractTextAsync(Stream fileStream, string fil
175200
{
176201
return await ParsePdfDocumentAsync(fileStream);
177202
}
203+
else if (IsExcelDocument(fileName, contentType))
204+
{
205+
return await ParseExcelDocumentAsync(fileStream);
206+
}
178207
else if (IsTextBasedFile(fileName, contentType))
179208
{
180209
return await ParseTextDocumentAsync(fileStream);
@@ -213,6 +242,104 @@ private static async Task<string> ParseWordDocumentAsync(Stream fileStream)
213242
}
214243
}
215244

245+
/// <summary>
246+
/// Parses Excel document and extracts text content
247+
/// </summary>
248+
private static async Task<string> ParseExcelDocumentAsync(Stream fileStream)
249+
{
250+
try
251+
{
252+
var memoryStream = await CreateMemoryStreamCopy(fileStream);
253+
254+
// EPPlus license already set in static constructor
255+
using var package = new ExcelPackage(memoryStream);
256+
var textBuilder = new StringBuilder();
257+
258+
// Check if workbook has any worksheets
259+
if (package.Workbook.Worksheets.Count == 0)
260+
{
261+
return "Excel file contains no worksheets";
262+
}
263+
264+
foreach (var worksheet in package.Workbook.Worksheets)
265+
{
266+
if (worksheet.Dimension != null)
267+
{
268+
textBuilder.AppendLine(string.Format(System.Globalization.CultureInfo.InvariantCulture, "Worksheet: {0}", worksheet.Name));
269+
270+
var rowCount = worksheet.Dimension.Rows;
271+
var colCount = worksheet.Dimension.Columns;
272+
273+
// Add header row if exists
274+
var hasData = false;
275+
for (int row = 1; row <= rowCount; row++)
276+
{
277+
var rowBuilder = new StringBuilder();
278+
var rowHasData = false;
279+
280+
for (int col = 1; col <= colCount; col++)
281+
{
282+
var cellValue = worksheet.Cells[row, col].Value;
283+
if (cellValue != null)
284+
{
285+
var cellText = cellValue.ToString();
286+
if (!string.IsNullOrWhiteSpace(cellText))
287+
{
288+
rowBuilder.Append(cellText);
289+
rowHasData = true;
290+
if (col < colCount) rowBuilder.Append('\t');
291+
}
292+
else
293+
{
294+
rowBuilder.Append(' '); // Empty cell gets space
295+
if (col < colCount) rowBuilder.Append('\t');
296+
}
297+
}
298+
else
299+
{
300+
rowBuilder.Append(' '); // Null cell gets space
301+
if (col < colCount) rowBuilder.Append('\t');
302+
}
303+
}
304+
305+
if (rowHasData)
306+
{
307+
textBuilder.AppendLine(rowBuilder.ToString());
308+
hasData = true;
309+
}
310+
}
311+
312+
if (!hasData)
313+
{
314+
textBuilder.AppendLine("Worksheet contains no data");
315+
}
316+
317+
textBuilder.AppendLine();
318+
}
319+
else
320+
{
321+
textBuilder.AppendLine(string.Format(System.Globalization.CultureInfo.InvariantCulture, "Worksheet: {0} (empty)", worksheet.Name));
322+
}
323+
}
324+
325+
var content = textBuilder.ToString();
326+
var cleanedContent = CleanContent(content);
327+
328+
// If content is still empty after cleaning, return a fallback message
329+
if (string.IsNullOrWhiteSpace(cleanedContent))
330+
{
331+
return "Excel file processed but no text content extracted";
332+
}
333+
334+
return cleanedContent;
335+
}
336+
catch (Exception ex)
337+
{
338+
// Return error message instead of empty string for debugging
339+
return $"Error parsing Excel file: {ex.Message}";
340+
}
341+
}
342+
216343
/// <summary>
217344
/// Creates a memory stream copy for processing
218345
/// </summary>
@@ -369,7 +496,15 @@ private static bool IsContentValid(string content)
369496
return false;
370497
}
371498

499+
// For Excel files, be more lenient with content validation
372500
var meaningfulTextRatio = content.Count(c => char.IsLetterOrDigit(c)) / (double)content.Length;
501+
502+
// If content contains worksheet markers, it's likely valid Excel content
503+
if (content.Contains("Worksheet:") || content.Contains("Excel file"))
504+
{
505+
return true;
506+
}
507+
373508
return meaningfulTextRatio >= MinMeaningfulTextRatio;
374509
}
375510

@@ -747,3 +882,4 @@ private static int CalculateNextStartPosition(string content, int currentStart,
747882

748883
#endregion
749884
}
885+

src/SmartRAG/Services/DocumentService.cs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
using SmartRAG.Entities;
44
using SmartRAG.Interfaces;
55
using SmartRAG.Models;
6-
using SmartRAG.Services;
76
using System.Globalization;
87
using System.Text;
98

@@ -30,7 +29,7 @@ public class DocumentService(
3029
private const string NoFileNamesMessage = "No file names provided";
3130
private const string NoContentTypesMessage = "No content types provided";
3231
private const string MismatchedCountsMessage = "Number of file streams, names, and content types must match";
33-
32+
3433
// CompositeFormat for repeated formatting (CA1863)
3534
private static readonly CompositeFormat UnsupportedFileTypeFormat = CompositeFormat.Parse("Unsupported file type: {0}. Supported types: {1}");
3635
private static readonly CompositeFormat UnsupportedContentTypeFormat = CompositeFormat.Parse("Unsupported content type: {0}. Supported types: {1}");

0 commit comments

Comments
 (0)