1212using SmartRAG . Services ;
1313using System . Text ;
1414using System . Text . RegularExpressions ;
15+ using OfficeOpenXml ;
1516
1617namespace SmartRAG . Services ;
1718
@@ -22,11 +23,19 @@ public class DocumentParserService(
2223 IOptions < SmartRagOptions > options ,
2324 ILogger < DocumentParserService > logger ) : IDocumentParserService
2425{
26+ /// <summary>
27+ /// Static constructor to set EPPlus license once for the application
28+ /// </summary>
29+ static DocumentParserService ( )
30+ {
31+ // Set EPPlus 8+ license for non-commercial organization use
32+ ExcelPackage . License . SetNonCommercialOrganization ( "SmartRAG" ) ;
33+ }
2534 #region Constants
2635
2736 // Content validation constants
28- private const int MinContentLength = 10 ;
29- private const double MinMeaningfulTextRatio = 0.3 ;
37+ private const int MinContentLength = 5 ; // Reduced for Excel files
38+ private const double MinMeaningfulTextRatio = 0.1 ; // Reduced for Excel files
3039
3140 // Chunk boundary search constants
3241 private const int DefaultDynamicSearchRange = 500 ;
@@ -36,6 +45,7 @@ public class DocumentParserService(
3645 // File extension constants
3746 private static readonly string [ ] WordExtensions = [ ".docx" , ".doc" ] ;
3847 private static readonly string [ ] PdfExtensions = [ ".pdf" ] ;
48+ private static readonly string [ ] ExcelExtensions = [ ".xlsx" , ".xls" ] ;
3949 private static readonly string [ ] TextExtensions = [ ".txt" , ".md" , ".json" , ".xml" , ".csv" , ".html" , ".htm" ] ;
4050
4151 // Content type constants
@@ -45,6 +55,12 @@ public class DocumentParserService(
4555 "application/vnd.ms-word"
4656 ] ;
4757
58+ private static readonly string [ ] ExcelContentTypes = [
59+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ,
60+ "application/vnd.ms-excel" ,
61+ "application/vnd.ms-excel.sheet.macroEnabled.12"
62+ ] ;
63+
4864 private static readonly string [ ] TextContentTypes = [ "text/" , "application/json" , "application/xml" , "application/csv" ] ;
4965
5066 // Sentence ending constants
@@ -91,12 +107,12 @@ public class DocumentParserService(
91107 /// <summary>
92108 /// Gets supported file types
93109 /// </summary>
94- public IEnumerable < string > GetSupportedFileTypes ( ) => TextExtensions . Concat ( WordExtensions ) . Concat ( PdfExtensions ) ;
110+ public IEnumerable < string > GetSupportedFileTypes ( ) => TextExtensions . Concat ( WordExtensions ) . Concat ( PdfExtensions ) . Concat ( ExcelExtensions ) ;
95111
96112 /// <summary>
97113 /// Gets supported content types
98114 /// </summary>
99- public IEnumerable < string > GetSupportedContentTypes ( ) => TextContentTypes . Concat ( WordContentTypes ) . Append ( "application/pdf" ) ;
115+ public IEnumerable < string > GetSupportedContentTypes ( ) => TextContentTypes . Concat ( WordContentTypes ) . Append ( "application/pdf" ) . Concat ( ExcelContentTypes ) ;
100116
101117 #endregion
102118
@@ -153,6 +169,15 @@ private static bool IsPdfDocument(string fileName, string contentType)
153169 contentType == "application/pdf" ;
154170 }
155171
172+ /// <summary>
173+ /// Checks if file is an Excel document
174+ /// </summary>
175+ private static bool IsExcelDocument ( string fileName , string contentType )
176+ {
177+ return ExcelExtensions . Any ( ext => fileName . EndsWith ( ext , StringComparison . OrdinalIgnoreCase ) ) ||
178+ ExcelContentTypes . Any ( ct => contentType . Contains ( ct ) ) ;
179+ }
180+
156181 /// <summary>
157182 /// Checks if file is text-based
158183 /// </summary>
@@ -175,6 +200,10 @@ private static async Task<string> ExtractTextAsync(Stream fileStream, string fil
175200 {
176201 return await ParsePdfDocumentAsync ( fileStream ) ;
177202 }
203+ else if ( IsExcelDocument ( fileName , contentType ) )
204+ {
205+ return await ParseExcelDocumentAsync ( fileStream ) ;
206+ }
178207 else if ( IsTextBasedFile ( fileName , contentType ) )
179208 {
180209 return await ParseTextDocumentAsync ( fileStream ) ;
@@ -213,6 +242,104 @@ private static async Task<string> ParseWordDocumentAsync(Stream fileStream)
213242 }
214243 }
215244
245+ /// <summary>
246+ /// Parses Excel document and extracts text content
247+ /// </summary>
248+ private static async Task < string > ParseExcelDocumentAsync ( Stream fileStream )
249+ {
250+ try
251+ {
252+ var memoryStream = await CreateMemoryStreamCopy ( fileStream ) ;
253+
254+ // EPPlus license already set in static constructor
255+ using var package = new ExcelPackage ( memoryStream ) ;
256+ var textBuilder = new StringBuilder ( ) ;
257+
258+ // Check if workbook has any worksheets
259+ if ( package . Workbook . Worksheets . Count == 0 )
260+ {
261+ return "Excel file contains no worksheets" ;
262+ }
263+
264+ foreach ( var worksheet in package . Workbook . Worksheets )
265+ {
266+ if ( worksheet . Dimension != null )
267+ {
268+ textBuilder . AppendLine ( string . Format ( System . Globalization . CultureInfo . InvariantCulture , "Worksheet: {0}" , worksheet . Name ) ) ;
269+
270+ var rowCount = worksheet . Dimension . Rows ;
271+ var colCount = worksheet . Dimension . Columns ;
272+
273+ // Add header row if exists
274+ var hasData = false ;
275+ for ( int row = 1 ; row <= rowCount ; row ++ )
276+ {
277+ var rowBuilder = new StringBuilder ( ) ;
278+ var rowHasData = false ;
279+
280+ for ( int col = 1 ; col <= colCount ; col ++ )
281+ {
282+ var cellValue = worksheet . Cells [ row , col ] . Value ;
283+ if ( cellValue != null )
284+ {
285+ var cellText = cellValue . ToString ( ) ;
286+ if ( ! string . IsNullOrWhiteSpace ( cellText ) )
287+ {
288+ rowBuilder . Append ( cellText ) ;
289+ rowHasData = true ;
290+ if ( col < colCount ) rowBuilder . Append ( '\t ' ) ;
291+ }
292+ else
293+ {
294+ rowBuilder . Append ( ' ' ) ; // Empty cell gets space
295+ if ( col < colCount ) rowBuilder . Append ( '\t ' ) ;
296+ }
297+ }
298+ else
299+ {
300+ rowBuilder . Append ( ' ' ) ; // Null cell gets space
301+ if ( col < colCount ) rowBuilder . Append ( '\t ' ) ;
302+ }
303+ }
304+
305+ if ( rowHasData )
306+ {
307+ textBuilder . AppendLine ( rowBuilder . ToString ( ) ) ;
308+ hasData = true ;
309+ }
310+ }
311+
312+ if ( ! hasData )
313+ {
314+ textBuilder . AppendLine ( "Worksheet contains no data" ) ;
315+ }
316+
317+ textBuilder . AppendLine ( ) ;
318+ }
319+ else
320+ {
321+ textBuilder . AppendLine ( string . Format ( System . Globalization . CultureInfo . InvariantCulture , "Worksheet: {0} (empty)" , worksheet . Name ) ) ;
322+ }
323+ }
324+
325+ var content = textBuilder . ToString ( ) ;
326+ var cleanedContent = CleanContent ( content ) ;
327+
328+ // If content is still empty after cleaning, return a fallback message
329+ if ( string . IsNullOrWhiteSpace ( cleanedContent ) )
330+ {
331+ return "Excel file processed but no text content extracted" ;
332+ }
333+
334+ return cleanedContent ;
335+ }
336+ catch ( Exception ex )
337+ {
338+ // Return error message instead of empty string for debugging
339+ return $ "Error parsing Excel file: { ex . Message } ";
340+ }
341+ }
342+
216343 /// <summary>
217344 /// Creates a memory stream copy for processing
218345 /// </summary>
@@ -369,7 +496,15 @@ private static bool IsContentValid(string content)
369496 return false ;
370497 }
371498
499+ // For Excel files, be more lenient with content validation
372500 var meaningfulTextRatio = content . Count ( c => char . IsLetterOrDigit ( c ) ) / ( double ) content . Length ;
501+
502+ // If content contains worksheet markers, it's likely valid Excel content
503+ if ( content . Contains ( "Worksheet:" ) || content . Contains ( "Excel file" ) )
504+ {
505+ return true ;
506+ }
507+
373508 return meaningfulTextRatio >= MinMeaningfulTextRatio ;
374509 }
375510
@@ -747,3 +882,4 @@ private static int CalculateNextStartPosition(string content, int currentStart,
747882
748883 #endregion
749884}
885+
0 commit comments