@@ -81,26 +81,32 @@ def ollama_embed(
8181def chunk_text (text : str , size : int = CHUNK_CHARS ) -> List [str ]:
8282 """Split text into chunks of specified size"""
8383 text = re .sub (r"\s+" , " " , text )
84- return [text [i : i + size ] for i in range (0 , len (text ), size )]
84+ return [text [i : i + size ] for i in range (0 , len (text ), size )]
8585
8686
8787def scan_docs (root : Path ) -> List [Tuple [str , str ]]:
8888 """Scan directory for supported documents and read their content"""
89- # First, count only supported files (exclude database files)
90- console .print ("🔍 [dim]Counting files...[/]" )
91- all_files = [p for p in root .rglob ("*" ) if p .is_file ()]
92-
93- # Filter out database files and only include supported extensions
94- files : List [Path ] = []
95- for p in all_files :
96- # Skip database files
97- if p .name in ["faiss_index.bin" , "doc_store.json" ]:
98- continue
99- # Only include supported file types
100- if p .suffix .lower () in READERS :
101- files .append (p )
89+ # Find supported files directly (much faster!)
90+ with Progress (
91+ SpinnerColumn (),
92+ TextColumn ("[progress.description]{task.description}" ),
93+ console = console ,
94+ ) as prog :
95+ count_task = prog .add_task ("🔍 Counting files..." )
96+
97+ files : List [Path ] = []
98+
99+ # Only glob for supported file extensions
100+ for ext in READERS .keys ():
101+ pattern = f"**/*{ ext } "
102+ for p in root .rglob (pattern ):
103+ if p .is_file () and p .name not in ["faiss_index.bin" , "doc_store.json" ]:
104+ files .append (p )
105+
106+ prog .update (count_task , completed = True )
102107
103108 total_files = len (files )
109+ console .print (f"📊 [bold green]Found { total_files } supported documents[/]" )
104110
105111 docs : List [Tuple [str , str ]] = []
106112 files_processed = 0
@@ -141,7 +147,7 @@ def scan_docs(root: Path) -> List[Tuple[str, str]]:
141147 prog .update (
142148 task ,
143149 current_file = "✅ Complete!" ,
144- description = f "📄 Found { len ( docs ) } documents " ,
150+ description = "📄 Document scanning complete " ,
145151 )
146152 return docs
147153
@@ -208,7 +214,7 @@ def build_or_load(
208214 task = prog .add_task ("🔮 Generating embeddings" , total = len (chunks ))
209215 for i in range (0 , len (chunks ), batch_size ):
210216 current_batch_size = min (batch_size , len (chunks ) - i )
211- vecs = ollama_embed (chunks [i : i + current_batch_size ], embed_model , url )
217+ vecs = ollama_embed (chunks [i : i + current_batch_size ], embed_model , url )
212218 index .add (vecs ) # type: ignore
213219 prog .advance (task , current_batch_size )
214220
0 commit comments