Skip to content

Commit cb0d407

Browse files
committed
md-auto update
1 parent 8a50960 commit cb0d407

File tree

9 files changed

+456
-6
lines changed

9 files changed

+456
-6
lines changed

README.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# ~auto~md~
22

3-
~auto~md~ is a Python tool that converts various file types and GitHub repositories into Markdown documents (.md) optimized for large language models (LLMs)
3+
### Python tool that converts various file types and GitHub repositories into Markdown documents (.md) optimized for quick RAG/indexing via large language models (LLMs)
44

5-
![screen](auto-md-gui.png)
5+
![screen](auto-md-gui-screen.png)
66

77
## Features
88

@@ -15,7 +15,7 @@
1515

1616
| Category | Extensions |
1717
|----------|------------|
18-
| Text | .txt, .text, .log, .log.1, .log.2 |
18+
| Text | .txt, .text, .log |
1919
| Markdown | .md, .markdown, .mdown, .mkdn, .mkd, .mdwn, .mdtxt, .mdtext |
2020
| Web | .html, .htm, .xhtml, .shtml, .css, .scss, .sass, .less |
2121
| Programming | .py, .pyw, .js, .jsx, .ts, .tsx, .java, .c, .cpp, .cs, .go, .rb, .php, .swift, .kt |
@@ -28,7 +28,7 @@
2828

2929
1. Install Python 3.7 or newer
3030

31-
2. Download this project:
31+
2. Download this project (or clone repo like normal):
3232
- Click the green "Code" button above
3333
- Choose "Download ZIP"
3434
- Extract the ZIP file
@@ -67,7 +67,9 @@ Let's say you have the following files in a folder called "my_project":
6767

6868
### Output
6969

70-
After processing with Auto MD, you would get a single Markdown file (`output.md`) that looks like this:
70+
After processing with Auto MD, you would get a single Markdown file (`output.md`) that looks like the example below
71+
72+
This single .md file contains all the content from your input files, with a table of contents at the top for easy navigation and referencing / indexing via LLM models
7173

7274
```markdown
7375
# Auto MD Output
@@ -119,4 +121,3 @@ After processing with Auto MD, you would get a single Markdown file (`output.md`
119121
(Content of styles.css)
120122
```
121123

122-
This single .md file contains all the content from your input files, with a table of contents at the top for easy navigation and referencing / indexing via LLM models.

auto-md-gui-screen.png

14.8 KB
Loading

auto-md/file_processor.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import os
2+
import re
3+
import zipfile
4+
import shutil
5+
import subprocess
6+
import logging
7+
from pathlib import Path
8+
from typing import List, Optional, Dict
9+
from markdown_formatter import format_as_markdown, generate_toc
10+
11+
TEXT_EXTENSIONS = {
12+
'.txt', '.md', '.markdown', '.mdown', '.mkdn', '.mkd', '.mdwn', '.mdtxt', '.mdtext', '.text',
13+
'.html', '.htm', '.xhtml', '.shtml',
14+
'.css', '.scss', '.sass', '.less',
15+
'.py', '.pyw', '.pyc', '.pyo', '.pyd',
16+
'.js', '.jsx', '.ts', '.tsx',
17+
'.yaml', '.yml',
18+
'.json', '.jsonl', '.json5',
19+
'.xml', '.xsl', '.xslt', '.svg',
20+
'.csv', '.tsv',
21+
'.rst', '.rest',
22+
'.ini', '.cfg', '.conf', '.config',
23+
'.log', '.log.1', '.log.2',
24+
'.bat', '.cmd', '.sh', '.bash', '.zsh', '.fish',
25+
'.sql', '.mysql', '.pgsql', '.sqlite',
26+
'.php', '.phtml', '.php3', '.php4', '.php5', '.phps',
27+
'.rb', '.rbw', '.rake', '.gemspec',
28+
'.lua', '.luac',
29+
'.pl', '.pm', '.t', '.pod',
30+
'.go', '.gop',
31+
'.java', '.class', '.jar',
32+
'.cs', '.csx', '.vb',
33+
'.c', '.h', '.cpp', '.hpp', '.cc', '.hh', '.cxx', '.hxx',
34+
'.swift', '.kt', '.kts',
35+
'.r', '.rdata', '.rds', '.rda',
36+
'.m', '.mm',
37+
'.tex', '.ltx', '.latex', '.bib',
38+
'.asm', '.s',
39+
'.f', '.for', '.f90', '.f95', '.f03', '.f08',
40+
'.scala', '.sc',
41+
'.clj', '.cljs', '.cljc', '.edn',
42+
'.dart',
43+
'.groovy', '.gvy', '.gy', '.gsh',
44+
'.ps1', '.psm1', '.psd1',
45+
'.elm',
46+
'.erl', '.hrl',
47+
'.ex', '.exs',
48+
'.hs', '.lhs',
49+
'.ml', '.mli',
50+
'.rs',
51+
'.vim', '.vimrc',
52+
'.dockerfile', '.containerfile',
53+
'.gitignore', '.gitattributes', '.gitmodules',
54+
'.toml', '.editorconfig'
55+
}
56+
57+
58+
def clean_text(text: str) -> str:
59+
"""Clean the input text."""
60+
return re.sub(r'[^\x00-\x7F]+', '', re.sub(r'\s+', ' ', text)).strip()
61+
62+
63+
def process_file(file_path: str, output_dir: str, single_file: bool, all_files: List[str], include_metadata: bool,
64+
include_toc: bool, toc_entries: Dict[str, str]) -> Optional[str]:
65+
"""Process a single text file."""
66+
logging.info(f"Processing file: {file_path}")
67+
try:
68+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
69+
text = file.read()
70+
71+
if not text.strip():
72+
logging.warning(f"File is empty: {file_path}")
73+
return None
74+
75+
title = Path(file_path).stem.replace('_', ' ').replace('-', ' ')
76+
cleaned_text = clean_text(text)
77+
markdown_text = format_as_markdown(cleaned_text, title, file_path, all_files, include_metadata, include_toc,
78+
toc_entries)
79+
80+
if not single_file:
81+
output_file = Path(output_dir) / f"{title}.md"
82+
output_file.write_text(markdown_text, encoding='utf-8')
83+
logging.info(f"Saved markdown to: {output_file}")
84+
85+
return markdown_text
86+
except Exception as e:
87+
logging.error(f"Error processing file {file_path}: {e}")
88+
return None
89+
90+
91+
def process_folder(folder_path: str, output_dir: str, single_file: bool, combined_content: List[str],
92+
all_files: List[str], include_metadata: bool, include_toc: bool, toc_entries: Dict[str, str]):
93+
"""Process all text files in a given folder and its subfolders."""
94+
logging.info(f"Processing folder: {folder_path}")
95+
for path in Path(folder_path).rglob('*'):
96+
if path.is_file():
97+
if path.suffix.lower() in TEXT_EXTENSIONS:
98+
all_files.append(str(path))
99+
result = process_file(str(path), output_dir, single_file, all_files, include_metadata, include_toc,
100+
toc_entries)
101+
if result:
102+
combined_content.append(result)
103+
elif path.suffix.lower() == '.zip':
104+
temp_extract_to = path.parent / f"temp_{path.name}"
105+
extract_zip(str(path), str(temp_extract_to))
106+
process_folder(str(temp_extract_to), output_dir, single_file, combined_content, all_files,
107+
include_metadata, include_toc, toc_entries)
108+
shutil.rmtree(temp_extract_to, ignore_errors=True)
109+
110+
111+
def extract_zip(zip_path: str, extract_to: str):
112+
"""Extract a zip file to the specified directory."""
113+
logging.info(f"Extracting zip file: {zip_path}")
114+
try:
115+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
116+
zip_ref.extractall(extract_to)
117+
logging.info(f"Extracted to: {extract_to}")
118+
except Exception as e:
119+
logging.error(f"Error extracting zip file {zip_path}: {e}")
120+
121+
122+
def clone_git_repo(repo_url: str, temp_folder: str, depth: Optional[int] = None):
123+
"""Clone a GitHub repository to the specified directory."""
124+
logging.info(f"Cloning GitHub repository: {repo_url}")
125+
try:
126+
cmd = ["git", "clone"]
127+
if depth is not None:
128+
cmd.extend(["--depth", str(depth)])
129+
cmd.extend([repo_url, temp_folder])
130+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
131+
logging.info(result.stdout)
132+
logging.info(f"Cloned to: {temp_folder}")
133+
except subprocess.CalledProcessError as e:
134+
logging.error(f"Error cloning GitHub repository {repo_url}: {e}")
135+
logging.error(e.stderr)
136+
137+
138+
def process_input(input_paths: List[str], output_path: str, temp_folder: str, single_file: bool,
139+
repo_depth: Optional[int], include_metadata: bool, include_toc: bool) -> str:
140+
"""Process each item in the input paths: directories, text files, zip files, and GitHub repos."""
141+
combined_content = []
142+
all_files = []
143+
toc_entries: Dict[str, str] = {}
144+
output_dir = Path(output_path).parent if single_file else Path(output_path)
145+
output_dir.mkdir(parents=True, exist_ok=True)
146+
147+
for item_path in input_paths:
148+
path = Path(item_path)
149+
if path.is_dir():
150+
process_folder(str(path), str(output_dir), single_file, combined_content, all_files, include_metadata,
151+
include_toc, toc_entries)
152+
elif path.suffix.lower() in TEXT_EXTENSIONS:
153+
all_files.append(str(path))
154+
result = process_file(str(path), str(output_dir), single_file, all_files, include_metadata, include_toc,
155+
toc_entries)
156+
if result:
157+
combined_content.append(result)
158+
elif path.suffix.lower() == '.zip':
159+
extract_to = Path(temp_folder) / path.stem
160+
extract_zip(str(path), str(extract_to))
161+
process_folder(str(extract_to), str(output_dir), single_file, combined_content, all_files, include_metadata,
162+
include_toc, toc_entries)
163+
shutil.rmtree(extract_to, ignore_errors=True)
164+
elif item_path.startswith("https://github.com"):
165+
repo_name = Path(item_path).name.replace('.git', '')
166+
repo_temp_folder = Path(temp_folder) / repo_name
167+
clone_git_repo(item_path, str(repo_temp_folder), depth=repo_depth)
168+
process_folder(str(repo_temp_folder), str(output_dir), single_file, combined_content, all_files,
169+
include_metadata, include_toc, toc_entries)
170+
shutil.rmtree(repo_temp_folder, ignore_errors=True)
171+
172+
if single_file and combined_content:
173+
output_file = Path(output_path)
174+
content = "\n---\n\n".join(combined_content)
175+
if include_toc:
176+
toc = generate_toc(toc_entries)
177+
content = toc + "\n---\n\n" + content
178+
output_file.write_text(content, encoding='utf-8')
179+
logging.info(f"Combined content saved to: {output_file}")
180+
elif single_file and not combined_content:
181+
logging.warning("No content was processed. Output file not created.")
182+
else:
183+
logging.info(f"Individual Markdown files saved in: {output_dir}")
184+
185+
return str(output_dir if not single_file else Path(output_path).parent)

0 commit comments

Comments
 (0)