Skip to content

Commit a8ff1b9

Browse files
committed
Initial public release
0 parents  commit a8ff1b9

24 files changed

+945
-0
lines changed

.gitignore

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Node.js
2+
**/node_modules/
3+
4+
# Virtual Environment
5+
.venv/
6+
venv/
7+
ENV/
8+
9+
# Python
10+
__pycache__/
11+
*.py[cod]
12+
*$py.class
13+
*.so
14+
.Python
15+
*.egg-info/
16+
dist/
17+
build/
18+
19+
# Redis
20+
dump.rdb
21+
22+
# Tests
23+
.coverage
24+
htmlcov/
25+
.pytest_cache/
26+
.tox/
27+
coverage.xml
28+
*.cover

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2024 David Legrand
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# site2md
2+
3+
Convert any website to Markdown or structured JSON. An ideal solution to provide content to LLMs(.txt). This project uses FastAPI and Trafilatura. It serves a simple API with optional KV (Materia, Redis, Valkey) caching and rate limiting.
4+
5+
## Usage
6+
7+
See the [example](https://github.com/davlgd/site2md/tree/main/example) directory to get started.
8+
9+
## Development & Tests
10+
11+
Install optional dependencies and run `pytest` to run the tests. If a KV local server is not running, corresponding tests will be skipped.
12+
13+
## License
14+
15+
This project is licensed under the terms of the MIT license. See the [LICENSE](https://github.com/davlgd/site2md/tree/main/LICENSE) file.

example/api-only/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
site2md>=0.1

example/api-only/server.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import uvicorn
2+
from site2md import create_app, Settings
3+
4+
if __name__ == "__main__":
5+
settings = Settings(
6+
static_dir=None,
7+
)
8+
9+
app = create_app(settings)
10+
uvicorn.run(app, host=settings.host, port=settings.port)

example/simple/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
site2md>=0.1

example/simple/server.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import uvicorn
2+
from site2md import create_app, Settings
3+
4+
if __name__ == "__main__":
5+
settings = Settings(
6+
static_dir=None,
7+
)
8+
9+
app = create_app(settings)
10+
uvicorn.run(app, host=settings.host, port=settings.port)

example/simple/static/index.html

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6+
<title>Markdown Converter</title>
7+
<style>
8+
body {
9+
background-color: #0a1428;
10+
color: #fff;
11+
display: grid;
12+
font-family: system-ui, sans-serif;
13+
min-height: 100vh;
14+
margin: 0;
15+
place-items: center;
16+
}
17+
18+
button {
19+
background: #2d4c7c;
20+
border: none;
21+
border-radius: 4px;
22+
color: white;
23+
cursor: pointer;
24+
font-size: 1.1rem;
25+
padding: 0.75rem;
26+
width: 100%;
27+
}
28+
29+
button:hover {
30+
background: #3a5d94;
31+
}
32+
33+
h1 {
34+
font-size: 1.5rem;
35+
margin-bottom: 1.5rem;
36+
margin-top: 0;
37+
text-align: center;
38+
}
39+
40+
input[type="url"] {
41+
background: rgba(255, 255, 255, 0.1);
42+
border: 1px solid rgba(255, 255, 255, 0.2);
43+
border-radius: 4px;
44+
box-sizing: border-box;
45+
color: #fff;
46+
font-size: 1.1rem;
47+
padding: 0.5rem;
48+
width: 100%;
49+
}
50+
51+
input[type="url"]:focus {
52+
border-color: rgba(255, 255, 255, 0.5);
53+
outline: none;
54+
}
55+
56+
label {
57+
color:rgb(203, 203, 203);
58+
display: block;
59+
margin-bottom: 1rem;
60+
}
61+
62+
.checkbox-wrapper {
63+
align-items: center;
64+
display: flex;
65+
gap: 0.5rem;
66+
margin: 1rem 0;
67+
}
68+
69+
.checkbox-wrapper label {
70+
margin: 0;
71+
}
72+
73+
.container {
74+
background: rgba(255, 255, 255, 0.05);
75+
border: 1px solid rgba(255, 255, 255, 0.1);
76+
border-radius: 10px;
77+
max-width: 400px;
78+
padding: 1.5rem 2rem;
79+
width: 100%;
80+
}
81+
82+
.form-group {
83+
margin-bottom: 1rem;
84+
}
85+
</style>
86+
</head>
87+
<body>
88+
<div class="container">
89+
<h1>Markdown Converter</h1>
90+
<form id="urlForm">
91+
<div class="form-group">
92+
<label for="urlInput">Enter URL</label>
93+
<input type="url" id="urlInput" name="urlInput" value="https://example.com" required>
94+
</div>
95+
96+
<div class="checkbox-wrapper">
97+
<input type="checkbox" id="jsonFormat" name="jsonFormat">
98+
<label for="jsonFormat">JSON Format</label>
99+
</div>
100+
101+
<button type="submit">Convert</button>
102+
</form>
103+
</div>
104+
<script>
105+
document.getElementById('urlForm').addEventListener('submit', function(event) {
106+
event.preventDefault();
107+
const urlInput = document.getElementById('urlInput').value;
108+
const jsonFormat = document.getElementById('jsonFormat').checked;
109+
let finalUrl = urlInput;
110+
if (jsonFormat) {
111+
finalUrl += '?format=json';
112+
}
113+
window.open(`/${finalUrl}`, '_blank');
114+
});
115+
</script>
116+
</body>
117+
</html>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
site2md>=0.1
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import os
2+
import uvicorn
3+
from site2md import create_app, Settings
4+
from site2md.cache import KVCache
5+
from site2md.config import kvConfig
6+
7+
if __name__ == "__main__":
8+
kv_config = kvConfig(
9+
host="materiakv.eu-fr-1.services.clever-cloud.com",
10+
port=6379,
11+
tls=True,
12+
db=0,
13+
password=os.getenv("KV_TOKEN"),
14+
ttl=3600,
15+
socket_timeout=1,
16+
socket_connect_timeout=1
17+
)
18+
19+
cache = KVCache(config=kv_config)
20+
21+
settings = Settings(
22+
static_dir=None,
23+
max_content_size=2_000_000,
24+
cache_backend=cache,
25+
rate_limiter=None
26+
)
27+
app = create_app(settings)
28+
uvicorn.run(app, host=settings.host, port=settings.port)

pyproject.toml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "site2md"
7+
version = "0.1.3"
8+
authors = [
9+
{ name = "David Legrand", email = "1110600+davlgd@users.noreply.github.com" },
10+
]
11+
description = "Host an API to convert websites to markdown with optional features"
12+
readme = "README.md"
13+
license = { file = "LICENSE" }
14+
requires-python = ">=3.9"
15+
classifiers = [
16+
"Programming Language :: Python :: 3",
17+
"License :: OSI Approved :: MIT License",
18+
"Operating System :: OS Independent",
19+
"Framework :: FastAPI",
20+
"Natural Language :: English",
21+
"Topic :: Text Processing :: Markup :: Markdown"
22+
]
23+
24+
dependencies = [
25+
"aiofiles>=24.1",
26+
"fastapi>=0.115",
27+
"pydantic>=2.10",
28+
"python-multipart>=0.0.20",
29+
"redis>=5.2",
30+
"requests>=2.32",
31+
"trafilatura>=2.0",
32+
"uvicorn>=0.34"
33+
]
34+
35+
[project.optional-dependencies]
36+
dev = [
37+
"httpx>=0.28.0",
38+
"pytest>=8.3.0",
39+
"pytest-asyncio>=0.25.0",
40+
"pytest-cov>=6.0.0"
41+
]
42+
43+
[project.urls]
44+
Homepage = "https://github.com/davlgd/site2md"
45+
Issues = "https://github.com/davlgd/site2md/issues"

pytest.ini

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[pytest]
2+
testpaths = tests
3+
python_files = test_*.py
4+
addopts = -v --cov=site2md --cov-report=term-missing

src/site2md/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from .api import create_app
2+
from .config import Settings
3+
from .types import CacheBackend, RateLimiter
4+
5+
__version__ = "0.1.3"
6+
__all__ = ["create_app", "Settings", "CacheBackend", "RateLimiter"]

0 commit comments

Comments
 (0)