Skip to content

Commit b124e48

Browse files
committed
Refactor generation of llms.txt
1 parent cf81358 commit b124e48

File tree

4 files changed

+122
-55
lines changed

4 files changed

+122
-55
lines changed

src/mkdocs_llmstxt/_internal/config.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,10 @@
66
from mkdocs.config.base import Config as BaseConfig
77

88

9-
class _FileConfig(BaseConfig):
10-
"""Sub-config for each Markdown file."""
11-
12-
output = mkconf.Type(str)
13-
inputs = mkconf.ListOfItems(mkconf.Type(str))
14-
15-
169
class _PluginConfig(BaseConfig):
1710
"""Configuration options for the plugin."""
1811

1912
autoclean = mkconf.Type(bool, default=True)
2013
preprocess = mkconf.Optional(mkconf.File(exists=True))
21-
files = mkconf.ListOfItems(mkconf.SubConfig(_FileConfig))
14+
markdown_description = mkconf.Optional(mkconf.Type(str))
15+
sections = mkconf.DictOfItems(mkconf.ListOfItems(mkconf.Type(str)))

src/mkdocs_llmstxt/_internal/debug.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,10 @@ def _get_debug_info() -> _Environment:
7979
"""
8080
py_name, py_version = _interpreter_name_version()
8181
packages = ["mkdocs-llmstxt"]
82-
variables = ["PYTHONPATH", *[var for var in os.environ if var.startswith("MKDOCS_LLMSTXT")]]
82+
variables = [
83+
"PYTHONPATH",
84+
*[var for var in os.environ if var.startswith("MKDOCS_LLMSTXT")],
85+
]
8386
return _Environment(
8487
interpreter_name=py_name,
8588
interpreter_version=py_version,
@@ -94,7 +97,9 @@ def _print_debug_info() -> None:
9497
"""Print debug/environment information."""
9598
info = _get_debug_info()
9699
print(f"- __System__: {info.platform}")
97-
print(f"- __Python__: {info.interpreter_name} {info.interpreter_version} ({info.interpreter_path})")
100+
print(
101+
f"- __Python__: {info.interpreter_name} {info.interpreter_version} ({info.interpreter_path})"
102+
)
98103
print("- __Environment variables__:")
99104
for var in info.variables:
100105
print(f" - `{var.name}`: `{var.value}`")

src/mkdocs_llmstxt/_internal/plugin.py

Lines changed: 110 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
from __future__ import annotations
44

55
import fnmatch
6+
from urllib.parse import urljoin
67
from collections import defaultdict
78
from itertools import chain
89
from pathlib import Path
9-
from typing import TYPE_CHECKING
10+
from typing import TYPE_CHECKING, NamedTuple, cast
1011

1112
import mdformat
1213
from bs4 import BeautifulSoup as Soup
1314
from bs4 import Tag
1415
from markdownify import ATX, MarkdownConverter
16+
from mkdocs.structure.pages import Page
1517
from mkdocs.config.defaults import MkDocsConfig
16-
from mkdocs.exceptions import PluginError
1718
from mkdocs.plugins import BasePlugin
1819

1920
from mkdocs_llmstxt._internal.config import _PluginConfig
@@ -31,6 +32,13 @@
3132
_logger = _get_logger(__name__)
3233

3334

35+
class MDPageInfo(NamedTuple):
36+
title: str
37+
path_md: Path
38+
md_url: str
39+
content: str
40+
41+
3442
class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
3543
"""The MkDocs plugin to generate an `llms.txt` file.
3644
@@ -47,6 +55,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
4755
"""The global MkDocs configuration."""
4856

4957
def __init__(self) -> None:
58+
self.md_pages: defaultdict[str, list[MDPageInfo]] = defaultdict(list)
5059
self.html_pages: dict[str, dict[str, str]] = defaultdict(dict)
5160
"""Dictionary to store the HTML contents of pages."""
5261

@@ -72,6 +81,10 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
7281
Returns:
7382
The same, untouched config.
7483
"""
84+
if config.site_url is None:
85+
raise ValueError(
86+
"'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin"
87+
)
7588
self.mkdocs_config = config
7689
return config
7790

@@ -88,64 +101,117 @@ def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # no
88101
Returns:
89102
Modified collection or none.
90103
"""
91-
for file in self.config.files:
92-
file["inputs"] = self._expand_inputs(file["inputs"], page_uris=list(files.src_uris.keys()))
104+
page_uris = list(files.src_uris)
105+
106+
for section_name, file_list in list(self.config.sections.items()):
107+
self.config.sections[section_name] = self._expand_inputs(
108+
file_list, page_uris=page_uris
109+
)
110+
93111
return files
94112

95113
def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None: # noqa: ARG002
96-
"""Record pages contents.
114+
"""Convert page content into a Markdown file and save the result to be processed in the `on_post_build` hook.
97115
98116
Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
99-
In this hook we simply record the HTML of the pages into a dictionary whose keys are the pages' URIs.
100117
101118
Parameters:
102119
html: The rendered HTML.
103120
page: The page object.
104121
"""
105-
for file in self.config.files:
106-
if page.file.src_uri in file["inputs"]:
107-
_logger.debug(f"Adding page {page.file.src_uri} to page {file['output']}")
108-
self.html_pages[file["output"]][page.file.src_uri] = html
122+
for section_name, file_list in self.config.sections.items():
123+
if page.file.src_uri in file_list:
124+
path_md = Path(page.file.abs_dest_path).with_suffix(".md")
125+
page_md = generate_page_markdown(
126+
html, self.config.autoclean, self.config.preprocess
127+
)
128+
129+
md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix()
130+
if md_url in (".", "./"):
131+
md_url = ""
132+
md_url = urljoin(
133+
# Guaranteed to exist as we require 'site_url' to be configured:
134+
cast(str, self.mkdocs_config.site_url),
135+
md_url,
136+
)
137+
138+
self.md_pages[section_name].append(
139+
MDPageInfo(
140+
title=cast(
141+
str,
142+
page.title if page.title is not None else page.file.src_uri,
143+
),
144+
path_md=path_md,
145+
md_url=md_url,
146+
content=page_md,
147+
)
148+
)
149+
109150
return html
110151

111-
def on_post_build(self, config: MkDocsConfig, **kwargs: Any) -> None: # noqa: ARG002
112-
"""Combine all recorded pages contents and convert it to a Markdown file with BeautifulSoup and Markdownify.
152+
def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa: ARG002
153+
"""Create the final `llms.txt` file and the MD files for all selected pages.
113154
114155
Hook for the [`on_post_build` event](https://www.mkdocs.org/user-guide/plugins/#on_post_build).
115-
In this hook we concatenate all previously recorded HTML, and convert it to Markdown using Markdownify.
116156
117157
Parameters:
118158
config: MkDocs configuration.
119159
"""
120160

121-
def language_callback(tag: Tag) -> str:
122-
for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()):
123-
if css_class.startswith("language-"):
124-
return css_class[9:]
125-
return ""
126-
127-
converter = MarkdownConverter(
128-
bullets="-",
129-
code_language_callback=language_callback,
130-
escape_underscores=False,
131-
heading_style=ATX,
132-
)
133-
134-
for file in self.config.files:
135-
try:
136-
html = "\n\n".join(self.html_pages[file["output"]][input_page] for input_page in file["inputs"])
137-
except KeyError as error:
138-
raise PluginError(str(error)) from error
139-
140-
soup = Soup(html, "html.parser")
141-
if self.config.autoclean:
142-
autoclean(soup)
143-
if self.config.preprocess:
144-
_preprocess(soup, self.config.preprocess, file["output"])
145-
146-
output_file = Path(config.site_dir).joinpath(file["output"])
147-
output_file.parent.mkdir(parents=True, exist_ok=True)
148-
markdown = mdformat.text(converter.convert_soup(soup), options={"wrap": "no"})
149-
output_file.write_text(markdown, encoding="utf8")
150-
151-
_logger.info(f"Generated file /{file['output']}")
161+
output_file = Path(config.site_dir).joinpath("llms.txt")
162+
output_file.parent.mkdir(parents=True, exist_ok=True)
163+
markdown = f"# {config.site_name}\n\n"
164+
165+
if config.site_description is not None:
166+
markdown += f"> {config.site_description}\n\n"
167+
168+
if self.config.markdown_description is not None:
169+
markdown += f"{self.config.markdown_description}\n\n"
170+
171+
for section_name, file_list in self.md_pages.items():
172+
markdown += f"## {section_name}\n\n"
173+
for page_title, path_md, md_url, content in file_list:
174+
_logger.debug(f"Generating MD file to {path_md}")
175+
path_md.write_text(content, encoding="utf8")
176+
markdown += f"- [{page_title}]({md_url})\n"
177+
178+
output_file.write_text(markdown, encoding="utf8")
179+
_logger.info("Generated file / llms.txt")
180+
181+
182+
def _language_callback(tag: Tag) -> str:
183+
for css_class in chain(
184+
tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()
185+
):
186+
if css_class.startswith("language-"):
187+
return css_class[9:]
188+
return ""
189+
190+
191+
_converter = MarkdownConverter(
192+
bullets="-",
193+
code_language_callback=_language_callback,
194+
escape_underscores=False,
195+
heading_style=ATX,
196+
)
197+
198+
199+
def generate_page_markdown(
200+
html: str, should_autoclean: bool, preprocess: str | None
201+
) -> str:
202+
"""Convert HTML to Markdown.
203+
204+
Parameters:
205+
html: The HTML content.
206+
should_autoclean: Whether to autoclean the HTML.
207+
preprocess: An optional path of a Python module containing a `preprocess` function.
208+
209+
Returns:
210+
The Markdown content.
211+
"""
212+
soup = Soup(html, "html.parser")
213+
if autoclean:
214+
autoclean(soup)
215+
if preprocess:
216+
_preprocess(soup, preprocess, "llms.txt")
217+
return mdformat.text(_converter.convert_soup(soup), options={"wrap": "no"})

src/mkdocs_llmstxt/_internal/preprocess.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,6 @@ def autoclean(soup: Soup) -> None:
9898

9999
# Remove line numbers from code blocks.
100100
for element in soup.find_all("table", attrs={"class": "highlighttable"}):
101-
element.replace_with(Soup(f"<pre>{element.find('code').get_text()}</pre>", "html.parser")) # type: ignore[union-attr]
101+
element.replace_with(
102+
Soup(f"<pre>{element.find('code').get_text()}</pre>", "html.parser")
103+
) # type: ignore[union-attr]

0 commit comments

Comments
 (0)