3
3
from __future__ import annotations
4
4
5
5
import fnmatch
6
+ from urllib .parse import urljoin
6
7
from collections import defaultdict
7
8
from itertools import chain
8
9
from pathlib import Path
9
- from typing import TYPE_CHECKING
10
+ from typing import TYPE_CHECKING , NamedTuple , cast
10
11
11
12
import mdformat
12
13
from bs4 import BeautifulSoup as Soup
13
14
from bs4 import Tag
14
15
from markdownify import ATX , MarkdownConverter
16
+ from mkdocs .structure .pages import Page
15
17
from mkdocs .config .defaults import MkDocsConfig
16
- from mkdocs .exceptions import PluginError
17
18
from mkdocs .plugins import BasePlugin
18
19
19
20
from mkdocs_llmstxt ._internal .config import _PluginConfig
31
32
_logger = _get_logger (__name__ )
32
33
33
34
35
+ class MDPageInfo (NamedTuple ):
36
+ title : str
37
+ path_md : Path
38
+ md_url : str
39
+ content : str
40
+
41
+
34
42
class MkdocsLLMsTxtPlugin (BasePlugin [_PluginConfig ]):
35
43
"""The MkDocs plugin to generate an `llms.txt` file.
36
44
@@ -47,6 +55,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
47
55
"""The global MkDocs configuration."""
48
56
49
57
def __init__ (self ) -> None :
58
+ self .md_pages : defaultdict [str , list [MDPageInfo ]] = defaultdict (list )
50
59
self .html_pages : dict [str , dict [str , str ]] = defaultdict (dict )
51
60
"""Dictionary to store the HTML contents of pages."""
52
61
@@ -72,6 +81,10 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
72
81
Returns:
73
82
The same, untouched config.
74
83
"""
84
+ if config .site_url is None :
85
+ raise ValueError (
86
+ "'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin"
87
+ )
75
88
self .mkdocs_config = config
76
89
return config
77
90
@@ -88,64 +101,117 @@ def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # no
88
101
Returns:
89
102
Modified collection or none.
90
103
"""
91
- for file in self .config .files :
92
- file ["inputs" ] = self ._expand_inputs (file ["inputs" ], page_uris = list (files .src_uris .keys ()))
104
+ page_uris = list (files .src_uris )
105
+
106
+ for section_name , file_list in list (self .config .sections .items ()):
107
+ self .config .sections [section_name ] = self ._expand_inputs (
108
+ file_list , page_uris = page_uris
109
+ )
110
+
93
111
return files
94
112
95
113
def on_page_content (self , html : str , * , page : Page , ** kwargs : Any ) -> str | None : # noqa: ARG002
96
- """Record pages contents .
114
+ """Convert page content into a Markdown file and save the result to be processed in the `on_post_build` hook .
97
115
98
116
Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
99
- In this hook we simply record the HTML of the pages into a dictionary whose keys are the pages' URIs.
100
117
101
118
Parameters:
102
119
html: The rendered HTML.
103
120
page: The page object.
104
121
"""
105
- for file in self .config .files :
106
- if page .file .src_uri in file ["inputs" ]:
107
- _logger .debug (f"Adding page { page .file .src_uri } to page { file ['output' ]} " )
108
- self .html_pages [file ["output" ]][page .file .src_uri ] = html
122
+ for section_name , file_list in self .config .sections .items ():
123
+ if page .file .src_uri in file_list :
124
+ path_md = Path (page .file .abs_dest_path ).with_suffix (".md" )
125
+ page_md = generate_page_markdown (
126
+ html , self .config .autoclean , self .config .preprocess
127
+ )
128
+
129
+ md_url = Path (page .file .dest_uri ).with_suffix (".md" ).as_posix ()
130
+ if md_url in ("." , "./" ):
131
+ md_url = ""
132
+ md_url = urljoin (
133
+ # Guaranteed to exist as we require 'site_url' to be configured:
134
+ cast (str , self .mkdocs_config .site_url ),
135
+ md_url ,
136
+ )
137
+
138
+ self .md_pages [section_name ].append (
139
+ MDPageInfo (
140
+ title = cast (
141
+ str ,
142
+ page .title if page .title is not None else page .file .src_uri ,
143
+ ),
144
+ path_md = path_md ,
145
+ md_url = md_url ,
146
+ content = page_md ,
147
+ )
148
+ )
149
+
109
150
return html
110
151
111
- def on_post_build (self , config : MkDocsConfig , ** kwargs : Any ) -> None : # noqa: ARG002
112
- """Combine all recorded pages contents and convert it to a Markdown file with BeautifulSoup and Markdownify .
152
+ def on_post_build (self , * , config : MkDocsConfig , ** kwargs : Any ) -> None : # noqa: ARG002
153
+ """Create the final `llms.txt` file and the MD files for all selected pages .
113
154
114
155
Hook for the [`on_post_build` event](https://www.mkdocs.org/user-guide/plugins/#on_post_build).
115
- In this hook we concatenate all previously recorded HTML, and convert it to Markdown using Markdownify.
116
156
117
157
Parameters:
118
158
config: MkDocs configuration.
119
159
"""
120
160
121
- def language_callback (tag : Tag ) -> str :
122
- for css_class in chain (tag .get ("class" ) or (), (tag .parent .get ("class" ) or ()) if tag .parent else ()):
123
- if css_class .startswith ("language-" ):
124
- return css_class [9 :]
125
- return ""
126
-
127
- converter = MarkdownConverter (
128
- bullets = "-" ,
129
- code_language_callback = language_callback ,
130
- escape_underscores = False ,
131
- heading_style = ATX ,
132
- )
133
-
134
- for file in self .config .files :
135
- try :
136
- html = "\n \n " .join (self .html_pages [file ["output" ]][input_page ] for input_page in file ["inputs" ])
137
- except KeyError as error :
138
- raise PluginError (str (error )) from error
139
-
140
- soup = Soup (html , "html.parser" )
141
- if self .config .autoclean :
142
- autoclean (soup )
143
- if self .config .preprocess :
144
- _preprocess (soup , self .config .preprocess , file ["output" ])
145
-
146
- output_file = Path (config .site_dir ).joinpath (file ["output" ])
147
- output_file .parent .mkdir (parents = True , exist_ok = True )
148
- markdown = mdformat .text (converter .convert_soup (soup ), options = {"wrap" : "no" })
149
- output_file .write_text (markdown , encoding = "utf8" )
150
-
151
- _logger .info (f"Generated file /{ file ['output' ]} " )
161
+ output_file = Path (config .site_dir ).joinpath ("llms.txt" )
162
+ output_file .parent .mkdir (parents = True , exist_ok = True )
163
+ markdown = f"# { config .site_name } \n \n "
164
+
165
+ if config .site_description is not None :
166
+ markdown += f"> { config .site_description } \n \n "
167
+
168
+ if self .config .markdown_description is not None :
169
+ markdown += f"{ self .config .markdown_description } \n \n "
170
+
171
+ for section_name , file_list in self .md_pages .items ():
172
+ markdown += f"## { section_name } \n \n "
173
+ for page_title , path_md , md_url , content in file_list :
174
+ _logger .debug (f"Generating MD file to { path_md } " )
175
+ path_md .write_text (content , encoding = "utf8" )
176
+ markdown += f"- [{ page_title } ]({ md_url } )\n "
177
+
178
+ output_file .write_text (markdown , encoding = "utf8" )
179
+ _logger .info ("Generated file / llms.txt" )
180
+
181
+
182
+ def _language_callback (tag : Tag ) -> str :
183
+ for css_class in chain (
184
+ tag .get ("class" ) or (), (tag .parent .get ("class" ) or ()) if tag .parent else ()
185
+ ):
186
+ if css_class .startswith ("language-" ):
187
+ return css_class [9 :]
188
+ return ""
189
+
190
+
191
+ _converter = MarkdownConverter (
192
+ bullets = "-" ,
193
+ code_language_callback = _language_callback ,
194
+ escape_underscores = False ,
195
+ heading_style = ATX ,
196
+ )
197
+
198
+
199
+ def generate_page_markdown (
200
+ html : str , should_autoclean : bool , preprocess : str | None
201
+ ) -> str :
202
+ """Convert HTML to Markdown.
203
+
204
+ Parameters:
205
+ html: The HTML content.
206
+ should_autoclean: Whether to autoclean the HTML.
207
+ preprocess: An optional path of a Python module containing a `preprocess` function.
208
+
209
+ Returns:
210
+ The Markdown content.
211
+ """
212
+ soup = Soup (html , "html.parser" )
213
+ if autoclean :
214
+ autoclean (soup )
215
+ if preprocess :
216
+ _preprocess (soup , preprocess , "llms.txt" )
217
+ return mdformat .text (_converter .convert_soup (soup ), options = {"wrap" : "no" })
0 commit comments