diff --git a/README.md b/README.md index b080569..7626e72 100644 --- a/README.md +++ b/README.md @@ -119,3 +119,13 @@ def preprocess(soup: BeautifulSoup, output: str) -> None: The `output` argument lets you modify the soup *depending on which file is being generated*. Have a look at [our own cleaning function](https://pawamoy.github.io/mkdocs-llmstxt/reference/mkdocs_llmstxt/#mkdocs_llmstxt.autoclean) to get inspiration. + +## Fine-tuning + +You can fine-tune the output of the `llms-full.txt` file by setting the following configuration options: + +- `prefix_url_per_page` - (Optional) If set to `true`, the URL of each page will be prefixed to the content of the page in the `llms-full.txt` file. Can be useful to provide context to the LLM. +- `use_section_separator` - (Optional) Can be set to the string that should be used to separate sections in the `llms-full.txt` file. Is wrapped with `\n` on both sides (if not empty). +- `use_section_pages_separator` - (Optional) Can be set to the string that should be used to separate pages in a section in the `llms-full.txt` file. Is wrapped with `\n` on both sides (if not empty). +- `prefix_url_base_url` - (Optional) Can be set to the URL that has to be used as a base URL for llms-full.txt URL building. If not set, the `site_url` will be used. +- `include_section_content_in_full_output` - (Optional) Can be set to `false` to not include the section content in the `llms-full.txt` file. This can be useful if you want to keep the file size small and only include the page content. Defaults to `true`. \ No newline at end of file diff --git a/src/mkdocs_llmstxt/_internal/config.py b/src/mkdocs_llmstxt/_internal/config.py index a9f5877..7a3515c 100644 --- a/src/mkdocs_llmstxt/_internal/config.py +++ b/src/mkdocs_llmstxt/_internal/config.py @@ -14,3 +14,8 @@ class _PluginConfig(BaseConfig): markdown_description = mkconf.Optional(mkconf.Type(str)) full_output = mkconf.Optional(mkconf.Type(str)) sections = mkconf.DictOfItems(mkconf.ListOfItems(mkconf.Type(str))) + prefix_url_per_page = mkconf.Type(bool, default=False) + use_section_separator = mkconf.Optional(mkconf.Type(str)) + use_section_pages_separator = mkconf.Optional(mkconf.Type(str)) + prefix_url_base_url = mkconf.Optional(mkconf.Type(str)) + include_section_content_in_full_output = mkconf.Type(bool, default=True) diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py index e89f2bb..c01757a 100644 --- a/src/mkdocs_llmstxt/_internal/plugin.py +++ b/src/mkdocs_llmstxt/_internal/plugin.py @@ -132,6 +132,8 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None # Guaranteed to exist as we require `site_url` to be configured. base = cast("str", self.mkdocs_config.site_url) + if self.config.prefix_url_base_url: + base = cast("str", self.config.prefix_url_base_url) if not base.endswith("/"): base += "/" md_url = urljoin(base, md_url) @@ -165,7 +167,7 @@ def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa if self.config.markdown_description is not None: markdown += f"{self.config.markdown_description}\n\n" - full_markdown = markdown + full_markdown = [markdown] for section_name, file_list in self.md_pages.items(): markdown += f"## {section_name}\n\n" @@ -181,9 +183,36 @@ def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa if self.config.full_output is not None: full_output_file = Path(config.site_dir).joinpath(self.config.full_output) for section_name, file_list in self.md_pages.items(): - list_content = "\n".join(info.content for info in file_list) - full_markdown += f"# {section_name}\n\n{list_content}" - full_output_file.write_text(full_markdown, encoding="utf8") + section_parts = [] + section_content = [] + + if self.config.include_section_content_in_full_output: + section_parts = [f"# {section_name}"] + + # Process each file in the section + for info in file_list: + if self.config.prefix_url_per_page: + section_content.append(f"URL FOR THIS PAGE: {info.md_url}") + section_content.append(info.content) + if self.config.use_section_pages_separator: + section_content.append(f"\n{self.config.use_section_pages_separator}\n") + + # Join all section content and add to section parts + section_parts.append("\n".join(section_content)) + + # Add the section parts to the full markdown (joined by newlines) + full_markdown.append("\n\n".join(section_parts)) + + section_separator = "" + # If section content is included - use section separator, otherwise - page separator + # (as there are no sections) + if self.config.include_section_content_in_full_output: + if self.config.use_section_separator is not None: + section_separator = f"\n{self.config.use_section_separator}\n" + elif self.config.use_section_pages_separator: + section_separator = f"\n{self.config.use_section_pages_separator}\n" + full_markdown_as_string = section_separator.join(full_markdown) + full_output_file.write_text(full_markdown_as_string, encoding="utf8") _logger.debug(f"Generated file /{self.config.full_output}.txt")