[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script (#6372)

MananTank · MananTank · commit 8887e46a009f · 2025-02-28T19:13:59.000Z
&lt;!-- start pr-codex --&gt;

## PR-Codex overview
This PR focuses on enhancing the layout components and search data extraction in the application. It introduces new files for LLM content and modifies existing components to include additional properties for better indexing and rendering.

### Detailed summary
- Added `public/llms.txt` and `public/llms-full.txt` files.
- Removed `data-noindex` from `&lt;div&gt;` in multiple layout files.
- Added `noLLM` property to `DocLayout` in various components.
- Updated `extractSearchData` to return LLM content.
- Modified `extractContent` to handle new LLM data.
- Adjusted `ArticleCard` and `Code` components to include `data-noindex`.
- Updated dependencies in `package.json` and `pnpm-lock.yaml`.

&gt; ✨ Ask PR-Codex anything about this PR by commenting with `/codex {your question}`

&lt;!-- end pr-codex --&gt;
diff --git a/apps/portal/.gitignore b/apps/portal/.gitignore
@@ -39,6 +39,8 @@ next-env.d.ts
 
 # generated files
 searchIndex.json
+public/llms.txt
+public/llms-full.txt
 
 .env
 public/sitemap*.xml
diff --git a/apps/portal/package.json b/apps/portal/package.json
@@ -34,9 +34,11 @@
     "date-fns": "4.1.0",
     "flexsearch": "^0.7.43",
     "github-slugger": "^2.0.0",
+    "he": "^1.2.0",
     "lucide-react": "0.476.0",
     "next": "15.2.0",
     "nextjs-toploader": "^1.6.12",
+    "node-html-markdown": "^1.3.0",
     "node-html-parser": "^6.1.13",
     "posthog-js": "1.67.1",
     "prettier": "3.3.3",
@@ -55,6 +57,7 @@
   "devDependencies": {
     "@next/eslint-plugin-next": "15.2.0",
     "@types/flexsearch": "^0.7.6",
+    "@types/he": "^1.2.3",
     "@types/mdx": "^2.0.13",
     "@types/node": "22.13.5",
     "@types/react": "19.0.10",
diff --git a/apps/portal/scripts/extractSearchData.ts b/apps/portal/scripts/extractSearchData.ts
@@ -1,10 +1,13 @@
 import { writeFileSync } from "node:fs";
-import { extractSearchData } from "../src/app/api/search/extraction";
+import { extractContent } from "../src/app/api/search/extraction";
 
 async function main() {
   const rootDir = process.cwd();
-  const websiteData = await extractSearchData(rootDir);
-  writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
+  const { searchData, llmContent, llmFullContent } =
+    await extractContent(rootDir);
+  writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
+  writeFileSync("./public/llms.txt", llmContent);
+  writeFileSync("./public/llms-full.txt", llmFullContent);
 }
 
 main();
diff --git a/apps/portal/src/app/account/layout.tsx b/apps/portal/src/app/account/layout.tsx
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
 
 export default async function Layout(props: { children: React.ReactNode }) {
   return (
-    <DocLayout sideBar={sidebar} editPageButton={true}>
+    <DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
       {props.children}
     </DocLayout>
   );
diff --git a/apps/portal/src/app/api/search/extraction/index.ts b/apps/portal/src/app/api/search/extraction/index.ts
@@ -1,4 +1,6 @@
 import { readFile } from "node:fs/promises";
+import he from "he";
+import { NodeHtmlMarkdown } from "node-html-markdown";
 import {
   CommentNode as X_CommentNode,
   HTMLElement as X_HTMLElement,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
 import { ignoreHeadings } from "./settings";
 import { trimExtraSpace } from "./trimExtraSpace";
 
-export async function extractSearchData(rootDir: string): Promise<PageData[]> {
+type ExtractedContent = {
+  searchData: PageData[];
+  llmContent: string;
+  llmFullContent: string;
+};
+
+const llmsContentHeader = `\
+# thirdweb
+
+> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
+
+## Docs
+`;
+
+const llmsFullContentHeader = `\
+# thirdweb
+
+> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
+`;
+
+export async function extractContent(
+  rootDir: string,
+): Promise<ExtractedContent> {
   const nextOutputDir = `${rootDir}/.next/server/app`;
   const htmlFiles = getFilesRecursive(nextOutputDir, "html");
 
   const pages: PageData[] = [];
+  let llmContent = "";
+  let llmFullContent = "";
 
   const noMainFound: string[] = [];
   const noH1Found: string[] = [];
@@ -26,7 +52,7 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
       const mainEl = parse(htmlContent, {
         comment: false,
         blockTextElements: {
-          pre: false, // parse text inside <pre> elements instead of treating it as text
+          pre: true,
         },
       }).querySelector("main");
 
@@ -37,25 +63,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
         return;
       }
 
-      const noIndex = mainEl.getAttribute("data-noindex");
-
-      if (noIndex) {
+      if (mainEl.getAttribute("data-noindex") === "true") {
         return;
       }
 
       const pageTitle = mainEl.querySelector("h1")?.text;
-
       if (!pageTitle) {
         noH1Found.push(
           filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
         );
       }
 
-      pages.push({
-        href: filePath.replace(nextOutputDir, "").replace(".html", ""),
-        title: pageTitle ? trimExtraSpace(pageTitle) : "",
-        sections: getPageSections(mainEl),
-      });
+      // Important: do the search index collection first - we will modify the main element in the next step
+      // Extract search data
+      const pageData = extractPageSearchData(
+        mainEl,
+        filePath,
+        nextOutputDir,
+        pageTitle,
+      );
+      if (pageData) {
+        pages.push(pageData);
+      }
+
+      // Extract LLM content
+      const { links, full } = extractPageLLMContent(
+        mainEl,
+        pageTitle,
+        filePath,
+        nextOutputDir,
+      );
+      llmContent += links ? `${links}\n` : "";
+      llmFullContent += full ? `${full}\n` : "";
     }),
   );
 
@@ -77,13 +116,147 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
     console.warn("\n");
   }
 
-  return pages;
+  return {
+    searchData: pages,
+    llmContent: `${llmsContentHeader}\n${llmContent}`,
+    llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
+  };
 }
 
-function getPageSections(main: X_HTMLElement): PageSectionData[] {
+function extractPageSearchData(
+  main: X_HTMLElement,
+  filePath: string,
+  nextOutputDir: string,
+  pageTitle: string | undefined,
+): PageData | null {
+  if (main.getAttribute("data-noindex") === "true") {
+    return null;
+  }
+
+  return {
+    href: filePath.replace(nextOutputDir, "").replace(".html", ""),
+    title: pageTitle ? trimExtraSpace(pageTitle) : "",
+    sections: getPageSectionsForSearchIndex(main),
+  };
+}
+
+function extractPageLLMContent(
+  main: X_HTMLElement,
+  pageTitle: string | undefined,
+  filePath: string,
+  nextOutputDir: string,
+): { links: string; full: string } {
+  if (
+    main.getAttribute("data-noindex") === "true" ||
+    main.getAttribute("data-no-llm") === "true"
+  ) {
+    return { links: "", full: "" };
+  }
+
+  const htmlToMarkdown = new NodeHtmlMarkdown({
+    keepDataImages: false,
+  });
+
+  let linksContent = "";
+  let fullContent = "";
+
+  const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
+
+  // Get first non-empty paragraph for description
+  const paragraphs = main.querySelectorAll("p");
+  let description = "";
+  for (const p of paragraphs) {
+    // skip noindex or no-llm paragraphs
+    if (
+      p.getAttribute("data-noindex") === "true" ||
+      p.getAttribute("data-no-llm") === "true"
+    ) {
+      continue;
+    }
+
+    description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
+    if (description) {
+      break;
+    }
+  }
+
+  linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;
+
+  // Remove noindex and no-llm elements
+  const contentElements = main.querySelectorAll("*");
+  for (const element of contentElements) {
+    if (
+      element.getAttribute("data-noindex") === "true" ||
+      element.getAttribute("data-no-llm") === "true"
+    ) {
+      element.remove();
+    }
+  }
+
+  // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
+  const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
+  for (const heading of headings) {
+    const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
+    const newLevel = Math.min(headingLevel + 1, 6);
+    heading.tagName = `H${newLevel}`;
+  }
+
+  // prefix all the relative links with the `https://portal.thirdweb.com`
+  const links = main.querySelectorAll("a");
+  for (const link of links) {
+    const [path, hash] = link.getAttribute("href")?.split("#") || [];
+    if (path?.startsWith("/")) {
+      link.setAttribute(
+        "href",
+        `https://portal.thirdweb.com${path}${hash ? `#${hash}` : ""}`,
+      );
+    }
+  }
+
+  // for code blocks inside pre tags -> make them direct descendants of the pre tag
+  // so they are parsed as blocks by node-html-markdown + add language class
+  const preTags = main.querySelectorAll("pre");
+  for (const preTag of preTags) {
+    const codeBlock = parse(preTag.innerHTML.toString(), {
+      comment: false,
+      blockTextElements: {
+        pre: true,
+      },
+    }).querySelector("code");
+
+    if (codeBlock) {
+      const code = codeBlock
+        .querySelectorAll("div > div > div > div")
+        .map((x) => x.textContent)
+        .join("\n")
+        .trim();
+
+      const lang = codeBlock.getAttribute("lang");
+      codeBlock.textContent = code;
+
+      const newCodePreBlock = parse(
+        `<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
+      );
+
+      preTag.replaceWith(newCodePreBlock);
+    }
+  }
+
+  // Convert the cleaned HTML to markdown
+  fullContent += `${htmlToMarkdown.translate(main.toString())}`;
+
+  return {
+    links: linksContent,
+    full: fullContent,
+  };
+}
+
+function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
   const sectionData: PageSectionData[] = [];
 
-  const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
+  const ignoreTags = new Set(
+    ["code", "nav", "pre"].map((t) => t.toUpperCase()),
+  );
 
   function collector(node: X_Node) {
     if (node instanceof X_CommentNode) {
@@ -94,9 +267,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
         return;
       }
 
-      const noIndexAttribute = node.getAttribute("data-noindex");
-
-      if (noIndexAttribute === "true") {
+      if (node.getAttribute("data-noindex") === "true") {
         return;
       }
 
diff --git a/apps/portal/src/app/cli/layout.tsx b/apps/portal/src/app/cli/layout.tsx
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
 
 export default async function Layout(props: { children: React.ReactNode }) {
   return (
-    <DocLayout sideBar={sidebar} editPageButton={true}>
+    <DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
       {props.children}
     </DocLayout>
   );
diff --git a/apps/portal/src/app/page.tsx b/apps/portal/src/app/page.tsx
@@ -6,7 +6,7 @@ import DocsHeroLight from "./_images/docs-hero-light.png";
 
 export default function Page() {
   return (
-    <main className="container max-w-[900px] grow pb-20">
+    <main className="container max-w-[900px] grow pb-20" data-noindex>
       <Hero />
       <div className="grid grid-cols-1 gap-8">
         <FrontendSection />
diff --git a/apps/portal/src/app/react-native/v5/layout.tsx b/apps/portal/src/app/react-native/v5/layout.tsx
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
         </div>
       }
     >
-      <div data-noindex>{props.children}</div>
+      <div>{props.children}</div>
     </DocLayout>
   );
 }
diff --git a/apps/portal/src/app/react/v5/layout.tsx b/apps/portal/src/app/react/v5/layout.tsx
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
         </div>
       }
     >
-      <div data-noindex>{props.children}</div>
+      <div>{props.children}</div>
     </DocLayout>
   );
 }
diff --git a/apps/portal/src/app/typescript/v5/layout.tsx b/apps/portal/src/app/typescript/v5/layout.tsx
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
         </div>
       }
     >
-      <div data-noindex>{props.children}</div>
+      <div>{props.children}</div>
     </DocLayout>
   );
 }
diff --git a/apps/portal/src/components/Document/Cards/ArticleCard.tsx b/apps/portal/src/components/Document/Cards/ArticleCard.tsx
@@ -11,6 +11,7 @@ export function ArticleCard(props: {
   const isExternal = props.href.startsWith("http");
   return (
     <Link
+      data-noindex
       href={props.href}
       className="flex cursor-default bg-card"
       target={isExternal ? "_blank" : undefined}
@@ -38,6 +39,7 @@ export function ArticleIconCard(props: {
   const isExternal = props.href.startsWith("http");
   return (
     <Link
+      data-noindex
       href={props.href}
       className={cn(
         "flex items-center gap-4 rounded-lg border bg-card p-4 transition-colors hover:border-active-border",
diff --git a/apps/portal/src/components/Document/Code.tsx b/apps/portal/src/components/Document/Code.tsx
diff --git a/apps/portal/src/components/Layouts/DocLayout.tsx b/apps/portal/src/components/Layouts/DocLayout.tsx
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {`
`15`	`15`	`</div>`
`16`	`16`	`}`
`17`	`17`	`>`
`18`		`- <div data-noindex>{props.children}</div>`
	`18`	`+ <div>{props.children}</div>`
`19`	`19`	`</DocLayout>`
`20`	`20`	`);`
`21`	`21`	`}`