Skip to content

Commit 8887e46

Browse files
committed
[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script (#6372)
<!-- start pr-codex --> ## PR-Codex overview This PR focuses on enhancing the layout components and search data extraction in the application. It introduces new files for LLM content and modifies existing components to include additional properties for better indexing and rendering. ### Detailed summary - Added `public/llms.txt` and `public/llms-full.txt` files. - Removed `data-noindex` from `<div>` in multiple layout files. - Added `noLLM` property to `DocLayout` in various components. - Updated `extractSearchData` to return LLM content. - Modified `extractContent` to handle new LLM data. - Adjusted `ArticleCard` and `Code` components to include `data-noindex`. - Updated dependencies in `package.json` and `pnpm-lock.yaml`. > ✨ Ask PR-Codex anything about this PR by commenting with `/codex {your question}` <!-- end pr-codex -->
1 parent 56301bc commit 8887e46

File tree

14 files changed

+384
-261
lines changed

14 files changed

+384
-261
lines changed

apps/portal/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ next-env.d.ts
3939

4040
# generated files
4141
searchIndex.json
42+
public/llms.txt
43+
public/llms-full.txt
4244

4345
.env
4446
public/sitemap*.xml

apps/portal/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@
3434
"date-fns": "4.1.0",
3535
"flexsearch": "^0.7.43",
3636
"github-slugger": "^2.0.0",
37+
"he": "^1.2.0",
3738
"lucide-react": "0.476.0",
3839
"next": "15.2.0",
3940
"nextjs-toploader": "^1.6.12",
41+
"node-html-markdown": "^1.3.0",
4042
"node-html-parser": "^6.1.13",
4143
"posthog-js": "1.67.1",
4244
"prettier": "3.3.3",
@@ -55,6 +57,7 @@
5557
"devDependencies": {
5658
"@next/eslint-plugin-next": "15.2.0",
5759
"@types/flexsearch": "^0.7.6",
60+
"@types/he": "^1.2.3",
5861
"@types/mdx": "^2.0.13",
5962
"@types/node": "22.13.5",
6063
"@types/react": "19.0.10",
Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { writeFileSync } from "node:fs";
2-
import { extractSearchData } from "../src/app/api/search/extraction";
2+
import { extractContent } from "../src/app/api/search/extraction";
33

44
async function main() {
55
const rootDir = process.cwd();
6-
const websiteData = await extractSearchData(rootDir);
7-
writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
6+
const { searchData, llmContent, llmFullContent } =
7+
await extractContent(rootDir);
8+
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
9+
writeFileSync("./public/llms.txt", llmContent);
10+
writeFileSync("./public/llms-full.txt", llmFullContent);
811
}
912

1013
main();

apps/portal/src/app/account/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/api/search/extraction/index.ts

Lines changed: 188 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { readFile } from "node:fs/promises";
2+
import he from "he";
3+
import { NodeHtmlMarkdown } from "node-html-markdown";
24
import {
35
CommentNode as X_CommentNode,
46
HTMLElement as X_HTMLElement,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113
import { ignoreHeadings } from "./settings";
1214
import { trimExtraSpace } from "./trimExtraSpace";
1315

14-
export async function extractSearchData(rootDir: string): Promise<PageData[]> {
16+
type ExtractedContent = {
17+
searchData: PageData[];
18+
llmContent: string;
19+
llmFullContent: string;
20+
};
21+
22+
const llmsContentHeader = `\
23+
# thirdweb
24+
25+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+
## Docs
28+
`;
29+
30+
const llmsFullContentHeader = `\
31+
# thirdweb
32+
33+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+
`;
35+
36+
export async function extractContent(
37+
rootDir: string,
38+
): Promise<ExtractedContent> {
1539
const nextOutputDir = `${rootDir}/.next/server/app`;
1640
const htmlFiles = getFilesRecursive(nextOutputDir, "html");
1741

1842
const pages: PageData[] = [];
43+
let llmContent = "";
44+
let llmFullContent = "";
1945

2046
const noMainFound: string[] = [];
2147
const noH1Found: string[] = [];
@@ -26,7 +52,7 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2652
const mainEl = parse(htmlContent, {
2753
comment: false,
2854
blockTextElements: {
29-
pre: false, // parse text inside <pre> elements instead of treating it as text
55+
pre: true,
3056
},
3157
}).querySelector("main");
3258

@@ -37,25 +63,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3763
return;
3864
}
3965

40-
const noIndex = mainEl.getAttribute("data-noindex");
41-
42-
if (noIndex) {
66+
if (mainEl.getAttribute("data-noindex") === "true") {
4367
return;
4468
}
4569

4670
const pageTitle = mainEl.querySelector("h1")?.text;
47-
4871
if (!pageTitle) {
4972
noH1Found.push(
5073
filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
5174
);
5275
}
5376

54-
pages.push({
55-
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
56-
title: pageTitle ? trimExtraSpace(pageTitle) : "",
57-
sections: getPageSections(mainEl),
58-
});
77+
// Important: do the search index collection first - we will modify the main element in the next step
78+
// Extract search data
79+
const pageData = extractPageSearchData(
80+
mainEl,
81+
filePath,
82+
nextOutputDir,
83+
pageTitle,
84+
);
85+
if (pageData) {
86+
pages.push(pageData);
87+
}
88+
89+
// Extract LLM content
90+
const { links, full } = extractPageLLMContent(
91+
mainEl,
92+
pageTitle,
93+
filePath,
94+
nextOutputDir,
95+
);
96+
llmContent += links ? `${links}\n` : "";
97+
llmFullContent += full ? `${full}\n` : "";
5998
}),
6099
);
61100

@@ -77,13 +116,147 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77116
console.warn("\n");
78117
}
79118

80-
return pages;
119+
return {
120+
searchData: pages,
121+
llmContent: `${llmsContentHeader}\n${llmContent}`,
122+
llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
123+
};
81124
}
82125

83-
function getPageSections(main: X_HTMLElement): PageSectionData[] {
126+
function extractPageSearchData(
127+
main: X_HTMLElement,
128+
filePath: string,
129+
nextOutputDir: string,
130+
pageTitle: string | undefined,
131+
): PageData | null {
132+
if (main.getAttribute("data-noindex") === "true") {
133+
return null;
134+
}
135+
136+
return {
137+
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
138+
title: pageTitle ? trimExtraSpace(pageTitle) : "",
139+
sections: getPageSectionsForSearchIndex(main),
140+
};
141+
}
142+
143+
function extractPageLLMContent(
144+
main: X_HTMLElement,
145+
pageTitle: string | undefined,
146+
filePath: string,
147+
nextOutputDir: string,
148+
): { links: string; full: string } {
149+
if (
150+
main.getAttribute("data-noindex") === "true" ||
151+
main.getAttribute("data-no-llm") === "true"
152+
) {
153+
return { links: "", full: "" };
154+
}
155+
156+
const htmlToMarkdown = new NodeHtmlMarkdown({
157+
keepDataImages: false,
158+
});
159+
160+
let linksContent = "";
161+
let fullContent = "";
162+
163+
const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
164+
165+
// Get first non-empty paragraph for description
166+
const paragraphs = main.querySelectorAll("p");
167+
let description = "";
168+
for (const p of paragraphs) {
169+
// skip noindex or no-llm paragraphs
170+
if (
171+
p.getAttribute("data-noindex") === "true" ||
172+
p.getAttribute("data-no-llm") === "true"
173+
) {
174+
continue;
175+
}
176+
177+
description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
178+
if (description) {
179+
break;
180+
}
181+
}
182+
183+
linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;
184+
185+
// Remove noindex and no-llm elements
186+
const contentElements = main.querySelectorAll("*");
187+
for (const element of contentElements) {
188+
if (
189+
element.getAttribute("data-noindex") === "true" ||
190+
element.getAttribute("data-no-llm") === "true"
191+
) {
192+
element.remove();
193+
}
194+
}
195+
196+
// Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
197+
const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
198+
for (const heading of headings) {
199+
const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
200+
const newLevel = Math.min(headingLevel + 1, 6);
201+
heading.tagName = `H${newLevel}`;
202+
}
203+
204+
// prefix all the relative links with the `https://portal.thirdweb.com`
205+
const links = main.querySelectorAll("a");
206+
for (const link of links) {
207+
const [path, hash] = link.getAttribute("href")?.split("#") || [];
208+
if (path?.startsWith("/")) {
209+
link.setAttribute(
210+
"href",
211+
`https://portal.thirdweb.com${path}${hash ? `#${hash}` : ""}`,
212+
);
213+
}
214+
}
215+
216+
// for code blocks inside pre tags -> make them direct descendants of the pre tag
217+
// so they are parsed as blocks by node-html-markdown + add language class
218+
const preTags = main.querySelectorAll("pre");
219+
for (const preTag of preTags) {
220+
const codeBlock = parse(preTag.innerHTML.toString(), {
221+
comment: false,
222+
blockTextElements: {
223+
pre: true,
224+
},
225+
}).querySelector("code");
226+
227+
if (codeBlock) {
228+
const code = codeBlock
229+
.querySelectorAll("div > div > div > div")
230+
.map((x) => x.textContent)
231+
.join("\n")
232+
.trim();
233+
234+
const lang = codeBlock.getAttribute("lang");
235+
codeBlock.textContent = code;
236+
237+
const newCodePreBlock = parse(
238+
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
239+
);
240+
241+
preTag.replaceWith(newCodePreBlock);
242+
}
243+
}
244+
245+
// Convert the cleaned HTML to markdown
246+
fullContent += `${htmlToMarkdown.translate(main.toString())}`;
247+
248+
return {
249+
links: linksContent,
250+
full: fullContent,
251+
};
252+
}
253+
254+
function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
84255
const sectionData: PageSectionData[] = [];
85256

86-
const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
257+
const ignoreTags = new Set(
258+
["code", "nav", "pre"].map((t) => t.toUpperCase()),
259+
);
87260

88261
function collector(node: X_Node) {
89262
if (node instanceof X_CommentNode) {
@@ -94,9 +267,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94267
return;
95268
}
96269

97-
const noIndexAttribute = node.getAttribute("data-noindex");
98-
99-
if (noIndexAttribute === "true") {
270+
if (node.getAttribute("data-noindex") === "true") {
100271
return;
101272
}
102273

apps/portal/src/app/cli/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/page.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import DocsHeroLight from "./_images/docs-hero-light.png";
66

77
export default function Page() {
88
return (
9-
<main className="container max-w-[900px] grow pb-20">
9+
<main className="container max-w-[900px] grow pb-20" data-noindex>
1010
<Hero />
1111
<div className="grid grid-cols-1 gap-8">
1212
<FrontendSection />

apps/portal/src/app/react-native/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/react/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/typescript/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

0 commit comments

Comments
 (0)