1
1
import { readFile } from "node:fs/promises" ;
2
+ import he from "he" ;
3
+ import { NodeHtmlMarkdown } from "node-html-markdown" ;
2
4
import {
3
5
CommentNode as X_CommentNode ,
4
6
HTMLElement as X_HTMLElement ,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
11
13
import { ignoreHeadings } from "./settings" ;
12
14
import { trimExtraSpace } from "./trimExtraSpace" ;
13
15
14
- export async function extractSearchData ( rootDir : string ) : Promise < PageData [ ] > {
16
+ type ExtractedContent = {
17
+ searchData : PageData [ ] ;
18
+ llmContent : string ;
19
+ llmFullContent : string ;
20
+ } ;
21
+
22
+ const llmsContentHeader = `\
23
+ # thirdweb
24
+
25
+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26
+
27
+ ## Docs
28
+ ` ;
29
+
30
+ const llmsFullContentHeader = `\
31
+ # thirdweb
32
+
33
+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34
+ ` ;
35
+
36
+ export async function extractContent (
37
+ rootDir : string ,
38
+ ) : Promise < ExtractedContent > {
15
39
const nextOutputDir = `${ rootDir } /.next/server/app` ;
16
40
const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
17
41
18
42
const pages : PageData [ ] = [ ] ;
43
+ let llmContent = "" ;
44
+ let llmFullContent = "" ;
19
45
20
46
const noMainFound : string [ ] = [ ] ;
21
47
const noH1Found : string [ ] = [ ] ;
@@ -26,7 +52,7 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
26
52
const mainEl = parse ( htmlContent , {
27
53
comment : false ,
28
54
blockTextElements : {
29
- pre : false , // parse text inside <pre> elements instead of treating it as text
55
+ pre : true ,
30
56
} ,
31
57
} ) . querySelector ( "main" ) ;
32
58
@@ -37,25 +63,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
37
63
return ;
38
64
}
39
65
40
- const noIndex = mainEl . getAttribute ( "data-noindex" ) ;
41
-
42
- if ( noIndex ) {
66
+ if ( mainEl . getAttribute ( "data-noindex" ) === "true" ) {
43
67
return ;
44
68
}
45
69
46
70
const pageTitle = mainEl . querySelector ( "h1" ) ?. text ;
47
-
48
71
if ( ! pageTitle ) {
49
72
noH1Found . push (
50
73
filePath . split ( ".next/server/app" ) [ 1 ] ?. replace ( ".html" , "" ) || "" ,
51
74
) ;
52
75
}
53
76
54
- pages . push ( {
55
- href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
56
- title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
57
- sections : getPageSections ( mainEl ) ,
58
- } ) ;
77
+ // Important: do the search index collection first - we will modify the main element in the next step
78
+ // Extract search data
79
+ const pageData = extractPageSearchData (
80
+ mainEl ,
81
+ filePath ,
82
+ nextOutputDir ,
83
+ pageTitle ,
84
+ ) ;
85
+ if ( pageData ) {
86
+ pages . push ( pageData ) ;
87
+ }
88
+
89
+ // Extract LLM content
90
+ const { links, full } = extractPageLLMContent (
91
+ mainEl ,
92
+ pageTitle ,
93
+ filePath ,
94
+ nextOutputDir ,
95
+ ) ;
96
+ llmContent += links ? `${ links } \n` : "" ;
97
+ llmFullContent += full ? `${ full } \n` : "" ;
59
98
} ) ,
60
99
) ;
61
100
@@ -77,13 +116,147 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77
116
console . warn ( "\n" ) ;
78
117
}
79
118
80
- return pages ;
119
+ return {
120
+ searchData : pages ,
121
+ llmContent : `${ llmsContentHeader } \n${ llmContent } ` ,
122
+ llmFullContent : `${ llmsFullContentHeader } \n${ llmFullContent } ` ,
123
+ } ;
81
124
}
82
125
83
- function getPageSections ( main : X_HTMLElement ) : PageSectionData [ ] {
126
+ function extractPageSearchData (
127
+ main : X_HTMLElement ,
128
+ filePath : string ,
129
+ nextOutputDir : string ,
130
+ pageTitle : string | undefined ,
131
+ ) : PageData | null {
132
+ if ( main . getAttribute ( "data-noindex" ) === "true" ) {
133
+ return null ;
134
+ }
135
+
136
+ return {
137
+ href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
138
+ title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
139
+ sections : getPageSectionsForSearchIndex ( main ) ,
140
+ } ;
141
+ }
142
+
143
+ function extractPageLLMContent (
144
+ main : X_HTMLElement ,
145
+ pageTitle : string | undefined ,
146
+ filePath : string ,
147
+ nextOutputDir : string ,
148
+ ) : { links : string ; full : string } {
149
+ if (
150
+ main . getAttribute ( "data-noindex" ) === "true" ||
151
+ main . getAttribute ( "data-no-llm" ) === "true"
152
+ ) {
153
+ return { links : "" , full : "" } ;
154
+ }
155
+
156
+ const htmlToMarkdown = new NodeHtmlMarkdown ( {
157
+ keepDataImages : false ,
158
+ } ) ;
159
+
160
+ let linksContent = "" ;
161
+ let fullContent = "" ;
162
+
163
+ const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
164
+
165
+ // Get first non-empty paragraph for description
166
+ const paragraphs = main . querySelectorAll ( "p" ) ;
167
+ let description = "" ;
168
+ for ( const p of paragraphs ) {
169
+ // skip noindex or no-llm paragraphs
170
+ if (
171
+ p . getAttribute ( "data-noindex" ) === "true" ||
172
+ p . getAttribute ( "data-no-llm" ) === "true"
173
+ ) {
174
+ continue ;
175
+ }
176
+
177
+ description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
178
+ if ( description ) {
179
+ break ;
180
+ }
181
+ }
182
+
183
+ linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description } ` ;
184
+
185
+ // Remove noindex and no-llm elements
186
+ const contentElements = main . querySelectorAll ( "*" ) ;
187
+ for ( const element of contentElements ) {
188
+ if (
189
+ element . getAttribute ( "data-noindex" ) === "true" ||
190
+ element . getAttribute ( "data-no-llm" ) === "true"
191
+ ) {
192
+ element . remove ( ) ;
193
+ }
194
+ }
195
+
196
+ // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
197
+ const headings = main . querySelectorAll ( "h1, h2, h3, h4, h5, h6" ) ;
198
+ for ( const heading of headings ) {
199
+ const headingLevel = Number . parseInt ( heading . tagName . replace ( "H" , "" ) ) ;
200
+ const newLevel = Math . min ( headingLevel + 1 , 6 ) ;
201
+ heading . tagName = `H${ newLevel } ` ;
202
+ }
203
+
204
+ // prefix all the relative links with the `https://portal.thirdweb.com`
205
+ const links = main . querySelectorAll ( "a" ) ;
206
+ for ( const link of links ) {
207
+ const [ path , hash ] = link . getAttribute ( "href" ) ?. split ( "#" ) || [ ] ;
208
+ if ( path ?. startsWith ( "/" ) ) {
209
+ link . setAttribute (
210
+ "href" ,
211
+ `https://portal.thirdweb.com${ path } ${ hash ? `#${ hash } ` : "" } ` ,
212
+ ) ;
213
+ }
214
+ }
215
+
216
+ // for code blocks inside pre tags -> make them direct descendants of the pre tag
217
+ // so they are parsed as blocks by node-html-markdown + add language class
218
+ const preTags = main . querySelectorAll ( "pre" ) ;
219
+ for ( const preTag of preTags ) {
220
+ const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
221
+ comment : false ,
222
+ blockTextElements : {
223
+ pre : true ,
224
+ } ,
225
+ } ) . querySelector ( "code" ) ;
226
+
227
+ if ( codeBlock ) {
228
+ const code = codeBlock
229
+ . querySelectorAll ( "div > div > div > div" )
230
+ . map ( ( x ) => x . textContent )
231
+ . join ( "\n" )
232
+ . trim ( ) ;
233
+
234
+ const lang = codeBlock . getAttribute ( "lang" ) ;
235
+ codeBlock . textContent = code ;
236
+
237
+ const newCodePreBlock = parse (
238
+ `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
239
+ ) ;
240
+
241
+ preTag . replaceWith ( newCodePreBlock ) ;
242
+ }
243
+ }
244
+
245
+ // Convert the cleaned HTML to markdown
246
+ fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
247
+
248
+ return {
249
+ links : linksContent ,
250
+ full : fullContent ,
251
+ } ;
252
+ }
253
+
254
+ function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
84
255
const sectionData : PageSectionData [ ] = [ ] ;
85
256
86
- const ignoreTags = new Set ( [ "code" , "nav" ] . map ( ( t ) => t . toUpperCase ( ) ) ) ;
257
+ const ignoreTags = new Set (
258
+ [ "code" , "nav" , "pre" ] . map ( ( t ) => t . toUpperCase ( ) ) ,
259
+ ) ;
87
260
88
261
function collector ( node : X_Node ) {
89
262
if ( node instanceof X_CommentNode ) {
@@ -94,9 +267,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94
267
return ;
95
268
}
96
269
97
- const noIndexAttribute = node . getAttribute ( "data-noindex" ) ;
98
-
99
- if ( noIndexAttribute === "true" ) {
270
+ if ( node . getAttribute ( "data-noindex" ) === "true" ) {
100
271
return ;
101
272
}
102
273
0 commit comments