1
1
#!/usr/bin/env node
2
2
/* eslint-disable no-console */
3
+ import { ListObjectsV2Command , PutObjectCommand , S3Client } from '@aws-sdk/client-s3' ;
3
4
import { selectAll } from 'hast-util-select' ;
4
5
import { createHash } from 'node:crypto' ;
5
6
import { createReadStream , createWriteStream , existsSync } from 'node:fs' ;
6
- import { mkdir , opendir , readFile , rm } from 'node:fs/promises' ;
7
+ import { mkdir , opendir , readFile , rm , writeFile } from 'node:fs/promises' ;
7
8
import { cpus } from 'node:os' ;
8
9
import * as path from 'node:path' ;
9
- import { Readable } from 'node:stream' ;
10
+ import { compose , Readable } from 'node:stream' ;
11
+ import { text } from 'node:stream/consumers' ;
10
12
import { pipeline } from 'node:stream/promises' ;
11
13
import { fileURLToPath } from 'node:url' ;
12
14
import { isMainThread , parentPort , Worker , workerData } from 'node:worker_threads' ;
@@ -23,17 +25,40 @@ import {unified} from 'unified';
23
25
import { remove } from 'unist-util-remove' ;
24
26
25
27
const CACHE_COMPRESS_LEVEL = 4 ;
28
+ const R2_BUCKET = 'sentry-docs' ;
29
+ const accessKeyId = process . env . R2_ACCESS_KEY_ID ;
30
+ const secretAccessKey = process . env . R2_SECRET_ACCESS_KEY ;
26
31
27
- function taskFinishHandler ( data ) {
28
- if ( data . failedTasks . length === 0 ) {
29
- console . log (
30
- `💰 Worker[${ data . id } ]: Cache hits: ${ data . cacheHits } (${ Math . round ( ( data . cacheHits / data . success ) * 100 ) } %)`
31
- ) ;
32
- console . log ( `✅ Worker[${ data . id } ]: converted ${ data . success } files successfully.` ) ;
32
+ function getS3Client ( ) {
33
+ return new S3Client ( {
34
+ endpoint : 'https://773afa1f62ff86c80db4f24f7ff1e9c8.r2.cloudflarestorage.com' ,
35
+ region : 'auto' ,
36
+ credentials : {
37
+ accessKeyId,
38
+ secretAccessKey,
39
+ } ,
40
+ retryMode : 'adaptive' ,
41
+ } ) ;
42
+ }
43
+
44
+ async function uploadToCFR2 ( s3Client , relativePath , data ) {
45
+ const command = new PutObjectCommand ( {
46
+ Bucket : R2_BUCKET ,
47
+ Key : relativePath ,
48
+ Body : data ,
49
+ ContentType : 'text/markdown' ,
50
+ } ) ;
51
+ await s3Client . send ( command ) ;
52
+ return ;
53
+ }
54
+
55
+ function taskFinishHandler ( { id, success, failedTasks} ) {
56
+ if ( failedTasks . length === 0 ) {
57
+ console . log ( `✅ Worker[${ id } ]: converted ${ success } files successfully.` ) ;
33
58
return false ;
34
59
}
35
- console . error ( `❌ Worker[${ data . id } ]: ${ data . failedTasks . length } files failed:` ) ;
36
- console . error ( data . failedTasks ) ;
60
+ console . error ( `❌ Worker[${ id } ]: ${ failedTasks . length } files failed:` ) ;
61
+ console . error ( failedTasks ) ;
37
62
return true ;
38
63
}
39
64
@@ -68,13 +93,34 @@ async function createWork() {
68
93
const numWorkers = Math . max ( Math . floor ( cpus ( ) . length / 2 ) , 2 ) ;
69
94
const workerTasks = new Array ( numWorkers ) . fill ( null ) . map ( ( ) => [ ] ) ;
70
95
96
+ const existingFilesOnR2 = null ;
97
+ if ( accessKeyId && secretAccessKey ) {
98
+ console . log ( `☁️ Getting existing hashes from R2...` ) ;
99
+ const s3Client = getS3Client ( ) ;
100
+ let continuationToken = undefined ;
101
+ do {
102
+ const response = await s3Client . send (
103
+ new ListObjectsV2Command ( {
104
+ Bucket : R2_BUCKET ,
105
+ ContinuationToken : continuationToken ,
106
+ } )
107
+ ) ;
108
+ continuationToken = response . NextContinuationToken ;
109
+ for ( const { Key, ETag} of response . Contents ) {
110
+ existingFilesOnR2 . set ( Key , ETag . slice ( 1 , - 1 ) ) ; // Remove quotes from ETag
111
+ }
112
+ } while ( continuationToken ) ;
113
+ console . log ( `✅ Found ${ existingFilesOnR2 . size } existing files on R2.` ) ;
114
+ }
115
+
71
116
console . log ( `🔎 Discovering files to convert...` ) ;
72
117
73
118
let numFiles = 0 ;
74
119
let workerIdx = 0 ;
75
120
// Need a high buffer size here otherwise Node skips some subdirectories!
76
121
// See https://github.com/nodejs/node/issues/48820
77
122
const dir = await opendir ( INPUT_DIR , { recursive : true , bufferSize : 1024 } ) ;
123
+
78
124
for await ( const dirent of dir ) {
79
125
if ( dirent . name . endsWith ( '.html' ) && dirent . isFile ( ) ) {
80
126
const sourcePath = path . join ( dirent . parentPath || dirent . path , dirent . name ) ;
@@ -84,7 +130,13 @@ async function createWork() {
84
130
) ;
85
131
await mkdir ( targetDir , { recursive : true } ) ;
86
132
const targetPath = path . join ( targetDir , dirent . name . slice ( 0 , - 5 ) + '.md' ) ;
87
- workerTasks [ workerIdx ] . push ( { sourcePath, targetPath} ) ;
133
+ const relativePath = path . relative ( OUTPUT_DIR , targetPath ) ;
134
+ workerTasks [ workerIdx ] . push ( {
135
+ sourcePath,
136
+ targetPath,
137
+ relativePath,
138
+ r2Hash : existingFilesOnR2 ? existingFilesOnR2 . get ( relativePath ) : null ,
139
+ } ) ;
88
140
workerIdx = ( workerIdx + 1 ) % numWorkers ;
89
141
numFiles ++ ;
90
142
}
@@ -96,7 +148,12 @@ async function createWork() {
96
148
const workerPromises = new Array ( numWorkers - 1 ) . fill ( null ) . map ( ( _ , id ) => {
97
149
return new Promise ( ( resolve , reject ) => {
98
150
const worker = new Worker ( selfPath , {
99
- workerData : { id, noCache, cacheDir : CACHE_DIR , tasks : workerTasks [ id ] } ,
151
+ workerData : {
152
+ id,
153
+ noCache,
154
+ cacheDir : CACHE_DIR ,
155
+ tasks : workerTasks [ id ] ,
156
+ } ,
100
157
} ) ;
101
158
let hasErrors = false ;
102
159
worker . on ( 'message' , data => ( hasErrors = taskFinishHandler ( data ) ) ) ;
@@ -113,10 +170,10 @@ async function createWork() {
113
170
// The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
114
171
workerPromises . push (
115
172
processTaskList ( {
116
- noCache,
117
- cacheDir : CACHE_DIR ,
118
- tasks : workerTasks [ workerTasks . length - 1 ] ,
119
173
id : workerTasks . length - 1 ,
174
+ tasks : workerTasks [ workerTasks . length - 1 ] ,
175
+ cacheDir : CACHE_DIR ,
176
+ noCache,
120
177
} ) . then ( data => {
121
178
if ( taskFinishHandler ( data ) ) {
122
179
throw new Error ( `Worker[${ data . id } ] had some errors.` ) ;
@@ -133,25 +190,24 @@ async function createWork() {
133
190
const md5 = data => createHash ( 'md5' ) . update ( data ) . digest ( 'hex' ) ;
134
191
135
192
async function genMDFromHTML ( source , target , { cacheDir, noCache} ) {
136
- const text = ( await readFile ( source , { encoding : 'utf8' } ) )
193
+ const leanHTML = ( await readFile ( source , { encoding : 'utf8' } ) )
137
194
// Remove all script tags, as they are not needed in markdown
138
195
// and they are not stable across builds, causing cache misses
139
196
. replace ( / < s c r i p t [ ^ > ] * > [ \s \S ] * ?< \/ s c r i p t > / gi, '' ) ;
140
- const hash = md5 ( text ) ;
141
- const cacheFile = path . join ( cacheDir , hash ) ;
197
+ const cacheKey = md5 ( leanHTML ) ;
198
+ const cacheFile = path . join ( cacheDir , cacheKey ) ;
142
199
if ( ! noCache ) {
143
200
try {
144
- await pipeline (
145
- createReadStream ( cacheFile ) ,
146
- createBrotliDecompress ( ) ,
147
- createWriteStream ( target , {
148
- encoding : 'utf8' ,
149
- } )
201
+ const data = await text (
202
+ compose ( createReadStream ( cacheFile ) , createBrotliDecompress ( ) )
150
203
) ;
204
+ await writeFile ( target , data , { encoding : 'utf8' } ) ;
151
205
152
- return true ;
153
- } catch {
154
- // pass
206
+ return { cacheHit : true , data} ;
207
+ } catch ( err ) {
208
+ if ( err . code !== 'ENOENT' ) {
209
+ console . warn ( `Error using cache file ${ cacheFile } :` , err ) ;
210
+ }
155
211
}
156
212
}
157
213
@@ -178,7 +234,7 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
178
234
. use ( ( ) => tree => remove ( tree , { type : 'inlineCode' , value : '' } ) )
179
235
. use ( remarkGfm )
180
236
. use ( remarkStringify )
181
- . process ( text )
237
+ . process ( leanHTML )
182
238
) ;
183
239
const reader = Readable . from ( data ) ;
184
240
@@ -203,23 +259,62 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
203
259
) . catch ( err => console . warn ( 'Error writing cache file:' , err ) ) ,
204
260
] ) ;
205
261
206
- return false ;
262
+ return { cacheHit : false , data } ;
207
263
}
208
264
209
265
async function processTaskList ( { id, tasks, cacheDir, noCache} ) {
266
+ const s3Client = getS3Client ( ) ;
210
267
const failedTasks = [ ] ;
211
- let cacheHits = 0 ;
212
- for ( const { sourcePath, targetPath} of tasks ) {
268
+ let cacheMisses = [ ] ;
269
+ let r2CacheMisses = [ ] ;
270
+ console . log ( `🤖 Worker[${ id } ]: Starting to process ${ tasks . length } files...` ) ;
271
+ for ( const { sourcePath, targetPath, relativePath, r2Hash} of tasks ) {
213
272
try {
214
- cacheHits + = await genMDFromHTML ( sourcePath , targetPath , {
273
+ const { data , cacheHit } = await genMDFromHTML ( sourcePath , targetPath , {
215
274
cacheDir,
216
275
noCache,
217
276
} ) ;
277
+ if ( ! cacheHit ) {
278
+ cacheMisses . push ( relativePath ) ;
279
+ }
280
+
281
+ if ( r2Hash !== null ) {
282
+ const fileHash = md5 ( data ) ;
283
+ if ( r2Hash !== fileHash ) {
284
+ r2CacheMisses . push ( relativePath ) ;
285
+ console . log (
286
+ `📤 Worker[${ id } ]: Uploading ${ relativePath } to R2, hash mismatch: ${ r2Hash } !== ${ fileHash } `
287
+ ) ;
288
+ await uploadToCFR2 ( s3Client , relativePath , data ) ;
289
+ }
290
+ }
218
291
} catch ( error ) {
219
292
failedTasks . push ( { sourcePath, targetPath, error} ) ;
220
293
}
221
294
}
222
- return { id, success : tasks . length - failedTasks . length , failedTasks, cacheHits} ;
295
+ const success = tasks . length - failedTasks . length ;
296
+ if ( r2CacheMisses . length / tasks . length > 0.1 ) {
297
+ console . warn (
298
+ `⚠️ Worker[${ id } ]: More than 10% of files had a different hash on R2, this might indicate a problem with the cache or the generation process.`
299
+ ) ;
300
+ } else if ( r2CacheMisses . length > 0 ) {
301
+ console . log (
302
+ `📤 Worker[${ id } ]: Updated the following files on R2: \n${ r2CacheMisses . map ( n => ` - ${ n } ` ) . join ( '\n' ) } `
303
+ ) ;
304
+ }
305
+ if ( cacheMisses . length / tasks . length > 0.1 ) {
306
+ console . warn ( `⚠️ Worker[${ id } ]: More than 10% cache miss rate during build.` ) ;
307
+ } else if ( cacheMisses . length > 0 ) {
308
+ console . log (
309
+ `❇️ Worker[${ id } ]: Updated cache for the following files: \n${ cacheMisses . map ( n => ` - ${ n } ` ) . join ( '\n' ) } `
310
+ ) ;
311
+ }
312
+
313
+ return {
314
+ id,
315
+ success,
316
+ failedTasks,
317
+ } ;
223
318
}
224
319
225
320
async function doWork ( work ) {
0 commit comments