Skip to content

Commit fd96deb

Browse files
BYKbitsandfoxes
authored andcommitted
ci(md): Upload md files to R2 (#14171)
1 parent 605273f commit fd96deb

File tree

3 files changed

+1202
-34
lines changed

3 files changed

+1202
-34
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
},
3939
"dependencies": {
4040
"@ariakit/react": "^0.4.5",
41+
"@aws-sdk/client-s3": "^3.837.0",
4142
"@emotion/core": "^11.0.0",
4243
"@emotion/react": "^11.11.0",
4344
"@emotion/styled": "^11.0.0",

scripts/generate-md-exports.mjs

Lines changed: 128 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
#!/usr/bin/env node
22
/* eslint-disable no-console */
3+
import {ListObjectsV2Command, PutObjectCommand, S3Client} from '@aws-sdk/client-s3';
34
import {selectAll} from 'hast-util-select';
45
import {createHash} from 'node:crypto';
56
import {createReadStream, createWriteStream, existsSync} from 'node:fs';
6-
import {mkdir, opendir, readFile, rm} from 'node:fs/promises';
7+
import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises';
78
import {cpus} from 'node:os';
89
import * as path from 'node:path';
9-
import {Readable} from 'node:stream';
10+
import {compose, Readable} from 'node:stream';
11+
import {text} from 'node:stream/consumers';
1012
import {pipeline} from 'node:stream/promises';
1113
import {fileURLToPath} from 'node:url';
1214
import {isMainThread, parentPort, Worker, workerData} from 'node:worker_threads';
@@ -23,17 +25,40 @@ import {unified} from 'unified';
2325
import {remove} from 'unist-util-remove';
2426

2527
const CACHE_COMPRESS_LEVEL = 4;
28+
const R2_BUCKET = 'sentry-docs';
29+
const accessKeyId = process.env.R2_ACCESS_KEY_ID;
30+
const secretAccessKey = process.env.R2_SECRET_ACCESS_KEY;
2631

27-
function taskFinishHandler(data) {
28-
if (data.failedTasks.length === 0) {
29-
console.log(
30-
`💰 Worker[${data.id}]: Cache hits: ${data.cacheHits} (${Math.round((data.cacheHits / data.success) * 100)}%)`
31-
);
32-
console.log(`✅ Worker[${data.id}]: converted ${data.success} files successfully.`);
32+
function getS3Client() {
33+
return new S3Client({
34+
endpoint: 'https://773afa1f62ff86c80db4f24f7ff1e9c8.r2.cloudflarestorage.com',
35+
region: 'auto',
36+
credentials: {
37+
accessKeyId,
38+
secretAccessKey,
39+
},
40+
retryMode: 'adaptive',
41+
});
42+
}
43+
44+
async function uploadToCFR2(s3Client, relativePath, data) {
45+
const command = new PutObjectCommand({
46+
Bucket: R2_BUCKET,
47+
Key: relativePath,
48+
Body: data,
49+
ContentType: 'text/markdown',
50+
});
51+
await s3Client.send(command);
52+
return;
53+
}
54+
55+
function taskFinishHandler({id, success, failedTasks}) {
56+
if (failedTasks.length === 0) {
57+
console.log(`✅ Worker[${id}]: converted ${success} files successfully.`);
3358
return false;
3459
}
35-
console.error(`❌ Worker[${data.id}]: ${data.failedTasks.length} files failed:`);
36-
console.error(data.failedTasks);
60+
console.error(`❌ Worker[${id}]: ${failedTasks.length} files failed:`);
61+
console.error(failedTasks);
3762
return true;
3863
}
3964

@@ -68,13 +93,34 @@ async function createWork() {
6893
const numWorkers = Math.max(Math.floor(cpus().length / 2), 2);
6994
const workerTasks = new Array(numWorkers).fill(null).map(() => []);
7095

96+
const existingFilesOnR2 = null;
97+
if (accessKeyId && secretAccessKey) {
98+
console.log(`☁️ Getting existing hashes from R2...`);
99+
const s3Client = getS3Client();
100+
let continuationToken = undefined;
101+
do {
102+
const response = await s3Client.send(
103+
new ListObjectsV2Command({
104+
Bucket: R2_BUCKET,
105+
ContinuationToken: continuationToken,
106+
})
107+
);
108+
continuationToken = response.NextContinuationToken;
109+
for (const {Key, ETag} of response.Contents) {
110+
existingFilesOnR2.set(Key, ETag.slice(1, -1)); // Remove quotes from ETag
111+
}
112+
} while (continuationToken);
113+
console.log(`✅ Found ${existingFilesOnR2.size} existing files on R2.`);
114+
}
115+
71116
console.log(`🔎 Discovering files to convert...`);
72117

73118
let numFiles = 0;
74119
let workerIdx = 0;
75120
// Need a high buffer size here otherwise Node skips some subdirectories!
76121
// See https://github.com/nodejs/node/issues/48820
77122
const dir = await opendir(INPUT_DIR, {recursive: true, bufferSize: 1024});
123+
78124
for await (const dirent of dir) {
79125
if (dirent.name.endsWith('.html') && dirent.isFile()) {
80126
const sourcePath = path.join(dirent.parentPath || dirent.path, dirent.name);
@@ -84,7 +130,13 @@ async function createWork() {
84130
);
85131
await mkdir(targetDir, {recursive: true});
86132
const targetPath = path.join(targetDir, dirent.name.slice(0, -5) + '.md');
87-
workerTasks[workerIdx].push({sourcePath, targetPath});
133+
const relativePath = path.relative(OUTPUT_DIR, targetPath);
134+
workerTasks[workerIdx].push({
135+
sourcePath,
136+
targetPath,
137+
relativePath,
138+
r2Hash: existingFilesOnR2 ? existingFilesOnR2.get(relativePath) : null,
139+
});
88140
workerIdx = (workerIdx + 1) % numWorkers;
89141
numFiles++;
90142
}
@@ -96,7 +148,12 @@ async function createWork() {
96148
const workerPromises = new Array(numWorkers - 1).fill(null).map((_, id) => {
97149
return new Promise((resolve, reject) => {
98150
const worker = new Worker(selfPath, {
99-
workerData: {id, noCache, cacheDir: CACHE_DIR, tasks: workerTasks[id]},
151+
workerData: {
152+
id,
153+
noCache,
154+
cacheDir: CACHE_DIR,
155+
tasks: workerTasks[id],
156+
},
100157
});
101158
let hasErrors = false;
102159
worker.on('message', data => (hasErrors = taskFinishHandler(data)));
@@ -113,10 +170,10 @@ async function createWork() {
113170
// The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
114171
workerPromises.push(
115172
processTaskList({
116-
noCache,
117-
cacheDir: CACHE_DIR,
118-
tasks: workerTasks[workerTasks.length - 1],
119173
id: workerTasks.length - 1,
174+
tasks: workerTasks[workerTasks.length - 1],
175+
cacheDir: CACHE_DIR,
176+
noCache,
120177
}).then(data => {
121178
if (taskFinishHandler(data)) {
122179
throw new Error(`Worker[${data.id}] had some errors.`);
@@ -133,25 +190,24 @@ async function createWork() {
133190
const md5 = data => createHash('md5').update(data).digest('hex');
134191

135192
async function genMDFromHTML(source, target, {cacheDir, noCache}) {
136-
const text = (await readFile(source, {encoding: 'utf8'}))
193+
const leanHTML = (await readFile(source, {encoding: 'utf8'}))
137194
// Remove all script tags, as they are not needed in markdown
138195
// and they are not stable across builds, causing cache misses
139196
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
140-
const hash = md5(text);
141-
const cacheFile = path.join(cacheDir, hash);
197+
const cacheKey = md5(leanHTML);
198+
const cacheFile = path.join(cacheDir, cacheKey);
142199
if (!noCache) {
143200
try {
144-
await pipeline(
145-
createReadStream(cacheFile),
146-
createBrotliDecompress(),
147-
createWriteStream(target, {
148-
encoding: 'utf8',
149-
})
201+
const data = await text(
202+
compose(createReadStream(cacheFile), createBrotliDecompress())
150203
);
204+
await writeFile(target, data, {encoding: 'utf8'});
151205

152-
return true;
153-
} catch {
154-
// pass
206+
return {cacheHit: true, data};
207+
} catch (err) {
208+
if (err.code !== 'ENOENT') {
209+
console.warn(`Error using cache file ${cacheFile}:`, err);
210+
}
155211
}
156212
}
157213

@@ -178,7 +234,7 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
178234
.use(() => tree => remove(tree, {type: 'inlineCode', value: ''}))
179235
.use(remarkGfm)
180236
.use(remarkStringify)
181-
.process(text)
237+
.process(leanHTML)
182238
);
183239
const reader = Readable.from(data);
184240

@@ -203,23 +259,62 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
203259
).catch(err => console.warn('Error writing cache file:', err)),
204260
]);
205261

206-
return false;
262+
return {cacheHit: false, data};
207263
}
208264

209265
async function processTaskList({id, tasks, cacheDir, noCache}) {
266+
const s3Client = getS3Client();
210267
const failedTasks = [];
211-
let cacheHits = 0;
212-
for (const {sourcePath, targetPath} of tasks) {
268+
let cacheMisses = [];
269+
let r2CacheMisses = [];
270+
console.log(`🤖 Worker[${id}]: Starting to process ${tasks.length} files...`);
271+
for (const {sourcePath, targetPath, relativePath, r2Hash} of tasks) {
213272
try {
214-
cacheHits += await genMDFromHTML(sourcePath, targetPath, {
273+
const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, {
215274
cacheDir,
216275
noCache,
217276
});
277+
if (!cacheHit) {
278+
cacheMisses.push(relativePath);
279+
}
280+
281+
if (r2Hash !== null) {
282+
const fileHash = md5(data);
283+
if (r2Hash !== fileHash) {
284+
r2CacheMisses.push(relativePath);
285+
console.log(
286+
`📤 Worker[${id}]: Uploading ${relativePath} to R2, hash mismatch: ${r2Hash} !== ${fileHash}`
287+
);
288+
await uploadToCFR2(s3Client, relativePath, data);
289+
}
290+
}
218291
} catch (error) {
219292
failedTasks.push({sourcePath, targetPath, error});
220293
}
221294
}
222-
return {id, success: tasks.length - failedTasks.length, failedTasks, cacheHits};
295+
const success = tasks.length - failedTasks.length;
296+
if (r2CacheMisses.length / tasks.length > 0.1) {
297+
console.warn(
298+
`⚠️ Worker[${id}]: More than 10% of files had a different hash on R2, this might indicate a problem with the cache or the generation process.`
299+
);
300+
} else if (r2CacheMisses.length > 0) {
301+
console.log(
302+
`📤 Worker[${id}]: Updated the following files on R2: \n${r2CacheMisses.map(n => ` - ${n}`).join('\n')}`
303+
);
304+
}
305+
if (cacheMisses.length / tasks.length > 0.1) {
306+
console.warn(`⚠️ Worker[${id}]: More than 10% cache miss rate during build.`);
307+
} else if (cacheMisses.length > 0) {
308+
console.log(
309+
`❇️ Worker[${id}]: Updated cache for the following files: \n${cacheMisses.map(n => ` - ${n}`).join('\n')}`
310+
);
311+
}
312+
313+
return {
314+
id,
315+
success,
316+
failedTasks,
317+
};
223318
}
224319

225320
async function doWork(work) {

0 commit comments

Comments
 (0)