Skip to content

Commit ffe922f

Browse files
mongodbenBen Perlmutter
andauthored
(EAI-1033): include TOC index for snooty ingest (#734)
* include TOC index * fix ai gen bad inclusion --------- Co-authored-by: Ben Perlmutter <mongodben@mongodb.com>
1 parent b507177 commit ffe922f

File tree

2 files changed

+73
-7
lines changed

2 files changed

+73
-7
lines changed

packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.test.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ describe("SnootyDataSource", () => {
3232
],
3333
};
3434
const snootyDataApiBaseUrl = "https://snooty-data-api.mongodb.com/prod/";
35+
3536
describe("makeSnootyDataSource()", () => {
3637
const sampleDataPath = Path.resolve(
3738
SRC_ROOT,
@@ -251,6 +252,17 @@ describe("SnootyDataSource", () => {
251252
expect(pages).toHaveLength(1);
252253
noIndexMock.done();
253254
});
255+
256+
it("includes tocIndex in metadata", async () => {
257+
const source = await makeSnootyDataSource({
258+
name: `snooty-test`,
259+
project,
260+
snootyDataApiBaseUrl,
261+
});
262+
const pages = await source.fetchPages();
263+
264+
expect(pages[0].metadata?.page?.tocIndex).toBe(0);
265+
});
254266
});
255267
});
256268

@@ -270,6 +282,7 @@ describe("handlePage()", () => {
270282
baseUrl: "https://example.com",
271283
tags: ["a"],
272284
version: { label: "1.0", isCurrent: true },
285+
toc: [],
273286
});
274287
expect(result).toMatchObject({
275288
format: "openapi-yaml",
@@ -292,6 +305,7 @@ describe("handlePage()", () => {
292305
baseUrl: "https://example.com",
293306
tags: ["a"],
294307
version: { label: "1.0", isCurrent: true },
308+
toc: [],
295309
});
296310
expect(result).toMatchObject({
297311
format: "md",

packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.ts

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,30 @@ export type SnootyPageEntry = SnootyManifestEntry & {
2828
data: SnootyPageData;
2929
};
3030

31+
export interface SnootyTocEntry {
32+
title: {
33+
type: "text";
34+
position: {
35+
start: {
36+
line: number;
37+
};
38+
};
39+
value: string;
40+
}[];
41+
slug?: string;
42+
url?: string;
43+
children: SnootyTocEntry[];
44+
options?: {
45+
drawer?: boolean;
46+
};
47+
}
48+
3149
/**
3250
Represents metadata in a Snooty manifest file.
3351
*/
3452
export type SnootyMetadataEntry = SnootyManifestEntry & {
3553
type: "metadata";
36-
data: { title?: string };
54+
data: { title?: string; toctree: SnootyTocEntry; toctreeOrder: string[] };
3755
};
3856

3957
/**
@@ -202,6 +220,7 @@ export const makeSnootyDataSource = ({
202220
const linePromises: Promise<void>[] = [];
203221
const pages: Page[] = [];
204222
let siteTitle: string | undefined = undefined;
223+
const toc: string[] = [];
205224
await new Promise<void>((resolve, reject) => {
206225
stream.on("line", async (line) => {
207226
const entry = JSON.parse(line) as SnootyManifestEntry;
@@ -227,6 +246,7 @@ export const makeSnootyDataSource = ({
227246
...links,
228247
baseUrl: branchUrl,
229248
},
249+
toc,
230250
});
231251
if (page !== undefined) {
232252
pages.push(page);
@@ -249,6 +269,18 @@ export const makeSnootyDataSource = ({
249269
case "metadata": {
250270
const { data } = entry as SnootyMetadataEntry;
251271
siteTitle = data.title;
272+
const visitedUrls = new Set<string>();
273+
const tocUrls = data.toctreeOrder
274+
.filter((slug) => {
275+
const url = makeUrl(slug, branchUrl);
276+
if (visitedUrls.has(url)) {
277+
return false;
278+
}
279+
visitedUrls.add(url);
280+
return true;
281+
})
282+
.map((slug) => makeUrl(slug, branchUrl));
283+
toc.push(...tocUrls);
252284
return;
253285
}
254286
case "timestamp":
@@ -344,6 +376,7 @@ export const handlePage = async (
344376
productName,
345377
version,
346378
links,
379+
toc,
347380
}: {
348381
sourceName: string;
349382
baseUrl: string;
@@ -354,6 +387,7 @@ export const handlePage = async (
354387
isCurrent: boolean;
355388
};
356389
links?: RenderLinks;
390+
toc: string[];
357391
}
358392
): Promise<Page | undefined> => {
359393
// Strip first three path segments - according to Snooty team, they'll always
@@ -373,11 +407,8 @@ export const handlePage = async (
373407
let body = "";
374408
let title: string | undefined;
375409
let format: PageFormat;
376-
const baseUrlTrailingSlash = baseUrl.replace(/\/?$/, "/");
377-
const url = new URL(pagePath, baseUrlTrailingSlash).href.replace(
378-
/\/?$/, // Add trailing slash
379-
"/"
380-
);
410+
const url = makeUrl(pagePath, baseUrl);
411+
381412
if (page.ast.options?.template === "openapi") {
382413
format = "openapi-yaml";
383414
body = await snootyAstToOpenApiSpec(page.ast);
@@ -395,6 +426,9 @@ export const handlePage = async (
395426
return;
396427
}
397428

429+
const maybeTocIndex = toc.findIndex((tocUrl) => tocUrl === url);
430+
const tocIndex = maybeTocIndex === -1 ? undefined : maybeTocIndex;
431+
398432
return {
399433
url,
400434
sourceName,
@@ -403,10 +437,28 @@ export const handlePage = async (
403437
format,
404438
sourceType: "tech-docs",
405439
metadata: {
406-
page: pageMetadata,
440+
page: { ...pageMetadata, tocIndex },
407441
tags,
408442
productName,
409443
version,
410444
},
411445
};
412446
};
447+
448+
function makeUrl(pagePath: string, baseUrl: string): string {
449+
// Ensure trailing slash for baseUrl
450+
const baseUrlTrailingSlash = baseUrl.replace(/\/?$/, "/");
451+
452+
// Handle empty pagePath or root path
453+
if (!pagePath || pagePath === "/") {
454+
return baseUrlTrailingSlash;
455+
}
456+
457+
// For non-empty paths, remove leading slash and ensure trailing slash
458+
const cleanPagePath = pagePath
459+
.replace(/^\//, "") // Remove leading slash
460+
.replace(/\/?$/, "/"); // Ensure trailing slash
461+
462+
// Concatenate the base URL with the clean page path
463+
return baseUrlTrailingSlash + cleanPagePath;
464+
}

0 commit comments

Comments
 (0)