Skip to content

Commit 3901211

Browse files
authored
(EAI-428) feature versioned docs (#739)
* (EAI-968) ingest multiple versions (#683) * remove snooty prefix * ingesting pages for all branches on each data source * do not ingest (and delete if already exists) pages on inactive branches * handle current version override * cleanup unused code from previous version override implementation, tests * update SnootyDataSource tests * remove override for docs current version * (EAI-969) query multiple versions (#693) * nearest neighbor search accepts filters, defaults to current version * parse filters to mdb query * (EAI-1003) get available versions for data source (#696) * get versions of a data source * get versions for multiple data sources * (EAI-922) ensure only current version on hugging face dataset (#698) * exclude old versions from dataset * add test case * fix other tests * fix return type of getDataSourceVersions - return object, not array * move QueryFilters type def to embedded content store * fix type * (EAI-1001) enable search by type (#703) * add sourceType to pages and embedded_content and ability to filter by it * test case * change sourceRegex filter to sourceType filter * lint
1 parent 131ac28 commit 3901211

26 files changed

+760
-375
lines changed

packages/datasets/src/bin/huggingFace/uploadCodeExampleDatasetToHuggingFace.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {
1010
} from "mongodb-rag-core";
1111
import {
1212
forbiddenUrls,
13-
publicDatasetSourceName,
13+
publicDatasetSourceTypes,
1414
} from "../../mongoDbDatasetConstants";
1515
import { uploadDatasetToHuggingFace } from "../../uploadDatasetToHuggingFace";
1616
import { HUGGINGFACE, HUGGINGFACE_DOCS_CODE_EXAMPLES } from "../../EnvVars";
@@ -21,7 +21,7 @@ import { CodeExampleDatasetEntry } from "../../codeExampleDataset/createCodeExam
2121
import { Filter } from "mongodb-rag-core/mongodb";
2222

2323
async function uploadCodeExampleDatasetToHuggingFace() {
24-
logger.info("Staring upload code example dataset to Hugging Face script");
24+
logger.info("Starting upload code example dataset to Hugging Face script");
2525

2626
const {
2727
HUGGINGFACE_ACCESS_TOKEN,
@@ -56,7 +56,7 @@ async function uploadCodeExampleDatasetToHuggingFace() {
5656
try {
5757
const pages = await pageStore.loadPages({
5858
query: makeLoadPagesFilter(
59-
publicDatasetSourceName,
59+
publicDatasetSourceTypes,
6060
Array.from(forbiddenUrls)
6161
),
6262
});
@@ -120,12 +120,16 @@ async function uploadCodeExampleDatasetToHuggingFace() {
120120
uploadCodeExampleDatasetToHuggingFace();
121121

122122
function makeLoadPagesFilter(
123-
publicDatasetSourceName: RegExp,
123+
publicDatasetSourceTypes: string[],
124124
forbiddenUrls: string[]
125125
): Filter<PersistedPage> {
126126
return {
127-
sourceName: publicDatasetSourceName,
127+
sourceType: { $in: publicDatasetSourceTypes },
128128
url: { $nin: forbiddenUrls },
129129
action: { $ne: "deleted" },
130+
$or: [
131+
{ "metadata.version.isCurrent": { $exists: false } },
132+
{ "metadata.version.isCurrent": true },
133+
],
130134
};
131135
}

packages/datasets/src/bin/huggingFace/uploadContentDatasetToHuggingFace.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import {
66
} from "mongodb-rag-core";
77
import {
88
forbiddenUrls,
9-
publicDatasetSourceName,
9+
publicDatasetSourceTypes,
1010
} from "../../mongoDbDatasetConstants";
1111
import { uploadDatasetToHuggingFace } from "../../uploadDatasetToHuggingFace";
1212
import { HUGGINGFACE, HUGGINGFACE_DOCS_CONTENT } from "../../EnvVars";
@@ -36,7 +36,7 @@ async function uploadContentDatasetToHuggingFace() {
3636
logger.info("Loading pages dataset from MongoDB");
3737
const dataset = await loadPagesDataset({
3838
pageStore,
39-
dataSourceRegex: publicDatasetSourceName,
39+
dataSourceTypes: publicDatasetSourceTypes,
4040
forbiddenUrls: Array.from(forbiddenUrls),
4141
});
4242
logger.info(

packages/datasets/src/mongoDbDatasetConstants.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ export const forbiddenUrls = new Set([
1717
]);
1818

1919
/**
20-
The {@link PersistedPage.sourceName} for public datasets should contain `snooty` (Docs) or `devcenter` (Developer Center).
20+
The {@link PersistedPage.sourceType} for public datasets should be either `tech-docs` (Docs) or `devcenter` (Developer Center).
2121
*/
22-
export const publicDatasetSourceName = /snooty|devcenter/;
22+
export const publicDatasetSourceTypes = ["tech-docs", "devcenter"];

packages/datasets/src/pageDataset/loadPageDataset.test.ts

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ describe("loadPagesDataset", () => {
2020
updated: new Date(),
2121
format: "html",
2222
action: "updated",
23+
sourceType: "tech-docs",
2324
},
2425
// This page should be filtered out by forbidden url
2526
{
@@ -31,6 +32,7 @@ describe("loadPagesDataset", () => {
3132
updated: new Date(),
3233
format: "html",
3334
action: "updated",
35+
sourceType: "tech-docs",
3436
},
3537
// This page has action deleted, so should not be returned
3638
{
@@ -42,8 +44,9 @@ describe("loadPagesDataset", () => {
4244
updated: new Date(),
4345
format: "html",
4446
action: "deleted",
47+
sourceType: "tech-docs",
4548
},
46-
// This page does not match regex (even though it is active)
49+
// This page does not match sourceType (even though it is active)
4750
{
4851
url: "https://example.com/page4",
4952
body: "Page 4 body",
@@ -53,6 +56,36 @@ describe("loadPagesDataset", () => {
5356
updated: new Date(),
5457
format: "html",
5558
action: "created",
59+
sourceType: "blog",
60+
},
61+
{
62+
url: "https://example.com/page5",
63+
body: "Page 5 body",
64+
metadata: {
65+
extra: "info1",
66+
version: { isCurrent: true, label: "current" },
67+
},
68+
title: "Page 5",
69+
sourceName: "SourceE",
70+
updated: new Date(),
71+
format: "html",
72+
action: "updated",
73+
sourceType: "tech-docs",
74+
},
75+
// This page represents an older version and should not be returned
76+
{
77+
url: "https://example.com/old-version/page5",
78+
body: "Page 5 body",
79+
metadata: {
80+
extra: "info5",
81+
version: { isCurrent: false, label: "old-version" },
82+
},
83+
title: "Page 5",
84+
sourceName: "SourceE",
85+
updated: new Date(),
86+
format: "html",
87+
action: "updated",
88+
sourceType: "tech-docs",
5689
},
5790
];
5891

@@ -72,21 +105,14 @@ describe("loadPagesDataset", () => {
72105
await pageStore.close();
73106
});
74107

75-
it("should only return pages matching a regex", async () => {
108+
it("should only return pages matching a sourceType", async () => {
76109
const dataset = await loadPagesDataset({
77110
pageStore,
78-
dataSourceRegex: /SourceA/,
79111
forbiddenUrls: [],
112+
dataSourceTypes: ["tech-docs"],
80113
});
81-
// page1 should be returned, page2 as well if not forbidden & not deleted, page3 is deleted.
82-
// page4 is filtered out because dataSource doesn't match.
114+
// page1 should be returned as it matches the sourceType, page 4 does not match
83115
expect(dataset.map((p) => p.url)).toContain("https://example.com/page1");
84-
expect(dataset.map((p) => p.url)).not.toContain(
85-
"https://example.com/page2"
86-
);
87-
expect(dataset.map((p) => p.url)).not.toContain(
88-
"https://example.com/page3"
89-
);
90116
expect(dataset.map((p) => p.url)).not.toContain(
91117
"https://example.com/page4"
92118
);
@@ -95,8 +121,8 @@ describe("loadPagesDataset", () => {
95121
it("should exclude forbidden urls", async () => {
96122
const dataset = await loadPagesDataset({
97123
pageStore,
98-
dataSourceRegex: /SourceA|SourceB/,
99124
forbiddenUrls: [samplePages[1].url],
125+
dataSourceTypes: ["devcenter", "tech-docs"],
100126
});
101127
expect(dataset.map((p) => p.url)).toContain("https://example.com/page1");
102128
expect(dataset.map((p) => p.url)).not.toContain(
@@ -107,19 +133,32 @@ describe("loadPagesDataset", () => {
107133
it("should not include pages with action 'deleted'", async () => {
108134
const dataset = await loadPagesDataset({
109135
pageStore,
110-
dataSourceRegex: /foo/,
111136
forbiddenUrls: [],
137+
dataSourceTypes: ["devcenter", "tech-docs"],
112138
});
113139
const urls = dataset.map((p) => p.url);
140+
expect(urls.length).toBeGreaterThan(0);
114141
expect(urls).not.toContain("https://example.com/page3");
115142
});
116143

144+
it("should not include pages representing a non current version", async () => {
145+
const dataset = await loadPagesDataset({
146+
pageStore,
147+
forbiddenUrls: [],
148+
dataSourceTypes: ["tech-docs"],
149+
});
150+
const urls = dataset.map((p) => p.url);
151+
expect(urls).toContain("https://example.com/page5");
152+
expect(urls).not.toContain("https://example.com/old-version/page5");
153+
});
154+
117155
it("should only return the projected fields", async () => {
118156
const dataset = await loadPagesDataset({
119157
pageStore,
120-
dataSourceRegex: /foo/,
121158
forbiddenUrls: [],
159+
dataSourceTypes: ["devcenter", "tech-docs"],
122160
});
161+
expect(dataset.length).toBeGreaterThan(0);
123162
for (const page of dataset) {
124163
const pageKeys = Object.keys(page);
125164

packages/datasets/src/pageDataset/loadPageDataset.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export interface LoadPagesDatasetParams {
1010
/**
1111
Regular expression to filter pages by `Page.dataSource`
1212
*/
13-
dataSourceRegex: RegExp;
13+
dataSourceTypes: string[];
1414
/**
1515
Set of urls to exclude from the dataset
1616
*/
@@ -19,15 +19,19 @@ export interface LoadPagesDatasetParams {
1919

2020
export async function loadPagesDataset({
2121
pageStore,
22-
dataSourceRegex,
22+
dataSourceTypes,
2323
forbiddenUrls,
2424
}: LoadPagesDatasetParams): Promise<PageDatasetEntry[]> {
2525
return pageStore.aggregatePages<PageDatasetEntry>([
2626
{
2727
$match: {
28-
sourceName: dataSourceRegex,
28+
sourceType: { $in: dataSourceTypes },
2929
url: { $nin: forbiddenUrls },
3030
action: { $ne: "deleted" },
31+
$or: [
32+
{ "metadata.version.isCurrent": { $exists: false } },
33+
{ "metadata.version.isCurrent": true },
34+
],
3135
},
3236
},
3337
{

packages/ingest-mongodb-public/src/meta.config.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,10 @@ export const standardConfig = {
7171
}),
7272
dataSources: async () => {
7373
const source = makeSnootyDataSource({
74-
name: `snooty-${metaSnootyProject.name}`,
74+
name: metaSnootyProject.name,
7575
project: {
7676
...metaSnootyProject,
77-
currentBranch: metaSnootyProject.currentBranch,
7877
type: "snooty",
79-
baseUrl: metaSnootyProject.baseUrl?.replace(/\/?$/, "/"),
8078
},
8179
snootyDataApiBaseUrl,
8280
});

packages/ingest-mongodb-public/src/sources/DevCenterDataSource.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ describe("makeDevCenterPage()", () => {
5757
}),
5858
format: "md",
5959
sourceName: "devcenter",
60+
sourceType: "devcenter",
6061
metadata: {
6162
tags: ["Realm", "GitHub Actions", "JavaScript"],
6263
pageDescription: devCenterDoc.description,

packages/ingest-mongodb-public/src/sources/DevCenterDataSource.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ export function makeDevCenterPage(
7979
}),
8080
format: "md",
8181
sourceName: name,
82+
sourceType: "devcenter",
8283
metadata: {
8384
tags: extractTags(document.tags),
8485
pageDescription: document.description,

packages/ingest-mongodb-public/src/sources/mongoose.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export const mongooseSourceConstructor = async () => {
2626
metadata: {
2727
productName: "Mongoose ODM",
2828
tags: ["node.js", "community library", "mongoose", "odm"],
29-
version: "v7.x (current)",
29+
versionLabel: "v7.x (current)",
3030
},
3131
});
3232
};

0 commit comments

Comments
 (0)