Skip to content

Commit 235bf22

Browse files
authored
Revert "(EAI-428) versioned docs" (#737)
Revert "(EAI-428) versioned docs (#699)" This reverts commit 7566854.
1 parent 4310e1d commit 235bf22

26 files changed

+375
-760
lines changed

packages/datasets/src/bin/huggingFace/uploadCodeExampleDatasetToHuggingFace.ts

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {
1010
} from "mongodb-rag-core";
1111
import {
1212
forbiddenUrls,
13-
publicDatasetSourceTypes,
13+
publicDatasetSourceName,
1414
} from "../../mongoDbDatasetConstants";
1515
import { uploadDatasetToHuggingFace } from "../../uploadDatasetToHuggingFace";
1616
import { HUGGINGFACE, HUGGINGFACE_DOCS_CODE_EXAMPLES } from "../../EnvVars";
@@ -21,7 +21,7 @@ import { CodeExampleDatasetEntry } from "../../codeExampleDataset/createCodeExam
2121
import { Filter } from "mongodb-rag-core/mongodb";
2222

2323
async function uploadCodeExampleDatasetToHuggingFace() {
24-
logger.info("Starting upload code example dataset to Hugging Face script");
24+
logger.info("Staring upload code example dataset to Hugging Face script");
2525

2626
const {
2727
HUGGINGFACE_ACCESS_TOKEN,
@@ -56,7 +56,7 @@ async function uploadCodeExampleDatasetToHuggingFace() {
5656
try {
5757
const pages = await pageStore.loadPages({
5858
query: makeLoadPagesFilter(
59-
publicDatasetSourceTypes,
59+
publicDatasetSourceName,
6060
Array.from(forbiddenUrls)
6161
),
6262
});
@@ -120,16 +120,12 @@ async function uploadCodeExampleDatasetToHuggingFace() {
120120
uploadCodeExampleDatasetToHuggingFace();
121121

122122
function makeLoadPagesFilter(
123-
publicDatasetSourceTypes: string[],
123+
publicDatasetSourceName: RegExp,
124124
forbiddenUrls: string[]
125125
): Filter<PersistedPage> {
126126
return {
127-
sourceType: { $in: publicDatasetSourceTypes },
127+
sourceName: publicDatasetSourceName,
128128
url: { $nin: forbiddenUrls },
129129
action: { $ne: "deleted" },
130-
$or: [
131-
{ "metadata.version.isCurrent": { $exists: false } },
132-
{ "metadata.version.isCurrent": true },
133-
],
134130
};
135131
}

packages/datasets/src/bin/huggingFace/uploadContentDatasetToHuggingFace.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import {
66
} from "mongodb-rag-core";
77
import {
88
forbiddenUrls,
9-
publicDatasetSourceTypes,
9+
publicDatasetSourceName,
1010
} from "../../mongoDbDatasetConstants";
1111
import { uploadDatasetToHuggingFace } from "../../uploadDatasetToHuggingFace";
1212
import { HUGGINGFACE, HUGGINGFACE_DOCS_CONTENT } from "../../EnvVars";
@@ -36,7 +36,7 @@ async function uploadContentDatasetToHuggingFace() {
3636
logger.info("Loading pages dataset from MongoDB");
3737
const dataset = await loadPagesDataset({
3838
pageStore,
39-
dataSourceTypes: publicDatasetSourceTypes,
39+
dataSourceRegex: publicDatasetSourceName,
4040
forbiddenUrls: Array.from(forbiddenUrls),
4141
});
4242
logger.info(

packages/datasets/src/mongoDbDatasetConstants.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ export const forbiddenUrls = new Set([
1717
]);
1818

1919
/**
20-
The {@link PersistedPage.sourceType} for public datasets should be either `tech-docs` (Docs) or `devcenter` (Developer Center).
20+
The {@link PersistedPage.sourceName} for public datasets should contain `snooty` (Docs) or `devcenter` (Developer Center).
2121
*/
22-
export const publicDatasetSourceTypes = ["tech-docs", "devcenter"];
22+
export const publicDatasetSourceName = /snooty|devcenter/;

packages/datasets/src/pageDataset/loadPageDataset.test.ts

Lines changed: 14 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ describe("loadPagesDataset", () => {
2020
updated: new Date(),
2121
format: "html",
2222
action: "updated",
23-
sourceType: "tech-docs",
2423
},
2524
// This page should be filtered out by forbidden url
2625
{
@@ -32,7 +31,6 @@ describe("loadPagesDataset", () => {
3231
updated: new Date(),
3332
format: "html",
3433
action: "updated",
35-
sourceType: "tech-docs",
3634
},
3735
// This page has action deleted, so should not be returned
3836
{
@@ -44,9 +42,8 @@ describe("loadPagesDataset", () => {
4442
updated: new Date(),
4543
format: "html",
4644
action: "deleted",
47-
sourceType: "tech-docs",
4845
},
49-
// This page does not match sourceType (even though it is active)
46+
// This page does not match regex (even though it is active)
5047
{
5148
url: "https://example.com/page4",
5249
body: "Page 4 body",
@@ -56,36 +53,6 @@ describe("loadPagesDataset", () => {
5653
updated: new Date(),
5754
format: "html",
5855
action: "created",
59-
sourceType: "blog",
60-
},
61-
{
62-
url: "https://example.com/page5",
63-
body: "Page 5 body",
64-
metadata: {
65-
extra: "info1",
66-
version: { isCurrent: true, label: "current" },
67-
},
68-
title: "Page 5",
69-
sourceName: "SourceE",
70-
updated: new Date(),
71-
format: "html",
72-
action: "updated",
73-
sourceType: "tech-docs",
74-
},
75-
// This page represents an older version and should not be returned
76-
{
77-
url: "https://example.com/old-version/page5",
78-
body: "Page 5 body",
79-
metadata: {
80-
extra: "info5",
81-
version: { isCurrent: false, label: "old-version" },
82-
},
83-
title: "Page 5",
84-
sourceName: "SourceE",
85-
updated: new Date(),
86-
format: "html",
87-
action: "updated",
88-
sourceType: "tech-docs",
8956
},
9057
];
9158

@@ -105,14 +72,21 @@ describe("loadPagesDataset", () => {
10572
await pageStore.close();
10673
});
10774

108-
it("should only return pages matching a sourceType", async () => {
75+
it("should only return pages matching a regex", async () => {
10976
const dataset = await loadPagesDataset({
11077
pageStore,
78+
dataSourceRegex: /SourceA/,
11179
forbiddenUrls: [],
112-
dataSourceTypes: ["tech-docs"],
11380
});
114-
// page1 should be returned as it matches the sourceType, page 4 does not match
81+
// page1 should be returned, page2 as well if not forbidden & not deleted, page3 is deleted.
82+
// page4 is filtered out because dataSource doesn't match.
11583
expect(dataset.map((p) => p.url)).toContain("https://example.com/page1");
84+
expect(dataset.map((p) => p.url)).not.toContain(
85+
"https://example.com/page2"
86+
);
87+
expect(dataset.map((p) => p.url)).not.toContain(
88+
"https://example.com/page3"
89+
);
11690
expect(dataset.map((p) => p.url)).not.toContain(
11791
"https://example.com/page4"
11892
);
@@ -121,8 +95,8 @@ describe("loadPagesDataset", () => {
12195
it("should exclude forbidden urls", async () => {
12296
const dataset = await loadPagesDataset({
12397
pageStore,
98+
dataSourceRegex: /SourceA|SourceB/,
12499
forbiddenUrls: [samplePages[1].url],
125-
dataSourceTypes: ["devcenter", "tech-docs"],
126100
});
127101
expect(dataset.map((p) => p.url)).toContain("https://example.com/page1");
128102
expect(dataset.map((p) => p.url)).not.toContain(
@@ -133,32 +107,19 @@ describe("loadPagesDataset", () => {
133107
it("should not include pages with action 'deleted'", async () => {
134108
const dataset = await loadPagesDataset({
135109
pageStore,
110+
dataSourceRegex: /foo/,
136111
forbiddenUrls: [],
137-
dataSourceTypes: ["devcenter", "tech-docs"],
138112
});
139113
const urls = dataset.map((p) => p.url);
140-
expect(urls.length).toBeGreaterThan(0);
141114
expect(urls).not.toContain("https://example.com/page3");
142115
});
143116

144-
it("should not include pages representing a non current version", async () => {
145-
const dataset = await loadPagesDataset({
146-
pageStore,
147-
forbiddenUrls: [],
148-
dataSourceTypes: ["tech-docs"],
149-
});
150-
const urls = dataset.map((p) => p.url);
151-
expect(urls).toContain("https://example.com/page5");
152-
expect(urls).not.toContain("https://example.com/old-version/page5");
153-
});
154-
155117
it("should only return the projected fields", async () => {
156118
const dataset = await loadPagesDataset({
157119
pageStore,
120+
dataSourceRegex: /foo/,
158121
forbiddenUrls: [],
159-
dataSourceTypes: ["devcenter", "tech-docs"],
160122
});
161-
expect(dataset.length).toBeGreaterThan(0);
162123
for (const page of dataset) {
163124
const pageKeys = Object.keys(page);
164125

packages/datasets/src/pageDataset/loadPageDataset.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export interface LoadPagesDatasetParams {
1010
/**
1111
Regular expression to filter pages by `Page.dataSource`
1212
*/
13-
dataSourceTypes: string[];
13+
dataSourceRegex: RegExp;
1414
/**
1515
Set of urls to exclude from the dataset
1616
*/
@@ -19,19 +19,15 @@ export interface LoadPagesDatasetParams {
1919

2020
export async function loadPagesDataset({
2121
pageStore,
22-
dataSourceTypes,
22+
dataSourceRegex,
2323
forbiddenUrls,
2424
}: LoadPagesDatasetParams): Promise<PageDatasetEntry[]> {
2525
return pageStore.aggregatePages<PageDatasetEntry>([
2626
{
2727
$match: {
28-
sourceType: { $in: dataSourceTypes },
28+
sourceName: dataSourceRegex,
2929
url: { $nin: forbiddenUrls },
3030
action: { $ne: "deleted" },
31-
$or: [
32-
{ "metadata.version.isCurrent": { $exists: false } },
33-
{ "metadata.version.isCurrent": true },
34-
],
3531
},
3632
},
3733
{

packages/ingest-mongodb-public/src/meta.config.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,12 @@ export const standardConfig = {
7171
}),
7272
dataSources: async () => {
7373
const source = makeSnootyDataSource({
74-
name: metaSnootyProject.name,
74+
name: `snooty-${metaSnootyProject.name}`,
7575
project: {
7676
...metaSnootyProject,
77+
currentBranch: metaSnootyProject.currentBranch,
7778
type: "snooty",
79+
baseUrl: metaSnootyProject.baseUrl?.replace(/\/?$/, "/"),
7880
},
7981
snootyDataApiBaseUrl,
8082
});

packages/ingest-mongodb-public/src/sources/DevCenterDataSource.test.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ describe("makeDevCenterPage()", () => {
5757
}),
5858
format: "md",
5959
sourceName: "devcenter",
60-
sourceType: "devcenter",
6160
metadata: {
6261
tags: ["Realm", "GitHub Actions", "JavaScript"],
6362
pageDescription: devCenterDoc.description,

packages/ingest-mongodb-public/src/sources/DevCenterDataSource.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ export function makeDevCenterPage(
7979
}),
8080
format: "md",
8181
sourceName: name,
82-
sourceType: "devcenter",
8382
metadata: {
8483
tags: extractTags(document.tags),
8584
pageDescription: document.description,

packages/ingest-mongodb-public/src/sources/mongoose.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export const mongooseSourceConstructor = async () => {
2626
metadata: {
2727
productName: "Mongoose ODM",
2828
tags: ["node.js", "community library", "mongoose", "odm"],
29-
versionLabel: "v7.x (current)",
29+
version: "v7.x (current)",
3030
},
3131
});
3232
};

0 commit comments

Comments
 (0)