Skip to content

Commit cb01231

Browse files
authored
(EAI-1044) Add sourceType to all sources (#756)
* Add sourceType to web sources * Define source types (SourceTypeName) * Fix failing test * Fix undefined overwrite issue * Update DB if sourceType changes * Make sourceType name generic * Improve grouping of web sources
1 parent e9f796c commit cb01231

20 files changed

+259
-143
lines changed

packages/datasets/src/pageDataset/loadPageDataset.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ describe("loadPagesDataset", () => {
4646
action: "deleted",
4747
sourceType: "tech-docs",
4848
},
49-
// This page does not match sourceType (even though it is active)
49+
// This page does not match sourceType = "tech-docs" (even though it is active)
5050
{
5151
url: "https://example.com/page4",
5252
body: "Page 4 body",
@@ -56,7 +56,7 @@ describe("loadPagesDataset", () => {
5656
updated: new Date(),
5757
format: "html",
5858
action: "created",
59-
sourceType: "blog",
59+
sourceType: "marketing",
6060
},
6161
{
6262
url: "https://example.com/page5",

packages/ingest-mongodb-public/src/sources/DevCenterDataSource.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
ProjectBase,
88
removeMarkdownImagesAndLinks,
99
} from "mongodb-rag-core/dataSources";
10+
import { SourceTypeName } from "./index";
1011

1112
export type DevCenterProjectConfig = ProjectBase & {
1213
type: "devcenter";
@@ -47,7 +48,7 @@ export const makeDevCenterDataSource = async ({
4748
const collection = db.collection<DevCenterEntry>(collectionName);
4849
const documents = collection.find();
4950

50-
const pages: Page[] = [];
51+
const pages: Page<SourceTypeName>[] = [];
5152
for await (const document of documents) {
5253
if (!document.content) {
5354
logger.warn(
@@ -69,7 +70,7 @@ export function makeDevCenterPage(
6970
document: DevCenterEntry,
7071
name: string,
7172
baseUrl: string
72-
): Page {
73+
): Page<SourceTypeName> {
7374
assert(document.content, "document.content must be defined");
7475
return {
7576
title: document.name,

packages/ingest-mongodb-public/src/sources/index.ts

Lines changed: 46 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,17 @@ export const devCenterProjectConfig: DevCenterProjectConfig = {
5252
connectionUri: DEVCENTER_CONNECTION_URI,
5353
};
5454

55+
/**
56+
Predefined values for sourceType that we want to use in our Pages.
57+
*/
58+
export type SourceTypeName =
59+
| "tech-docs"
60+
| "devcenter"
61+
| "marketing"
62+
| "university-content"
63+
| "tech-docs-external"
64+
| "book-external";
65+
5566
const mongoDbUniversitySourceConstructor = async () => {
5667
const universityDataApiKey = UNIVERSITY_DATA_API_KEY;
5768
assert(!!universityDataApiKey, "UNIVERSITY_DATA_API_KEY required");
@@ -67,34 +78,37 @@ const mongoDbUniversitySourceConstructor = async () => {
6778
return makeMongoDbUniversityDataSource(universityConfig);
6879
};
6980

70-
export const mongoDbCorpDataSourceConfig: MakeMdOnGithubDataSourceParams = {
71-
name: "mongodb-corp",
72-
repoUrl: "https://github.com/mongodb/chatbot/",
73-
repoLoaderOptions: {
74-
branch: "main",
75-
ignoreFiles: [/^(?!^\/mongodb-corp\/).*/, /^(mongodb-corp\/README\.md)$/],
76-
},
77-
pathToPageUrl(_, frontMatter) {
78-
if (!frontMatter?.url) {
79-
throw new Error("frontMatter.url must be specified");
80-
}
81-
return frontMatter?.url as string;
82-
},
83-
extractMetadata(_, frontMatter) {
84-
if (!frontMatter) {
85-
throw new Error("frontMatter must be specified");
86-
}
87-
const frontMatterCopy = { ...frontMatter };
88-
delete frontMatterCopy.url;
89-
return frontMatterCopy;
90-
},
91-
extractTitle: (_, frontmatter) => (frontmatter?.title as string) ?? null,
92-
};
81+
export const mongoDbCorpDataSourceConfig: MakeMdOnGithubDataSourceParams<SourceTypeName> =
82+
{
83+
name: "mongodb-corp",
84+
repoUrl: "https://github.com/mongodb/chatbot/",
85+
repoLoaderOptions: {
86+
branch: "main",
87+
ignoreFiles: [/^(?!^\/mongodb-corp\/).*/, /^(mongodb-corp\/README\.md)$/],
88+
},
89+
pathToPageUrl(_, frontMatter) {
90+
if (!frontMatter?.url) {
91+
throw new Error("frontMatter.url must be specified");
92+
}
93+
return frontMatter?.url as string;
94+
},
95+
extractMetadata(_, frontMatter) {
96+
if (!frontMatter) {
97+
throw new Error("frontMatter must be specified");
98+
}
99+
const frontMatterCopy = { ...frontMatter };
100+
delete frontMatterCopy.url;
101+
return frontMatterCopy;
102+
},
103+
extractTitle: (_, frontmatter) => (frontmatter?.title as string) ?? null,
104+
};
93105
const mongoDbCorpDataSource = async () => {
94-
return await makeMdOnGithubDataSource(mongoDbCorpDataSourceConfig);
106+
return await makeMdOnGithubDataSource<SourceTypeName>(
107+
mongoDbCorpDataSourceConfig
108+
);
95109
};
96110

97-
export const mongoDbUniMetadataDataSourceConfig: MakeMdOnGithubDataSourceParams =
111+
export const mongoDbUniMetadataDataSourceConfig: MakeMdOnGithubDataSourceParams<SourceTypeName> =
98112
{
99113
name: "university-meta",
100114
repoUrl: "https://github.com/mongodb/chatbot/",
@@ -117,24 +131,28 @@ export const mongoDbUniMetadataDataSourceConfig: MakeMdOnGithubDataSourceParams
117131
return frontMatterCopy;
118132
},
119133
extractTitle: (_, frontmatter) => (frontmatter?.title as string) ?? null,
134+
sourceType: "university-content",
120135
metadata: {
121136
siteTitle: "MongoDB University",
122137
},
123138
};
124139
const mongoDbUniMetadataSource = async () => {
125-
return await makeMdOnGithubDataSource(mongoDbUniMetadataDataSourceConfig);
140+
return await makeMdOnGithubDataSource<SourceTypeName>(
141+
mongoDbUniMetadataDataSourceConfig
142+
);
126143
};
127144

128145
export const terraformProviderSourceConstructor = async () => {
129146
const siteBaseUrl =
130147
"https://registry.terraform.io/providers/mongodb/mongodbatlas/latest/docs";
131-
return await makeGitDataSource({
148+
return await makeGitDataSource<SourceTypeName>({
132149
name: "atlas-terraform-provider",
133150
repoUri: "https://github.com/mongodb/terraform-provider-mongodbatlas.git",
134151
repoOptions: {
135152
"--depth": 1,
136153
"--branch": "master",
137154
},
155+
sourceType: "tech-docs-external",
138156
metadata: {
139157
productName: "mongodbatlas Terraform Provider",
140158
tags: ["docs", "terraform", "atlas", "hcl"],
@@ -147,7 +165,7 @@ export const terraformProviderSourceConstructor = async () => {
147165
);
148166
const url = getTerraformPageUrl(siteBaseUrl, path);
149167

150-
const page: Omit<Page, "sourceName"> = {
168+
const page: Omit<Page<SourceTypeName>, "sourceName"> = {
151169
body: removeMarkdownImagesAndLinks(body),
152170
format: "md",
153171
url: url,

packages/ingest-mongodb-public/src/sources/mongodb-university/makeUniversityPages.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import {
33
TiCatalogItem,
44
UniversityVideo,
55
} from "./MongoDbUniversityDataApiClient";
6+
import { SourceTypeName } from "../index";
67

78
export const UNI_BASE_URL = "https://learn.mongodb.com";
89

@@ -20,7 +21,7 @@ export function makeUniversityPages({
2021
tiCatalogItems: TiCatalogItem[];
2122
videos: UniversityVideo[];
2223
metadata?: PageMetadata;
23-
}): Page[] {
24+
}): Page<SourceTypeName>[] {
2425
// Create a dictionary of videos keyed by their hashed ID.
2526
// This is used to efficiently look up the video for a lesson.
2627
const videoDict = makeVideosDictionary(videos);
@@ -44,8 +45,8 @@ function makeCatalogItemPages({
4445
tiCatalogItems: TiCatalogItem[];
4546
videoDict: VideosDict;
4647
metadata?: PageMetadata;
47-
}): Page[] {
48-
const pages: Page[] = [];
48+
}): Page<SourceTypeName>[] {
49+
const pages: Page<SourceTypeName>[] = [];
4950
for (const catalogItem of tiCatalogItems) {
5051
/* Create page for higher level courses.
5152
* Higher level courses are Leanring Paths and Courses that have nested content.
@@ -56,14 +57,15 @@ function makeCatalogItemPages({
5657
catalogItem.learning_format === "Learning Path" ||
5758
catalogItem.learning_format === "Course"
5859
) {
59-
const page: Page = {
60+
const page: Page<SourceTypeName> = {
6061
sourceName,
6162
url: `${UNI_BASE_URL}/learning-paths/${catalogItem.slug}`,
6263
title: catalogItem.name,
6364
format: "md",
6465
body: generateContentDescriptionMarkdown({
6566
tiCatalogItem: catalogItem,
6667
}),
68+
sourceType: "university-content",
6769
metadata: {
6870
...(metadata ?? {}),
6971
tags: [...(metadata?.tags ?? []), "landing page"],
@@ -90,7 +92,7 @@ function makeCatalogItemPages({
9092
if (body.length === 0) {
9193
continue;
9294
}
93-
const page: Page = {
95+
const page: Page<SourceTypeName> = {
9496
sourceName,
9597
url: makeUniversityPageUrl({
9698
catalogItemSlug: catalogItem.slug,
@@ -104,6 +106,7 @@ function makeCatalogItemPages({
104106
}),
105107
format: "txt",
106108
body,
109+
sourceType: "university-content",
107110
metadata: {
108111
...(metadata ?? {}),
109112
// We choose to not include tags returned by the API (i.e.

packages/ingest-mongodb-public/src/sources/mongodbDotCom/WebDataSource.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ interface WebSourceParams extends WebSource {
1616
export function makeWebDataSource({
1717
name,
1818
urls,
19+
sourceType,
1920
staticMetadata,
2021
makeBrowser,
2122
}: WebSourceParams): DataSource {
@@ -37,6 +38,7 @@ export function makeWebDataSource({
3738
url,
3839
format: "md",
3940
sourceName: name,
41+
sourceType,
4042
...content,
4143
metadata: { ...content.metadata, ...staticMetadata },
4244
});

0 commit comments

Comments
 (0)