@@ -2,12 +2,27 @@ import {
2
2
updateEmbeddedContent ,
3
3
updateEmbeddedContentForPage ,
4
4
} from "./updateEmbeddedContent" ;
5
- import { persistPages } from "." ;
5
+ import {
6
+ makeMongoDbEmbeddedContentStore ,
7
+ makeMongoDbPageStore ,
8
+ MongoDbEmbeddedContentStore ,
9
+ MongoDbPageStore ,
10
+ persistPages ,
11
+ updatePages ,
12
+ } from "." ;
6
13
import { makeMockPageStore } from "../test/MockPageStore" ;
7
14
import * as chunkPageModule from "../chunk/chunkPage" ;
8
- import { EmbeddedContentStore , EmbeddedContent } from "./EmbeddedContent" ;
15
+ import {
16
+ EmbeddedContentStore ,
17
+ EmbeddedContent ,
18
+ GetSourcesMatchParams ,
19
+ } from "./EmbeddedContent" ;
9
20
import { Embedder } from "../embed" ;
10
21
import { Page , PersistedPage } from "." ;
22
+ import { strict as assert } from "assert" ;
23
+ import { MongoMemoryReplSet } from "mongodb-memory-server" ;
24
+ import { DataSource } from "../dataSources" ;
25
+ import { MongoClient } from "mongodb" ;
11
26
12
27
export const makeMockEmbeddedContentStore = ( ) : EmbeddedContentStore => {
13
28
const content : Map < string /* page url */ , EmbeddedContent [ ] > = new Map ( ) ;
@@ -29,6 +44,9 @@ export const makeMockEmbeddedContentStore = (): EmbeddedContentStore => {
29
44
metadata : {
30
45
embeddingName : "test" ,
31
46
} ,
47
+ async getDataSources ( matchQuery : GetSourcesMatchParams ) : Promise < string [ ] > {
48
+ return [ ] ;
49
+ } ,
32
50
} ;
33
51
} ;
34
52
@@ -49,6 +67,7 @@ const embedder = {
49
67
} ,
50
68
} ;
51
69
70
+ // TODO: deprecate mock store and use mongodb-memory-server instead. https://jira.mongodb.org/browse/EAI-935
52
71
describe ( "updateEmbeddedContent" , ( ) => {
53
72
it ( "deletes embedded content for deleted page" , async ( ) => {
54
73
const pageStore = makeMockPageStore ( ) ;
@@ -207,6 +226,7 @@ describe("updateEmbeddedContent", () => {
207
226
store : embeddedContentStore ,
208
227
page,
209
228
concurrencyOptions : { createChunks : 2 } ,
229
+ chunkAlgoHash : "testchunkalgohash" ,
210
230
} ) ;
211
231
212
232
const embeddedContent = await embeddedContentStore . loadEmbeddedContent ( {
@@ -276,3 +296,231 @@ describe("updateEmbeddedContent", () => {
276
296
} ) ;
277
297
} ) ;
278
298
} ) ;
299
+
300
+ // These tests use "mongodb-memory-server", not mockEmbeddedContentStore
301
+ describe ( "updateEmbeddedContent updates chunks based on changes to copy or changes to the chunk algo" , ( ) => {
302
+ let mongod : MongoMemoryReplSet | undefined ;
303
+ let pageStore : MongoDbPageStore ;
304
+ let embedStore : MongoDbEmbeddedContentStore ;
305
+ let uri : string ;
306
+ let databaseName : string ;
307
+ let mongoClient : MongoClient ;
308
+ let page1Embedding : EmbeddedContent [ ] , page2Embedding : EmbeddedContent [ ] ;
309
+ let pages : PersistedPage [ ] = [ ] ;
310
+
311
+ const embedder = {
312
+ async embed ( ) {
313
+ return { embedding : [ 1 , 2 , 3 ] } ;
314
+ } ,
315
+ } ;
316
+ const mockDataSources : DataSource [ ] = [
317
+ {
318
+ name : "source1" ,
319
+ fetchPages : async ( ) => [
320
+ {
321
+ url : "test1.com" ,
322
+ format : "html" ,
323
+ sourceName : "source1" ,
324
+ body : "hello source 1" ,
325
+ } ,
326
+ ] ,
327
+ } ,
328
+ {
329
+ name : "source2" ,
330
+ fetchPages : async ( ) => [
331
+ {
332
+ url : "test2.com" ,
333
+ format : "html" ,
334
+ sourceName : "source2" ,
335
+ body : "hello source 2" ,
336
+ } ,
337
+ ] ,
338
+ } ,
339
+ ] ;
340
+ const mockDataSourceNames = mockDataSources . map (
341
+ ( dataSource ) => dataSource . name
342
+ ) ;
343
+ beforeAll ( async ( ) => {
344
+ mongod = await MongoMemoryReplSet . create ( ) ;
345
+ uri = mongod . getUri ( ) ;
346
+ mongoClient = new MongoClient ( uri ) ;
347
+ await mongoClient . connect ( ) ;
348
+ } ) ;
349
+ beforeEach ( async ( ) => {
350
+ // setup mongo client, page store, and embedded content store
351
+ databaseName = "test-all-command" ;
352
+ embedStore = makeMongoDbEmbeddedContentStore ( {
353
+ connectionUri : uri ,
354
+ databaseName,
355
+ searchIndex : { embeddingName : "test-embedding" } ,
356
+ } ) ;
357
+ pageStore = makeMongoDbPageStore ( {
358
+ connectionUri : uri ,
359
+ databaseName,
360
+ } ) ;
361
+ // create pages and verify that they have been created
362
+ await updatePages ( { sources : mockDataSources , pageStore } ) ;
363
+ pages = await pageStore . loadPages ( ) ;
364
+ assert ( pages . length == 2 ) ;
365
+ // create embeddings for the pages and verify that they have been created
366
+ await updateEmbeddedContent ( {
367
+ since : new Date ( 0 ) ,
368
+ embeddedContentStore : embedStore ,
369
+ pageStore,
370
+ sourceNames : mockDataSourceNames ,
371
+ embedder,
372
+ } ) ;
373
+ page1Embedding = await embedStore . loadEmbeddedContent ( {
374
+ page : pages [ 0 ] ,
375
+ } ) ;
376
+ page2Embedding = await embedStore . loadEmbeddedContent ( {
377
+ page : pages [ 1 ] ,
378
+ } ) ;
379
+ assert ( page1Embedding . length ) ;
380
+ assert ( page2Embedding . length ) ;
381
+ } ) ;
382
+
383
+ afterEach ( async ( ) => {
384
+ await pageStore ?. drop ( ) ;
385
+ await embedStore ?. drop ( ) ;
386
+ } ) ;
387
+ afterAll ( async ( ) => {
388
+ await pageStore ?. close ( ) ;
389
+ await embedStore ?. close ( ) ;
390
+ await mongoClient ?. close ( ) ;
391
+ await mongod ?. stop ( ) ;
392
+ } ) ;
393
+
394
+ it ( "should update embedded content only for pages that have been updated (copy change) after the 'since' date provided" , async ( ) => {
395
+ // Modify dates of pages and embedded content for testing
396
+ const sinceDate = new Date ( "2024-01-01" ) ;
397
+ const beforeSinceDate = new Date ( "2023-01-01" ) ;
398
+ const afterSinceDate = new Date ( "2025-01-01" ) ;
399
+ // set pages[0] to be last updated before sinceDate (should not be modified)
400
+ await mongoClient
401
+ . db ( databaseName )
402
+ . collection ( "pages" )
403
+ . updateOne ( { ...pages [ 0 ] } , { $set : { updated : beforeSinceDate } } ) ;
404
+ await mongoClient
405
+ . db ( databaseName )
406
+ . collection ( "embedded_content" )
407
+ . updateOne (
408
+ { sourceName : mockDataSourceNames [ 0 ] } ,
409
+ { $set : { updated : beforeSinceDate } }
410
+ ) ;
411
+ // set pages[1] to be last updated after sinceDate (should be re-chunked)
412
+ await mongoClient
413
+ . db ( databaseName )
414
+ . collection ( "pages" )
415
+ . updateOne ( { ...pages [ 1 ] } , { $set : { updated : afterSinceDate } } ) ;
416
+ await mongoClient
417
+ . db ( databaseName )
418
+ . collection ( "embedded_content" )
419
+ . updateOne (
420
+ { sourceName : mockDataSourceNames [ 1 ] } ,
421
+ { $set : { updated : afterSinceDate } }
422
+ ) ;
423
+ const originalPage1Embedding = await embedStore . loadEmbeddedContent ( {
424
+ page : pages [ 0 ] ,
425
+ } ) ;
426
+ const originalPage2Embedding = await embedStore . loadEmbeddedContent ( {
427
+ page : pages [ 1 ] ,
428
+ } ) ;
429
+ await updateEmbeddedContent ( {
430
+ since : sinceDate ,
431
+ embeddedContentStore : embedStore ,
432
+ pageStore,
433
+ sourceNames : mockDataSourceNames ,
434
+ embedder,
435
+ } ) ;
436
+ const updatedPage1Embedding = await embedStore . loadEmbeddedContent ( {
437
+ page : pages [ 0 ] ,
438
+ } ) ;
439
+ const updatedPage2Embedding = await embedStore . loadEmbeddedContent ( {
440
+ page : pages [ 1 ] ,
441
+ } ) ;
442
+ assert ( updatedPage1Embedding . length ) ;
443
+ assert ( updatedPage2Embedding . length ) ;
444
+ expect ( updatedPage1Embedding [ 0 ] . updated . getTime ( ) ) . toBe (
445
+ originalPage1Embedding [ 0 ] . updated . getTime ( )
446
+ ) ;
447
+ expect ( updatedPage2Embedding [ 0 ] . updated . getTime ( ) ) . not . toBe (
448
+ originalPage2Embedding [ 0 ] . updated . getTime ( )
449
+ ) ;
450
+ } ) ;
451
+ it ( "should update embedded content when only chunk algo has changed" , async ( ) => {
452
+ // change the chunking algo for the second page, but not the first
453
+ await updateEmbeddedContent ( {
454
+ since : new Date ( ) ,
455
+ embeddedContentStore : embedStore ,
456
+ pageStore,
457
+ sourceNames : [ mockDataSourceNames [ 0 ] ] ,
458
+ embedder,
459
+ } ) ;
460
+ await updateEmbeddedContent ( {
461
+ since : new Date ( ) ,
462
+ embeddedContentStore : embedStore ,
463
+ pageStore,
464
+ sourceNames : [ mockDataSourceNames [ 1 ] ] ,
465
+ embedder,
466
+ chunkOptions : { chunkOverlap : 2 } ,
467
+ } ) ;
468
+ const updatedPage1Embedding = await embedStore . loadEmbeddedContent ( {
469
+ page : pages [ 0 ] ,
470
+ } ) ;
471
+ const updatedPage2Embedding = await embedStore . loadEmbeddedContent ( {
472
+ page : pages [ 1 ] ,
473
+ } ) ;
474
+ assert ( updatedPage1Embedding . length ) ;
475
+ assert ( updatedPage2Embedding . length ) ;
476
+ expect ( updatedPage1Embedding [ 0 ] . chunkAlgoHash ) . toBe (
477
+ page1Embedding [ 0 ] . chunkAlgoHash
478
+ ) ;
479
+ expect ( updatedPage2Embedding [ 0 ] . chunkAlgoHash ) . not . toBe (
480
+ page2Embedding [ 0 ] . chunkAlgoHash
481
+ ) ;
482
+ } ) ;
483
+ it ( "should update embedded content when either chunk algo has changed or copy has changed" , async ( ) => {
484
+ // SETUP: Modify dates of pages and embedded content for this test case
485
+ const sinceDate = new Date ( "2024-01-01" ) ;
486
+ const afterSinceDate = new Date ( "2025-01-01" ) ;
487
+ await mongoClient
488
+ . db ( databaseName )
489
+ . collection ( "pages" )
490
+ . updateOne ( { ...pages [ 0 ] } , { $set : { updated : afterSinceDate } } ) ;
491
+ await mongoClient
492
+ . db ( databaseName )
493
+ . collection ( "embedded_content" )
494
+ . updateOne (
495
+ { sourceName : mockDataSourceNames [ 0 ] } ,
496
+ { $set : { updated : afterSinceDate } }
497
+ ) ;
498
+ const originalPage1Embedding = await embedStore . loadEmbeddedContent ( {
499
+ page : pages [ 0 ] ,
500
+ } ) ;
501
+ // END SETUP
502
+ await updateEmbeddedContent ( {
503
+ since : sinceDate ,
504
+ embeddedContentStore : embedStore ,
505
+ pageStore,
506
+ sourceNames : mockDataSourceNames ,
507
+ embedder,
508
+ chunkOptions : { chunkOverlap : 2 } ,
509
+ } ) ;
510
+ const updatedPage1Embedding = await embedStore . loadEmbeddedContent ( {
511
+ page : pages [ 0 ] ,
512
+ } ) ;
513
+ const updatedPage2Embedding = await embedStore . loadEmbeddedContent ( {
514
+ page : pages [ 1 ] ,
515
+ } ) ;
516
+ assert ( updatedPage1Embedding . length ) ;
517
+ assert ( updatedPage2Embedding . length ) ;
518
+ // both pages should be updated
519
+ expect ( updatedPage1Embedding [ 0 ] . chunkAlgoHash ) . not . toBe (
520
+ originalPage1Embedding [ 0 ] . chunkAlgoHash
521
+ ) ;
522
+ expect ( updatedPage2Embedding [ 0 ] . chunkAlgoHash ) . not . toBe (
523
+ page2Embedding [ 0 ] . chunkAlgoHash
524
+ ) ;
525
+ } ) ;
526
+ } ) ;
0 commit comments