@@ -387,9 +387,8 @@ def upsert(
387387 doc .embed (embedder = self .embedder )
388388 cleaned_content = self ._clean_content (doc .content )
389389 content_hash = md5 (cleaned_content .encode ()).hexdigest ()
390- _id = doc .id or content_hash
391390 record = {
392- "id" : _id ,
391+ "id" : content_hash , # use content_hash as a reproducible id to avoid duplicates while upsert
393392 "name" : doc .name ,
394393 "meta_data" : doc .meta_data ,
395394 "filters" : filters ,
@@ -406,15 +405,15 @@ def upsert(
406405 insert_stmt = postgresql .insert (self .table ).values (batch_records )
407406 upsert_stmt = insert_stmt .on_conflict_do_update (
408407 index_elements = ["id" ],
409- set_ = dict (
410- name = insert_stmt .excluded .name ,
411- meta_data = insert_stmt .excluded .meta_data ,
412- filters = insert_stmt .excluded .filters ,
413- content = insert_stmt .excluded .content ,
414- embedding = insert_stmt .excluded .embedding ,
415- usage = insert_stmt .excluded .usage ,
416- content_hash = insert_stmt .excluded .content_hash ,
417- ) ,
408+ set_ = {
409+ " name" : insert_stmt .excluded .name ,
410+ " meta_data" : insert_stmt .excluded .meta_data ,
411+ " filters" : insert_stmt .excluded .filters ,
412+ " content" : insert_stmt .excluded .content ,
413+ " embedding" : insert_stmt .excluded .embedding ,
414+ " usage" : insert_stmt .excluded .usage ,
415+ " content_hash" : insert_stmt .excluded .content_hash ,
416+ } ,
418417 )
419418 sess .execute (upsert_stmt )
420419 sess .commit () # Commit batch independently
0 commit comments