You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I recently picked up langchain/langgraph and I am learning by creating a small RAG application (duh! almost everyone does that :) ).
So I implemented this ParentDocumentRetrieverPlus which allows saving documents in parent and child vector-stores which in my case are two different collections in my PGVector store.
I got really good results after this.
I am just wondering if a toy code like this is worthy of contribution in community part of langchain?
import logging
import uuid
from typing import List, Optional
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from langchain_core.vectorstores import VectorStore
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class ParentDocumentRetrieverPlus(BaseRetriever):
vector_store: VectorStore
child_vector_store: VectorStore
child_splitter: Optional[TextSplitter] = (
None # if not provided, the default one is used. See code below
)
child_split_length: Optional[int] = 128
child_split_overlap: Optional[int] = 32
parent_splitter: Optional[TextSplitter] = None
def _get_relevant_documents(self, query: str) -> List[Document]:
logger = logging.getLogger(__name__)
logger.debug(f"_get_relevant_documents: query: {query}")
child_docs = self.child_vector_store.similarity_search(query)
logger.debug(f"_get_relevant_documents: child docs: {child_docs}")
# get the documents using the children -> parent mapping
if child_docs:
# child docs found, get the parent ids from the metadata
parent_ids = list(
set([child_doc.metadata["parent_id"] for child_doc in child_docs])
)
logger.debug(
f"_get_relevant_documents: getting documents with parent ids: {parent_ids}"
)
# with the parent id, get the parent docs
docs = self.vector_store.get_by_ids(parent_ids)
logger.debug(f"_get_relevant_documents: returning parent docs: {len(docs)}")
return docs
else:
logger.debug("_get_relevant_documents: no child docs found:")
return None
def add_documents(self, documents: List[Document]) -> None:
logger = logging.getLogger(__name__)
docs = documents
logger.debug(f"original documents: {len(docs)}")
# first split the parent documents if needed
if self.parent_splitter:
docs = self.parent_splitter.split_documents(documents)
logger.debug(f"original documents after split: {len(docs)}")
docs = [(str(uuid.uuid4()), doc) for doc in docs]
# for each parent document, split it into child documents
# and add parent id to every child's meta data
child_docs = []
if not self.child_splitter:
self.child_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.child_split_length,
chunk_overlap=self.child_split_overlap,
)
for parent_id, doc in docs:
child_docs_temp = self.child_splitter.split_documents([doc])
for child_doc in child_docs_temp:
child_doc.metadata["parent_id"] = parent_id
child_docs += child_docs_temp
# now we have parent, their ids and child docs
ids = [x[0] for x in docs]
parent_docs = [x[1] for x in docs]
logger.debug(f"parent documents: {len(parent_docs)}")
logger.debug(f"child documents : {len(child_docs)}")
# let's save them
self.vector_store.add_documents(parent_docs, ids=ids)
self.child_vector_store.add_documents(child_docs)
logger.debug("parent and child document saved:")
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
Uh oh!
There was an error while loading. Please reload this page.
-
Hi,
I recently picked up langchain/langgraph and I am learning by creating a small RAG application (duh! almost everyone does that :) ).
So I implemented this ParentDocumentRetrieverPlus which allows saving documents in parent and child vector-stores which in my case are two different collections in my PGVector store.
I got really good results after this.
I am just wondering if a toy code like this is worthy of contribution in community part of langchain?
Beta Was this translation helpful? Give feedback.
All reactions