Merge branch 'main' of https://github.com/neo4j/neo4j-genai-python into feature/kg_builder

stellasia · stellasia · commit 197afe2ec6bd · 2024-08-08T13:27:33.000+02:00
# Conflicts:
#	CHANGELOG.md
diff --git a/.github/workflows/pr-e2e-tests.yaml b/.github/workflows/pr-e2e-tests.yaml
@@ -15,11 +15,10 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.8', '3.12']
         neo4j-version:
           - 5
         neo4j-edition:
-          - community
           - enterprise
     services:
       t2v-transformers:
@@ -50,6 +49,10 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v4
+      - name: Docker Prune
+        run: |
+          docker system prune -af
+          docker volume prune -f
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:
diff --git a/.github/workflows/scheduled-e2e-tests.yaml b/.github/workflows/scheduled-e2e-tests.yaml
@@ -0,0 +1,92 @@
+name: 'Neo4j-GenAI Scheduled E2E Tests'
+
+on:
+  schedule:
+    - cron:  '0 6,9,12,15,18 * * 1-5'  # Runs every 3 hours daytime on working days
+  push:
+    branches:
+      - main
+
+jobs:
+  e2e-tests:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 6
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+        neo4j-version:
+          - 5
+        neo4j-edition:
+          - community
+          - enterprise
+    services:
+      t2v-transformers:
+        image: cr.weaviate.io/semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2-onnx
+        env:
+          ENABLE_CUDA: '0'
+      weaviate:
+        image: cr.weaviate.io/semitechnologies/weaviate:1.25.1
+        env:
+          TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080'
+          AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
+          DEFAULT_VECTORIZER_MODULE: 'text2vec-transformers'
+          ENABLE_MODULES: 'text2vec-transformers'
+          CLUSTER_HOSTNAME: 'node1'
+        ports:
+          - 8080:8080
+          - 50051:50051
+      neo4j:
+        image: neo4j:${{ matrix.neo4j-version }}-${{ matrix.neo4j-edition }}
+        env:
+          NEO4J_AUTH: neo4j/password
+          NEO4J_ACCEPT_LICENSE_AGREEMENT: 'eval'
+          NEO4J_PLUGINS: '["apoc"]'
+        ports:
+          - 7687:7687
+          - 7474:7474
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - name: Docker Prune
+        run: |
+          docker system prune -af
+          docker volume prune -f
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+        with:
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+          installer-parallel: true
+      - name: Set Python version for Poetry
+        run: poetry env use python${{ matrix.python-version }}
+      - name: Load cached venv
+        id: cached-poetry-dependencies
+        uses: actions/cache@v4
+        with:
+          path: .venv
+          key: ${{ runner.os }}-venv-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
+      - name: Install dependencies
+        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
+        run: poetry install --no-interaction --no-root
+      - name: Install root project
+        run: poetry install --no-interaction
+      - name: Install dependencies
+        run: poetry install --with dev
+      - name: Wait for Weaviate to start
+        shell: bash
+        run: |
+          set +e
+          count=0; until curl -s --fail localhost:8080/v1/.well-known/ready; do ((count++)); [ $count -ge 10 ] && echo "Reached maximum retry limit" && exit 1; sleep 15; done
+      - name: Run tests
+        shell: bash
+        run: |
+          if [[ "${{ matrix.neo4j-edition }}" == "community" ]]; then
+              poetry run pytest -m 'not enterprise_only' ./tests/e2e
+          else
+              poetry run pytest ./tests/e2e
+          fi
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,8 +3,13 @@
 ## Next
 
 ### Added
+- Add optional custom_prompt arg to the Text2CypherRetriever class.
 - Introduced support for Component/Pipeline flexible architecture
 
+### Changed
+- `GraphRAG.search` method first parameter has been renamed `query_text` (was `query`) for consistency with the retrievers interface.
+- Made `GraphRAG.search` method backwards compatible with the query parameter, raising warnings to encourage using query_text instead.
+
 ## 0.3.1
 
 ### Fixed
diff --git a/README.md b/README.md
@@ -68,7 +68,7 @@ Assumption: Neo4j running with a defined vector index
 
 ```python
 from neo4j import GraphDatabase
-from neo4j_genai.indexes import upsert_query
+from neo4j_genai.indexes import upsert_vector
 
 URI = "neo4j://localhost:7687"
 AUTH = ("neo4j", "password")
@@ -78,7 +78,7 @@ driver = GraphDatabase.driver(URI, auth=AUTH)
 
 # Upsert the vector
 vector = ...
-upsert_query(
+upsert_vector(
     driver,
     node_id=1,
     embedding_property="vectorProperty",
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -115,7 +115,7 @@ Note that the below example is not the only way you can upsert data into your Ne
 .. code:: python
 
     from neo4j import GraphDatabase
-    from neo4j_genai.indexes import upsert_query
+    from neo4j_genai.indexes import upsert_vector
 
     URI = "neo4j://localhost:7687"
     AUTH = ("neo4j", "password")
@@ -125,7 +125,7 @@ Note that the below example is not the only way you can upsert data into your Ne
 
     # Upsert the vector
     vector = ...
-    upsert_query(
+    upsert_vector(
         driver,
         node_id=1,
         embedding_property="vectorProperty",
diff --git a/examples/graphrag_custom_prompt.py b/examples/graphrag_custom_prompt.py
@@ -60,7 +60,7 @@ def formatter(record: neo4j.Record) -> RetrieverResultItem:
     {context}
 
     Question:
-    {query}
+    {query_text}
 
     Answer:
     """
diff --git a/src/neo4j_genai/generation/graphrag.py b/src/neo4j_genai/generation/graphrag.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import logging
+import warnings
 from typing import Any, Optional
 
 from pydantic import ValidationError
@@ -53,43 +54,60 @@ def __init__(
 
     def search(
         self,
-        query: str,
+        query_text: str = "",
         examples: str = "",
         retriever_config: Optional[dict[str, Any]] = None,
         return_context: bool = False,
+        query: Optional[str] = None,
     ) -> RagResultModel:
         """This method performs a full RAG search:
         1. Retrieval: context retrieval
         2. Augmentation: prompt formatting
         3. Generation: answer generation with LLM
 
         Args:
-            query (str): The user question
+            query_text (str): The user question
             examples: Examples added to the LLM prompt.
             retriever_config (Optional[dict]): Parameters passed to the retriever
                 search method; e.g.: top_k
             return_context (bool): Whether to return the retriever result (default: False)
+            query (Optional[str]): The user question. Will be deprecated in favor of query_text.
 
         Returns:
             RagResultModel: The LLM-generated answer
 
         """
         try:
+            if query is not None:
+                if query_text:
+                    warnings.warn(
+                        "Both 'query' and 'query_text' are provided, 'query_text' will be used.",
+                        DeprecationWarning,
+                        stacklevel=2,
+                    )
+            elif isinstance(query, str):
+                warnings.warn(
+                    "'query' is deprecated and will be removed in a future version, please use 'query_text' instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                query_text = query
+
             validated_data = RagSearchModel(
-                query=query,
+                query_text=query_text,
                 examples=examples,
                 retriever_config=retriever_config or {},
                 return_context=return_context,
             )
         except ValidationError as e:
             raise SearchValidationError(e.errors())
-        query = validated_data.query
+        query_text = validated_data.query_text
         retriever_result: RetrieverResult = self.retriever.search(
-            query_text=query, **validated_data.retriever_config
+            query_text=query_text, **validated_data.retriever_config
         )
         context = "\n".join(item.content for item in retriever_result.items)
         prompt = self.prompt_template.format(
-            query=query, context=context, examples=validated_data.examples
+            query_text=query_text, context=context, examples=validated_data.examples
         )
         logger.debug(f"RAG: retriever_result={retriever_result}")
         logger.debug(f"RAG: prompt={prompt}")
diff --git a/src/neo4j_genai/generation/prompts.py b/src/neo4j_genai/generation/prompts.py
@@ -87,14 +87,14 @@ class RagTemplate(PromptTemplate):
 {examples}
 
 Question:
-{query}
+{query_text}
 
 Answer:
 """
-    EXPECTED_INPUTS = ["context", "query", "examples"]
+    EXPECTED_INPUTS = ["context", "query_text", "examples"]
 
-    def format(self, query: str, context: str, examples: str) -> str:
-        return super().format(query=query, context=context, examples=examples)
+    def format(self, query_text: str, context: str, examples: str) -> str:
+        return super().format(query_text=query_text, context=context, examples=examples)
 
 
 class Text2CypherTemplate(PromptTemplate):
diff --git a/src/neo4j_genai/generation/types.py b/src/neo4j_genai/generation/types.py
@@ -39,7 +39,7 @@ def check_llm(cls, value: Any) -> Any:
 
 
 class RagSearchModel(BaseModel):
-    query: str
+    query_text: str
     examples: str = ""
     retriever_config: dict[str, Any] = {}
     return_context: bool = False
diff --git a/src/neo4j_genai/retrievers/text2cypher.py b/src/neo4j_genai/retrievers/text2cypher.py
@@ -55,6 +55,7 @@ class Text2CypherRetriever(Retriever):
         llm (neo4j_genai.generation.llm.LLMInterface): LLM object to generate the Cypher query.
         neo4j_schema (Optional[str]): Neo4j schema used to generate the Cypher query.
         examples (Optional[list[str], optional): Optional user input/query pairs for the LLM to use as examples.
+        custom_prompt (Optional[str]): Optional custom prompt to use instead of auto generated prompt. Will not include the neo4j_schema or examples args, if provided.
 
     Raises:
         RetrieverInitializationError: If validation of the input arguments fail.
@@ -69,6 +70,7 @@ def __init__(
         result_formatter: Optional[
             Callable[[neo4j.Record], RetrieverResultItem]
         ] = None,
+        custom_prompt: Optional[str] = None,
     ) -> None:
         try:
             driver_model = Neo4jDriverModel(driver=driver)
@@ -82,6 +84,7 @@ def __init__(
                 neo4j_schema_model=neo4j_schema_model,
                 examples=examples,
                 result_formatter=result_formatter,
+                custom_prompt=custom_prompt,
             )
         except ValidationError as e:
             raise RetrieverInitializationError(e.errors()) from e
@@ -90,12 +93,17 @@ def __init__(
         self.llm = validated_data.llm_model.llm
         self.examples = validated_data.examples
         self.result_formatter = validated_data.result_formatter
+        self.custom_prompt = validated_data.custom_prompt
         try:
-            self.neo4j_schema = (
-                validated_data.neo4j_schema_model.neo4j_schema
-                if validated_data.neo4j_schema_model
-                else get_schema(validated_data.driver_model.driver)
-            )
+            if (
+                not validated_data.custom_prompt
+            ):  # don't need schema for a custom prompt
+                self.neo4j_schema = (
+                    validated_data.neo4j_schema_model.neo4j_schema
+                    if validated_data.neo4j_schema_model
+                    else get_schema(validated_data.driver_model.driver)
+                )
+
         except (Neo4jError, DriverError) as e:
             error_message = getattr(e, "message", str(e))
             raise SchemaFetchError(
@@ -124,12 +132,16 @@ def get_search_results(
         except ValidationError as e:
             raise SearchValidationError(e.errors()) from e
 
-        prompt_template = Text2CypherTemplate()
-        prompt = prompt_template.format(
-            schema=self.neo4j_schema,
-            examples="\n".join(self.examples) if self.examples else "",
-            query=validated_data.query_text,
-        )
+        if not self.custom_prompt:
+            prompt_template = Text2CypherTemplate()
+            prompt = prompt_template.format(
+                schema=self.neo4j_schema,
+                examples="\n".join(self.examples) if self.examples else "",
+                query=validated_data.query_text,
+            )
+        else:
+            prompt = self.custom_prompt
+
         logger.debug("Text2CypherRetriever prompt: %s", prompt)
 
         try:
diff --git a/src/neo4j_genai/types.py b/src/neo4j_genai/types.py
@@ -240,3 +240,4 @@ class Text2CypherRetrieverModel(BaseModel):
     neo4j_schema_model: Optional[Neo4jSchemaModel] = None
     examples: Optional[list[str]] = None
     result_formatter: Optional[Callable[[neo4j.Record], RetrieverResultItem]] = None
+    custom_prompt: Optional[str] = None
diff --git a/tests/e2e/test_graphrag_e2e.py b/tests/e2e/test_graphrag_e2e.py
@@ -52,7 +52,7 @@ def test_graphrag_happy_path(
     llm.invoke.return_value = LLMResponse(content="some text")
 
     result = rag.search(
-        query="biology",
+        query_text="biology",
         retriever_config={
             "top_k": 2,
         },
@@ -96,7 +96,7 @@ def test_graphrag_happy_path_return_context(
     llm.invoke.return_value = LLMResponse(content="some text")
 
     result = rag.search(
-        query="biology",
+        query_text="biology",
         retriever_config={
             "top_k": 2,
         },
@@ -142,7 +142,7 @@ def test_graphrag_happy_path_examples(
     llm.invoke.return_value = LLMResponse(content="some text")
 
     result = rag.search(
-        query="biology",
+        query_text="biology",
         retriever_config={
             "top_k": 2,
         },
@@ -186,7 +186,7 @@ def test_graphrag_llm_error(
 
     with pytest.raises(LLMGenerationError):
         rag.search(
-            query="biology",
+            query_text="biology",
         )
 
 
@@ -203,5 +203,5 @@ def test_graphrag_retrieval_error(
 
     with pytest.raises(TypeError):
         rag.search(
-            query="biology",
+            query_text="biology",
         )
diff --git a/tests/unit/retrievers/test_text2cypher.py b/tests/unit/retrievers/test_text2cypher.py
diff --git a/tests/unit/test_graphrag.py b/tests/unit/test_graphrag.py