Improve default schema extraction prompt and add examples

NathalieCharbel · NathalieCharbel · commit 8458b75b877d · 2025-04-29T17:23:14.000+03:00
diff --git a/examples/README.md b/examples/README.md
@@ -3,6 +3,7 @@
 This folder contains examples usage for the different features
 supported by the `neo4j-graphrag` package:
 
+- [Automatic Schema Extraction](#schema-extraction) from PDF or text
 - [Build Knowledge Graph](#build-knowledge-graph) from PDF or text
 - [Retrieve](#retrieve) information from the graph
 - [Question Answering](#answer-graphrag) (Q&A)
@@ -122,6 +123,7 @@ are listed in [the last section of this file](#customize).
 - [Chunk embedder]()
 - Schema Builder:
   - [User-defined](./customize/build_graph/components/schema_builders/schema.py)
+  - [Automatic schema extraction](./automatic_schema_extraction/schema_from_text.py)
 - Entity Relation Extractor:
   - [LLM-based](./customize/build_graph/components/extractors/llm_entity_relation_extractor.py)
   - [LLM-based with custom prompt](./customize/build_graph/components/extractors/llm_entity_relation_extractor_with_custom_prompt.py)
diff --git a/examples/automatic_schema_extraction/schema_from_text.py b/examples/automatic_schema_extraction/schema_from_text.py
@@ -0,0 +1,130 @@
+"""This example demonstrates how to use the SchemaFromText component 
+to automatically extract a schema from text and save it to JSON and YAML files.
+
+The SchemaFromText component uses an LLM to analyze the text and identify entities,
+relations, and their properties.
+
+Note: This example requires an OpenAI API key to be set in the .env file.
+"""
+
+import asyncio
+import logging
+import os
+from dotenv import load_dotenv
+
+from neo4j_graphrag.experimental.components.schema import SchemaFromText, SchemaConfig
+from neo4j_graphrag.llm import OpenAILLM
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Configure logging
+logging.basicConfig()
+logging.getLogger("neo4j_graphrag").setLevel(logging.INFO)
+
+# Verify OpenAI API key is available
+if not os.getenv("OPENAI_API_KEY"):
+    raise ValueError(
+        "OPENAI_API_KEY environment variable not found. "
+        "Please set it in the .env file in the root directory."
+    )
+
+# Sample text to extract schema from - it's about a company and its employees
+TEXT = """
+Acme Corporation was founded in 1985 by John Smith in New York City. 
+The company specializes in manufacturing high-quality widgets and gadgets 
+for the consumer electronics industry.
+
+Sarah Johnson joined Acme in 2010 as a Senior Engineer and was promoted to 
+Engineering Director in 2015. She oversees a team of 12 engineers working on 
+next-generation products. Sarah holds a PhD in Electrical Engineering from MIT 
+and has filed 5 patents during her time at Acme.
+
+The company expanded to international markets in 2012, opening offices in London, 
+Tokyo, and Berlin. Each office is managed by a regional director who reports 
+directly to the CEO, Michael Brown, who took over leadership in 2008.
+
+Acme's most successful product, the SuperWidget X1, was launched in 2018 and 
+has sold over 2 million units worldwide. The product was developed by a team led 
+by Robert Chen, who joined the company in 2016 after working at TechGiant for 8 years.
+
+The company currently employs 250 people across its 4 locations and had a revenue 
+of $75 million in the last fiscal year. Acme is planning to go public in 2024 
+with an estimated valuation of $500 million.
+"""
+
+# Define the file paths for saving the schema
+OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data")
+JSON_FILE_PATH = os.path.join(OUTPUT_DIR, "extracted_schema.json")
+YAML_FILE_PATH = os.path.join(OUTPUT_DIR, "extracted_schema.yaml")
+
+
+async def extract_and_save_schema() -> SchemaConfig:
+    """Extract schema from text and save it to JSON and YAML files."""
+    
+    # Define LLM parameters
+    llm_model_params = {
+        "max_tokens": 2000,
+        "response_format": {"type": "json_object"},
+        "temperature": 0,  # Lower temperature for more consistent output
+    }
+    
+    # Create the LLM instance
+    llm = OpenAILLM(
+        model_name="gpt-4o",
+        model_params=llm_model_params,
+    )
+    
+    try:
+        # Create a SchemaFromText component with the default template
+        schema_extractor = SchemaFromText(llm=llm)
+        
+        print("Extracting schema from text...")
+        # Extract schema from text
+        inferred_schema = await schema_extractor.run(text=TEXT)
+        
+        # Ensure the output directory exists
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        
+        print(f"Saving schema to JSON file: {JSON_FILE_PATH}")
+        # Save the schema to JSON file
+        inferred_schema.store_as_json(JSON_FILE_PATH)
+        
+        print(f"Saving schema to YAML file: {YAML_FILE_PATH}")
+        # Save the schema to YAML file
+        inferred_schema.store_as_yaml(YAML_FILE_PATH)
+        
+        print("\nExtracted Schema Summary:")
+        print(f"Entities: {list(inferred_schema.entities.keys())}")
+        print(f"Relations: {list(inferred_schema.relations.keys() if inferred_schema.relations else [])}")
+        
+        if inferred_schema.potential_schema:
+            print("\nPotential Schema:")
+            for entity1, relation, entity2 in inferred_schema.potential_schema:
+                print(f"  {entity1} --[{relation}]--> {entity2}")
+        
+        return inferred_schema
+    
+    finally:
+        # Close the LLM client
+        await llm.async_client.close()
+
+
+async def main() -> None:
+    """Run the example."""
+    
+    # Extract schema and save to files
+    schema_config = await extract_and_save_schema()
+    
+    print(f"\nSchema files have been saved to:")
+    print(f"  - JSON: {JSON_FILE_PATH}")
+    print(f"  - YAML: {YAML_FILE_PATH}")
+    
+    print("\nExample of how to load the schema from files:")
+    print("  from neo4j_graphrag.experimental.components.schema import SchemaConfig")
+    print(f"  schema_from_json = SchemaConfig.from_file('{JSON_FILE_PATH}')")
+    print(f"  schema_from_yaml = SchemaConfig.from_file('{YAML_FILE_PATH}')")
+
+
+if __name__ == "__main__":
+    asyncio.run(main()) 
diff --git a/src/neo4j_graphrag/generation/prompts.py b/src/neo4j_graphrag/generation/prompts.py
@@ -207,24 +207,48 @@ class SchemaExtractionTemplate(PromptTemplate):
 You are a top-tier algorithm designed for extracting a labeled property graph schema in 
 structured formats.
 
-Generate the generalized graph schema based on input text. Identify key entity types,
-their relationship types, and property types whenever it is possible. Return only 
-abstract schema information, no concrete instances. Use singular PascalCase labels for 
-entity types and UPPER_SNAKE_CASE for relationship types. Include property definitions 
-only when the type can be confidently inferred, otherwise omit the properties. 
+Generate a generalized graph schema based on the input text. Identify key entity types,
+their relationship types, and property types.
+
+IMPORTANT RULES:
+1. Return only abstract schema information, not concrete instances.
+2. Use singular PascalCase labels for entity types (e.g., Person, Company, Product).
+3. Use UPPER_SNAKE_CASE for relationship types (e.g., WORKS_FOR, MANAGES).
+4. Include property definitions only when the type can be confidently inferred, otherwise omit them.
+5. When defining potential_schema, ensure that every entity and relation mentioned exists in your entities and relations lists.
+6. Do not create entity types that aren't clearly mentioned in the text.
+7. Keep your schema minimal and focused on clearly identifiable patterns in the text.
+
 Accepted property types are: BOOLEAN, DATE, DURATION, FLOAT, INTEGER, LIST, 
-LOCAL DATETIME, LOCAL TIME, POINT, STRING, ZONED DATETIME, ZONED TIME.
-Do not add extra keys or explanatory text. Return a valid JSON object without 
-back‑ticks, markdown, or comments.
- 
-For example, if the text says "Alice lives in London", the output JSON object should 
-adhere to the following format: 
-{{"entities": [{{"label": "Person", "properties": [{{"name": "name", "type": "STRING"}}]}}, 
-{{"label": "City", "properties":[{{"name": "name", "type": "STRING"}}]}}],
-"relations": [{{"label": "LIVES_IN"}}],
-"potential_schema":[[ "Person", "LIVES_IN", "City"]]}}
-
-More examples:
+LOCAL_DATETIME, LOCAL_TIME, POINT, STRING, ZONED_DATETIME, ZONED_TIME.
+
+Return a valid JSON object that follows this precise structure:
+{{
+  "entities": [
+    {{
+      "label": "Person",
+      "properties": [
+        {{
+          "name": "name",
+          "type": "STRING"
+        }}
+      ]
+    }},
+    ...
+  ],
+  "relations": [
+    {{
+      "label": "WORKS_FOR"
+    }},
+    ...
+  ],
+  "potential_schema": [
+    ["Person", "WORKS_FOR", "Company"],
+    ...
+  ]
+}}
+
+Examples:
 {examples}
 
 Input text: