1
+ """This example demonstrates how to use the SchemaFromText component
2
+ to automatically extract a schema from text and save it to JSON and YAML files.
3
+
4
+ The SchemaFromText component uses an LLM to analyze the text and identify entities,
5
+ relations, and their properties.
6
+
7
+ Note: This example requires an OpenAI API key to be set in the .env file.
8
+ """
9
+
10
+ import asyncio
11
+ import logging
12
+ import os
13
+ from dotenv import load_dotenv
14
+
15
+ from neo4j_graphrag .experimental .components .schema import SchemaFromText , SchemaConfig
16
+ from neo4j_graphrag .llm import OpenAILLM
17
+
18
+ # Load environment variables from .env file
19
+ load_dotenv ()
20
+
21
+ # Configure logging
22
+ logging .basicConfig ()
23
+ logging .getLogger ("neo4j_graphrag" ).setLevel (logging .INFO )
24
+
25
+ # Verify OpenAI API key is available
26
+ if not os .getenv ("OPENAI_API_KEY" ):
27
+ raise ValueError (
28
+ "OPENAI_API_KEY environment variable not found. "
29
+ "Please set it in the .env file in the root directory."
30
+ )
31
+
32
+ # Sample text to extract schema from - it's about a company and its employees
33
+ TEXT = """
34
+ Acme Corporation was founded in 1985 by John Smith in New York City.
35
+ The company specializes in manufacturing high-quality widgets and gadgets
36
+ for the consumer electronics industry.
37
+
38
+ Sarah Johnson joined Acme in 2010 as a Senior Engineer and was promoted to
39
+ Engineering Director in 2015. She oversees a team of 12 engineers working on
40
+ next-generation products. Sarah holds a PhD in Electrical Engineering from MIT
41
+ and has filed 5 patents during her time at Acme.
42
+
43
+ The company expanded to international markets in 2012, opening offices in London,
44
+ Tokyo, and Berlin. Each office is managed by a regional director who reports
45
+ directly to the CEO, Michael Brown, who took over leadership in 2008.
46
+
47
+ Acme's most successful product, the SuperWidget X1, was launched in 2018 and
48
+ has sold over 2 million units worldwide. The product was developed by a team led
49
+ by Robert Chen, who joined the company in 2016 after working at TechGiant for 8 years.
50
+
51
+ The company currently employs 250 people across its 4 locations and had a revenue
52
+ of $75 million in the last fiscal year. Acme is planning to go public in 2024
53
+ with an estimated valuation of $500 million.
54
+ """
55
+
56
+ # Define the file paths for saving the schema
57
+ OUTPUT_DIR = os .path .join (os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))), "data" )
58
+ JSON_FILE_PATH = os .path .join (OUTPUT_DIR , "extracted_schema.json" )
59
+ YAML_FILE_PATH = os .path .join (OUTPUT_DIR , "extracted_schema.yaml" )
60
+
61
+
62
+ async def extract_and_save_schema () -> SchemaConfig :
63
+ """Extract schema from text and save it to JSON and YAML files."""
64
+
65
+ # Define LLM parameters
66
+ llm_model_params = {
67
+ "max_tokens" : 2000 ,
68
+ "response_format" : {"type" : "json_object" },
69
+ "temperature" : 0 , # Lower temperature for more consistent output
70
+ }
71
+
72
+ # Create the LLM instance
73
+ llm = OpenAILLM (
74
+ model_name = "gpt-4o" ,
75
+ model_params = llm_model_params ,
76
+ )
77
+
78
+ try :
79
+ # Create a SchemaFromText component with the default template
80
+ schema_extractor = SchemaFromText (llm = llm )
81
+
82
+ print ("Extracting schema from text..." )
83
+ # Extract schema from text
84
+ inferred_schema = await schema_extractor .run (text = TEXT )
85
+
86
+ # Ensure the output directory exists
87
+ os .makedirs (OUTPUT_DIR , exist_ok = True )
88
+
89
+ print (f"Saving schema to JSON file: { JSON_FILE_PATH } " )
90
+ # Save the schema to JSON file
91
+ inferred_schema .store_as_json (JSON_FILE_PATH )
92
+
93
+ print (f"Saving schema to YAML file: { YAML_FILE_PATH } " )
94
+ # Save the schema to YAML file
95
+ inferred_schema .store_as_yaml (YAML_FILE_PATH )
96
+
97
+ print ("\n Extracted Schema Summary:" )
98
+ print (f"Entities: { list (inferred_schema .entities .keys ())} " )
99
+ print (f"Relations: { list (inferred_schema .relations .keys () if inferred_schema .relations else [])} " )
100
+
101
+ if inferred_schema .potential_schema :
102
+ print ("\n Potential Schema:" )
103
+ for entity1 , relation , entity2 in inferred_schema .potential_schema :
104
+ print (f" { entity1 } --[{ relation } ]--> { entity2 } " )
105
+
106
+ return inferred_schema
107
+
108
+ finally :
109
+ # Close the LLM client
110
+ await llm .async_client .close ()
111
+
112
+
113
+ async def main () -> None :
114
+ """Run the example."""
115
+
116
+ # Extract schema and save to files
117
+ schema_config = await extract_and_save_schema ()
118
+
119
+ print (f"\n Schema files have been saved to:" )
120
+ print (f" - JSON: { JSON_FILE_PATH } " )
121
+ print (f" - YAML: { YAML_FILE_PATH } " )
122
+
123
+ print ("\n Example of how to load the schema from files:" )
124
+ print (" from neo4j_graphrag.experimental.components.schema import SchemaConfig" )
125
+ print (f" schema_from_json = SchemaConfig.from_file('{ JSON_FILE_PATH } ')" )
126
+ print (f" schema_from_yaml = SchemaConfig.from_file('{ YAML_FILE_PATH } ')" )
127
+
128
+
129
+ if __name__ == "__main__" :
130
+ asyncio .run (main ())
0 commit comments