Skip to content

Commit 39fd4f7

Browse files
authored
Add ability to load PDF from HTTP + specify file system from string (useful for config files) (#230)
* Fix PdfLoader, add ability to specify file system from string * Make PdfLoader work with config files
1 parent f8092fc commit 39fd4f7

File tree

8 files changed

+221
-12
lines changed

8 files changed

+221
-12
lines changed

examples/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ are listed in [the last section of this file](#customize).
1515
- [End to end PDF to graph simple pipeline](build_graph/simple_kg_builder_from_pdf.py)
1616
- [End to end text to graph simple pipeline](build_graph/simple_kg_builder_from_text.py)
1717
- [Build KG pipeline from config file](build_graph/from_config_files/simple_kg_pipeline_from_config_file.py)
18+
- [Build KG pipeline with PDF URL](build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py)
1819

1920

2021
## Retrieve
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
{
2+
"version_": "1",
3+
"template_": "SimpleKGPipeline",
4+
"neo4j_config": {
5+
"params_": {
6+
"uri": {
7+
"resolver_": "ENV",
8+
"var_": "NEO4J_URI"
9+
},
10+
"user": {
11+
"resolver_": "ENV",
12+
"var_": "NEO4J_USER"
13+
},
14+
"password": {
15+
"resolver_": "ENV",
16+
"var_": "NEO4J_PASSWORD"
17+
}
18+
}
19+
},
20+
"llm_config": {
21+
"class_": "OpenAILLM",
22+
"params_": {
23+
"api_key": {
24+
"resolver_": "ENV",
25+
"var_": "OPENAI_API_KEY"
26+
},
27+
"model_name": "gpt-4o",
28+
"model_params": {
29+
"temperature": 0,
30+
"max_tokens": 2000,
31+
"response_format": {"type": "json_object"}
32+
}
33+
}
34+
},
35+
"embedder_config": {
36+
"class_": "OpenAIEmbeddings",
37+
"params_": {
38+
"api_key": {
39+
"resolver_": "ENV",
40+
"var_": "OPENAI_API_KEY"
41+
}
42+
}
43+
},
44+
"from_pdf": true,
45+
"entities": [
46+
"Person",
47+
{
48+
"label": "House",
49+
"description": "Family the person belongs to",
50+
"properties": [
51+
{
52+
"name": "name",
53+
"type": "STRING"
54+
}
55+
]
56+
},
57+
{
58+
"label": "Planet",
59+
"properties": [
60+
{
61+
"name": "name",
62+
"type": "STRING"
63+
},
64+
{
65+
"name": "weather",
66+
"type": "STRING"
67+
}
68+
]
69+
}
70+
],
71+
"relations": [
72+
"PARENT_OF",
73+
{
74+
"label": "HEIR_OF",
75+
"description": "Used for inheritor relationship between father and sons"
76+
},
77+
{
78+
"label": "RULES",
79+
"properties": [
80+
{
81+
"name": "fromYear",
82+
"type": "INTEGER"
83+
}
84+
]
85+
}
86+
],
87+
"potential_schema": [
88+
[
89+
"Person",
90+
"PARENT_OF",
91+
"Person"
92+
],
93+
[
94+
"Person",
95+
"HEIR_OF",
96+
"House"
97+
],
98+
[
99+
"House",
100+
"RULES",
101+
"Planet"
102+
]
103+
],
104+
"text_splitter": {
105+
"class_": "text_splitters.fixed_size_splitter.FixedSizeSplitter",
106+
"params_": {
107+
"chunk_size": 100,
108+
"chunk_overlap": 10
109+
}
110+
},
111+
"pdf_loader": {
112+
"class_": "pdf_loader.PdfLoader",
113+
"run_params_": {
114+
"fs": "http"
115+
}
116+
},
117+
"perform_entity_resolution": true
118+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""In this example, the pipeline is defined in a JSON ('simple_kg_pipeline_config.json')
2+
or YAML ('simple_kg_pipeline_config.yaml') file.
3+
4+
According to the configuration file, some parameters will be read from the env vars
5+
(Neo4j credentials and the OpenAI API key).
6+
"""
7+
8+
import asyncio
9+
import logging
10+
11+
## If env vars are in a .env file, uncomment:
12+
## (requires pip install python-dotenv)
13+
# from dotenv import load_dotenv
14+
# load_dotenv()
15+
# env vars manually set for testing:
16+
import os
17+
from pathlib import Path
18+
19+
from neo4j_graphrag.experimental.pipeline.config.runner import PipelineRunner
20+
from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult
21+
22+
logging.basicConfig()
23+
logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)
24+
25+
os.environ["NEO4J_URI"] = "bolt://localhost:7687"
26+
os.environ["NEO4J_USER"] = "neo4j"
27+
os.environ["NEO4J_PASSWORD"] = "password"
28+
# os.environ["OPENAI_API_KEY"] = "sk-..."
29+
30+
31+
root_dir = Path(__file__).parent
32+
file_path = root_dir / "simple_kg_pipeline_config_url.json"
33+
34+
35+
# File to process
36+
URL = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf"
37+
38+
39+
async def main() -> PipelineResult:
40+
pipeline = PipelineRunner.from_config_file(file_path)
41+
return await pipeline.run({"file_path": URL})
42+
43+
44+
if __name__ == "__main__":
45+
print(asyncio.run(main()))
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
"""Use the PdfLoader component to extract text from a remote PDF file."""
2+
3+
import asyncio
4+
5+
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader
6+
7+
url = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf"
8+
9+
10+
async def main() -> None:
11+
loader = PdfLoader()
12+
document = await loader.run(filepath=url, fs="http")
13+
print(document.text[:100])
14+
15+
16+
if __name__ == "__main__":
17+
asyncio.run(main())

src/neo4j_graphrag/experimental/components/pdf_loader.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,10 @@ def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
6262
class PdfLoader(DataLoader):
6363
@staticmethod
6464
def load_file(
65-
file: Union[Path, str],
65+
file: str,
6666
fs: AbstractFileSystem,
6767
) -> str:
6868
"""Parse PDF file and return text."""
69-
if not isinstance(file, Path):
70-
file = Path(file)
71-
7269
try:
7370
with fs.open(file, "rb") as fp:
7471
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
@@ -85,16 +82,21 @@ def load_file(
8582

8683
async def run(
8784
self,
88-
filepath: Path,
85+
filepath: Union[str, Path],
8986
metadata: Optional[Dict[str, str]] = None,
90-
fs: Optional[AbstractFileSystem] = None,
87+
fs: Optional[Union[AbstractFileSystem, str]] = None,
9188
) -> PdfDocument:
92-
fs = fs or LocalFileSystem()
89+
if not isinstance(filepath, str):
90+
filepath = str(filepath)
91+
if isinstance(fs, str):
92+
fs = fsspec.filesystem(fs)
93+
elif fs is None:
94+
fs = LocalFileSystem()
9395
text = self.load_file(filepath, fs)
9496
return PdfDocument(
9597
text=text,
9698
document_info=DocumentInfo(
97-
path=str(filepath),
99+
path=filepath,
98100
metadata=self.get_document_metadata(text, metadata),
99101
),
100102
)

src/neo4j_graphrag/experimental/pipeline/config/object_config.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,10 @@ class ComponentConfig(ObjectConfig[Component]):
254254
DEFAULT_MODULE = "neo4j_graphrag.experimental.components"
255255
INTERFACE = Component
256256

257+
def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]:
258+
self._global_data = resolved_data
259+
return self.resolve_params(self.run_params_)
260+
257261

258262
class ComponentType(RootModel): # type: ignore[type-arg]
259263
root: Union[Component, ComponentConfig]
@@ -264,3 +268,8 @@ def parse(self, resolved_data: dict[str, Any] | None = None) -> Component:
264268
if isinstance(self.root, Component):
265269
return self.root
266270
return self.root.parse(resolved_data)
271+
272+
def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]:
273+
if isinstance(self.root, Component):
274+
return {}
275+
return self.root.get_run_params(resolved_data)

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,23 @@ def _get_pdf_loader(self) -> Optional[PdfLoader]:
9090
return self.pdf_loader.parse(self._global_data) # type: ignore
9191
return PdfLoader()
9292

93+
def _get_run_params_for_pdf_loader(self) -> dict[str, Any]:
94+
if not self.from_pdf:
95+
return {}
96+
if self.pdf_loader:
97+
return self.pdf_loader.get_run_params(self._global_data)
98+
return {}
99+
93100
def _get_splitter(self) -> TextSplitter:
94101
if self.text_splitter:
95102
return self.text_splitter.parse(self._global_data) # type: ignore
96103
return FixedSizeSplitter()
97104

105+
def _get_run_params_for_splitter(self) -> dict[str, Any]:
106+
if self.text_splitter:
107+
return self.text_splitter.get_run_params(self._global_data)
108+
return {}
109+
98110
def _get_chunk_embedder(self) -> TextChunkEmbedder:
99111
return TextChunkEmbedder(embedder=self.get_default_embedder())
100112

@@ -123,6 +135,11 @@ def _get_writer(self) -> KGWriter:
123135
neo4j_database=self.neo4j_database,
124136
)
125137

138+
def _get_run_params_for_writer(self) -> dict[str, Any]:
139+
if self.kg_writer:
140+
return self.kg_writer.get_run_params(self._global_data)
141+
return {}
142+
126143
def _get_resolver(self) -> Optional[EntityResolver]:
127144
if not self.perform_entity_resolution:
128145
return None

tests/unit/experimental/components/test_pdf_loader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,17 @@ def pdf_loader() -> PdfLoader:
3030

3131

3232
@pytest.fixture
33-
def dummy_pdf_path() -> Path:
34-
return BASE_DIR / "sample_data/lorem_ipsum.pdf"
33+
def dummy_pdf_path() -> str:
34+
return str(BASE_DIR / "sample_data/lorem_ipsum.pdf")
3535

3636

37-
def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
37+
def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
3838
expected_content = "Lorem ipsum dolor sit amet."
3939
actual_content = pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem())
4040
assert actual_content == expected_content
4141

4242

43-
def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
43+
def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
4444
with patch(
4545
"fsspec.implementations.local.LocalFileSystem.open",
4646
side_effect=Exception("Failed to open"),

0 commit comments

Comments
 (0)