Skip to content

Commit 9718210

Browse files
authored
Llama index readers gitbook (#16862)
1 parent 00c27f6 commit 9718210

File tree

19 files changed

+5651
-0
lines changed

19 files changed

+5651
-0
lines changed
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
llama_index/_static
2+
.DS_Store
3+
# Byte-compiled / optimized / DLL files
4+
__pycache__/
5+
*.py[cod]
6+
*$py.class
7+
8+
# C extensions
9+
*.so
10+
11+
# Distribution / packaging
12+
.Python
13+
bin/
14+
build/
15+
develop-eggs/
16+
dist/
17+
downloads/
18+
eggs/
19+
.eggs/
20+
etc/
21+
include/
22+
lib/
23+
lib64/
24+
parts/
25+
sdist/
26+
share/
27+
var/
28+
wheels/
29+
pip-wheel-metadata/
30+
share/python-wheels/
31+
*.egg-info/
32+
.installed.cfg
33+
*.egg
34+
MANIFEST
35+
36+
# PyInstaller
37+
# Usually these files are written by a python script from a template
38+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
39+
*.manifest
40+
*.spec
41+
42+
# Installer logs
43+
pip-log.txt
44+
pip-delete-this-directory.txt
45+
46+
# Unit test / coverage reports
47+
htmlcov/
48+
.tox/
49+
.nox/
50+
.coverage
51+
.coverage.*
52+
.cache
53+
nosetests.xml
54+
coverage.xml
55+
*.cover
56+
*.py,cover
57+
.hypothesis/
58+
.pytest_cache/
59+
.ruff_cache
60+
61+
# Translations
62+
*.mo
63+
*.pot
64+
65+
# Django stuff:
66+
*.log
67+
local_settings.py
68+
db.sqlite3
69+
db.sqlite3-journal
70+
71+
# Flask stuff:
72+
instance/
73+
.webassets-cache
74+
75+
# Scrapy stuff:
76+
.scrapy
77+
78+
# Sphinx documentation
79+
docs/_build/
80+
81+
# PyBuilder
82+
target/
83+
84+
# Jupyter Notebook
85+
.ipynb_checkpoints
86+
notebooks/
87+
88+
# IPython
89+
profile_default/
90+
ipython_config.py
91+
92+
# pyenv
93+
.python-version
94+
95+
# pipenv
96+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
98+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
99+
# install all needed dependencies.
100+
#Pipfile.lock
101+
102+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
103+
__pypackages__/
104+
105+
# Celery stuff
106+
celerybeat-schedule
107+
celerybeat.pid
108+
109+
# SageMath parsed files
110+
*.sage.py
111+
112+
# Environments
113+
.env
114+
.venv
115+
env/
116+
venv/
117+
ENV/
118+
env.bak/
119+
venv.bak/
120+
pyvenv.cfg
121+
122+
# Spyder project settings
123+
.spyderproject
124+
.spyproject
125+
126+
# Rope project settings
127+
.ropeproject
128+
129+
# mkdocs documentation
130+
/site
131+
132+
# mypy
133+
.mypy_cache/
134+
.dmypy.json
135+
dmypy.json
136+
137+
# Pyre type checker
138+
.pyre/
139+
140+
# Jetbrains
141+
.idea
142+
modules/
143+
*.swp
144+
145+
# VsCode
146+
.vscode
147+
148+
# pipenv
149+
Pipfile
150+
Pipfile.lock
151+
152+
# pyright
153+
pyrightconfig.json
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
poetry_requirements(
2+
name="poetry",
3+
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# CHANGELOG
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
2+
3+
help: ## Show all Makefile targets.
4+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
5+
6+
format: ## Run code autoformatters (black).
7+
pre-commit install
8+
git ls-files | xargs pre-commit run black --files
9+
10+
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
11+
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
12+
13+
test: ## Run tests via pytest.
14+
python -m unittest discover tests
15+
16+
watch-docs: ## Build and watch documentation.
17+
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# LlamaIndex Readers Integration: Gitbook
2+
3+
## Overview
4+
5+
Simple Gitbook Reader allows loading data from a gitbook space. It collects & converts contents from gitbook space into documents used by LlamaIndex.
6+
7+
### Installation
8+
9+
You can install Gitbook Reader via pip:
10+
11+
```bash
12+
pip install llama-index-readers-gitbook
13+
```
14+
15+
### Usage
16+
17+
```python
18+
from llama_index.readers.gitbook import SimpleGitbookReader
19+
20+
# Initialize SimpleGitbookReader
21+
reader = SimpleGitbookReader(
22+
api_token="<Gitbook API Token>", # Gitbook API Token
23+
)
24+
25+
# load data from Gitbook
26+
documents = reader.load_data(
27+
space_id="<Gitbook Space Id>", # Id of the gitbook space
28+
metadata_names=None, # Names of the fields to add to metadata attribute (available: 'path', 'title', 'description', 'parent')
29+
)
30+
```
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
python_sources()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from llama_index.readers.gitbook.base import SimpleGitbookReader
2+
3+
__all__ = ["SimpleGitbookReader"]
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from typing import List, Optional
2+
3+
from llama_index.core.readers.base import BaseReader
4+
from llama_index.core.schema import Document
5+
6+
from llama_index.readers.gitbook.gitbook_client import GitbookClient
7+
8+
VALID_METADATA_FIELDS = {"path", "title", "description", "parent"}
9+
10+
11+
class SimpleGitbookReader(BaseReader):
12+
"""Simple gitbook reader.
13+
14+
Convert each gitbook page into Document used by LlamaIndex.
15+
16+
Args:
17+
api_token (str): Gitbook API Token.
18+
api_url (str): Gitbook API Endpoint.
19+
"""
20+
21+
def __init__(self, api_token: str, api_url: str = None) -> None:
22+
"""Initialize with parameters."""
23+
self.client = GitbookClient(api_token, api_url)
24+
25+
def load_data(
26+
self,
27+
space_id: str,
28+
metadata_names: Optional[List[str]] = None,
29+
show_progress=False,
30+
) -> List[Document]:
31+
"""Load data from the input directory.
32+
33+
Args:
34+
space_id (str): Gitbook space id
35+
metadata_names (Optional[List[str]]): names of the fields to be added
36+
to the metadata attribute of the Document.
37+
only 'path', 'title', 'description', 'parent' are available
38+
Defaults to None
39+
show_progress (bool, optional): Show progress bar. Defaults to False
40+
41+
Returns:
42+
List[Document]: A list of documents.
43+
44+
"""
45+
if metadata_names:
46+
invalid_fields = set(metadata_names) - VALID_METADATA_FIELDS
47+
if invalid_fields:
48+
raise ValueError(
49+
f"Invalid metadata fields: {', '.join(invalid_fields)}"
50+
)
51+
52+
documents = []
53+
pages = self.client.list_pages(space_id)
54+
55+
if show_progress:
56+
from tqdm import tqdm
57+
58+
iterator = tqdm(pages, desc="Downloading pages")
59+
else:
60+
iterator = pages
61+
62+
for page in iterator:
63+
id = page.get("id")
64+
content = self.client.get_page_markdown(space_id, id)
65+
if not content:
66+
print(f"Warning: No content found for page ID {id}. Skipping...")
67+
continue
68+
69+
if metadata_names is None:
70+
documents.append(
71+
Document(text=content, id_=id, metadata={"path": page.get("path")})
72+
)
73+
else:
74+
try:
75+
metadata = {name: page.get(name) for name in metadata_names}
76+
except KeyError as err:
77+
raise ValueError(
78+
f"{err.args[0]} field is not available. Choose from {', '.join(VALID_METADATA_FIELDS)}"
79+
) from err
80+
documents.append(Document(text=content, id_=id, metadata=metadata))
81+
82+
return documents
83+
84+
85+
if __name__ == "__main__":
86+
import os
87+
import sys
88+
89+
def load_env_file():
90+
"""Load environment variables from .env file."""
91+
current_dir = os.path.dirname(os.path.abspath(__file__))
92+
env_path = os.path.join(current_dir, "../../../.env")
93+
if os.path.exists(env_path):
94+
with open(env_path) as f:
95+
for line in f:
96+
line = line.strip()
97+
if line and not line.startswith("#"):
98+
key, value = line.split("=", 1)
99+
os.environ[key.strip()] = value.strip()
100+
101+
load_env_file()
102+
api_token = os.getenv("GITBOOK_API_TOKEN")
103+
space_id = os.getenv("GITBOOK_SPACE_ID")
104+
105+
if not api_token or not space_id:
106+
print("Error: GITBOOK_API_TOKEN and GITBOOK_SPACE_ID must be set in .env file")
107+
sys.exit(1)
108+
109+
reader = SimpleGitbookReader(api_token)
110+
print(reader.load_data(space_id, show_progress=True))

0 commit comments

Comments
 (0)