Skip to content

Commit 3c7c021

Browse files
authored
Merge pull request #27 from TrueSelph/0.1.12
Added import and export of documents and knodes
2 parents 32d167d + de64a53 commit 3c7c021

File tree

8 files changed

+391
-30
lines changed

8 files changed

+391
-30
lines changed

deepdoc_client_action/CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,8 @@
7474
- Updated docs
7575

7676
# 0.1.11
77-
- Increase timeout and add logs
77+
- Increase timeout and add logs
78+
79+
# 0.1.12
80+
- Added import and export of documents and knodes
81+
- Added TOCChunker

deepdoc_client_action/add_documents.jac

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ walker add_documents(agent_graph_walker) {
2222
has with_embeddings:bool = False; # whether to generate embeddings for the documents
2323
has response:str = "";
2424
has reporting:bool = True;
25+
has chunker_type:str = "hybrid";
2526

2627
# set up logger
2728
static has logger:Logger = logging.getLogger(__name__);
@@ -89,7 +90,8 @@ walker add_documents(agent_graph_walker) {
8990
to_page=self.to_page,
9091
lang=self.lang,
9192
with_embeddings=self.with_embeddings,
92-
callback_url=callback_url
93+
callback_url=callback_url,
94+
chunker_type=self.chunker_type
9395
);
9496

9597
if self.reporting {

deepdoc_client_action/app/app.py

Lines changed: 216 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import Dict
88

99
import streamlit as st
10+
import yaml
1011
from jvclient.lib.utils import call_api, get_reports_payload
1112
from jvclient.lib.widgets import app_header, app_update_action
1213
from streamlit_router import StreamlitRouter
@@ -21,6 +22,8 @@ def render(router: StreamlitRouter, agent_id: str, action_id: str, info: dict) -
2122
:param info: Additional information.
2223
"""
2324
(model_key, module_root) = app_header(agent_id, action_id, info)
25+
if "job_id_details" not in st.session_state:
26+
st.session_state.job_id_details = ""
2427

2528
# add documents section
2629
with st.expander("Configure", False):
@@ -112,6 +115,11 @@ def render(router: StreamlitRouter, agent_id: str, action_id: str, info: dict) -
112115
value=True,
113116
)
114117

118+
chunker_type = st.selectbox(
119+
"Chunker type",
120+
options=["toc", "hybrid", "hierarchical"],
121+
key=f"{model_key}_chunker_type",
122+
)
115123
# Process inputs
116124
url_list = [url.strip() for url in doc_urls.split("\n") if url.strip()]
117125
metadata_list = []
@@ -149,6 +157,7 @@ def render(router: StreamlitRouter, agent_id: str, action_id: str, info: dict) -
149157
"to_page": int(to_page) if to_page is not None else 0,
150158
"lang": str(lang),
151159
"with_embeddings": with_embeddings,
160+
"chunker_type": chunker_type,
152161
}
153162

154163
# Add optional fields only if they exist
@@ -271,6 +280,105 @@ def get_status_badge(status: str) -> str:
271280
color = color_map.get(status, "gray")
272281
return f"<span style='background-color: {color}; color: white; padding: 2px 6px; border-radius: 4px;'>{status}</span>"
273282

283+
with st.expander("Export document", False):
284+
# Fetch documents with pagination parameters
285+
with_embeddings = st.toggle(
286+
"Export with Embeddings", value=True, key=f"{model_key}_with_embeddings"
287+
)
288+
result = call_api(
289+
endpoint="action/walker/deepdoc_client_action/export_documents",
290+
json_data={
291+
"agent_id": agent_id,
292+
"reporting": True,
293+
"with_embeddings": with_embeddings,
294+
},
295+
timeout=120,
296+
)
297+
298+
if result and result.status_code == 200:
299+
payload = get_reports_payload(result)
300+
if payload:
301+
st.download_button(
302+
label="Download Documents",
303+
data=json.dumps(payload, indent=2, ensure_ascii=False),
304+
file_name="deepdoc_documents.json",
305+
mime="application/json",
306+
)
307+
else:
308+
st.error("No job ID returned from the API. Please try again.")
309+
310+
with st.expander("Import document", False):
311+
knode_source = st.radio(
312+
"Choose data source:",
313+
("Text input", "Upload file"),
314+
key=f"{model_key}_knode_source",
315+
)
316+
317+
purge_collection = st.toggle(
318+
"Purge Collection",
319+
value=False,
320+
key=f"{model_key}_purge_collection",
321+
)
322+
323+
data_to_import = ""
324+
if knode_source == "Text input":
325+
data_to_import = st.text_area(
326+
"Document in YAML or JSON",
327+
value="",
328+
height=170,
329+
key=f"{model_key}_knode_data",
330+
)
331+
332+
uploaded_file = None
333+
if knode_source == "Upload file":
334+
uploaded_file = st.file_uploader(
335+
"Upload file (YAML or JSON)",
336+
type=["yaml", "json"],
337+
key=f"{model_key}_document_upload",
338+
)
339+
340+
with_embeddings = st.toggle(
341+
"Import with Embeddings",
342+
value=True,
343+
key=f"{model_key}_import_embeddings",
344+
)
345+
346+
if st.button("Import", key=f"{model_key}_btn_import_document"):
347+
if uploaded_file:
348+
try:
349+
file_content = uploaded_file.read().decode(
350+
"utf-8", errors="replace"
351+
)
352+
if uploaded_file.type == "application/json":
353+
data_to_import = json.loads(file_content)
354+
else:
355+
data_to_import = yaml.safe_load(file_content)
356+
data_to_import = json.dumps(data_to_import, ensure_ascii=False)
357+
except Exception as e:
358+
st.error(f"Error loading file: {e}")
359+
360+
if data_to_import:
361+
result = call_api(
362+
endpoint="action/walker/deepdoc_client_action/import_documents",
363+
json_data={
364+
"agent_id": agent_id,
365+
"data": data_to_import,
366+
"with_embeddings": with_embeddings,
367+
"purge": purge_collection,
368+
},
369+
)
370+
371+
if result:
372+
st.success("Agent documents imported successfully")
373+
else:
374+
st.error(
375+
"Failed to import document. Ensure valid YAML/JSON format."
376+
)
377+
else:
378+
st.error(
379+
"No data to import. Please provide valid text or upload a file."
380+
)
381+
274382
with st.expander("Document List", True):
275383
# Initialize session state variables for pagination
276384
if "current_page" not in st.session_state:
@@ -493,13 +601,23 @@ def get_status_badge(status: str) -> str:
493601
if st.button("No, Keep Job"):
494602
st.session_state.confirm_state = {"active": False}
495603
st.rerun()
496-
elif st.button("Delete Job", key=f"delete_job_{job_id}"):
497-
st.session_state.confirm_state = {
498-
"active": True,
499-
"type": "delete_job",
500-
"job_id": job_id,
501-
}
502-
st.rerun()
604+
605+
elif status == "COMPLETED":
606+
col1, col2 = st.columns(2)
607+
with col1:
608+
if st.button("Delete Job", key=f"delete_job_{job_id}"):
609+
st.session_state.confirm_state = {
610+
"active": True,
611+
"type": "delete_job",
612+
"job_id": job_id,
613+
}
614+
st.rerun()
615+
with col2:
616+
if st.button("View Job", key=f"view_job_{job_id}"):
617+
st.session_state.current_page = 3
618+
st.session_state.job_id_details = job_id
619+
st.session_state.job_details = documents
620+
st.rerun()
503621

504622
# Display each document in the job
505623
for document in documents:
@@ -618,9 +736,94 @@ def get_status_badge(status: str) -> str:
618736
time.sleep(5)
619737
st.rerun()
620738

621-
else:
622-
st.info(
623-
"No documents found. Your uploaded documents will be shown here."
624-
)
625-
else:
626-
st.info("No documents found. Your uploaded documents will be shown here.")
739+
if st.session_state.job_id_details:
740+
st.write("---")
741+
st.write("## Job Details")
742+
743+
if "page" not in st.session_state[model_key]:
744+
st.session_state[model_key]["page"] = 1
745+
if "per_page" not in st.session_state[model_key]:
746+
st.session_state[model_key]["per_page"] = 10
747+
748+
# Items per page selection
749+
per_page_options = [10, 20, 30, 50, 100]
750+
new_per_page = st.selectbox(
751+
"Documents per page:",
752+
per_page_options,
753+
index=per_page_options.index(st.session_state[model_key]["per_page"]),
754+
)
755+
756+
# Reset page if per_page changes
757+
if new_per_page != st.session_state[model_key]["per_page"]:
758+
st.session_state[model_key]["per_page"] = new_per_page
759+
st.session_state[model_key]["page"] = 1
760+
st.rerun()
761+
762+
st.session_state[model_key]["pages_input"] = st.text_input(
763+
"Enter page numbers (comma or space separated):",
764+
value="", # optional default value
765+
placeholder="e.g., 1,2,3",
766+
)
767+
768+
st.session_state[model_key]["pages_input"] = [
769+
p.strip()
770+
for p in st.session_state[model_key]["pages_input"]
771+
.replace(",", " ")
772+
.split()
773+
if p.strip().isdigit()
774+
]
775+
st.session_state[model_key][
776+
"filter_by"
777+
] = f'metadata.job_id:="{st.session_state.job_id_details}"'
778+
779+
if st.session_state[model_key]["pages_input"]:
780+
st.session_state[model_key][
781+
"filter_by"
782+
] += f' && metadata.page:=[{",".join(st.session_state[model_key]["pages_input"])}]'
783+
784+
params = {
785+
"page": st.session_state[model_key].get("page", 1),
786+
"per_page": st.session_state[model_key].get("per_page", 10),
787+
"filter_by": st.session_state[model_key]["filter_by"],
788+
"agent_id": agent_id,
789+
}
790+
791+
response = call_api(
792+
endpoint="action/walker/typesense_vector_store_action/list_documents",
793+
json_data=params,
794+
)
795+
796+
if response and response.status_code == 200:
797+
result = get_reports_payload(response)
798+
documents = result.get("documents", [])
799+
800+
for doc in documents:
801+
if doc["metadata"].get("title"):
802+
title = doc["metadata"]["title"][0].strip()
803+
else:
804+
title = doc["text"]
805+
title = title.split("\n")[0].strip()
806+
807+
title = title[:40]
808+
page = doc["metadata"].get("page", "N/A")
809+
810+
with st.expander(f"{title} (Page {page})", expanded=False):
811+
812+
st.write(doc["text"])
813+
st.write("---")
814+
815+
col1, col2 = st.columns([5, 1]) # first column 5x width of second
816+
with col1:
817+
st.markdown(f"**Page:** {page}")
818+
with col2:
819+
# Delete button
820+
if st.button("Delete", key=f"delete_{doc['id']}"):
821+
args = {"id": doc["id"], "agent_id": agent_id}
822+
result = call_api(
823+
endpoint="action/walker/typesense_vector_store_action/delete_document",
824+
json_data=args,
825+
)
826+
827+
if result and result.status_code == 200:
828+
get_reports_payload(result)
829+
st.rerun()

0 commit comments

Comments
 (0)