Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions graphgen/configs/protein_qa_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
read:
input_file: resources/input_examples/protein_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
split:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
search: # web search configuration
enabled: false # whether to enable web search
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
enabled: false
partition: # graph partition configuration
method: anchor_bfs # partition method
method_params:
anchor_type: protein # node type to select anchor nodes
max_units_per_community: 10 # atomic partition, one node or edge per community
generate:
mode: protein_qa # atomic, aggregated, multi_hop, cot, vqa
data_format: ChatML # Alpaca, Sharegpt, ChatML
173 changes: 55 additions & 118 deletions graphgen/graphgen.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
import os
import time
from dataclasses import dataclass
from typing import Dict, cast

import gradio as gr
Expand All @@ -16,8 +15,7 @@
Tokenizer,
)
from graphgen.operators import (
build_mm_kg,
build_text_kg,
build_kg,
chunk_documents,
generate_qas,
judge_statement,
Expand All @@ -31,26 +29,26 @@
sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))


@dataclass
class GraphGen:
unique_id: int = int(time.time())
working_dir: str = os.path.join(sys_path, "cache")

# llm
tokenizer_instance: Tokenizer = None
synthesizer_llm_client: OpenAIClient = None
trainee_llm_client: OpenAIClient = None

# webui
progress_bar: gr.Progress = None

def __post_init__(self):
self.tokenizer_instance: Tokenizer = self.tokenizer_instance or Tokenizer(
def __init__(
self,
unique_id: int = int(time.time()),
working_dir: str = os.path.join(sys_path, "cache"),
tokenizer_instance: Tokenizer = None,
synthesizer_llm_client: OpenAIClient = None,
trainee_llm_client: OpenAIClient = None,
progress_bar: gr.Progress = None,
):
self.unique_id = unique_id
self.working_dir = working_dir

# llm
self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
model_name=os.getenv("TOKENIZER_MODEL")
)

self.synthesizer_llm_client: OpenAIClient = (
self.synthesizer_llm_client
synthesizer_llm_client
or OpenAIClient(
model_name=os.getenv("SYNTHESIZER_MODEL"),
api_key=os.getenv("SYNTHESIZER_API_KEY"),
Expand All @@ -59,7 +57,7 @@ def __post_init__(self):
)
)

self.trainee_llm_client: OpenAIClient = self.trainee_llm_client or OpenAIClient(
self.trainee_llm_client: OpenAIClient = trainee_llm_client or OpenAIClient(
model_name=os.getenv("TRAINEE_MODEL"),
api_key=os.getenv("TRAINEE_API_KEY"),
base_url=os.getenv("TRAINEE_BASE_URL"),
Expand All @@ -86,6 +84,9 @@ def __post_init__(self):
namespace="qa",
)

# webui
self.progress_bar: gr.Progress = progress_bar

@async_to_sync_method
async def insert(self, read_config: Dict, split_config: Dict):
"""
Expand All @@ -104,109 +105,45 @@ async def insert(self, read_config: Dict, split_config: Dict):
new_docs = {compute_mm_hash(doc, prefix="doc-"): doc for doc in data}
_add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
new_text_docs = {k: v for k, v in new_docs.items() if v.get("type") == "text"}
new_mm_docs = {k: v for k, v in new_docs.items() if v.get("type") != "text"}

await self.full_docs_storage.upsert(new_docs)

async def _insert_text_docs(text_docs):
if len(text_docs) == 0:
logger.warning("All text docs are already in the storage")
return
logger.info("[New Docs] inserting %d text docs", len(text_docs))
# Step 2.1: Split chunks and filter existing ones
inserting_chunks = await chunk_documents(
text_docs,
split_config["chunk_size"],
split_config["chunk_overlap"],
self.tokenizer_instance,
self.progress_bar,
)

_add_chunk_keys = await self.chunks_storage.filter_keys(
list(inserting_chunks.keys())
)
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
}

if len(inserting_chunks) == 0:
logger.warning("All text chunks are already in the storage")
return

logger.info("[New Chunks] inserting %d text chunks", len(inserting_chunks))
await self.chunks_storage.upsert(inserting_chunks)

# Step 2.2: Extract entities and relations from text chunks
logger.info("[Text Entity and Relation Extraction] processing ...")
_add_entities_and_relations = await build_text_kg(
llm_client=self.synthesizer_llm_client,
kg_instance=self.graph_storage,
chunks=[
Chunk(id=k, content=v["content"], type="text")
for k, v in inserting_chunks.items()
],
progress_bar=self.progress_bar,
)
if not _add_entities_and_relations:
logger.warning("No entities or relations extracted from text chunks")
return

await self._insert_done()
return _add_entities_and_relations

async def _insert_multi_modal_docs(mm_docs):
if len(mm_docs) == 0:
logger.warning("No multi-modal documents to insert")
return

logger.info("[New Docs] inserting %d multi-modal docs", len(mm_docs))

# Step 3.1: Transform multi-modal documents into chunks and filter existing ones
inserting_chunks = await chunk_documents(
mm_docs,
split_config["chunk_size"],
split_config["chunk_overlap"],
self.tokenizer_instance,
self.progress_bar,
)
if len(new_docs) == 0:
logger.warning("All documents are already in the storage")
return

_add_chunk_keys = await self.chunks_storage.filter_keys(
list(inserting_chunks.keys())
)
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
}
inserting_chunks = await chunk_documents(
new_docs,
split_config["chunk_size"],
split_config["chunk_overlap"],
self.tokenizer_instance,
self.progress_bar,
)

if len(inserting_chunks) == 0:
logger.warning("All multi-modal chunks are already in the storage")
return
_add_chunk_keys = await self.chunks_storage.filter_keys(
list(inserting_chunks.keys())
)
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
}

logger.info(
"[New Chunks] inserting %d multimodal chunks", len(inserting_chunks)
)
await self.chunks_storage.upsert(inserting_chunks)

# Step 3.2: Extract multi-modal entities and relations from chunks
logger.info("[Multi-modal Entity and Relation Extraction] processing ...")
_add_entities_and_relations = await build_mm_kg(
llm_client=self.synthesizer_llm_client,
kg_instance=self.graph_storage,
chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()],
progress_bar=self.progress_bar,
)
if not _add_entities_and_relations:
logger.warning(
"No entities or relations extracted from multi-modal chunks"
)
return
await self._insert_done()
return _add_entities_and_relations

# Step 2: Insert text documents
await _insert_text_docs(new_text_docs)
# Step 3: Insert multi-modal documents
await _insert_multi_modal_docs(new_mm_docs)
if len(inserting_chunks) == 0:
logger.warning("All chunks are already in the storage")
return

logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
await self.chunks_storage.upsert(inserting_chunks)

_add_entities_and_relations = await build_kg(
llm_client=self.synthesizer_llm_client,
kg_instance=self.graph_storage,
chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()],
progress_bar=self.progress_bar,
)
if not _add_entities_and_relations:
logger.warning("No entities or relations extracted from text chunks")
Copy link

Copilot AI Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The warning message refers to 'text chunks' but this code path handles all chunk types (both text and multi-modal). The message should be updated to 'No entities or relations extracted from chunks' to accurately reflect the unified processing.

Suggested change
logger.warning("No entities or relations extracted from text chunks")
logger.warning("No entities or relations extracted from chunks")

Copilot uses AI. Check for mistakes.
return

await self._insert_done()
return _add_entities_and_relations

async def _insert_done(self):
tasks = []
Expand Down
23 changes: 23 additions & 0 deletions graphgen/models/kg_builder/mo_kg_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Dict, List, Tuple

from graphgen.bases import Chunk

from .light_rag_kg_builder import LightRAGKGBuilder


class MOKGBuilder(LightRAGKGBuilder):
async def extract(
self, chunk: Chunk
) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
"""
Multi-Omics Knowledge Graph Builder
Step1: Extract and output a JSON object containing protein information from the given chunk.
Step2: Get more details about the protein by querying external databases if necessary.
Step3: Construct entities and relationships for the protein knowledge graph.
Step4: Return the entities and relationships.
:param chunk
Copy link

Copilot AI Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing description for the chunk parameter in the docstring. Should document the expected type and purpose of this parameter.

Suggested change
:param chunk
:param chunk: Chunk: The input data chunk containing information to extract protein entities and relationships from.

Copilot uses AI. Check for mistakes.
:return: Tuple containing entities and relationships.
"""
# TODO: Implement the multi-omics KG extraction logic here
print(chunk)
return {}, {}
2 changes: 1 addition & 1 deletion graphgen/operators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .build_kg import build_mm_kg, build_text_kg
from .build_kg import build_kg
from .generate import generate_qas
from .judge import judge_statement
from .partition import partition_kg
Expand Down
3 changes: 1 addition & 2 deletions graphgen/operators/build_kg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from .build_mm_kg import build_mm_kg
from .build_text_kg import build_text_kg
from .build_kg import build_kg
52 changes: 52 additions & 0 deletions graphgen/operators/build_kg/build_kg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import List

import gradio as gr

from graphgen.bases.base_storage import BaseGraphStorage
from graphgen.bases.datatypes import Chunk
from graphgen.models import OpenAIClient
from graphgen.utils import logger

from .build_mm_kg import build_mm_kg
from .build_text_kg import build_text_kg


async def build_kg(
llm_client: OpenAIClient,
kg_instance: BaseGraphStorage,
chunks: List[Chunk],
progress_bar: gr.Progress = None,
):
"""
Build knowledge graph (KG) and merge into kg_instance
:param llm_client: Synthesizer LLM model to extract entities and relationships
:param kg_instance
:param chunks
Comment on lines +25 to +26
Copy link

Copilot AI Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing descriptions for the kg_instance and chunks parameters in the docstring. These should document the expected types and purposes of these parameters.

Suggested change
:param kg_instance
:param chunks
:param kg_instance: BaseGraphStorage instance where the extracted knowledge graph will be merged.
:param chunks: List of Chunk objects to process for entity and relation extraction.

Copilot uses AI. Check for mistakes.
:param progress_bar: Gradio progress bar to show the progress of the extraction
:return:
"""

text_chunks = [chunk for chunk in chunks if chunk.type == "text"]
mm_chunks = [chunk for chunk in chunks if chunk.type != "text"]

if len(text_chunks) == 0:
logger.info("All text chunks are already in the storage")
else:
logger.info("[Text Entity and Relation Extraction] processing ...")
await build_text_kg(
llm_client=llm_client,
kg_instance=kg_instance,
chunks=text_chunks,
progress_bar=progress_bar,
)
if len(mm_chunks) == 0:
logger.info("All multi-modal chunks are already in the storage")
else:
logger.info("[Multi-modal Entity and Relation Extraction] processing ...")
await build_mm_kg(
llm_client=llm_client,
kg_instance=kg_instance,
chunks=mm_chunks,
progress_bar=progress_bar,
)
return kg_instance
29 changes: 29 additions & 0 deletions graphgen/operators/build_kg/build_mo_kg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import List

import gradio as gr

from graphgen.bases.base_storage import BaseGraphStorage
from graphgen.bases.datatypes import Chunk
from graphgen.models import OpenAIClient


async def build_mo_kg(
llm_client: OpenAIClient,
kg_instance: BaseGraphStorage,
chunks: List[Chunk],
progress_bar: gr.Progress = None,
):
"""
Build multi-omics KG and merge into kg_instance. (Multi-Omics: genomics, proteomics, metabolomics, etc.)
:param llm_client: Synthesizer LLM model to extract entities and relationships
:param kg_instance
:param chunks
Comment on lines +22 to +23
Copy link

Copilot AI Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing descriptions for the kg_instance and chunks parameters in the docstring. These should document the expected types and purposes of these parameters.

Suggested change
:param kg_instance
:param chunks
:param kg_instance: BaseGraphStorage instance where the multi-omics knowledge graph will be merged.
:param chunks: List of Chunk objects representing the input data to extract entities and relationships from.

Copilot uses AI. Check for mistakes.
:param progress_bar: Gradio progress bar to show the progress of the extraction
:return:
"""
# TODO: implement multi-omics KG building logic here
print("llm_client:", llm_client)
print("kg_instance:", kg_instance)
print("chunks:", chunks)
print("progress_bar:", progress_bar)
return kg_instance