-
Notifications
You must be signed in to change notification settings - Fork 37
feat: add protein_qa generation #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
c8ada4c
5d5012a
e783736
27ab285
2192ee8
96be73a
256acc1
51c12ce
fa6e32a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| read: | ||
| input_file: resources/input_examples/protein_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples | ||
| split: | ||
| chunk_size: 1024 # chunk size for text splitting | ||
| chunk_overlap: 100 # chunk overlap for text splitting | ||
| search: # web search configuration | ||
| enabled: false # whether to enable web search | ||
| search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia | ||
| quiz_and_judge: # quiz and test whether the LLM masters the knowledge points | ||
| enabled: false | ||
| partition: # graph partition configuration | ||
| method: anchor_bfs # partition method | ||
| method_params: | ||
| anchor_type: protein # node type to select anchor nodes | ||
| max_units_per_community: 10 # atomic partition, one node or edge per community | ||
| generate: | ||
| mode: protein_qa # atomic, aggregated, multi_hop, cot, vqa | ||
| data_format: ChatML # Alpaca, Sharegpt, ChatML |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,23 @@ | ||||||
| from typing import Dict, List, Tuple | ||||||
|
|
||||||
| from graphgen.bases import Chunk | ||||||
|
|
||||||
| from .light_rag_kg_builder import LightRAGKGBuilder | ||||||
|
|
||||||
|
|
||||||
| class MOKGBuilder(LightRAGKGBuilder): | ||||||
| async def extract( | ||||||
| self, chunk: Chunk | ||||||
| ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: | ||||||
| """ | ||||||
| Multi-Omics Knowledge Graph Builder | ||||||
| Step1: Extract and output a JSON object containing protein information from the given chunk. | ||||||
| Step2: Get more details about the protein by querying external databases if necessary. | ||||||
| Step3: Construct entities and relationships for the protein knowledge graph. | ||||||
| Step4: Return the entities and relationships. | ||||||
| :param chunk | ||||||
|
||||||
| :param chunk | |
| :param chunk: Chunk: The input data chunk containing information to extract protein entities and relationships from. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1 @@ | ||
| from .build_mm_kg import build_mm_kg | ||
| from .build_text_kg import build_text_kg | ||
| from .build_kg import build_kg |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,52 @@ | ||||||||||
| from typing import List | ||||||||||
|
|
||||||||||
| import gradio as gr | ||||||||||
|
|
||||||||||
| from graphgen.bases.base_storage import BaseGraphStorage | ||||||||||
| from graphgen.bases.datatypes import Chunk | ||||||||||
| from graphgen.models import OpenAIClient | ||||||||||
| from graphgen.utils import logger | ||||||||||
|
|
||||||||||
| from .build_mm_kg import build_mm_kg | ||||||||||
| from .build_text_kg import build_text_kg | ||||||||||
|
|
||||||||||
|
|
||||||||||
| async def build_kg( | ||||||||||
| llm_client: OpenAIClient, | ||||||||||
| kg_instance: BaseGraphStorage, | ||||||||||
| chunks: List[Chunk], | ||||||||||
| progress_bar: gr.Progress = None, | ||||||||||
| ): | ||||||||||
| """ | ||||||||||
| Build knowledge graph (KG) and merge into kg_instance | ||||||||||
| :param llm_client: Synthesizer LLM model to extract entities and relationships | ||||||||||
| :param kg_instance | ||||||||||
| :param chunks | ||||||||||
|
Comment on lines
+25
to
+26
|
||||||||||
| :param kg_instance | |
| :param chunks | |
| :param kg_instance: BaseGraphStorage instance where the extracted knowledge graph will be merged. | |
| :param chunks: List of Chunk objects to process for entity and relation extraction. |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,29 @@ | ||||||||||
| from typing import List | ||||||||||
|
|
||||||||||
| import gradio as gr | ||||||||||
|
|
||||||||||
| from graphgen.bases.base_storage import BaseGraphStorage | ||||||||||
| from graphgen.bases.datatypes import Chunk | ||||||||||
| from graphgen.models import OpenAIClient | ||||||||||
|
|
||||||||||
|
|
||||||||||
| async def build_mo_kg( | ||||||||||
| llm_client: OpenAIClient, | ||||||||||
| kg_instance: BaseGraphStorage, | ||||||||||
| chunks: List[Chunk], | ||||||||||
| progress_bar: gr.Progress = None, | ||||||||||
| ): | ||||||||||
| """ | ||||||||||
| Build multi-omics KG and merge into kg_instance. (Multi-Omics: genomics, proteomics, metabolomics, etc.) | ||||||||||
| :param llm_client: Synthesizer LLM model to extract entities and relationships | ||||||||||
| :param kg_instance | ||||||||||
| :param chunks | ||||||||||
|
Comment on lines
+22
to
+23
|
||||||||||
| :param kg_instance | |
| :param chunks | |
| :param kg_instance: BaseGraphStorage instance where the multi-omics knowledge graph will be merged. | |
| :param chunks: List of Chunk objects representing the input data to extract entities and relationships from. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The warning message refers to 'text chunks' but this code path handles all chunk types (both text and multi-modal). The message should be updated to 'No entities or relations extracted from chunks' to accurately reflect the unified processing.