feat: add multi-hop generation

ChenZiHong-Gavin · ChenZiHong-Gavin · commit e9f66df7f941 · 2025-02-03T16:49:38.000+08:00
diff --git a/configs/graphgen_config.yaml b/configs/graphgen_config.yaml
@@ -1,4 +1,4 @@
-qa_form: atomic
+qa_form: multi-hop
 data_type: raw
 input_file: resources/examples/raw_demo.jsonl
 tokenizer: cl100k_base
@@ -10,9 +10,9 @@ traverse_strategy:
   - medium
   - medium
   edge_sampling: max_loss
-  expand_method: max_tokens
+  expand_method: max_width
   isolated_node_strategy: ignore
-  max_depth: 5
-  max_extra_edges: 5
+  max_depth: 1
+  max_extra_edges: 2
   max_tokens: 256
 web_search: false
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -5,13 +5,14 @@
 import time
 from typing import List, cast, Union
 from dataclasses import dataclass
+
 from tqdm.asyncio import tqdm as tqdm_async
 
 from models import Chunk, JsonKVStorage, OpenAIModel, NetworkXStorage, WikiSearch, Tokenizer, TraverseStrategy
 from models.storage.base_storage import StorageNameSpace
 from utils import create_event_loop, logger, compute_content_hash
 from .operators import (extract_kg, search_wikipedia, quiz, judge_statement, traverse_graph_by_edge,
-                        traverse_graph_atomically)
+                        traverse_graph_atomically, traverse_graph_for_multi_hop)
 
 
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -195,6 +196,12 @@ async def async_traverse(self):
                                                       self.graph_storage,
                                                       self.traverse_strategy,
                                                       self.text_chunks_storage)
+        elif self.traverse_strategy.qa_form == "multi_hop":
+            results = await traverse_graph_for_multi_hop(self.synthesizer_llm_client,
+                                                            self.tokenizer_instance,
+                                                            self.graph_storage,
+                                                            self.traverse_strategy,
+                                                            self.text_chunks_storage)
         else:
             results = await traverse_graph_by_edge(self.synthesizer_llm_client, self.tokenizer_instance,
                                                    self.graph_storage, self.traverse_strategy, self.text_chunks_storage)
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -2,13 +2,14 @@
 from .quiz import quiz
 from .judge import judge_statement
 from .search_wikipedia import search_wikipedia
-from .traverse_graph import traverse_graph_by_edge, traverse_graph_atomically
+from .traverse_graph import traverse_graph_by_edge, traverse_graph_atomically, traverse_graph_for_multi_hop
 
 __all__ = [
     "extract_kg",
     "quiz",
     "judge_statement",
     "search_wikipedia",
     "traverse_graph_by_edge",
-    "traverse_graph_atomically"
+    "traverse_graph_atomically",
+    "traverse_graph_for_multi_hop"
 ]
diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py
@@ -3,7 +3,7 @@
 from tqdm.asyncio import tqdm as tqdm_async
 
 from models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer, JsonKVStorage
-from templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT
+from templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT
 from utils import detect_main_language, compute_content_hash, logger
 from graphgen.operators.split_graph import get_batches_with_strategy
 
@@ -399,3 +399,110 @@ async def _generate_question(
         except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while generating questions: %s", e)
     return results
+
+async def traverse_graph_for_multi_hop(
+    llm_client: OpenAIModel,
+    tokenizer: Tokenizer,
+    graph_storage: NetworkXStorage,
+    traverse_strategy: TraverseStrategy,
+    text_chunks_storage: JsonKVStorage,
+    max_concurrent: int = 1000
+) -> dict:
+    """
+    Traverse the graph for multi-hop
+
+    :param llm_client
+    :param tokenizer
+    :param graph_storage
+    :param traverse_strategy
+    :param text_chunks_storage
+    :param max_concurrent
+    :return: question and answer
+    """
+    assert traverse_strategy.qa_form == "multi_hop"
+
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    results = {}
+    edges = list(await graph_storage.get_all_edges())
+    nodes = list(await graph_storage.get_all_nodes())
+
+    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
+
+    processing_batches = await get_batches_with_strategy(
+        nodes,
+        edges,
+        graph_storage,
+        traverse_strategy
+    )
+
+    processing_batches = assign_difficulty(processing_batches, traverse_strategy.difficulty_order)
+
+    async def _process_single_batch(
+        _process_batch: tuple
+    ) -> dict:
+        async with semaphore:
+            try:
+                language = "Chinese" if detect_main_language(_process_batch[0][0]['description']) == "zh" else "English"
+
+                _process_nodes = _process_batch[0]
+                _process_edges = _process_batch[1]
+
+                entities = [
+                    f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
+                ]
+
+                relations = [
+                    f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
+                    for _process_edge in _process_edges
+                ]
+
+                entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
+                relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
+
+                prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
+                    entities=entities_str,
+                    relationships=relations_str
+                )
+
+                context = await llm_client.generate_answer(prompt)
+
+                # post-process the context
+                if "Question:" in context and "Answer:" in context:
+                    question = context.split("Question:")[1].split("Answer:")[0].strip()
+                    answer = context.split("Answer:")[1].strip()
+                elif "问题：" in context and "答案：" in context:
+                    question = context.split("问题：")[1].split("答案：")[0].strip()
+                    answer = context.split("答案：")[1].strip()
+                else:
+                    return {}
+
+                question = question.strip("\"")
+                answer = answer.strip("\"")
+
+                logger.info("Question: %s", question)
+                logger.info("Answer: %s", answer)
+
+                return {
+                    compute_content_hash(question): {
+                        "question": question,
+                        "answer": answer,
+                        "loss": get_average_loss(_process_batch),
+                        "difficulty": _process_batch[2],
+                    }
+                }
+
+            except Exception as e: # pylint: disable=broad-except
+                logger.error("Error occurred while processing batch: %s", e)
+                return {}
+
+    for result in tqdm_async(
+        asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]),
+        total=len(processing_batches),
+        desc="Processing batches"
+    ):
+        try:
+            results.update(await result)
+        except Exception as e: # pylint: disable=broad-except
+            logger.error("Error occurred while processing batches: %s", e)
+    return results
diff --git a/models/strategy/travserse_strategy.py b/models/strategy/travserse_strategy.py
@@ -6,7 +6,7 @@
 @dataclass
 class TraverseStrategy(BaseStrategy):
     # 生成的QA形式：原子、多跳、开放性
-    qa_form: str = "atomic"
+    qa_form: str = "multi_hop" # "atomic" or "multi_hop" or "open"
     # 最大边数和最大token数方法中选择一个生效
     expand_method: str = "max_tokens" # "max_width" or "max_tokens"
     # 单向拓展还是双向拓展
diff --git a/templates/__init__.py b/templates/__init__.py
@@ -5,3 +5,4 @@
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
 from .answer_rephrasing import ANSWER_REPHRASING_PROMPT
 from .question_generation import QUESTION_GENERATION_PROMPT
+from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
diff --git a/templates/multi_hop_generation.py b/templates/multi_hop_generation.py
@@ -0,0 +1,60 @@
+# pylint: disable=C0301
+
+TEMPLATE_ZH: str = """请基于以下知识子图生成多跳推理问题和答案。你将获得一个知识子图，其中包含一系列实体、关系和事实。你的任务是提出一个问题，该问题需要经过多次推理才能回答。问题的答案应该是从给定的知识子图中推断出来的。确保问题的难度适中，需要多步推理才能回答。
+
+例如：
+########
+--实体--
+1. 苹果
+2. 水果
+3. 维生素C
+########
+--关系--
+1. 苹果-水果：苹果是一种水果
+2. 水果-维生素C：水果中富含维生素C
+########
+问题：通过吃苹果补充的什么物质，有助于维持健康？
+答案：维生素C
+########
+
+#########
+--实体--
+{entities}
+#########
+--关系--
+{relationships}
+#########
+直接输出生成的问题和答案，请不要直接复制示例问题和答案，不要输出无关内容。
+"""
+
+TEMPLATE_EN: str = """Please generate a multi-hop reasoning question and answer based on the following knowledge subgraph. You will be provided with a knowledge subgraph that contains a series of entities, relations, and facts. Your task is to generate a question that requires multiple steps of reasoning to answer. The answer to the question should be inferred from the given knowledge subgraph. Ensure that the question is of moderate difficulty and requires multiple steps of reasoning to answer.
+
+For example:
+########
+--Entities--
+1. Apple
+2. Fruit
+3. Vitamin C
+########
+--Relations--
+1. Apple-Fruit: Apple is a type of fruit
+2. Fruit-Vitamin C: Fruits are rich in Vitamin C
+########
+Question: What substance, obtained through eating apples, helps maintain health?
+Answer: Vitamin C
+########
+
+########
+--Entities--
+{entities}
+########
+--Relations--
+{relationships}
+########
+Output the generated question and answer directly, please do not copy the example question and answer directly, and do not provide irrelevant information.
+"""
+
+MULTI_HOP_GENERATION_PROMPT = {
+    "English": TEMPLATE_EN,
+    "Chinese": TEMPLATE_ZH
+}