refactor: split quiz and judge

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 3840d6817ad9 · 2025-01-13T16:59:11.000+08:00
diff --git a/generate.py b/generate.py
@@ -73,7 +73,9 @@
 
     graph_gen.insert(data, args.data_type)
 
-    graph_gen.judge(re_judge=True, max_samples=3)
+    graph_gen.quiz(max_samples=3)
+
+    graph_gen.judge(re_judge=True)
 
     graph_gen.traverse()
     with open(os.path.join(sys_path, "cache", "configs", f"graphgen_{unique_id}.yaml"), "w", encoding='utf-8') as f:
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -7,10 +7,10 @@
 from dataclasses import dataclass
 from tqdm.asyncio import tqdm as tqdm_async
 
-from .operators import *
 from models import Chunk, JsonKVStorage, OpenAIModel, NetworkXStorage, WikiSearch, Tokenizer, TraverseStrategy
 from utils import create_event_loop, logger, compute_content_hash
 from models.storage.base_storage import StorageNameSpace
+from .operators import *
 
 
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -81,7 +81,8 @@ async def async_split_chunks(self, data: Union[List[list], List[dict]], data_typ
                     compute_content_hash(dp["content"], prefix="chunk-"): {
                         **dp,
                         'full_doc_id': doc_key
-                    } for dp in self.tokenizer_instance.chunk_by_token_size(doc["content"], self.chunk_overlap_size, self.chunk_size)
+                    } for dp in self.tokenizer_instance.chunk_by_token_size(doc["content"],
+                                                                            self.chunk_overlap_size, self.chunk_size)
                 }
                 inserting_chunks.update(chunks)
             _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
@@ -156,21 +157,29 @@ async def async_insert(self, data: Union[List[list], List[dict]], data_type: str
 
     async def _insert_done(self):
         tasks = []
-        for storage_instance in [self.full_docs_storage, self.text_chunks_storage, self.graph_storage, self.wiki_storage]:
+        for storage_instance in [self.full_docs_storage, self.text_chunks_storage,
+                                 self.graph_storage, self.wiki_storage]:
             if storage_instance is None:
                 continue
             tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
         await asyncio.gather(*tasks)
 
-    def judge(self, re_judge=False, max_samples=1):
+    def quiz(self, max_samples=1):
         loop = create_event_loop()
-        loop.run_until_complete(self.async_judge(re_judge, max_samples))
+        loop.run_until_complete(self.async_quiz(max_samples))
+
+    async def async_quiz(self, max_samples=1):
+        await quiz_relations(self.teacher_llm_client, self.graph_storage, self.rephrase_storage, max_samples)
+        await self.rephrase_storage.index_done_callback()
 
-    async def async_judge(self, re_judge=False, max_samples=1):
+    def judge(self, re_judge=False):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_judge(re_judge))
+
+    async def async_judge(self, re_judge=False):
         _update_relations = await judge_relations(self.teacher_llm_client, self.student_llm_client,
-                                                  self.graph_storage, self.rephrase_storage, re_judge, max_samples)
+                                                  self.graph_storage, self.rephrase_storage, re_judge)
         await _update_relations.index_done_callback()
-        await self.rephrase_storage.index_done_callback()
 
     def traverse(self):
         loop = create_event_loop()
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -1,10 +1,12 @@
 from .extract_kg import extract_kg
+from .quiz_relations import quiz_relations
 from .judge_relations import judge_relations
 from .search_wikipedia import search_wikipedia
 from .traverse_graph import traverse_graph_by_edge
 
 __all__ = [
     "extract_kg",
+    "quiz_relations",
     "judge_relations",
     "search_wikipedia",
     "traverse_graph_by_edge"
diff --git a/graphgen/operators/judge_relations.py b/graphgen/operators/judge_relations.py
@@ -12,7 +12,6 @@ async def judge_relations(
         graph_storage: NetworkXStorage,
         rephrase_storage: JsonKVStorage,
         re_judge: bool = False,
-        max_samples: int = 1,
         max_concurrent: int = 1000) -> NetworkXStorage:
     """
     Get all edges and judge them
@@ -22,7 +21,6 @@ async def judge_relations(
     :param graph_storage: graph storage instance
     :param rephrase_storage: rephrase storage instance
     :param re_judge: re-judge the relations
-    :param max_samples: max samples for each edge
     :param max_concurrent: max concurrent
     :return:
     """
@@ -38,34 +36,14 @@ async def _judge_single_relation(
             edge_data = edge[2]
 
             if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
-                logger.info(f"Edge {source_id} -> {target_id} already judged, loss: {edge_data['loss']}, skip")
+                logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"])
                 return source_id, target_id, edge_data
 
             description = edge_data["description"]
-            language = "English" if detect_main_language(description) == "en" else "Chinese"
 
             try:
-                # 如果在rephrase_storage中已经存在，直接取出
                 descriptions = await rephrase_storage.get_by_id(description)
-                if not descriptions:
-                    # 多次采样，取平均
-                    descriptions = [(description, 'yes')]
-                    for i in range(max_samples):
-                        if i > 0:
-                            new_description = await teacher_llm_client.generate_answer(
-                                DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(input_sentence=description),
-                                temperature=1
-                            )
-                            descriptions.append((new_description, 'yes'))
-                        new_anti_description = await teacher_llm_client.generate_answer(
-                            DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(input_sentence=description),
-                            temperature=1
-                        )
-                        descriptions.append((new_anti_description, 'no'))
-
-                    descriptions = list(set(descriptions))
-
-                    await rephrase_storage.upsert({description: descriptions})
+                assert descriptions is not None
 
                 judgements = []
                 gts = [gt for _, gt in descriptions]
@@ -81,7 +59,7 @@ async def _judge_single_relation(
 
                 edge_data["loss"] = loss
             except Exception as e: # pylint: disable=broad-except
-                logger.error(f"Error in judging relation {source_id} -> {target_id}: {e}")
+                logger.error("Error in judging relation %s -> %s: %s", source_id, target_id, e)
                 logger.info("Use default loss 0.1")
                 edge_data["loss"] = -math.log(0.1)
 
diff --git a/graphgen/operators/quiz_relations.py b/graphgen/operators/quiz_relations.py
@@ -0,0 +1,78 @@
+import asyncio
+
+from tqdm.asyncio import tqdm as tqdm_async
+from models import JsonKVStorage, OpenAIModel, NetworkXStorage
+from utils import logger, detect_main_language
+from templates import DESCRIPTION_REPHRASING_PROMPT
+
+
+async def quiz_relations(
+        teacher_llm_client: OpenAIModel,
+        graph_storage: NetworkXStorage,
+        rephrase_storage: JsonKVStorage,
+        max_samples: int = 1,
+        max_concurrent: int = 1000) -> JsonKVStorage:
+    """
+    Get all edges and quiz them
+
+    :param teacher_llm_client: generate statements
+    :param graph_storage: graph storage instance
+    :param rephrase_storage: rephrase storage instance
+    :param max_samples: max samples for each edge
+    :param max_concurrent: max concurrent
+    :return:
+    """
+
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def _quiz_single_relation(
+        edge: tuple,
+    ):
+        async with semaphore:
+            source_id = edge[0]
+            target_id = edge[1]
+            edge_data = edge[2]
+
+            description = edge_data["description"]
+            language = "English" if detect_main_language(description) == "en" else "Chinese"
+
+            try:
+                # 如果在rephrase_storage中已经存在，直接取出
+                descriptions = await rephrase_storage.get_by_id(description)
+                if not descriptions:
+                    # 多次采样，取平均
+                    descriptions = [(description, 'yes')]
+                    for i in range(max_samples):
+                        if i > 0:
+                            new_description = await teacher_llm_client.generate_answer(
+                                DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(input_sentence=description),
+                                temperature=1
+                            )
+                            descriptions.append((new_description, 'yes'))
+                        new_anti_description = await teacher_llm_client.generate_answer(
+                            DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(input_sentence=description),
+                            temperature=1
+                        )
+                        descriptions.append((new_anti_description, 'no'))
+
+                    descriptions = list(set(descriptions))
+            except Exception as e: # pylint: disable=broad-except
+                logger.error(f"Error when quizzing edge {source_id} -> {target_id}: {e}")
+                descriptions = [(description, 'yes')]
+
+            await rephrase_storage.upsert({description: descriptions})
+
+            return {description: descriptions}
+
+
+    edges = await graph_storage.get_all_edges()
+
+    results = []
+    for result in tqdm_async(
+            asyncio.as_completed([_quiz_single_relation(edge) for edge in edges]),
+            total=len(edges),
+            desc="Quizzing relations"
+    ):
+        results.append(await result)
+
+    return rephrase_storage