fix: fix lint errors

ChenZiHong-Gavin · ChenZiHong-Gavin · commit e70fe78cc1d8 · 2025-01-24T21:50:08.000+08:00
diff --git a/graphgen/operators/judge.py b/graphgen/operators/judge.py
@@ -6,7 +6,7 @@
 from templates import STATEMENT_JUDGEMENT_PROMPT
 
 
-async def judge_statement(
+async def judge_statement( # pylint: disable=too-many-statements
         student_llm_client: OpenAIModel,
         graph_storage: NetworkXStorage,
         rephrase_storage: JsonKVStorage,
diff --git a/graphgen/operators/split_graph.py b/graphgen/operators/split_graph.py
@@ -224,7 +224,7 @@ def _sort_edges(edges: list, edge_sampling: str) -> list:
         raise ValueError(f"Invalid edge sampling: {edge_sampling}")
     return edges
 
-async def get_batches_with_strategy(
+async def get_batches_with_strategy( # pylint: disable=too-many-arguments
     nodes: list,
     edges: list,
     graph_storage: NetworkXStorage,
diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py
@@ -100,6 +100,33 @@ def get_loss_tercile(losses: list) -> (float, float):
 
     return losses[q1_index], losses[q2_index]
 
+def assign_difficulty(subgraphs: list, difficulty_order: list) -> list:
+    """
+    Assign difficulty to subgraphs based on the loss
+
+    :param subgraphs
+    :param difficulty_order
+    :return
+    """
+    losses = []
+    for subgraph in subgraphs:
+        loss = get_average_loss(subgraph)
+        losses.append(loss)
+    q1, q2 = get_loss_tercile(losses)
+
+    for i, subgraph in enumerate(subgraphs):
+        loss = get_average_loss(subgraph)
+        if loss < q1:
+            # easy
+            subgraphs[i] = (subgraph[0], subgraph[1], difficulty_order[0])
+        elif loss < q2:
+            # medium
+            subgraphs[i] = (subgraph[0], subgraph[1], difficulty_order[1])
+        else:
+            # hard
+            subgraphs[i] = (subgraph[0], subgraph[1], difficulty_order[2])
+    return subgraphs
+
 def get_average_loss(batch: tuple) -> float:
     if loss_strategy == "only_edge":
         return sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1])
@@ -258,24 +285,7 @@ async def _process_single_batch(
         traverse_strategy
     )
 
-    losses = []
-    for batch in processing_batches:
-        loss = get_average_loss(batch)
-        losses.append(loss)
-    q1, q2 = get_loss_tercile(losses)
-
-    difficulty_order = traverse_strategy.difficulty_order
-    for i, batch in enumerate(processing_batches):
-        loss = get_average_loss(batch)
-        if loss < q1:
-            # easy
-            processing_batches[i] = (batch[0], batch[1], difficulty_order[0])
-        elif loss < q2:
-            # medium
-            processing_batches[i] = (batch[0], batch[1], difficulty_order[1])
-        else:
-            # hard
-            processing_batches[i] = (batch[0], batch[1], difficulty_order[2])
+    processing_batches = assign_difficulty(processing_batches, traverse_strategy.difficulty_order)
 
     for result in tqdm_async(asyncio.as_completed(
         [_process_single_batch(batch) for batch in processing_batches]
diff --git a/judge.py b/judge.py
@@ -4,7 +4,7 @@
 from dotenv import load_dotenv
 
 from models import NetworkXStorage, JsonKVStorage, OpenAIModel
-from graphgen.operators import judge_relations
+from graphgen.operators import judge_statement
 
 sys_path = os.path.abspath(os.path.dirname(__file__))
 
@@ -33,7 +33,7 @@
         namespace="rephrase"
     )
 
-    new_graph = asyncio.run(judge_relations(llm_client, graph_storage, rephrase_storage, re_judge=True))
+    new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True))
 
     graph_file = asyncio.run(graph_storage.get_graph())
 
diff --git a/webui/app.py b/webui/app.py
@@ -4,7 +4,7 @@
 import gradio as gr
 
 from models import TraverseStrategy, NetworkXStorage, Tokenizer
-from charts import plot_pre_length_distribution, plot_post_synth_length_distribution, plot_loss_distribution
+from webui.charts import plot_pre_length_distribution, plot_post_synth_length_distribution, plot_loss_distribution
 from graphgen.operators.split_graph import get_batches_with_strategy
 from utils import create_event_loop
 
diff --git a/webui/charts/plot_metric_trend.py b/webui/charts/plot_metric_trend.py
@@ -3,15 +3,15 @@
 import numpy as np
 from scipy.interpolate import make_interp_spline
 
-def plot_metric_trend(df, x, y):
-    fig = px.line(df, x=x, y=y,
+def plot_metric_trend(dataframe, x, y):
+    fig = px.line(dataframe, x=x, y=y,
                   color='max length',
                   markers=True,
                   color_discrete_sequence=['#925EB0', '#7E99F4', '#CC7C71', '#7AB656']) # A5AEB7
 
-    fig.update_xaxes(tickvals=df[x], ticktext=[f'{int(val * 100)}%' for val in df[x].unique()])
+    fig.update_xaxes(tickvals=dataframe[x], ticktext=[f'{int(val * 100)}%' for val in dataframe[x].unique()])
 
-    avg = df.groupby(x)[y].mean().reset_index()
+    avg = dataframe.groupby(x)[y].mean().reset_index()
     avg['max length'] = 'Average'
 
     x_smooth = np.linspace(avg[x].min(), avg[x].max(), 500)
diff --git a/webui/charts/plot_rephrase_process.py b/webui/charts/plot_rephrase_process.py
@@ -40,12 +40,14 @@ def analyse_log(log_info: dict) -> list:
 
     logs = [log_item for log_item in logs if log_item['log_level'] == 'INFO']
 
+    break_index = 0
     for i, log_item in enumerate(logs):
         match = re.search(r'(\d+) nodes and (\d+) edges processed', log_item['message'])
         if match:
+            break_index = i
             break
 
-    logs = logs[i:]
+    logs = logs[break_index:]
     assert len(logs) % 3 == 0
 
     # 每三个为一组
@@ -93,8 +95,8 @@ def plot_pre_length_distribution(stats: list[dict]):
     length_distribution = defaultdict(int)
 
     # 一次遍历完成所有统计
-    for item in stats:
-        bin_start = (item['pre_length'] // bin_size) * bin_size
+    for stat in stats:
+        bin_start = (stat['pre_length'] // bin_size) * bin_size
         bin_key = f"{bin_start}-{bin_start + bin_size}"
         length_distribution[bin_key] += 1
 
@@ -145,16 +147,16 @@ def plot_post_synth_length_distribution(stats: list[dict]):
         return go.Figure()
 
     # 计算最大长度并确定区间
-    max_length = max(item['post_length'] for item in stats)
+    max_length = max(stat['post_length'] for stat in stats)
     bin_size = 50
     max_length = ((max_length // bin_size) + 1) * bin_size
 
     # 使用defaultdict避免键不存在的检查
     length_distribution = defaultdict(int)
 
     # 一次遍历完成所有统计
-    for item in stats:
-        bin_start = (item['post_length'] // bin_size) * bin_size
+    for stat in stats:
+        bin_start = (stat['post_length'] // bin_size) * bin_size
         bin_key = f"{bin_start}-{bin_start + bin_size}"
         length_distribution[bin_key] += 1