Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 12 additions & 15 deletions evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
# 評価器モジュール - 統一インターフェース
# 評価器モジュール - 統一インターフェース (すべて非同期バージョンを使用)

from .base import BaseEvaluator
from .ragas_ollama import RagasOllamaEvaluator # Re-enabled with compatible versions
# from .academic_evaluator import AcademicEvaluator # Removed in favor of async version
from .factory import EvaluatorFactory, EvaluatorManager
from .async_base import AsyncBaseEvaluator
from .async_academic_evaluator import AsyncAcademicEvaluator
from .async_ragas_evaluator import AsyncRagasEvaluator
from .async_factory import AsyncEvaluatorFactory, AsyncEvaluatorManager
from .base_evaluator import BaseEvaluator as AsyncBaseEvaluator
from .academic_evaluator import AcademicEvaluator
from .ragas_evaluator import RagasEvaluator
from .factory import EvaluatorFactory, EvaluatorManager # Legacy classes
from .evaluator_factory import EvaluatorFactory as AsyncEvaluatorFactory, EvaluatorManager as AsyncEvaluatorManager

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Broken import: .base does not exist.

from .base import BaseEvaluator will ImportError; also you want to re-export BaseEvaluator twice. Import solely from base_evaluator and alias.

-from .base import BaseEvaluator
-from .base_evaluator import BaseEvaluator as AsyncBaseEvaluator
+from .base_evaluator import BaseEvaluator
+AsyncBaseEvaluator = BaseEvaluator
 from .academic_evaluator import AcademicEvaluator
 from .ragas_evaluator import RagasEvaluator
-from .factory import EvaluatorFactory, EvaluatorManager  # Legacy classes
-from .evaluator_factory import EvaluatorFactory as AsyncEvaluatorFactory, EvaluatorManager as AsyncEvaluatorManager
+from .factory import EvaluatorFactory, EvaluatorManager  # Legacy classes
+from .evaluator_factory import EvaluatorFactory as AsyncEvaluatorFactory, EvaluatorManager as AsyncEvaluatorManager
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
from .base import BaseEvaluator
from .ragas_ollama import RagasOllamaEvaluator # Re-enabled with compatible versions
# from .academic_evaluator import AcademicEvaluator # Removed in favor of async version
from .factory import EvaluatorFactory, EvaluatorManager
from .async_base import AsyncBaseEvaluator
from .async_academic_evaluator import AsyncAcademicEvaluator
from .async_ragas_evaluator import AsyncRagasEvaluator
from .async_factory import AsyncEvaluatorFactory, AsyncEvaluatorManager
from .base_evaluator import BaseEvaluator as AsyncBaseEvaluator
from .academic_evaluator import AcademicEvaluator
from .ragas_evaluator import RagasEvaluator
from .factory import EvaluatorFactory, EvaluatorManager # Legacy classes
from .evaluator_factory import EvaluatorFactory as AsyncEvaluatorFactory, EvaluatorManager as AsyncEvaluatorManager
from .base_evaluator import BaseEvaluator
AsyncBaseEvaluator = BaseEvaluator
from .academic_evaluator import AcademicEvaluator
from .ragas_evaluator import RagasEvaluator
from .factory import EvaluatorFactory, EvaluatorManager # Legacy classes
from .evaluator_factory import EvaluatorFactory as AsyncEvaluatorFactory, EvaluatorManager as AsyncEvaluatorManager
🤖 Prompt for AI Agents
In evaluators/__init__.py around lines 3 to 9, remove the broken import "from
.base import BaseEvaluator" and instead import BaseEvaluator only from
.base_evaluator (aliasing if you need an async name), ensuring you don't
re-export the same symbol twice; keep the other evaluator and factory imports
as-is and export a single BaseEvaluator symbol (plus any intentional aliases)
from base_evaluator.

# 注: すべての評価器は非同期APIをサポートしています

__all__ = [
'BaseEvaluator',
'RagasOllamaEvaluator', # Re-enabled with compatible versions
'AsyncRagasEvaluator', # Renamed from AsyncRagasOllamaEvaluator
# 'AcademicEvaluator', # Removed in favor of async version
'EvaluatorFactory',
'EvaluatorManager',
'AsyncBaseEvaluator',
'AsyncAcademicEvaluator',
'AsyncRagasOllamaEvaluator',
'AcademicEvaluator',
'RagasEvaluator',
'EvaluatorFactory', # Legacy
'EvaluatorManager', # Legacy
'AsyncEvaluatorFactory',
'AsyncEvaluatorManager'
]
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# 增强异步学术评估器 - 合并学术和混合模型优势
# 增强学术评估器 - 合并学术和混合模型优势

from typing import Dict, List, Any, Optional
from langchain_openai import ChatOpenAI
from .async_base import AsyncBaseEvaluator
from .base import BaseEvaluator
from .base_evaluator import BaseEvaluator
import json
import re
import asyncio
Expand All @@ -12,12 +11,12 @@

logger = logging.getLogger(__name__)

class AsyncAcademicEvaluator(AsyncBaseEvaluator):
"""增强异步学术评估器 - 支持可选的嵌入模型辅助评估"""
class AcademicEvaluator(BaseEvaluator):
"""增强学术评估器 - 支持可选的嵌入模型辅助评估"""

def __init__(self, config: Dict[str, Any]):
"""初始化增强异步学术评估器"""
super().__init__("AsyncAcademic", config)
"""初始化增强学术评估器"""
super().__init__("Academic", config)

try:
# 初始化聊天模型(主要评估模型)
Expand All @@ -39,9 +38,9 @@ def __init__(self, config: Dict[str, Any]):
self.evaluation_mode = config.get("evaluation_mode", "pure_chat")

self._available = True
print(f"✅ {self.name}增强异步评估器初始化成功 (模式: {self.evaluation_mode})")
print(f"✅ {self.name}增强评估器初始化成功 (模式: {self.evaluation_mode})")
except Exception as e:
print(f"❌ {self.name}增强异步评估器初始化失败: {e}")
print(f"❌ {self.name}增强评估器初始化失败: {e}")
self._available = False

async def evaluate_answers_async(self, questions: List[str], answers: List[str],
Expand Down Expand Up @@ -215,9 +214,13 @@ async def _evaluate_pure_chat_mode(self, question: str, answer: str, ground_trut
return self._get_enhanced_default_scores()

async def _calculate_semantic_similarity(self, answer: str, ground_truth: str) -> float:
"""使用嵌入模型计算语义相似度(混合模式用)"""
"""使用嵌入模型计算语义相似度(混合模式用)- 优化版本"""

try:
# 如果没有嵌入配置,直接使用文本相似度
if not self.embedding_config.get("api_key"):
return self._calculate_text_similarity(answer, ground_truth)

# 并发获取两个文本的嵌入向量
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
headers = {
Expand All @@ -228,24 +231,24 @@ async def _calculate_semantic_similarity(self, answer: str, ground_truth: str) -
# 获取回答的嵌入向量
answer_payload = {
"model": self.embedding_config["model"],
"prompt": answer
"input": answer # 使用标准的 input 字段
}

# 获取标准答案的嵌入向量
ground_truth_payload = {
"model": self.embedding_config["model"],
"prompt": ground_truth
"input": ground_truth
}

# 并发请求两个嵌入向量
answer_task = session.post(
f"{self.embedding_config['base_url'].rstrip('/')}/api/embeddings",
f"{self.embedding_config['base_url'].rstrip('/')}/embeddings",
headers=headers,
json=answer_payload
)

ground_truth_task = session.post(
f"{self.embedding_config['base_url'].rstrip('/')}/api/embeddings",
f"{self.embedding_config['base_url'].rstrip('/')}/embeddings",
headers=headers,
json=ground_truth_payload
)
Expand All @@ -264,7 +267,8 @@ async def _calculate_semantic_similarity(self, answer: str, ground_truth: str) -
return self._calculate_text_similarity(answer, ground_truth)

answer_result = await answer_response.json()
answer_embedding = answer_result.get("embedding", [])
# 支持多种API格式
answer_embedding = answer_result.get("data", [{}])[0].get("embedding", []) if "data" in answer_result else answer_result.get("embedding", [])

# 处理标准答案嵌入向量
if isinstance(ground_truth_response, Exception):
Expand All @@ -276,15 +280,15 @@ async def _calculate_semantic_similarity(self, answer: str, ground_truth: str) -
return self._calculate_text_similarity(answer, ground_truth)

ground_truth_result = await ground_truth_response.json()
ground_truth_embedding = ground_truth_result.get("embedding", [])
ground_truth_embedding = ground_truth_result.get("data", [{}])[0].get("embedding", []) if "data" in ground_truth_result else ground_truth_result.get("embedding", [])

# 计算余弦相似度
if len(answer_embedding) > 0 and len(ground_truth_embedding) > 0:
similarity = self._calculate_cosine_similarity(answer_embedding, ground_truth_embedding)
print(f"🔍 嵌入向量语义相似度: {similarity}")
print(f"🔍 嵌入向量语义相似度: {similarity:.4f}")
return similarity
else:
print(f"❌ 嵌入向量为空")
print(f"❌ 嵌入向量为空 - answer: {len(answer_embedding)}, ground_truth: {len(ground_truth_embedding)}")
return self._calculate_text_similarity(answer, ground_truth)

except Exception as e:
Expand Down
10 changes: 5 additions & 5 deletions evaluators/async_base.py → evaluators/base_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# 异步评价器基类 - 为评价系统提供异步接口
# 评价器基类 - 为评价系统提供异步接口

from abc import ABC, abstractmethod
from typing import Dict, List, Any, Optional
Expand All @@ -7,12 +7,12 @@

logger = logging.getLogger(__name__)

class AsyncBaseEvaluator(ABC):
"""异步评价器基类"""
class BaseEvaluator(ABC):
"""评价器基类 - 支持异步API"""

def __init__(self, name: str, config: Dict[str, Any]):
"""
初始化异步评价器
初始化评价器

Args:
name: 评价器名称
Expand All @@ -23,7 +23,7 @@ def __init__(self, name: str, config: Dict[str, Any]):
self.timeout = config.get('timeout', 45)
self._available = False

logger.info(f"Async evaluator initialized: {name}")
logger.info(f"Evaluator initialized: {name}")

@abstractmethod
async def evaluate_answers_async(self, questions: List[str], answers: List[str],
Expand Down
57 changes: 27 additions & 30 deletions evaluators/async_factory.py → evaluators/evaluator_factory.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
# 异步评估器工厂 - 异步评估器的创建和管理
# 评估器工厂 - 评估器的创建和管理

from typing import Dict, List, Any, Optional
from .async_base import AsyncBaseEvaluator
from .async_academic_evaluator import AsyncAcademicEvaluator
from .async_ragas_evaluator import AsyncRagasEvaluator
from .base_evaluator import BaseEvaluator
from .academic_evaluator import AcademicEvaluator
from .ragas_evaluator import RagasEvaluator
import asyncio
import logging

logger = logging.getLogger(__name__)

class AsyncEvaluatorFactory:
"""异步评估器工厂类"""
class EvaluatorFactory:
"""评估器工厂类"""

# 可用的异步评估器类型
# 可用的评估器类型
EVALUATOR_TYPES = {
"async_academic": AsyncAcademicEvaluator,
"async_ragas": AsyncRagasEvaluator
"academic": AcademicEvaluator,
"ragas": RagasEvaluator
}

# 默认评估器优先级
DEFAULT_PRIORITY = ["async_ragas", "async_academic"]
DEFAULT_PRIORITY = ["ragas", "academic"]

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

类属性应为不可变并显式声明 ClassVar

避免可变类属性被实例共享篡改;同时用 Type[BaseEvaluator] 标注。

-    # 可用的评估器类型
-    EVALUATOR_TYPES = {
-        "academic": AcademicEvaluator,
-        "ragas": RagasEvaluator
-    }
+    # 可用的评估器类型
+    EVALUATOR_TYPES: ClassVar[Dict[str, Type[BaseEvaluator]]] = {
+        "academic": AcademicEvaluator,
+        "ragas": RagasEvaluator,
+    }
 ...
-    # 默认评估器优先级
-    DEFAULT_PRIORITY = ["ragas", "academic"]
+    # 默认评估器优先级(使用不可变元组)
+    DEFAULT_PRIORITY: ClassVar[Tuple[str, ...]] = ("ragas", "academic")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# 可用的评估器类型
EVALUATOR_TYPES = {
"async_academic": AsyncAcademicEvaluator,
"async_ragas": AsyncRagasEvaluator
"academic": AcademicEvaluator,
"ragas": RagasEvaluator
}
# 默认评估器优先级
DEFAULT_PRIORITY = ["async_ragas", "async_academic"]
DEFAULT_PRIORITY = ["ragas", "academic"]
# 可用的评估器类型
EVALUATOR_TYPES: ClassVar[Dict[str, Type[BaseEvaluator]]] = {
"academic": AcademicEvaluator,
"ragas": RagasEvaluator,
}
# 默认评估器优先级(使用不可变元组)
DEFAULT_PRIORITY: ClassVar[Tuple[str, ...]] = ("ragas", "academic")
🧰 Tools
🪛 Ruff (0.12.2)

16-19: Mutable class attributes should be annotated with typing.ClassVar

(RUF012)


22-22: Mutable class attributes should be annotated with typing.ClassVar

(RUF012)

🤖 Prompt for AI Agents
In evaluators/evaluator_factory.py around lines 15 to 23, the class-level
EVALUATOR_TYPES and DEFAULT_PRIORITY are currently mutable and lack proper
typing; change their declarations to use typing.ClassVar with explicit types
(EVALUATOR_TYPES: ClassVar[Mapping[str, Type[BaseEvaluator]]] and
DEFAULT_PRIORITY: ClassVar[Tuple[str, ...]]), replace the mutable dict with an
immutable Mapping (or wrap with types.MappingProxyType) and use a tuple (or
frozenset/tuple) for DEFAULT_PRIORITY to prevent instance-level mutation and
ensure correct static typing.

@classmethod
async def create_evaluator_async(cls, evaluator_type: str, config: Dict[str, Any]) -> Optional[AsyncBaseEvaluator]:
async def create_evaluator_async(cls, evaluator_type: str, config: Dict[str, Any]) -> Optional[BaseEvaluator]:
"""异步创建评估器"""
if evaluator_type not in cls.EVALUATOR_TYPES:
raise ValueError(f"不支持的评估器类型: {evaluator_type}")
Expand All @@ -34,15 +34,15 @@ async def create_evaluator_async(cls, evaluator_type: str, config: Dict[str, Any
if evaluator.is_available():
return evaluator
else:
print(f"⚠️ {evaluator_type}异步评估器不可用")
print(f"⚠️ {evaluator_type}评估器不可用")
return None
except Exception as e:
print(f"❌ {evaluator_type}异步评估器创建失败: {e}")
print(f"❌ {evaluator_type}评估器创建失败: {e}")
return None

@classmethod
async def create_all_evaluators_async(cls, config: Dict[str, Any],
types: Optional[List[str]] = None) -> Dict[str, AsyncBaseEvaluator]:
types: Optional[List[str]] = None) -> Dict[str, BaseEvaluator]:
"""异步创建所有可用的评估器"""
if types is None:
types = cls.DEFAULT_PRIORITY
Expand Down Expand Up @@ -82,15 +82,13 @@ def get_evaluator_info(cls) -> Dict[str, Dict[str, Any]]:
info[name] = {
"name": temp_evaluator.name,
"supported_metrics": temp_evaluator.get_supported_metrics(),
"description": cls._get_evaluator_description(name),
"async": True
"description": cls._get_evaluator_description(name)
}
except:
info[name] = {
"name": name,
"supported_metrics": [],
"description": cls._get_evaluator_description(name),
"async": True
"description": cls._get_evaluator_description(name)
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

避免 bare except;记录异常上下文

bare except 会吞掉系统异常。改为捕获 Exception 并记录。

-            except:
-                info[name] = {
+            except Exception as e:
+                logger.debug("获取评估器信息失败: %s (%s)", name, e)
+                info[name] = {
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
except:
info[name] = {
"name": name,
"supported_metrics": [],
"description": cls._get_evaluator_description(name),
"async": True
"description": cls._get_evaluator_description(name)
}
except Exception as e:
logger.debug("获取评估器信息失败: %s (%s)", name, e)
info[name] = {
"name": name,
"supported_metrics": [],
"description": cls._get_evaluator_description(name)
}
🧰 Tools
🪛 Ruff (0.12.2)

87-87: Do not use bare except

(E722)

🤖 Prompt for AI Agents
In evaluators/evaluator_factory.py around lines 87 to 92, replace the bare
"except:" with "except Exception as e" and record the exception context before
continuing to populate info[name]; use the module logger (or import logging) and
call logger.exception or logging.exception with a message that includes the
evaluator name so the stack trace and error message are preserved, then keep the
existing info[name] assignment unchanged.


return info
Expand All @@ -99,22 +97,22 @@ def get_evaluator_info(cls) -> Dict[str, Dict[str, Any]]:
def _get_evaluator_description(cls, evaluator_type: str) -> str:
"""获取评估器描述"""
descriptions = {
"async_academic": "增强异步学术评估器 - 支持6维度质量评估(相关性、正确性、完整性、清晰度、连贯性、有用性)",
"async_ragas": "异步Ragas框架评估器 - 完整的RAG评估指标集(相关性、正确性、忠实性、上下文精度、上下文召回率)"
"academic": "增强学术评估器 - 支持6维度质量评估(相关性、正确性、完整性、清晰度、连贯性、有用性)",
"ragas": "Ragas框架评估器 - 完整的RAG评估指标集(相关性、正确性、忠实性、上下文精度、上下文召回率)"
}
return descriptions.get(evaluator_type, "无描述")

class AsyncEvaluatorManager:
"""异步评估器管理器"""
class EvaluatorManager:
"""评估器管理器"""

def __init__(self, chat_config: Dict[str, Any], embedding_config: Dict[str, Any]):
"""初始化异步评估器管理器"""
"""初始化评估器管理器"""
# 为混合模型评估器准备两种配置
self.chat_config = chat_config.copy()
self.embedding_config = embedding_config.copy()
self.evaluators = {} # 将在初始化时异步创建

print(f"🔧 异步评估器管理器初始化完成")
print(f"🔧 评估器管理器初始化完成")

async def initialize_async(self):
"""异步初始化所有评估器"""
Expand All @@ -130,20 +128,20 @@ async def initialize_async(self):
"evaluation_mode": "hybrid" # 使用混合模式:embedding计算相关性,聊天模型评估质量
}

self.evaluators = await AsyncEvaluatorFactory.create_all_evaluators_async(enhanced_config)
self.evaluators = await EvaluatorFactory.create_all_evaluators_async(enhanced_config)

if not self.evaluators:
raise ValueError("没有可用的异步评估器")
raise ValueError("没有可用的评估器")

print(f"🔧 可用的异步评估器: {list(self.evaluators.keys())}")
print(f"🔧 可用的评估器: {list(self.evaluators.keys())}")

async def evaluate_all_async(self, questions: List[str], answers: List[str],
ground_truths: List[str], contexts: List[List[str]] = None) -> Dict[str, Dict[str, List[float]]]:
"""异步执行所有评估器评估"""
all_results = {}

for evaluator_name, evaluator in self.evaluators.items():
print(f"\n📊 使用{evaluator_name}异步评估器评估中...")
print(f"\n📊 使用{evaluator_name}评估器评估中...")

try:
# 使用带超时的异步评估
Expand All @@ -166,8 +164,7 @@ def get_evaluator_summary(self) -> Dict[str, Any]:
summary = {
"total_evaluators": len(self.evaluators),
"available_evaluators": list(self.evaluators.keys()),
"evaluator_details": {},
"async": True
"evaluator_details": {}
}

for name, evaluator in self.evaluators.items():
Expand Down
Loading
Loading