Skip to content

Commit 58914cb

Browse files
TomeHirataCopilot
andauthored
Move dsp/metrics into evaluate/metrics (#8402)
* move dsp/metrics into evaluate/metrics * fix tests * Update dspy/evaluate/metrics.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix circular import --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent ea60c75 commit 58914cb

File tree

8 files changed

+158
-158
lines changed

8 files changed

+158
-158
lines changed

dspy/dsp/utils/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
from dspy.dsp.utils.dpr import *
2-
from dspy.dsp.utils.metrics import *
32
from dspy.dsp.utils.settings import *
43
from dspy.dsp.utils.utils import *

dspy/dsp/utils/metrics.py

Lines changed: 0 additions & 113 deletions
This file was deleted.

dspy/evaluate/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
from dspy.dsp.utils import EM, normalize_text
21
from dspy.evaluate.auto_evaluation import CompleteAndGrounded, SemanticF1
32
from dspy.evaluate.evaluate import Evaluate
4-
from dspy.evaluate.metrics import answer_exact_match, answer_passage_match
3+
from dspy.evaluate.metrics import EM, answer_exact_match, answer_passage_match, normalize_text
54

65
__all__ = [
76
"EM",

dspy/evaluate/auto_evaluation.py

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,50 @@
1-
import dspy
1+
from dspy.predict.chain_of_thought import ChainOfThought
2+
from dspy.primitives import Module
3+
from dspy.signatures import InputField, OutputField, Signature
24

35

4-
class SemanticRecallPrecision(dspy.Signature):
6+
class SemanticRecallPrecision(Signature):
57
"""
68
Compare a system's response to the ground truth to compute its recall and precision.
79
If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.
810
"""
911

10-
question: str = dspy.InputField()
11-
ground_truth: str = dspy.InputField()
12-
system_response: str = dspy.InputField()
13-
recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
14-
precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
12+
question: str = InputField()
13+
ground_truth: str = InputField()
14+
system_response: str = InputField()
15+
recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
16+
precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
1517

1618

17-
class DecompositionalSemanticRecallPrecision(dspy.Signature):
19+
class DecompositionalSemanticRecallPrecision(Signature):
1820
"""
1921
Compare a system's response to the ground truth to compute recall and precision of key ideas.
2022
You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.
2123
"""
2224

23-
question: str = dspy.InputField()
24-
ground_truth: str = dspy.InputField()
25-
system_response: str = dspy.InputField()
26-
ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
27-
system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
28-
discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
29-
recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
30-
precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
25+
question: str = InputField()
26+
ground_truth: str = InputField()
27+
system_response: str = InputField()
28+
ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
29+
system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
30+
discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
31+
recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
32+
precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
3133

3234

3335
def f1_score(precision, recall):
3436
precision, recall = max(0.0, min(1.0, precision)), max(0.0, min(1.0, recall))
3537
return 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)
3638

3739

38-
class SemanticF1(dspy.Module):
40+
class SemanticF1(Module):
3941
def __init__(self, threshold=0.66, decompositional=False):
4042
self.threshold = threshold
4143

4244
if decompositional:
43-
self.module = dspy.ChainOfThought(DecompositionalSemanticRecallPrecision)
45+
self.module = ChainOfThought(DecompositionalSemanticRecallPrecision)
4446
else:
45-
self.module = dspy.ChainOfThought(SemanticRecallPrecision)
47+
self.module = ChainOfThought(SemanticRecallPrecision)
4648

4749
def forward(self, example, pred, trace=None):
4850
scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
@@ -55,42 +57,42 @@ def forward(self, example, pred, trace=None):
5557
###########
5658

5759

58-
class AnswerCompleteness(dspy.Signature):
60+
class AnswerCompleteness(Signature):
5961
"""
6062
Estimate the completeness of a system's responses, against the ground truth.
6163
You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
6264
"""
6365

64-
question: str = dspy.InputField()
65-
ground_truth: str = dspy.InputField()
66-
system_response: str = dspy.InputField()
67-
ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
68-
system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
69-
discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
70-
completeness: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
66+
question: str = InputField()
67+
ground_truth: str = InputField()
68+
system_response: str = InputField()
69+
ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
70+
system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
71+
discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
72+
completeness: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
7173

7274

7375

74-
class AnswerGroundedness(dspy.Signature):
76+
class AnswerGroundedness(Signature):
7577
"""
7678
Estimate the groundedness of a system's responses, against real retrieved documents written by people.
7779
You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
7880
discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense.
7981
"""
8082

81-
question: str = dspy.InputField()
82-
retrieved_context: str = dspy.InputField()
83-
system_response: str = dspy.InputField()
84-
system_response_claims: str = dspy.OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
85-
discussion: str = dspy.OutputField(desc="discussion of how supported the claims are by the retrieved context")
86-
groundedness: float = dspy.OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")
83+
question: str = InputField()
84+
retrieved_context: str = InputField()
85+
system_response: str = InputField()
86+
system_response_claims: str = OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
87+
discussion: str = OutputField(desc="discussion of how supported the claims are by the retrieved context")
88+
groundedness: float = OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")
8789

8890

89-
class CompleteAndGrounded(dspy.Module):
91+
class CompleteAndGrounded(Module):
9092
def __init__(self, threshold=0.66):
9193
self.threshold = threshold
92-
self.completeness_module = dspy.ChainOfThought(AnswerCompleteness)
93-
self.groundedness_module = dspy.ChainOfThought(AnswerGroundedness)
94+
self.completeness_module = ChainOfThought(AnswerCompleteness)
95+
self.groundedness_module = ChainOfThought(AnswerGroundedness)
9496

9597
def forward(self, example, pred, trace=None):
9698
completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response)

dspy/evaluate/evaluate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def process_item(example):
204204
return round(100 * ncorrect / ntotal, 2)
205205

206206
def _construct_result_table(
207-
self, results: list[Tuple[dspy.Example, dspy.Example, Any]], metric_name: str
207+
self, results: list[Tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
208208
) -> "pd.DataFrame":
209209
"""
210210
Construct a pandas DataFrame from the specified result list.

0 commit comments

Comments
 (0)