fix issue #782 + add test_chrf

varisd · varisd · commit 81bbdbbbcf3e · 2019-01-02T15:41:00.000+01:00
diff --git a/neuralmonkey/evaluators/chrf.py b/neuralmonkey/evaluators/chrf.py
@@ -1,5 +1,6 @@
 from typing import List, Dict
 from typeguard import check_argument_types
+import numpy as np
 from neuralmonkey.evaluators.evaluator import Evaluator
 
 # pylint: disable=invalid-name
@@ -25,7 +26,6 @@ def __init__(self,
         super().__init__(name)
 
         self.n = n
-        self.max_ord = n
         self.beta_2 = beta**2
 
         self.ignored = []  # type: List[str]
@@ -58,44 +58,39 @@ def score_instance(self,
                 / ((self.beta_2 * precision) + recall))
 
     def chr_r(self, hyp_ngrams: NGramDicts, ref_ngrams: NGramDicts) -> float:
-        recall = 0.0
+        count_all = np.zeros(self.n)
+        count_matched = np.zeros(self.n)
         for m in range(1, self.n + 1):
-            count_all = 0
-            count_matched = 0
             for ngr in ref_ngrams[m - 1]:
                 ref_count = ref_ngrams[m - 1][ngr]
-                count_all += ref_count
+                count_all[m - 1] += ref_count
                 if ngr in hyp_ngrams[m - 1]:
-                    count_matched += min(ref_count, hyp_ngrams[m - 1][ngr])
-            # Catch division by zero
-            if count_all != 0.0:
-                recall += count_matched / count_all
-        return recall / float(self.max_ord)
+                    count_matched[m - 1] += min(
+                        ref_count, hyp_ngrams[m - 1][ngr])
+        return np.mean(np.divide(
+            count_matched, count_all, out=np.ones_like(count_all),
+            where=(count_all!=0)))
 
     def chr_p(self, hyp_ngrams: NGramDicts, ref_ngrams: NGramDicts) -> float:
-        precision = 0.0
+        count_all = np.zeros(self.n)
+        count_matched = np.zeros(self.n)
         for m in range(1, self.n + 1):
-            count_all = 0
-            count_matched = 0
             for ngr in hyp_ngrams[m - 1]:
                 hyp_count = hyp_ngrams[m - 1][ngr]
-                count_all += hyp_count
+                count_all[m - 1] += hyp_count
                 if ngr in ref_ngrams[m - 1]:
-                    count_matched += min(hyp_count, ref_ngrams[m - 1][ngr])
-            # Catch division by zero
-            if count_all != 0.0:
-                precision += count_matched / count_all
-
-        return precision / float(self.max_ord)
+                    count_matched[m - 1] += min(
+                        hyp_count, ref_ngrams[m - 1][ngr])
+        return np.mean(np.divide(
+            count_matched, count_all, out=np.ones_like(count_all),
+            where=(count_all!=0)))
 
     def _get_ngrams(self, tokens: List[str], n: int) -> NGramDicts:
-        if len(tokens) < n:
-            self.max_ord = len(tokens)
-
         ngr_dicts = []
         for m in range(1, n + 1):
             ngr_dict = {}  # type: Dict[str, int]
-            for i in range(m, len(tokens)):
+            # if m > len(tokens), return an empty dict
+            for i in range(m, len(tokens) + 1):
                 ngr = "".join(tokens[i - m:i])
                 ngr_dict[ngr] = ngr_dict.setdefault(ngr, 0) + 1
             ngr_dicts.append(ngr_dict)
diff --git a/neuralmonkey/tests/test_chrf.py b/neuralmonkey/tests/test_chrf.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3.5
+
+
+import unittest
+
+from neuralmonkey.evaluators.chrf import ChrFEvaluator
+
+
+CORPUS_DECODED = [
+    "colorful thoughts furiously sleep",
+    "little piglet slept all night",
+    "working working working working working be be be be be be be",
+    "ich bin walrus",
+    "walrus for präsident"
+]
+
+CORPUS_REFERENCE = [
+    "the colorless ideas slept furiously",
+    "pooh slept all night",
+    "working class hero is something to be",
+    "I am the working class walrus",
+    "walrus for president"
+]
+
+TOKENS = ["a", "b", "a"]
+NGRAMS = [
+    {"a": 2, "b" : 1},
+    {"ab": 1, "ba" : 1},
+    {"aba" : 1},
+    {}]
+            
+
+DECODED = [d.split() for d in CORPUS_DECODED]
+REFERENCE = [r.split() for r in CORPUS_REFERENCE]
+
+FUNC = ChrFEvaluator()
+FUNC_P = FUNC.chr_p
+FUNC_R = FUNC.chr_r
+FUNC_NGRAMS = FUNC._get_ngrams
+
+class TestChrF(unittest.TestCase):
+
+    def test_empty_decoded(self):
+        # Recall == 0.0
+        self.assertEqual(FUNC([[] for _ in DECODED], REFERENCE), 0.0)
+
+    def test_empty_reference(self):
+        # Precision == 0.0
+        self.assertEqual(FUNC([[] for _ in REFERENCE], DECODED), 0.0)
+
+    def test_identical(self):
+        self.assertEqual(FUNC(REFERENCE, REFERENCE), 1.0)
+
+    def test_empty_sentence(self):
+        ref_empty = REFERENCE + [[]]
+        out_empty = DECODED + [["something"]]
+        score = FUNC(out_empty, ref_empty)
+        self.assertAlmostEqual(score, 0.38, delta=10)
+
+    def test_chrf(self):
+        score = FUNC(DECODED, REFERENCE)
+        self.assertAlmostEqual(score, 0.46, delta=10)
+
+    def test_get_ngrams(self):
+        tokens = ["a", "b", "a"]
+        ngrams_out = FUNC_NGRAMS(tokens, 4)
+        self.assertEqual(len(ngrams_out), 4)
+        for i, _ in enumerate(NGRAMS):
+            self.assertDictEqual(ngrams_out[i], NGRAMS[i])
+
+if __name__ == "__main__":
+    unittest.main()