Rework country simplification (#78)

StuartBertram · web-flow · commit 7bbf3101f55f · 2025-09-11T08:43:17.000+12:00
* Rework country simplification (#77) We now use lists of words and drop terms rather than trying to do lots of string mutations that might give us different values that aren't in the list of terms. * Rework to reduce iteration complexity Single word terms can be dropped in a single pass * Fix typo and reduce iteration further Take the first non-dropped word rather than checking all words for whether they need to be dropped and then taking the first from what remains. * Test and fix empty string and all removed cases * Handle additional whitespace * Simplify the logic further We can loop over an enumerated list and break when we find a word. Multi-part words do index-based access but it's rare enough that the performance hit for indexing an array should be low. * Linting * Add more comments and optimise Moved the term building inside the IF block because we don't need to build the lists if we don't have any candidate words! * Fix test failure
diff --git a/src/hdx/location/country.py b/src/hdx/location/country.py
@@ -1,10 +1,8 @@
 """Country location"""
 
-import copy
 import logging
 import os.path
 import re
-from string import punctuation
 from typing import Dict, List, Optional, Tuple, Union
 
 import hxl
@@ -62,6 +60,7 @@ class Country:
         "BOLIVARIAN",
         "PLURINATIONAL",
         "PEOPLE'S",
+        "PEOPLES",
         "DUTCH PART",
         "FRENCH PART",
         "MALVINAS",
@@ -721,41 +720,90 @@ def simplify_countryname(cls, country: str) -> (str, List[str]):
         Returns:
             Tuple[str, List[str]]: Uppercase simplified country name and list of removed words
         """
-        countryupper = country.upper()
+        # Convert the input into an upper-cased list of words
+        countryupper = country.upper().strip()
         words = get_words_in_sentence(countryupper)
+
+        # Strip common patterns
         index = countryupper.find(",")
         if index != -1:
             countryupper = countryupper[:index]
         index = countryupper.find(":")
         if index != -1:
             countryupper = countryupper[:index]
-        regex = re.compile(r"\(.+?\)")
-        countryupper = regex.sub("", countryupper)
-        remove = copy.deepcopy(cls.simplifications)
-        for simplification1, simplification2 in cls.abbreviations.items():
-            countryupper = countryupper.replace(simplification1, "")
-            remove.append(simplification2)
-        for (
-            simplification1,
-            simplifications,
-        ) in cls.multiple_abbreviations.items():
-            countryupper = countryupper.replace(simplification1, "")
-            for simplification2 in simplifications:
-                remove.append(simplification2)
-        remove = "|".join(remove)
-        regex = re.compile(
-            r"[" + punctuation.replace("'", "") + r"]|\b(" + remove + r")\b",
-            flags=re.IGNORECASE,
-        )
-        countryupper = regex.sub("", countryupper)
-        countryupper = countryupper.strip()
-        countryupper_words = get_words_in_sentence(countryupper)
-        if len(countryupper_words) > 1:
-            countryupper = countryupper_words[0]
-        if countryupper:
-            countryupper = countryupper.strip(punctuation)
-            words.remove(countryupper)
-        return countryupper, words
+
+        if countryupper and not (countryupper[0] == "(" and countryupper[-1] == ")"):
+            regex = re.compile(r"\(.+?\)")
+            countryupper = regex.sub("", countryupper)
+
+        # Find the words that remain as candidates for the simplified name.
+        # These are guaranteed to be a subset of `words` because we have only pruned
+        # parts from the sentence and not done any transformative processing.
+        candidate_words = get_words_in_sentence(countryupper)
+
+        if candidate_words:
+            # Make the simplifying terms indexable for efficient lookup
+            multiword_terms = {}
+            singleword_terms = set()
+
+            for terms in [
+                cls.simplifications,
+                cls.abbreviations.keys(),
+                cls.abbreviations.values(),
+                cls.multiple_abbreviations.keys(),
+            ] + list(cls.multiple_abbreviations.values()):
+                for term in terms:
+                    if " " in term:
+                        # Index multi-word terms by the first term against a list of the terms
+                        term_parts = term.split(" ")
+                        multiword_terms[term_parts[0]] = term_parts
+                    else:
+                        # Add single word terms to the set, and add their dot-less form as well
+                        singleword_terms.add(term)
+                        if term[-1] == ".":
+                            singleword_terms.add(term.strip("."))
+
+            num_candidate_words = len(candidate_words)
+            simplified_term = ""
+            enumerated_words = enumerate(candidate_words)
+            default = (num_candidate_words, "")
+
+            # Iterate through the candidate terms until we a) find a non-simplified word
+            # or b) hit the end of the list of words
+            while (val := next(enumerated_words, default)) != default:
+                i, word = val
+                if word in singleword_terms:
+                    # If the word was a single word simplification term then skip it
+                    continue
+                if (
+                    # If the current term is the first word in a multi-part term
+                    (term_parts := multiword_terms.get(word))
+                    # And there are enough words left in the sentence
+                    and i + len(term_parts) <= num_candidate_words
+                    # And all of the words in the multi-word phrase are in sequence
+                    # in the candidate term starting at the current position
+                    and all(
+                        candidate_words[i + j] == term_part
+                        for j, term_part in enumerate(term_parts)
+                    )
+                ):
+                    # Then skip the other words in the term and continue
+                    for _ in range(len(term_parts) - 1):
+                        next(enumerated_words)
+
+                    continue
+                # Else we found a word that we aren't dropping - it is our simplified word.
+                # Take it and break.
+                simplified_term = word
+                break
+
+            if simplified_term:
+                # We found a simplified term. Remove it from the list of other terms
+                words.remove(simplified_term)
+        else:
+            simplified_term = ""
+
+        return simplified_term, words
 
     @classmethod
     def get_iso3_country_code(
diff --git a/tests/hdx/location/test_country.py b/tests/hdx/location/test_country.py
@@ -644,38 +644,144 @@ def test_expand_countryname_abbrevs(self):
         ]
 
     def test_simplify_countryname(self):
-        assert Country.simplify_countryname("jpn") == ("JPN", list())
+        # Test that we handle the empty string case
+        assert Country.simplify_countryname("") == ("", [])
+
+        # Test that country codes and arbitrary words return just the word but capitalised
+        assert Country.simplify_countryname("jpn") == ("JPN", [])
+        assert Country.simplify_countryname("test") == ("TEST", [])
+
+        # Test simplified terms are removed, including abbreviations
         assert Country.simplify_countryname("United Rep. of Tanzania") == (
             "TANZANIA",
             ["UNITED", "REP", "OF"],
         )
+        assert Country.simplify_countryname(
+            "The former Yugoslav Republic of Macedonia"
+        ) == ("MACEDONIA", ["THE", "FORMER", "YUGOSLAV", "REPUBLIC", "OF"])
+
+        # Test different word orderings and bracketing are consistent
         assert Country.simplify_countryname("Micronesia (Federated States of)") == (
             "MICRONESIA",
             ["FEDERATED", "STATES", "OF"],
         )
+        assert Country.simplify_countryname("Federated States of Micronesia") == (
+            "MICRONESIA",
+            ["FEDERATED", "STATES", "OF"],
+        )
+        assert Country.simplify_countryname("(Federated States of) Micronesia") == (
+            "MICRONESIA",
+            ["FEDERATED", "STATES", "OF"],
+        )
+
+        # Test that the simplified terms on their own are dropped and that we handle
+        # the "no simplified term" case
+        assert Country.simplify_countryname("Federated States") == (
+            "",
+            ["FEDERATED", "STATES"],
+        )
+
+        # Test that multi-word simplifications are dropped
+        assert Country.simplify_countryname("French Part of Saint Martin") == (
+            "MARTIN",
+            ["FRENCH", "PART", "OF", "SAINT"],
+        )
+        assert Country.simplify_countryname("French Part of Saint-Martin") == (
+            "MARTIN",
+            ["FRENCH", "PART", "OF", "SAINT"],
+        )
+        # "French Part" is a simplification and so can't be the simplified term
+        assert Country.simplify_countryname("French Part") == ("", ["FRENCH", "PART"])
+        # But the words must be consecutive for multi-part terms,
+        # so we don't drop "French" and "part" here
+        assert Country.simplify_countryname("French and Part") == (
+            "FRENCH",
+            ["AND", "PART"],
+        )
+
+        # Test that we handle abbreviations with and without punctuation
         assert Country.simplify_countryname("Dem. Rep. of the Congo") == (
             "CONGO",
             ["DEM", "REP", "OF", "THE"],
         )
+        assert Country.simplify_countryname("Dem Rep of the Congo") == (
+            "CONGO",
+            ["DEM", "REP", "OF", "THE"],
+        )
+
+        # Test that we handle the "Country, Specifics" comma format
         assert Country.simplify_countryname(
             "Korea, Democratic People's Republic of"
         ) == ("KOREA", ["DEMOCRATIC", "PEOPLE'S", "REPUBLIC", "OF"])
         assert Country.simplify_countryname(
             "Democratic People's Republic of Korea"
         ) == ("KOREA", ["DEMOCRATIC", "PEOPLE'S", "REPUBLIC", "OF"])
+
+        # Test that we handle more bracketed formats
         assert Country.simplify_countryname("Korea (the Republic of))") == (
             "KOREA",
             ["THE", "REPUBLIC", "OF"],
         )
+        # Regression test for bug #70 - partial brackets
         assert Country.simplify_countryname("Korea (the Republic of") == (
             "KOREA",
             ["THE", "REPUBLIC", "OF"],
         )
-        assert Country.simplify_countryname(
-            "The former Yugoslav Republic of Macedonia"
-        ) == ("MACEDONIA", ["THE", "FORMER", "YUGOSLAV", "REPUBLIC", "OF"])
+
+        # Test that we don't strip everything just because it's bracketed, even if the brackets
+        # are surrounded by whitespace
+        assert Country.simplify_countryname("(the Republic of Korea)") == (
+            "KOREA",
+            ["THE", "REPUBLIC", "OF"],
+        )
+        assert Country.simplify_countryname("   (the Republic of Korea)   ") == (
+            "KOREA",
+            ["THE", "REPUBLIC", "OF"],
+        )
+
+        # Test that we're actually stripping the brackets and that it's not all just been
+        # simplified words that we'd drop anyway, even if they weren't in brackets
+        assert Country.simplify_countryname("(Sometimes) Korea") == (
+            "KOREA",
+            ["SOMETIMES"],
+        )
+
+        # Regression test for bug #75 - apostrophes in simplified term
         assert Country.simplify_countryname("d'Ivoire Côte") == ("D'IVOIRE", ["CÔTE"])
 
+        # Regression test for bug #77 - other punctuation in simplified term
+        assert Country.simplify_countryname("Guinea-Bissau") == ("GUINEA", ["BISSAU"])
+
+        # Test simplification of terms with apostrophes, and the non-apostrophe form
+        assert Country.simplify_countryname("People's Republic of Bangladesh") == (
+            "BANGLADESH",
+            ["PEOPLE'S", "REPUBLIC", "OF"],
+        )
+        assert Country.simplify_countryname("Peoples Republic of Bangladesh") == (
+            "BANGLADESH",
+            ["PEOPLES", "REPUBLIC", "OF"],
+        )
+        # Known limitation with "smart quote" handling
+        assert Country.simplify_countryname("People’s Republic of Bangladesh") == (
+            "PEOPLE’S",
+            ["REPUBLIC", "OF", "BANGLADESH"],
+        )
+
+        # Simplifying assumes that it isn't getting an address and simplifies to the first
+        # part around commas, even if it isn't a country
+        assert Country.simplify_countryname("Paris, France") == (
+            "PARIS",
+            ["FRANCE"],
+        )
+
+        # Some people supply strings that aren't countries
+        # (often indirectly via `get_iso3_country_code_fuzzy()`)
+        # Ensure the function doesn't error, even if the value is meaningless.
+        assert Country.simplify_countryname("3.1 Global scores and ranking") == (
+            "3",
+            ["1", "GLOBAL", "SCORES", "AND", "RANKING"],
+        )
+
     def test_get_iso3_country_code(self):
         assert Country.get_iso3_country_code("jpn") == "JPN"
         assert Country.get_iso3_country_code("Dem. Rep. of the Congo") == "COD"