Skip to content

Commit 7bbf310

Browse files
Rework country simplification (#78)
* Rework country simplification (#77) We now use lists of words and drop terms rather than trying to do lots of string mutations that might give us different values that aren't in the list of terms. * Rework to reduce iteration complexity Single word terms can be dropped in a single pass * Fix typo and reduce iteration further Take the first non-dropped word rather than checking all words for whether they need to be dropped and then taking the first from what remains. * Test and fix empty string and all removed cases * Handle additional whitespace * Simplify the logic further We can loop over an enumerated list and break when we find a word. Multi-part words do index-based access but it's rare enough that the performance hit for indexing an array should be low. * Linting * Add more comments and optimise Moved the term building inside the IF block because we don't need to build the lists if we don't have any candidate words! * Fix test failure
1 parent 84f5e15 commit 7bbf310

File tree

2 files changed

+188
-34
lines changed

2 files changed

+188
-34
lines changed

src/hdx/location/country.py

Lines changed: 78 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
"""Country location"""
22

3-
import copy
43
import logging
54
import os.path
65
import re
7-
from string import punctuation
86
from typing import Dict, List, Optional, Tuple, Union
97

108
import hxl
@@ -62,6 +60,7 @@ class Country:
6260
"BOLIVARIAN",
6361
"PLURINATIONAL",
6462
"PEOPLE'S",
63+
"PEOPLES",
6564
"DUTCH PART",
6665
"FRENCH PART",
6766
"MALVINAS",
@@ -721,41 +720,90 @@ def simplify_countryname(cls, country: str) -> (str, List[str]):
721720
Returns:
722721
Tuple[str, List[str]]: Uppercase simplified country name and list of removed words
723722
"""
724-
countryupper = country.upper()
723+
# Convert the input into an upper-cased list of words
724+
countryupper = country.upper().strip()
725725
words = get_words_in_sentence(countryupper)
726+
727+
# Strip common patterns
726728
index = countryupper.find(",")
727729
if index != -1:
728730
countryupper = countryupper[:index]
729731
index = countryupper.find(":")
730732
if index != -1:
731733
countryupper = countryupper[:index]
732-
regex = re.compile(r"\(.+?\)")
733-
countryupper = regex.sub("", countryupper)
734-
remove = copy.deepcopy(cls.simplifications)
735-
for simplification1, simplification2 in cls.abbreviations.items():
736-
countryupper = countryupper.replace(simplification1, "")
737-
remove.append(simplification2)
738-
for (
739-
simplification1,
740-
simplifications,
741-
) in cls.multiple_abbreviations.items():
742-
countryupper = countryupper.replace(simplification1, "")
743-
for simplification2 in simplifications:
744-
remove.append(simplification2)
745-
remove = "|".join(remove)
746-
regex = re.compile(
747-
r"[" + punctuation.replace("'", "") + r"]|\b(" + remove + r")\b",
748-
flags=re.IGNORECASE,
749-
)
750-
countryupper = regex.sub("", countryupper)
751-
countryupper = countryupper.strip()
752-
countryupper_words = get_words_in_sentence(countryupper)
753-
if len(countryupper_words) > 1:
754-
countryupper = countryupper_words[0]
755-
if countryupper:
756-
countryupper = countryupper.strip(punctuation)
757-
words.remove(countryupper)
758-
return countryupper, words
734+
735+
if countryupper and not (countryupper[0] == "(" and countryupper[-1] == ")"):
736+
regex = re.compile(r"\(.+?\)")
737+
countryupper = regex.sub("", countryupper)
738+
739+
# Find the words that remain as candidates for the simplified name.
740+
# These are guaranteed to be a subset of `words` because we have only pruned
741+
# parts from the sentence and not done any transformative processing.
742+
candidate_words = get_words_in_sentence(countryupper)
743+
744+
if candidate_words:
745+
# Make the simplifying terms indexable for efficient lookup
746+
multiword_terms = {}
747+
singleword_terms = set()
748+
749+
for terms in [
750+
cls.simplifications,
751+
cls.abbreviations.keys(),
752+
cls.abbreviations.values(),
753+
cls.multiple_abbreviations.keys(),
754+
] + list(cls.multiple_abbreviations.values()):
755+
for term in terms:
756+
if " " in term:
757+
# Index multi-word terms by the first term against a list of the terms
758+
term_parts = term.split(" ")
759+
multiword_terms[term_parts[0]] = term_parts
760+
else:
761+
# Add single word terms to the set, and add their dot-less form as well
762+
singleword_terms.add(term)
763+
if term[-1] == ".":
764+
singleword_terms.add(term.strip("."))
765+
766+
num_candidate_words = len(candidate_words)
767+
simplified_term = ""
768+
enumerated_words = enumerate(candidate_words)
769+
default = (num_candidate_words, "")
770+
771+
# Iterate through the candidate terms until we a) find a non-simplified word
772+
# or b) hit the end of the list of words
773+
while (val := next(enumerated_words, default)) != default:
774+
i, word = val
775+
if word in singleword_terms:
776+
# If the word was a single word simplification term then skip it
777+
continue
778+
if (
779+
# If the current term is the first word in a multi-part term
780+
(term_parts := multiword_terms.get(word))
781+
# And there are enough words left in the sentence
782+
and i + len(term_parts) <= num_candidate_words
783+
# And all of the words in the multi-word phrase are in sequence
784+
# in the candidate term starting at the current position
785+
and all(
786+
candidate_words[i + j] == term_part
787+
for j, term_part in enumerate(term_parts)
788+
)
789+
):
790+
# Then skip the other words in the term and continue
791+
for _ in range(len(term_parts) - 1):
792+
next(enumerated_words)
793+
794+
continue
795+
# Else we found a word that we aren't dropping - it is our simplified word.
796+
# Take it and break.
797+
simplified_term = word
798+
break
799+
800+
if simplified_term:
801+
# We found a simplified term. Remove it from the list of other terms
802+
words.remove(simplified_term)
803+
else:
804+
simplified_term = ""
805+
806+
return simplified_term, words
759807

760808
@classmethod
761809
def get_iso3_country_code(

tests/hdx/location/test_country.py

Lines changed: 110 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -644,38 +644,144 @@ def test_expand_countryname_abbrevs(self):
644644
]
645645

646646
def test_simplify_countryname(self):
647-
assert Country.simplify_countryname("jpn") == ("JPN", list())
647+
# Test that we handle the empty string case
648+
assert Country.simplify_countryname("") == ("", [])
649+
650+
# Test that country codes and arbitrary words return just the word but capitalised
651+
assert Country.simplify_countryname("jpn") == ("JPN", [])
652+
assert Country.simplify_countryname("test") == ("TEST", [])
653+
654+
# Test simplified terms are removed, including abbreviations
648655
assert Country.simplify_countryname("United Rep. of Tanzania") == (
649656
"TANZANIA",
650657
["UNITED", "REP", "OF"],
651658
)
659+
assert Country.simplify_countryname(
660+
"The former Yugoslav Republic of Macedonia"
661+
) == ("MACEDONIA", ["THE", "FORMER", "YUGOSLAV", "REPUBLIC", "OF"])
662+
663+
# Test different word orderings and bracketing are consistent
652664
assert Country.simplify_countryname("Micronesia (Federated States of)") == (
653665
"MICRONESIA",
654666
["FEDERATED", "STATES", "OF"],
655667
)
668+
assert Country.simplify_countryname("Federated States of Micronesia") == (
669+
"MICRONESIA",
670+
["FEDERATED", "STATES", "OF"],
671+
)
672+
assert Country.simplify_countryname("(Federated States of) Micronesia") == (
673+
"MICRONESIA",
674+
["FEDERATED", "STATES", "OF"],
675+
)
676+
677+
# Test that the simplified terms on their own are dropped and that we handle
678+
# the "no simplified term" case
679+
assert Country.simplify_countryname("Federated States") == (
680+
"",
681+
["FEDERATED", "STATES"],
682+
)
683+
684+
# Test that multi-word simplifications are dropped
685+
assert Country.simplify_countryname("French Part of Saint Martin") == (
686+
"MARTIN",
687+
["FRENCH", "PART", "OF", "SAINT"],
688+
)
689+
assert Country.simplify_countryname("French Part of Saint-Martin") == (
690+
"MARTIN",
691+
["FRENCH", "PART", "OF", "SAINT"],
692+
)
693+
# "French Part" is a simplification and so can't be the simplified term
694+
assert Country.simplify_countryname("French Part") == ("", ["FRENCH", "PART"])
695+
# But the words must be consecutive for multi-part terms,
696+
# so we don't drop "French" and "part" here
697+
assert Country.simplify_countryname("French and Part") == (
698+
"FRENCH",
699+
["AND", "PART"],
700+
)
701+
702+
# Test that we handle abbreviations with and without punctuation
656703
assert Country.simplify_countryname("Dem. Rep. of the Congo") == (
657704
"CONGO",
658705
["DEM", "REP", "OF", "THE"],
659706
)
707+
assert Country.simplify_countryname("Dem Rep of the Congo") == (
708+
"CONGO",
709+
["DEM", "REP", "OF", "THE"],
710+
)
711+
712+
# Test that we handle the "Country, Specifics" comma format
660713
assert Country.simplify_countryname(
661714
"Korea, Democratic People's Republic of"
662715
) == ("KOREA", ["DEMOCRATIC", "PEOPLE'S", "REPUBLIC", "OF"])
663716
assert Country.simplify_countryname(
664717
"Democratic People's Republic of Korea"
665718
) == ("KOREA", ["DEMOCRATIC", "PEOPLE'S", "REPUBLIC", "OF"])
719+
720+
# Test that we handle more bracketed formats
666721
assert Country.simplify_countryname("Korea (the Republic of))") == (
667722
"KOREA",
668723
["THE", "REPUBLIC", "OF"],
669724
)
725+
# Regression test for bug #70 - partial brackets
670726
assert Country.simplify_countryname("Korea (the Republic of") == (
671727
"KOREA",
672728
["THE", "REPUBLIC", "OF"],
673729
)
674-
assert Country.simplify_countryname(
675-
"The former Yugoslav Republic of Macedonia"
676-
) == ("MACEDONIA", ["THE", "FORMER", "YUGOSLAV", "REPUBLIC", "OF"])
730+
731+
# Test that we don't strip everything just because it's bracketed, even if the brackets
732+
# are surrounded by whitespace
733+
assert Country.simplify_countryname("(the Republic of Korea)") == (
734+
"KOREA",
735+
["THE", "REPUBLIC", "OF"],
736+
)
737+
assert Country.simplify_countryname(" (the Republic of Korea) ") == (
738+
"KOREA",
739+
["THE", "REPUBLIC", "OF"],
740+
)
741+
742+
# Test that we're actually stripping the brackets and that it's not all just been
743+
# simplified words that we'd drop anyway, even if they weren't in brackets
744+
assert Country.simplify_countryname("(Sometimes) Korea") == (
745+
"KOREA",
746+
["SOMETIMES"],
747+
)
748+
749+
# Regression test for bug #75 - apostrophes in simplified term
677750
assert Country.simplify_countryname("d'Ivoire Côte") == ("D'IVOIRE", ["CÔTE"])
678751

752+
# Regression test for bug #77 - other punctuation in simplified term
753+
assert Country.simplify_countryname("Guinea-Bissau") == ("GUINEA", ["BISSAU"])
754+
755+
# Test simplification of terms with apostrophes, and the non-apostrophe form
756+
assert Country.simplify_countryname("People's Republic of Bangladesh") == (
757+
"BANGLADESH",
758+
["PEOPLE'S", "REPUBLIC", "OF"],
759+
)
760+
assert Country.simplify_countryname("Peoples Republic of Bangladesh") == (
761+
"BANGLADESH",
762+
["PEOPLES", "REPUBLIC", "OF"],
763+
)
764+
# Known limitation with "smart quote" handling
765+
assert Country.simplify_countryname("People’s Republic of Bangladesh") == (
766+
"PEOPLE’S",
767+
["REPUBLIC", "OF", "BANGLADESH"],
768+
)
769+
770+
# Simplifying assumes that it isn't getting an address and simplifies to the first
771+
# part around commas, even if it isn't a country
772+
assert Country.simplify_countryname("Paris, France") == (
773+
"PARIS",
774+
["FRANCE"],
775+
)
776+
777+
# Some people supply strings that aren't countries
778+
# (often indirectly via `get_iso3_country_code_fuzzy()`)
779+
# Ensure the function doesn't error, even if the value is meaningless.
780+
assert Country.simplify_countryname("3.1 Global scores and ranking") == (
781+
"3",
782+
["1", "GLOBAL", "SCORES", "AND", "RANKING"],
783+
)
784+
679785
def test_get_iso3_country_code(self):
680786
assert Country.get_iso3_country_code("jpn") == "JPN"
681787
assert Country.get_iso3_country_code("Dem. Rep. of the Congo") == "COD"

0 commit comments

Comments
 (0)