@@ -644,38 +644,144 @@ def test_expand_countryname_abbrevs(self):
644644 ]
645645
646646 def test_simplify_countryname (self ):
647- assert Country .simplify_countryname ("jpn" ) == ("JPN" , list ())
647+ # Test that we handle the empty string case
648+ assert Country .simplify_countryname ("" ) == ("" , [])
649+
650+ # Test that country codes and arbitrary words return just the word but capitalised
651+ assert Country .simplify_countryname ("jpn" ) == ("JPN" , [])
652+ assert Country .simplify_countryname ("test" ) == ("TEST" , [])
653+
654+ # Test simplified terms are removed, including abbreviations
648655 assert Country .simplify_countryname ("United Rep. of Tanzania" ) == (
649656 "TANZANIA" ,
650657 ["UNITED" , "REP" , "OF" ],
651658 )
659+ assert Country .simplify_countryname (
660+ "The former Yugoslav Republic of Macedonia"
661+ ) == ("MACEDONIA" , ["THE" , "FORMER" , "YUGOSLAV" , "REPUBLIC" , "OF" ])
662+
663+ # Test different word orderings and bracketing are consistent
652664 assert Country .simplify_countryname ("Micronesia (Federated States of)" ) == (
653665 "MICRONESIA" ,
654666 ["FEDERATED" , "STATES" , "OF" ],
655667 )
668+ assert Country .simplify_countryname ("Federated States of Micronesia" ) == (
669+ "MICRONESIA" ,
670+ ["FEDERATED" , "STATES" , "OF" ],
671+ )
672+ assert Country .simplify_countryname ("(Federated States of) Micronesia" ) == (
673+ "MICRONESIA" ,
674+ ["FEDERATED" , "STATES" , "OF" ],
675+ )
676+
677+ # Test that the simplified terms on their own are dropped and that we handle
678+ # the "no simplified term" case
679+ assert Country .simplify_countryname ("Federated States" ) == (
680+ "" ,
681+ ["FEDERATED" , "STATES" ],
682+ )
683+
684+ # Test that multi-word simplifications are dropped
685+ assert Country .simplify_countryname ("French Part of Saint Martin" ) == (
686+ "MARTIN" ,
687+ ["FRENCH" , "PART" , "OF" , "SAINT" ],
688+ )
689+ assert Country .simplify_countryname ("French Part of Saint-Martin" ) == (
690+ "MARTIN" ,
691+ ["FRENCH" , "PART" , "OF" , "SAINT" ],
692+ )
693+ # "French Part" is a simplification and so can't be the simplified term
694+ assert Country .simplify_countryname ("French Part" ) == ("" , ["FRENCH" , "PART" ])
695+ # But the words must be consecutive for multi-part terms,
696+ # so we don't drop "French" and "part" here
697+ assert Country .simplify_countryname ("French and Part" ) == (
698+ "FRENCH" ,
699+ ["AND" , "PART" ],
700+ )
701+
702+ # Test that we handle abbreviations with and without punctuation
656703 assert Country .simplify_countryname ("Dem. Rep. of the Congo" ) == (
657704 "CONGO" ,
658705 ["DEM" , "REP" , "OF" , "THE" ],
659706 )
707+ assert Country .simplify_countryname ("Dem Rep of the Congo" ) == (
708+ "CONGO" ,
709+ ["DEM" , "REP" , "OF" , "THE" ],
710+ )
711+
712+ # Test that we handle the "Country, Specifics" comma format
660713 assert Country .simplify_countryname (
661714 "Korea, Democratic People's Republic of"
662715 ) == ("KOREA" , ["DEMOCRATIC" , "PEOPLE'S" , "REPUBLIC" , "OF" ])
663716 assert Country .simplify_countryname (
664717 "Democratic People's Republic of Korea"
665718 ) == ("KOREA" , ["DEMOCRATIC" , "PEOPLE'S" , "REPUBLIC" , "OF" ])
719+
720+ # Test that we handle more bracketed formats
666721 assert Country .simplify_countryname ("Korea (the Republic of))" ) == (
667722 "KOREA" ,
668723 ["THE" , "REPUBLIC" , "OF" ],
669724 )
725+ # Regression test for bug #70 - partial brackets
670726 assert Country .simplify_countryname ("Korea (the Republic of" ) == (
671727 "KOREA" ,
672728 ["THE" , "REPUBLIC" , "OF" ],
673729 )
674- assert Country .simplify_countryname (
675- "The former Yugoslav Republic of Macedonia"
676- ) == ("MACEDONIA" , ["THE" , "FORMER" , "YUGOSLAV" , "REPUBLIC" , "OF" ])
730+
731+ # Test that we don't strip everything just because it's bracketed, even if the brackets
732+ # are surrounded by whitespace
733+ assert Country .simplify_countryname ("(the Republic of Korea)" ) == (
734+ "KOREA" ,
735+ ["THE" , "REPUBLIC" , "OF" ],
736+ )
737+ assert Country .simplify_countryname (" (the Republic of Korea) " ) == (
738+ "KOREA" ,
739+ ["THE" , "REPUBLIC" , "OF" ],
740+ )
741+
742+ # Test that we're actually stripping the brackets and that it's not all just been
743+ # simplified words that we'd drop anyway, even if they weren't in brackets
744+ assert Country .simplify_countryname ("(Sometimes) Korea" ) == (
745+ "KOREA" ,
746+ ["SOMETIMES" ],
747+ )
748+
749+ # Regression test for bug #75 - apostrophes in simplified term
677750 assert Country .simplify_countryname ("d'Ivoire Côte" ) == ("D'IVOIRE" , ["CÔTE" ])
678751
752+ # Regression test for bug #77 - other punctuation in simplified term
753+ assert Country .simplify_countryname ("Guinea-Bissau" ) == ("GUINEA" , ["BISSAU" ])
754+
755+ # Test simplification of terms with apostrophes, and the non-apostrophe form
756+ assert Country .simplify_countryname ("People's Republic of Bangladesh" ) == (
757+ "BANGLADESH" ,
758+ ["PEOPLE'S" , "REPUBLIC" , "OF" ],
759+ )
760+ assert Country .simplify_countryname ("Peoples Republic of Bangladesh" ) == (
761+ "BANGLADESH" ,
762+ ["PEOPLES" , "REPUBLIC" , "OF" ],
763+ )
764+ # Known limitation with "smart quote" handling
765+ assert Country .simplify_countryname ("People’s Republic of Bangladesh" ) == (
766+ "PEOPLE’S" ,
767+ ["REPUBLIC" , "OF" , "BANGLADESH" ],
768+ )
769+
770+ # Simplifying assumes that it isn't getting an address and simplifies to the first
771+ # part around commas, even if it isn't a country
772+ assert Country .simplify_countryname ("Paris, France" ) == (
773+ "PARIS" ,
774+ ["FRANCE" ],
775+ )
776+
777+ # Some people supply strings that aren't countries
778+ # (often indirectly via `get_iso3_country_code_fuzzy()`)
779+ # Ensure the function doesn't error, even if the value is meaningless.
780+ assert Country .simplify_countryname ("3.1 Global scores and ranking" ) == (
781+ "3" ,
782+ ["1" , "GLOBAL" , "SCORES" , "AND" , "RANKING" ],
783+ )
784+
679785 def test_get_iso3_country_code (self ):
680786 assert Country .get_iso3_country_code ("jpn" ) == "JPN"
681787 assert Country .get_iso3_country_code ("Dem. Rep. of the Congo" ) == "COD"
0 commit comments