Skip to content

Commit 3367906

Browse files
authored
Improve correctness of POSIX locale conversion (#6575)
Closes #6573
1 parent c524ac2 commit 3367906

File tree

5 files changed

+39
-190
lines changed

5 files changed

+39
-190
lines changed

utils/env_preferences/src/parse/aliases.rs

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -4,63 +4,6 @@
44

55
//! Platform-specific conversion from locale strings to BCP-47 identifiers.
66
7-
/// Find a BCP-47 language/region from a list of POSIX locale aliases.
8-
///
9-
/// Some of these conversions are approximate and not exact (e.g. "C"->"und").
10-
/// This is based on GNU libc's `intl/locale.alias` file, with some manual processing
11-
/// to remove duplicates. The original file is available at:
12-
/// <https://sourceware.org/git/?p=glibc.git;a=blob;f=intl/locale.alias;hb=HEAD>
13-
pub fn find_posix_alias(
14-
alias: &str,
15-
) -> Option<(
16-
icu_locale_core::subtags::Language,
17-
Option<icu_locale_core::subtags::Region>,
18-
)> {
19-
use icu_locale_core::subtags::{language, region, Language};
20-
21-
match alias {
22-
"C" | "POSIX" => Some((Language::UNKNOWN, None)),
23-
"bokmal" => Some((language!("nb"), Some(region!("NO")))),
24-
"catalan" => Some((language!("ca"), Some(region!("ES")))),
25-
"croatian" => Some((language!("hr"), Some(region!("HR")))),
26-
"czech" => Some((language!("cs"), Some(region!("CZ")))),
27-
"danish" => Some((language!("da"), Some(region!("DK")))),
28-
"dansk" => Some((language!("da"), Some(region!("DK")))),
29-
"deutsch" => Some((language!("de"), Some(region!("DE")))),
30-
"dutch" => Some((language!("nl"), Some(region!("NL")))),
31-
"eesti" => Some((language!("et"), Some(region!("EE")))),
32-
"estonian" => Some((language!("et"), Some(region!("EE")))),
33-
"finnish" => Some((language!("fi"), Some(region!("FI")))),
34-
"french" => Some((language!("fr"), Some(region!("FR")))),
35-
"galego" => Some((language!("gl"), Some(region!("ES")))),
36-
"galician" => Some((language!("gl"), Some(region!("ES")))),
37-
"german" => Some((language!("de"), Some(region!("DE")))),
38-
"greek" => Some((language!("el"), Some(region!("GR")))),
39-
"hebrew" => Some((language!("he"), Some(region!("IL")))),
40-
"hrvatski" => Some((language!("hr"), Some(region!("HR")))),
41-
"hungarian" => Some((language!("hu"), Some(region!("HU")))),
42-
"icelandic" => Some((language!("is"), Some(region!("IS")))),
43-
"italian" => Some((language!("it"), Some(region!("IT")))),
44-
"japanese" => Some((language!("ja"), Some(region!("JP")))),
45-
"korean" => Some((language!("ko"), Some(region!("KR")))),
46-
"lithuanian" => Some((language!("lt"), Some(region!("LT")))),
47-
"norwegian" => Some((language!("nb"), Some(region!("NO")))),
48-
"nynorsk" => Some((language!("nn"), Some(region!("NO")))),
49-
"polish" => Some((language!("pl"), Some(region!("PL")))),
50-
"portuguese" => Some((language!("pt"), Some(region!("PT")))),
51-
"romanian" => Some((language!("ro"), Some(region!("RO")))),
52-
"russian" => Some((language!("ru"), Some(region!("RU")))),
53-
"slovak" => Some((language!("sk"), Some(region!("SK")))),
54-
"slovene" => Some((language!("sl"), Some(region!("SI")))),
55-
"slovenian" => Some((language!("sl"), Some(region!("SI")))),
56-
"spanish" => Some((language!("es"), Some(region!("ES")))),
57-
"swedish" => Some((language!("sv"), Some(region!("SE")))),
58-
"thai" => Some((language!("th"), Some(region!("TH")))),
59-
"turkish" => Some((language!("tr"), Some(region!("TR")))),
60-
_ => None,
61-
}
62-
}
63-
647
/// Strip any Windows "Sort Order Identifier" and return a matching CLDR collation value.
658
///
669
/// The full table is available at:

utils/env_preferences/src/parse/posix.rs

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
//! # use env_preferences::LocaleError;
1313
//! # fn main() -> Result<(), LocaleError> {
1414
//! let posix_locale = PosixLocale::try_from_str("en_US")?;
15-
//! assert_eq!(posix_locale.try_convert_lossy()?, locale!("en-US-posix"));
15+
//! assert_eq!(posix_locale.try_convert_lossy()?, locale!("en-US"));
1616
//! # Ok(())
1717
//! # }
1818
//! ```
@@ -21,12 +21,10 @@ use displaydoc::Display;
2121
use icu_locale_core::extensions::unicode::{key, value};
2222
use icu_locale_core::extensions::Extensions;
2323
use icu_locale_core::subtags::{language, script, variant, Language, Region, Variants};
24-
use icu_locale_core::{LanguageIdentifier, Locale};
24+
use icu_locale_core::{locale, LanguageIdentifier, Locale};
2525

2626
use crate::ParseError;
2727

28-
use super::aliases::find_posix_alias;
29-
3028
#[derive(Display, Debug, PartialEq)]
3129
/// An error while parsing a POSIX locale identifier
3230
pub enum PosixParseError {
@@ -208,17 +206,17 @@ impl<'src> PosixLocale<'src> {
208206
/// // Locales will always include the `posix` variant
209207
/// assert_eq!(
210208
/// PosixLocale::try_from_str("en_US")?.try_convert_lossy()?,
211-
/// locale!("en-US-posix")
209+
/// locale!("en-US")
212210
/// );
213211
/// // The codeset field will be ignored
214212
/// assert_eq!(
215213
/// PosixLocale::try_from_str("en_US.iso88591")?.try_convert_lossy()?,
216-
/// locale!("en-US-posix")
214+
/// locale!("en-US")
217215
/// );
218216
/// // Any unknown modifiers will be ignored
219217
/// assert_eq!(
220218
/// PosixLocale::try_from_str("en_US@unknown")?.try_convert_lossy()?,
221-
/// locale!("en-US-posix")
219+
/// locale!("en-US")
222220
/// );
223221
/// # Ok(())
224222
/// # }
@@ -230,63 +228,57 @@ impl<'src> PosixLocale<'src> {
230228
/// # use env_preferences::parse::posix::PosixLocale;
231229
/// # use env_preferences::LocaleError;
232230
/// # fn main() -> Result<(), LocaleError> {
233-
/// // The default "C"/"POSIX" locale will be converted to "und"
231+
/// // The default "C"/"POSIX" locale will be converted to "en-US-posix"
234232
/// assert_eq!(
235233
/// PosixLocale::try_from_str("C")?.try_convert_lossy()?,
236-
/// locale!("und-posix")
234+
/// locale!("en-US-posix")
237235
/// );
238236
/// assert_eq!(
239237
/// PosixLocale::try_from_str("POSIX")?.try_convert_lossy()?,
240-
/// locale!("und-posix")
241-
/// );
242-
///
243-
/// // Known language aliases will be converted to the matching BCP-47 identifier
244-
/// assert_eq!(
245-
/// PosixLocale::try_from_str("french")?.try_convert_lossy()?,
246-
/// locale!("fr-FR-posix")
238+
/// locale!("en-US-posix")
247239
/// );
248240
///
249241
/// // Known script modifiers will be converted to the matching CLDR keys
250242
/// assert_eq!(
251243
/// PosixLocale::try_from_str("uz_UZ@cyrillic")?.try_convert_lossy()?,
252-
/// locale!("uz-Cyrl-UZ-posix")
244+
/// locale!("uz-Cyrl-UZ")
253245
/// );
254246
/// assert_eq!(
255247
/// PosixLocale::try_from_str("ks_IN@devanagari")?.try_convert_lossy()?,
256-
/// locale!("ks-Deva-IN-posix")
248+
/// locale!("ks-Deva-IN")
257249
/// );
258250
/// assert_eq!(
259251
/// PosixLocale::try_from_str("be_BY@latin")?.try_convert_lossy()?,
260-
/// locale!("be-Latn-BY-posix")
252+
/// locale!("be-Latn-BY")
261253
/// );
262254
///
263255
/// // Other known modifiers are handled accordingly
264256
/// assert_eq!(
265257
/// PosixLocale::try_from_str("en_US@euro")?.try_convert_lossy()?,
266-
/// locale!("en-US-posix-u-cu-eur")
258+
/// locale!("en-US-u-cu-eur")
267259
/// );
268260
/// assert_eq!(
269261
/// PosixLocale::try_from_str("aa_ER@saaho")?.try_convert_lossy()?,
270-
/// locale!("ssy-ER-posix")
262+
/// locale!("ssy-ER")
271263
/// );
272264
/// # Ok(())
273265
/// # }
274266
/// ```
275267
pub fn try_convert_lossy(&self) -> Result<Locale, ParseError> {
276-
// Check if the language matches a known alias (e.g. "nynorsk"->("nn", "NO"))
277-
let (mut language, region) = match find_posix_alias(self.language) {
278-
Some((language, region)) => (language, region),
279-
None => {
280-
let language = Language::try_from_str(self.language)?;
281-
let region = self.territory.map(Region::try_from_str).transpose()?;
282-
283-
(language, region)
284-
}
285-
};
268+
// The default "C"/"POSIX" locale should map to "en-US-posix",
269+
// which is the default behaviour in ICU4C:
270+
// https://github.com/unicode-org/icu/blob/795d7ac82c4b29cf721d0ad62c0b178347d453bf/icu4c/source/common/putil.cpp#L1738
271+
if self.language == "C" || self.language == "POSIX" {
272+
return Ok(locale!("en-US-posix"));
273+
}
286274

287275
let mut extensions = Extensions::new();
288276
let mut script = None;
289-
let mut variants = vec![variant!("posix")];
277+
let mut variant = None;
278+
279+
// Parse the language/region
280+
let mut language = Language::try_from_str(self.language)?;
281+
let region = self.territory.map(Region::try_from_str).transpose()?;
290282

291283
if let Some(modifier) = self.modifier {
292284
match modifier.to_ascii_lowercase().as_str() {
@@ -300,8 +292,7 @@ impl<'src> PosixLocale<'src> {
300292
// Saaho seems to be the only "legacy variant" that appears as a modifier:
301293
// https://www.unicode.org/reports/tr35/#table-legacy-variant-mappings
302294
"saaho" => language = language!("ssy"),
303-
// This keeps `variants` sorted; "-posix" comes before "-valencia"
304-
"valencia" => variants.push(variant!("valencia")),
295+
"valencia" => variant = Some(variant!("valencia")),
305296
// Some modifiers are known but can't be expressed as a BCP-47 identifier
306297
// e.g. "@abegede", "@iqtelif"
307298
_ => (),
@@ -313,7 +304,7 @@ impl<'src> PosixLocale<'src> {
313304
language,
314305
region,
315306
script,
316-
variants: Variants::from_vec_unchecked(variants),
307+
variants: variant.map_or_else(Variants::new, Variants::from_variant),
317308
},
318309
extensions,
319310
})

utils/env_preferences/tests/datasets/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
/// List of real-world POSIX locales, based on GNU libc's localization data
88
/// Data used:
99
/// - Filenames in https://sourceware.org/git/?p=glibc.git;a=tree;f=localedata/locales;hb=HEAD
10-
/// - Aliases: https://sourceware.org/git/?p=glibc.git;a=blob;f=intl/locale.alias;hb=HEAD
1110
/// - Legacy `@saaho` modifier: https://sourceware.org/git/?p=glibc.git;a=blob;f=ChangeLog.old/ChangeLog.28;hb=HEAD#l356
1211
const POSIX_DATASET: &str = include_str!("posix.txt");
1312
/// List of real-world Windows locales, based on Microsoft's LCID reference.

0 commit comments

Comments
 (0)