|
1 | 1 | #!/usr/bin/perl -w
|
2 | 2 |
|
| 3 | +use v5.41; |
| 4 | +no feature 'signatures'; # For the time being, to avoid converting |
3 | 5 | use Text::Tabs;
|
4 | 6 |
|
5 | 7 | # Unconditionally regenerate:
|
@@ -2195,48 +2197,42 @@ sub dictionary_order {
|
2195 | 2197 |
|
2196 | 2198 | no warnings 'non_unicode';
|
2197 | 2199 |
|
2198 |
| - local $a = $a; |
2199 |
| - local $b = $b; |
| 2200 | + my $mod_string_for_dictionary_order = sub { |
| 2201 | + my $string = shift; |
2200 | 2202 |
|
2201 |
| - # Convert all digit sequences to same length with leading zeros, so for |
2202 |
| - # example, 8 will compare less than 16 (using a fill length value that |
2203 |
| - # should be longer than any sequence in the input). |
2204 |
| - $a =~ s/(\d+)/sprintf "%06d", $1/ge; |
2205 |
| - $b =~ s/(\d+)/sprintf "%06d", $1/ge; |
| 2203 | + # Convert all digit sequences to be the same length with leading |
| 2204 | + # zeros, so that, for example '8' will sort before '16' (using a fill |
| 2205 | + # length value that should be longer than any sequence in the input). |
| 2206 | + $string =~ s/(\d+)/sprintf "%06d", $1/ge; |
2206 | 2207 |
|
2207 |
| - # Translate any underscores and digits so they compare after all Unicode |
2208 |
| - # characters |
2209 |
| - $a =~ tr[_0-9]/\x{110000}-\x{11000A}/; |
2210 |
| - $b =~ tr[_0-9]/\x{110000}-\x{11000A}/; |
| 2208 | + # Translate any underscores so they sort lowest. This causes |
| 2209 | + # 'word1_word2' to sort before 'word1word2' for all words. And |
| 2210 | + # translate any digits so they come after anything else. This causes |
| 2211 | + # digits to sort highest) |
| 2212 | + $string =~ tr[_0-9]/\0\x{110000}-\x{110009}/; |
2211 | 2213 |
|
2212 |
| - use feature 'state'; |
2213 |
| - # Modify \w, \W to reflect the changes. |
2214 |
| - state $ud = '\x{110000}-\x{11000A}'; # xlated underscore, digits |
2215 |
| - state $w = "\\w$ud"; # new \w string |
2216 |
| - state $mod_w = qr/[$w]/; |
2217 |
| - state $mod_W = qr/[^$w]/; |
| 2214 | + # Then move leading underscores to the end, translating them to above |
| 2215 | + # everything else. This causes '_word_' to compare just after 'word_' |
| 2216 | + $string .= "\x{11000A}" x length $1 if $string =~ s/ ^ (\0+) //x; |
2218 | 2217 |
|
2219 |
| - # Only \w for initial comparison |
2220 |
| - my $a_only_word = uc($a =~ s/$mod_W//gr); |
2221 |
| - my $b_only_word = uc($b =~ s/$mod_W//gr); |
| 2218 | + return $string; |
| 2219 | + }; |
2222 | 2220 |
|
2223 |
| - # And not initial nor interior underscores nor digits (by squeezing them |
2224 |
| - # out) |
2225 |
| - my $a_stripped = $a_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx; |
2226 |
| - my $b_stripped = $b_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx; |
| 2221 | + # Modify \w, \W to reflect what the above sub does. |
| 2222 | + state $w = "\\w\0\x{110000}-\x{11000A}"; # new \w string |
| 2223 | + state $mod_w = qr/[$w]/; |
| 2224 | + state $mod_W = qr/[^$w]/; |
2227 | 2225 |
|
2228 |
| - # If the stripped versions differ, use that as the comparison. |
2229 |
| - my $cmp = $a_stripped cmp $b_stripped; |
2230 |
| - return $cmp if $cmp; |
| 2226 | + local $a = $mod_string_for_dictionary_order->($a); |
| 2227 | + local $b = $mod_string_for_dictionary_order->($b); |
2231 | 2228 |
|
2232 |
| - # For the first tie breaker, repeat, but consider initial and interior |
2233 |
| - # underscores and digits, again having those compare after all Unicode |
2234 |
| - # characters |
2235 |
| - $cmp = $a_only_word cmp $b_only_word; |
| 2229 | + # If the strings stripped of \W differ, use that as the comparison. |
| 2230 | + my $cmp = lc ($a =~ s/$mod_W//gr) cmp lc ($b =~ s/$mod_W//gr); |
2236 | 2231 | return $cmp if $cmp;
|
2237 | 2232 |
|
2238 |
| - # Next tie breaker is just a caseless comparison |
2239 |
| - $cmp = uc($a) cmp uc($b); |
| 2233 | + # For the first tie breaker use a plain caseless comparison of the |
| 2234 | + # modified strings |
| 2235 | + $cmp = lc $a cmp lc $b; |
2240 | 2236 | return $cmp if $cmp;
|
2241 | 2237 |
|
2242 | 2238 | # Finally a straight comparison
|
|
0 commit comments