Skip to content

Commit 9c334eb

Browse files
sipragatorvalds
authored andcommitted
get_maintainer: correctly parse UTF-8 encoded names in files
While the script correctly extracts UTF-8 encoded names from the MAINTAINERS file, the regular expressions damage my name when parsing from .yaml files. Fix this by replacing the Latin-1-compatible regular expressions with the unicode property matcher \p{L}, which matches on any letter according to the Unicode General Category of letters. The proposed solution only works if the script uses proper string encoding from the outset, so instruct Perl to unconditionally open all files with UTF-8 encoding. This should be safe, as the entire source tree is either UTF-8 or ASCII encoded anyway. See [1] for a detailed analysis. Furthermore, to prevent the \w expression from matching non-ASCII when checking for whether a name should be escaped with quotes, add the /a flag to the regular expression. The escaping logic was duplicated in two places, so it has been factored out into its own function. The original issue was also identified on the tools mailing list [2]. This should solve the observed side effects there as well. Link: https://lore.kernel.org/all/dzn6uco4c45oaa3ia4u37uo5mlt33obecv7gghj2l756fr4hdh@mt3cprft3tmq/ [1] Link: https://lore.kernel.org/tools/20230726-gush-slouching-a5cd41@meerkat/ [2] Signed-off-by: Alvin Šipraga <alsi@bang-olufsen.dk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 453f5db commit 9c334eb

File tree

1 file changed

+17
-13
lines changed

1 file changed

+17
-13
lines changed

scripts/get_maintainer.pl

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
use Cwd;
2121
use File::Find;
2222
use File::Spec::Functions;
23+
use open qw(:std :encoding(UTF-8));
2324

2425
my $cur_path = fastgetcwd() . '/';
2526
my $lk_path = "./";
@@ -445,7 +446,7 @@ sub maintainers_in_file {
445446
my $text = do { local($/) ; <$f> };
446447
close($f);
447448

448-
my @poss_addr = $text =~ m$[A-Za-zÀ-ÿ\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g;
449+
my @poss_addr = $text =~ m$[\p{L}\"\' \,\.\+-]*\s*[\,]*\s*[\(\<\{]{0,1}[A-Za-z0-9_\.\+-]+\@[A-Za-z0-9\.-]+\.[A-Za-z0-9]+[\)\>\}]{0,1}$g;
449450
push(@file_emails, clean_file_emails(@poss_addr));
450451
}
451452
}
@@ -1152,6 +1153,17 @@ sub top_of_kernel_tree {
11521153
return 0;
11531154
}
11541155

1156+
sub escape_name {
1157+
my ($name) = @_;
1158+
1159+
if ($name =~ /[^\w \-]/ai) { ##has "must quote" chars
1160+
$name =~ s/(?<!\\)"/\\"/g; ##escape quotes
1161+
$name = "\"$name\"";
1162+
}
1163+
1164+
return $name;
1165+
}
1166+
11551167
sub parse_email {
11561168
my ($formatted_email) = @_;
11571169

@@ -1169,13 +1181,9 @@ sub parse_email {
11691181

11701182
$name =~ s/^\s+|\s+$//g;
11711183
$name =~ s/^\"|\"$//g;
1184+
$name = escape_name($name);
11721185
$address =~ s/^\s+|\s+$//g;
11731186

1174-
if ($name =~ /[^\w \-]/i) { ##has "must quote" chars
1175-
$name =~ s/(?<!\\)"/\\"/g; ##escape quotes
1176-
$name = "\"$name\"";
1177-
}
1178-
11791187
return ($name, $address);
11801188
}
11811189

@@ -1186,13 +1194,9 @@ sub format_email {
11861194

11871195
$name =~ s/^\s+|\s+$//g;
11881196
$name =~ s/^\"|\"$//g;
1197+
$name = escape_name($name);
11891198
$address =~ s/^\s+|\s+$//g;
11901199

1191-
if ($name =~ /[^\w \-]/i) { ##has "must quote" chars
1192-
$name =~ s/(?<!\\)"/\\"/g; ##escape quotes
1193-
$name = "\"$name\"";
1194-
}
1195-
11961200
if ($usename) {
11971201
if ("$name" eq "") {
11981202
$formatted_email = "$address";
@@ -2462,13 +2466,13 @@ sub clean_file_emails {
24622466
$name = "";
24632467
}
24642468

2465-
my @nw = split(/[^A-Za-zÀ-ÿ\'\,\.\+-]/, $name);
2469+
my @nw = split(/[^\p{L}\'\,\.\+-]/, $name);
24662470
if (@nw > 2) {
24672471
my $first = $nw[@nw - 3];
24682472
my $middle = $nw[@nw - 2];
24692473
my $last = $nw[@nw - 1];
24702474

2471-
if (((length($first) == 1 && $first =~ m/[A-Za-z]/) ||
2475+
if (((length($first) == 1 && $first =~ m/\p{L}/) ||
24722476
(length($first) == 2 && substr($first, -1) eq ".")) ||
24732477
(length($middle) == 1 ||
24742478
(length($middle) == 2 && substr($middle, -1) eq "."))) {

0 commit comments

Comments
 (0)