Skip to content

Commit 79f995f

Browse files
committed
Complete rewrite of the script
Notable changes: - takes into account attachment directories - checks for files that exist in one archive and not in the other - compares all content in the archives - changed the heuristics to exclude the generated date/ version footer in generated messages - option to ignore files using regex - progress / quiet modes - takes into account symlinks
1 parent 8d9b198 commit 79f995f

File tree

1 file changed

+300
-56
lines changed

1 file changed

+300
-56
lines changed

tests/diff_hypermail_archives.pl

Lines changed: 300 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,314 @@
1-
#!/usr/local/bin/perl
1+
#!/usr/bin/perl
22

3+
# diff_hypermail_archives
34
#
4-
# Compare 2 directories with archives produced by Hypermail.
5+
# Compare two directories with archives produced by Hypermail.
56
# Intended primarily to test changes to Hypermail.
6-
# Written by Peter McCluskey (pcm@rahul.net).
77
#
8+
# Originally written by Peter McCluskey (pcm@rahul.net).
9+
# Rewritten and extended by Jose Kahan (jose.kahan@w3.org).
10+
#
11+
# TODO: If more portability is needed, try switch the sytem
12+
# diff and grep to File::Compare
13+
# https://perldoc.perl.org/File/Compare.html
814

9-
$dir1 = $ARGV[0];
10-
$dir2 = $ARGV[1];
15+
use strict;
16+
use warnings;
1117

12-
opendir(FDDIR,$dir1) || die("cant open $dir1");
13-
@files1 = sort readdir(FDDIR);
18+
use File::Find;
19+
use FindBin '$Script';
20+
use Cwd 'abs_path';
1421

15-
foreach $file (@files1)
16-
{
17-
my $file1name = "$dir1/$file";
18-
my $file2name = "$dir2/$file";
19-
if(!(-e $file2name))
20-
{
21-
print "$file2name is missing\n";
22-
next;
23-
}
24-
if(-d $file1name)
25-
{
26-
print "$file2name is not a directory\n" if(!(-d $file2name));
27-
next;
28-
}
29-
open(FD, "diff $file1name $file2name |") || die("cannot diff $file1name $file2name");
30-
while($line = <FD>)
31-
{
32-
if($line =~ /\d+c\d+/)
33-
{
34-
if(&non_date_diffs($diff1, $diff2))
35-
{
36-
print "$file: $diffs";
22+
use Getopt::Std;
23+
24+
##
25+
## configurable options
26+
##
27+
28+
# path to diff command
29+
our $diff_cmd = "/usr/bin/diff";
30+
# path to grep command
31+
our $grep_cmd= "/bin/grep";
32+
33+
##
34+
## End of configurable options
35+
##
36+
37+
##
38+
## Global variables
39+
##
40+
41+
# don't print progress messages
42+
our $quiet;
43+
# print a "." for each processed file
44+
our $show_progress;
45+
# enable to print extra messages
46+
our $debug;
47+
# ignore all content below the footer trailer
48+
our $ignore_footer;
49+
# hash with filenames, dirnames that must be ignored
50+
our @ignore_regex;
51+
# thw two dirs that need to be compared
52+
our $dir1;
53+
our $dir2;
54+
# global difference counter (we consider them errors)
55+
our $errors = 0;
56+
# attachmed dir prefix, hard-coded into hypermail
57+
our $attachment_dir_prefix = "att-";
58+
# archive generated by hypermail generated text blurb
59+
our $hmail_generated_by_text = q(This archive was generated by <a.*hypermail-project.org/");
60+
# footer trailer generated by hypermail
61+
our $footer_trailer = q(<!-- trailer="footer" -->);
62+
63+
# returns the line number of the footer or of the generated_by hypermail
64+
# blurb if found
65+
sub get_hypermail_generated_by_lines {
66+
my $filename = shift;
67+
my $counter = 0;
68+
my $expected_counter = ($ignore_footer) ? 1 : 2;
69+
my %generated_by_lines;
70+
71+
my $needle = $ignore_footer ? $footer_trailer : $hmail_generated_by_text;
72+
my @grep_args = ($grep_cmd, $ignore_footer ? "-A0" : "-A1", "-n", $needle, $filename);
73+
74+
open (my $fh, "-|", @grep_args) || die("cannot @grep_args\n");
75+
76+
while (my $line = <$fh>) {
77+
if ($line =~ m/^\d+-?:\ ?/) {
78+
my $line_nb = (split /:/, $line)[0];
79+
$line_nb =~ s/-$//;
80+
$generated_by_lines{$line_nb} = 1;
81+
$counter++;
82+
}
83+
}
84+
close ($fh);
85+
86+
return ($counter == $expected_counter) ? \%generated_by_lines : {};
87+
88+
} # get_hypermail_generated_by_lines
89+
90+
# checks if the filenames exist in both directories and the type
91+
sub compare_filenames {
92+
my ($filename1, $filename2) = @_;
93+
my $res = 0;
94+
95+
if (!-e $filename2) {
96+
$errors++;
97+
$res = -1;
98+
print "\n" if $show_progress;
99+
print "[$errors] $filename2 does not exist\n" unless $quiet;
100+
101+
} elsif (-d $filename1) {
102+
if (!-d $filename2) {
103+
print "\n" if $show_progress;
104+
print "[$errors] $filename2 is not a directory\n" unless $quiet;
105+
$errors++;
106+
}
107+
$res = -1;
108+
}
109+
110+
return $res;
111+
112+
} # compare_filenames
113+
114+
# filter out files we're not interested in
115+
sub filter_filenames {
116+
my $filename = shift;
117+
my $res = 0;
118+
119+
foreach my $regex (@ignore_regex) {
120+
if ($filename =~ m/$regex/) {
121+
$res = -1;
122+
print "$filename is ignored per regex: " . $regex . "\n" unless $quiet;
123+
}
124+
}
125+
126+
return $res;
127+
128+
} # filter_filenames
129+
130+
# does a diff on existing directories, files, and file content
131+
sub diff_files_complete {
132+
my $file = $_;
133+
my $filename1 = $File::Find::name;
134+
my $filename2 = $filename1;
135+
$filename2 =~ s/$dir1/$dir2/;
136+
my $generated_by_lines;
137+
my $footer_line;
138+
my $diffs = "";
139+
my $local_errors = 0;
140+
141+
print "." if $show_progress;
142+
143+
if (compare_filenames ($filename1, $filename2)
144+
|| filter_filenames ($filename1)) {
145+
return;
146+
}
147+
148+
my $is_attachment_dir = $filename1 =~ m/\/$attachment_dir_prefix/;
149+
150+
if (!$is_attachment_dir) {
151+
if ($filename1 =~ m/\.html$/) {
152+
$generated_by_lines = get_hypermail_generated_by_lines ($filename1);
153+
if ($ignore_footer) {
154+
$footer_line = (keys %{ $generated_by_lines } )[0];
37155
}
38-
$diffs = '';
39-
$diff1 = '';
40-
$diff2 = '';
156+
}
157+
}
158+
159+
print "comparing $filename1\n" if $debug;
160+
161+
my @diff_args = ($diff_cmd, $filename1, $filename2);
162+
open (my $fh, "-|", @diff_args) || die("cannot diff $filename1 $filename2\n");
163+
164+
while (my $line = <$fh>) {
165+
166+
if ($line eq "") {
167+
next;
168+
}
169+
170+
# for hypermail generated messages and indexes, if the diff
171+
# finds the the generated_by blurb, we assume that the only
172+
# things that changed are the version number and/or the
173+
# generation date. We ignore the rest of the diff output at
174+
# this point.
175+
if ($line =~ /\d+c\d+/) {
176+
if (!$is_attachment_dir) {
177+
my ($ln_1, $ln_2) = split /c/, $line, 2;
178+
179+
if ($ln_1 eq $ln_2 && %{ $generated_by_lines }) {
180+
if ($ignore_footer) {
181+
if ($ln_1 >= $footer_line) {
182+
last;
183+
}
184+
} elsif ($$generated_by_lines{$ln_1}) {
185+
last;
186+
}
187+
}
188+
}
189+
$local_errors++;
41190
}
42191
$diffs .= $line;
43-
$diff1 .= $line if(substr($line,0,1) eq '<');
44-
$diff2 .= $line if(substr($line,0,1) eq '>');
45192
}
46-
if(&non_date_diffs($diff1, $diff2))
47-
{
48-
print "$file: $diffs";
193+
close ($fh);
194+
195+
if ($diffs ne "" && !$quiet) {
196+
$errors++;
197+
print "\n" if $show_progress;
198+
print "[$errors] $filename1\n[$errors] $filename2: found $local_errors difference" . ($errors == 1 ? "" : "s") . "\n";
199+
print "$diffs\n";
49200
}
50-
$diffs = '';
51-
$diff1 = '';
52-
$diff2 = '';
53-
}
201+
202+
} # diff_files_complete
54203

55-
sub non_date_diffs
204+
# only does a diff to see if the same directories and files exist.
205+
# Ignores content differences.
206+
sub diff_files_dir {
207+
my $filename1 = $File::Find::name;
208+
my $filename2 = $filename1;
209+
$filename2 =~ s/$dir1/$dir2/;
210+
211+
print "." if $show_progress;
212+
213+
compare_filenames ($filename1, $filename2);
214+
215+
return;
216+
217+
} # diff_files
218+
219+
sub process_options {
220+
my %options=();
221+
222+
getopts("qhfpdi:", \%options);
223+
224+
$dir1 = $ARGV[0];
225+
$dir2 = $ARGV[1];
226+
227+
if (defined $options{d}) {
228+
$debug = 1;
229+
}
230+
if (defined $options{q}) {
231+
$quiet = 1;
232+
}
233+
234+
if (defined $options{p} && !$quiet && !$debug) {
235+
$show_progress = 1;
236+
}
237+
238+
if (defined $options{f}) {
239+
$ignore_footer = 1;
240+
}
241+
242+
if (defined $options{h} || !defined $dir1 || !defined $dir2) {
243+
die ("\nUsage: $Script [-q -h -i foo:bar] dir1 dir2\n"
244+
. "\t-q quiet mode\n"
245+
. "\t-p show processing progress\n"
246+
. "\t-h help prints this message\n"
247+
. "\t-f ignore all content below the footer trailer comment\n"
248+
. "\t-i list of colon separated regex corresponding to directories/filenames to ignore\n"
249+
. "\tdir1, dir2 paths to the two directories to compare\n\n");
250+
}
251+
252+
# remove trailing / if given
253+
$dir1 =~ s/\/+$//;
254+
$dir2 =~ s/\/+$//;
255+
256+
if (-l $dir1) {
257+
$dir1 = abs_path ($dir1);
258+
}
259+
260+
if (-l $dir2) {
261+
$dir2 = abs_path ($dir2);
262+
}
263+
264+
if (!-d $dir1) {
265+
die ("directory $dir1 doesn't exist\n");
266+
}
267+
268+
if (!-d $dir2) {
269+
die ("directory $dir2 doesn't exist\n");
270+
}
271+
272+
if (defined $options{i}) {
273+
@ignore_regex = split (/:/, $options{i});
274+
}
275+
276+
} # process_options
277+
278+
# main
56279
{
57-
my ($diff1, $diff2) = @_;
58-
my $i;
59-
return 1 if(length($diff1) != length($diff2));
60-
for($i = 0; $i < length($diff2); ++$i)
61-
{
62-
my $c1 = substr($diff1, $i, 1);
63-
my $c2 = substr($diff2, $i, 1);
64-
next if($c1 eq $c2);
65-
next if(($c1 eq "<") && ($c2 eq ">"));
66-
next if(($c1 =~ /\d/) && ($c2 =~ /\d/));
67-
return 1;
68-
}
69-
return 0;
70-
}
280+
# read command-line options
281+
process_options();
282+
283+
my %find_options = ('follow' => 1,
284+
'wanted' => \&diff_files_complete,
285+
);
286+
287+
print "\n" unless $quiet;
288+
print "comparing $dir1 against $dir2\n" unless $quiet;
289+
print "\n" if $debug && !$quiet;
290+
291+
find(\%find_options, $dir1);
292+
293+
# do the opposite diff too, to make sure we are not generating new files
294+
print "\n\n" if $show_progress || $debug;
295+
296+
print "comparing $dir2 filenames against $dir1\n" unless $quiet;
297+
298+
($dir1, $dir2) = ($dir2, $dir1);
299+
300+
$find_options{wanted} = \&diff_files_dir;
301+
find(\%find_options, $dir1);
302+
303+
print "\n" if $show_progress;
304+
print "\n" unless $quiet;
305+
306+
if ($errors) {
307+
print "=> $dir1 and $dir2 differ ($errors difference", $errors > 1 ? "s" : "", ")\n\n" unless $quiet;
308+
} else {
309+
print "=> Archives are identical\n\n" unless $quiet;
310+
}
311+
312+
exit (($errors == 0) ? 0 : -1);
313+
314+
} # main

0 commit comments

Comments
 (0)