Merge pull request #64 from Mark-Simulacrum/custom-recompress

pietroalbini · web-flow · commit b55c4b30c4d6 · 2023-04-15T19:35:15.000+02:00
Replace rayon with manual parallelism
diff --git a/src/recompress.rs b/src/recompress.rs
@@ -11,7 +11,6 @@
 //! finish in a reasonable amount of time.
 
 use crate::Context;
-use rayon::prelude::*;
 use std::fmt::Write as FmtWrite;
 use std::fs::{self, File};
 use std::io::{self, Read, Write};
@@ -24,7 +23,7 @@ impl Context {
         println!(
             "starting to recompress {} files across {} threads",
             to_recompress.len(),
-            to_recompress.len().min(rayon::current_num_threads()),
+            to_recompress.len().min(self.config.num_threads),
         );
         println!(
             "gz recompression enabled: {} (note: may occur anyway for missing gz artifacts)",
@@ -38,135 +37,154 @@ impl Context {
         let compression_level = flate2::Compression::new(self.config.gzip_compression_level);
 
         // Query the length of each file, and sort by length. This puts the smallest files
-        // toward the end of the array, which will generally deprioritize them in the parallel
-        // next parallel loop, avoiding as much of a long-tail on the compression work
-        // (smallest files are fastest to recompress typically).
-        //
-        // FIXME: Rayon's documentation on par_iter isn't very detailed in terms of whether this
-        // does any good. We may want to replace this with our own manual thread pool
-        // implementation that guarantees this property - each task is large enough that just
-        // popping from a single Mutex<Vec<...>> will be plenty fast enough.
-        to_recompress.sort_by_cached_key(|path| {
-            std::cmp::Reverse(fs::metadata(path).map(|m| m.len()).unwrap_or(0))
-        });
-
-        to_recompress
-            .par_iter()
-            .map(|xz_path| {
-                println!("recompressing {}...", xz_path.display());
-                let file_start = Instant::now();
-                let gz_path = xz_path.with_extension("gz");
-
-                let mut destinations: Vec<(&str, Box<dyn io::Write>)> = Vec::new();
-
-                // Produce gzip if explicitly enabled or the destination file doesn't exist.
-                if recompress_gz || !gz_path.is_file() {
-                    let gz = File::create(gz_path)?;
-                    destinations.push((
-                        "gz",
-                        Box::new(flate2::write::GzEncoder::new(gz, compression_level)),
-                    ));
-                }
-
-                // xz recompression with more aggressive settings than we want to take the time
-                // for in rust-lang/rust CI. This cuts 5-15% off of the produced tarballs.
-                //
-                // Note that this is using a single-threaded compressor as we're parallelizing
-                // via rayon already. In rust-lang/rust we were trying to use parallel
-                // compression, but the default block size for that is 3*dict_size so we
-                // weren't actually using more than one core in most of the builders with
-                // <192MB uncompressed tarballs. In promote-release since we're recompressing
-                // 100s of tarballs there's no need for each individual compression to be
-                // parallel.
-                let xz_recompressed = xz_path.with_extension("xz_recompressed");
-                if recompress_xz {
-                    let mut filters = xz2::stream::Filters::new();
-                    let mut lzma_ops = xz2::stream::LzmaOptions::new_preset(9).unwrap();
-                    // This sets the overall dictionary size, which is also how much memory (baseline)
-                    // is needed for decompression.
-                    lzma_ops.dict_size(64 * 1024 * 1024);
-                    // Use the best match finder for compression ratio.
-                    lzma_ops.match_finder(xz2::stream::MatchFinder::BinaryTree4);
-                    lzma_ops.mode(xz2::stream::Mode::Normal);
-                    // Set nice len to the maximum for best compression ratio
-                    lzma_ops.nice_len(273);
-                    // Set depth to a reasonable value, 0 means auto, 1000 is somwhat high but gives
-                    // good results.
-                    lzma_ops.depth(1000);
-                    // 2 is the default and does well for most files
-                    lzma_ops.position_bits(2);
-                    // 0 is the default and does well for most files
-                    lzma_ops.literal_position_bits(0);
-                    // 3 is the default and does well for most files
-                    lzma_ops.literal_context_bits(3);
-
-                    filters.lzma2(&lzma_ops);
-
-                    // FIXME: Do we want a checksum as part of compression?
-                    let stream =
-                        xz2::stream::Stream::new_stream_encoder(&filters, xz2::stream::Check::None)
+        // toward the start of the array, which will make us pop them last. Smaller units of work
+        // are less likely to lead to a long tail of a single thread doing work while others are
+        // idle, so we want to schedule them last (i.e., in the tail of the build).
+        to_recompress.sort_by_cached_key(|path| fs::metadata(path).map(|m| m.len()).unwrap_or(0));
+
+        let total_length = to_recompress.len();
+
+        // Manually parallelize across freshly spawned worker threads. rayon is nice, but since we
+        // care about the scheduling order and have very large units of work (>500ms, typically 10s
+        // of seconds) the more efficient parallelism in rayon isn't desirable. (Scheduling order
+        // is the particular problem for us).
+        let to_recompress = std::sync::Mutex::new(to_recompress);
+        std::thread::scope(|s| {
+            // Spawn num_threads workers...
+            let mut tasks = Vec::new();
+            for _ in 0..self.config.num_threads {
+                tasks.push(s.spawn(|| {
+                    while let Some(xz_path) = {
+                        // Extra block is needed to make sure the lock guard drops before we enter the
+                        // loop iteration, because while-let is desugared to a loop + match, and match
+                        // scopes live until the end of the match.
+                        let path = to_recompress.lock().unwrap().pop();
+                        path
+                    } {
+                        println!("recompressing {}...", xz_path.display());
+                        let file_start = Instant::now();
+                        let gz_path = xz_path.with_extension("gz");
+
+                        let mut destinations: Vec<(&str, Box<dyn io::Write>)> = Vec::new();
+
+                        // Produce gzip if explicitly enabled or the destination file doesn't exist.
+                        if recompress_gz || !gz_path.is_file() {
+                            let gz = File::create(gz_path)?;
+                            destinations.push((
+                                "gz",
+                                Box::new(flate2::write::GzEncoder::new(gz, compression_level)),
+                            ));
+                        }
+
+                        // xz recompression with more aggressive settings than we want to take the time
+                        // for in rust-lang/rust CI. This cuts 5-15% off of the produced tarballs.
+                        //
+                        // Note that this is using a single-threaded compressor as we're parallelizing
+                        // via rayon already. In rust-lang/rust we were trying to use parallel
+                        // compression, but the default block size for that is 3*dict_size so we
+                        // weren't actually using more than one core in most of the builders with
+                        // <192MB uncompressed tarballs. In promote-release since we're recompressing
+                        // 100s of tarballs there's no need for each individual compression to be
+                        // parallel.
+                        let xz_recompressed = xz_path.with_extension("xz_recompressed");
+                        if recompress_xz {
+                            let mut filters = xz2::stream::Filters::new();
+                            let mut lzma_ops = xz2::stream::LzmaOptions::new_preset(9).unwrap();
+                            // This sets the overall dictionary size, which is also how much memory (baseline)
+                            // is needed for decompression.
+                            lzma_ops.dict_size(64 * 1024 * 1024);
+                            // Use the best match finder for compression ratio.
+                            lzma_ops.match_finder(xz2::stream::MatchFinder::BinaryTree4);
+                            lzma_ops.mode(xz2::stream::Mode::Normal);
+                            // Set nice len to the maximum for best compression ratio
+                            lzma_ops.nice_len(273);
+                            // Set depth to a reasonable value, 0 means auto, 1000 is somwhat high but gives
+                            // good results.
+                            lzma_ops.depth(1000);
+                            // 2 is the default and does well for most files
+                            lzma_ops.position_bits(2);
+                            // 0 is the default and does well for most files
+                            lzma_ops.literal_position_bits(0);
+                            // 3 is the default and does well for most files
+                            lzma_ops.literal_context_bits(3);
+
+                            filters.lzma2(&lzma_ops);
+
+                            // FIXME: Do we want a checksum as part of compression?
+                            let stream = xz2::stream::Stream::new_stream_encoder(
+                                &filters,
+                                xz2::stream::Check::None,
+                            )
                             .unwrap();
-                    let xz_out = File::create(&xz_recompressed)?;
-                    destinations.push((
-                        "xz",
-                        Box::new(xz2::write::XzEncoder::new_stream(
-                            std::io::BufWriter::new(xz_out),
-                            stream,
-                        )),
-                    ));
-                }
-
-                // We only decompress once and then write into each of the compressors before
-                // moving on.
-                //
-                // This code assumes that compression with `write_all` will never fail (i.e., we
-                // can take arbitrary amounts of data as input). That seems like a reasonable
-                // assumption though.
-                let mut decompressor = XzDecoder::new(File::open(xz_path)?);
-                let mut buffer = vec![0u8; 4 * 1024 * 1024];
-                let mut decompress_time = Duration::ZERO;
-                let mut time_by_dest = vec![Duration::ZERO; destinations.len()];
-                loop {
-                    let start = Instant::now();
-                    let length = decompressor.read(&mut buffer)?;
-                    decompress_time += start.elapsed();
-                    if length == 0 {
-                        break;
-                    }
-                    for (idx, (_, destination)) in destinations.iter_mut().enumerate() {
-                        let start = std::time::Instant::now();
-                        destination.write_all(&buffer[..length])?;
-                        time_by_dest[idx] += start.elapsed();
+                            let xz_out = File::create(&xz_recompressed)?;
+                            destinations.push((
+                                "xz",
+                                Box::new(xz2::write::XzEncoder::new_stream(
+                                    std::io::BufWriter::new(xz_out),
+                                    stream,
+                                )),
+                            ));
+                        }
+
+                        // We only decompress once and then write into each of the compressors before
+                        // moving on.
+                        //
+                        // This code assumes that compression with `write_all` will never fail (i.e., we
+                        // can take arbitrary amounts of data as input). That seems like a reasonable
+                        // assumption though.
+                        let mut decompressor = XzDecoder::new(File::open(&xz_path)?);
+                        let mut buffer = vec![0u8; 4 * 1024 * 1024];
+                        let mut decompress_time = Duration::ZERO;
+                        let mut time_by_dest = vec![Duration::ZERO; destinations.len()];
+                        loop {
+                            let start = Instant::now();
+                            let length = decompressor.read(&mut buffer)?;
+                            decompress_time += start.elapsed();
+                            if length == 0 {
+                                break;
+                            }
+                            for (idx, (_, destination)) in destinations.iter_mut().enumerate() {
+                                let start = std::time::Instant::now();
+                                destination.write_all(&buffer[..length])?;
+                                time_by_dest[idx] += start.elapsed();
+                            }
+                        }
+
+                        let mut compression_times = String::new();
+                        for (idx, (name, _)) in destinations.iter().enumerate() {
+                            write!(
+                                compression_times,
+                                ", {:.2?} {} compression",
+                                time_by_dest[idx], name
+                            )?;
+                        }
+                        println!(
+                            "recompressed {}: {:.2?} total, {:.2?} decompression{}",
+                            xz_path.display(),
+                            file_start.elapsed(),
+                            decompress_time,
+                            compression_times
+                        );
+
+                        if recompress_xz {
+                            fs::rename(&xz_recompressed, xz_path)?;
+                        }
                     }
-                }
-
-                let mut compression_times = String::new();
-                for (idx, (name, _)) in destinations.iter().enumerate() {
-                    write!(
-                        compression_times,
-                        ", {:.2?} {} compression",
-                        time_by_dest[idx], name
-                    )?;
-                }
-                println!(
-                    "recompressed {}: {:.2?} total, {:.2?} decompression{}",
-                    xz_path.display(),
-                    file_start.elapsed(),
-                    decompress_time,
-                    compression_times
-                );
-
-                if recompress_xz {
-                    fs::rename(&xz_recompressed, xz_path)?;
-                }
-
-                Ok::<(), anyhow::Error>(())
-            })
-            .collect::<anyhow::Result<Vec<()>>>()?;
+
+                    Ok::<_, anyhow::Error>(())
+                }));
+            }
+
+            for task in tasks {
+                task.join().expect("no panics")?;
+            }
+
+            Ok::<_, anyhow::Error>(())
+        })?;
 
         println!(
             "finished recompressing {} files in {:.2?}",
-            to_recompress.len(),
+            total_length,
             recompress_start.elapsed(),
         );
         Ok(())