Make rayon independent of worker threading

197g · 197g · commit ef1e8d1fd480 · 2022-02-25T22:54:46.000+01:00
Moves it to a seperate module. Rayon now spawn in-place closures with no
explicit communication instead for workers. This can parallelize by
spawning multiple rows of data at the same time.
diff --git a/src/decoder.rs b/src/decoder.rs
@@ -560,14 +560,17 @@ impl<R: Read> Decoder<R> {
                 let coefficients_per_mcu_row = usize::from(component.block_size.width)
                     * usize::from(component.vertical_sampling_factor)
                     * 64;
-                for mcu_y in 0..frame.mcu_size.height {
-                    let row_coefficients = {
+
+                let mut tasks = (0..frame.mcu_size.height)
+                    .map(|mcu_y| {
                         let offset = usize::from(mcu_y) * coefficients_per_mcu_row;
-                        self.coefficients[i][offset..offset + coefficients_per_mcu_row].to_vec()
-                    };
+                        let row_coefficients = self.coefficients[i][offset..offset + coefficients_per_mcu_row].to_vec();
+                        (i, row_coefficients)
+                    });
 
-                    worker.append_row((i, row_coefficients))?;
-                }
+                // FIXME: additional potential work stealing opportunities for rayon case if we
+                // also internally can parallelize over components.
+                worker.append_rows(&mut tasks)?;
                 planes[i] = worker.get_result(i)?;
             }
         }
@@ -871,6 +874,8 @@ impl<R: Read> Decoder<R> {
                         )
                     };
 
+                    // FIXME: additional potential work stealing opportunities for rayon case if we
+                    // also internally can parallelize over components.
                     worker.append_row((i, row_coefficients))?;
                 }
             }
diff --git a/src/worker/immediate.rs b/src/worker/immediate.rs
@@ -10,10 +10,10 @@ use crate::parser::Component;
 use super::{RowData, Worker};
 
 pub struct ImmediateWorker {
-    offsets: [usize; MAX_COMPONENTS],
-    results: Vec<Vec<u8>>,
-    components: Vec<Option<Component>>,
-    quantization_tables: Vec<Option<Arc<[u16; 64]>>>,
+    pub(crate) offsets: [usize; MAX_COMPONENTS],
+    pub(crate) results: Vec<Vec<u8>>,
+    pub(crate) components: Vec<Option<Component>>,
+    pub(crate) quantization_tables: Vec<Option<Arc<[u16; 64]>>>,
 }
 
 pub fn with_immediate<T>(f: impl FnOnce(&mut dyn Worker) -> T) -> T {
diff --git a/src/worker/mod.rs b/src/worker/mod.rs
@@ -1,5 +1,7 @@
 mod immediate;
 mod multithreaded;
+#[cfg(feature = "rayon")]
+mod rayon;
 
 use alloc::sync::Arc;
 use alloc::vec::Vec;
@@ -16,6 +18,15 @@ pub trait Worker {
     fn start(&mut self, row_data: RowData) -> Result<()>;
     fn append_row(&mut self, row: (usize, Vec<i16>)) -> Result<()>;
     fn get_result(&mut self, index: usize) -> Result<Vec<u8>>;
+    /// Default implementation for spawning multiple tasks.
+    fn append_rows(&mut self, row: &mut dyn Iterator<Item=(usize, Vec<i16>)>)
+        -> Result<()>
+    {
+        for item in row {
+            self.append_row(item)?;
+        }
+        Ok(())
+    }
 }
 
 pub enum PreferWorkerKind {
@@ -26,6 +37,9 @@ pub enum PreferWorkerKind {
 /// Execute something with a worker system.
 pub fn with_worker<T>(prefer: PreferWorkerKind, f: impl FnOnce(&mut dyn Worker) -> T) -> T {
     match prefer {
+        #[cfg(not(any(target_arch = "asmjs", target_arch = "wasm32")))]
+        #[cfg(feature = "rayon")]
+        PreferWorkerKind::Multithreaded => self::rayon::with_rayon(f),
         #[cfg(not(any(target_arch = "asmjs", target_arch = "wasm32")))]
         PreferWorkerKind::Multithreaded => self::multithreaded::with_multithreading(f),
         _ => self::immediate::with_immediate(f),
diff --git a/src/worker/multithreaded.rs b/src/worker/multithreaded.rs
@@ -11,13 +11,7 @@ use super::{RowData, Worker};
 use super::immediate::ImmediateWorker;
 
 pub fn with_multithreading<T>(f: impl FnOnce(&mut dyn Worker) -> T) -> T {
-    #[cfg(not(feature = "rayon"))]
-    return self::enter_threads(f);
-
-    #[cfg(feature = "rayon")]
-    return jpeg_rayon::enter(|mut worker| {
-        f(&mut worker)
-    });
+    self::enter_threads(f)
 }
 
 enum WorkerMsg {
@@ -133,70 +127,3 @@ fn enter_threads<T>(f: impl FnOnce(&mut dyn Worker) -> T) -> T {
     let mut worker = StdThreadWorker(MpscWorker::default());
     f(&mut worker)
 }
-
-
-#[cfg(feature = "rayon")]
-mod jpeg_rayon {
-    use crate::error::Result;
-    use super::{MpscWorker, RowData};
-
-    pub struct Scoped<'r, 'scope> {
-        fifo: &'r rayon::ScopeFifo<'scope>,
-        inner: MpscWorker,
-    }
-
-    pub fn enter<T>(f: impl FnOnce(Scoped) -> T) -> T {
-        // Note: Must be at least two threads. Otherwise, we may deadlock, due to ordering
-        // constraints that we can not impose properly. Note that `append_row` creates a new task
-        // while in `get_result` we wait for all tasks of a component. The only way for rayon to
-        // impose this wait __and get a result__ is by ending an in_place_scope.
-        //
-        // However, the ordering of tasks is not as FIFO as the name would suggest. Indeed, even
-        // though tasks are spawned in `start` _before_ the task spawned in `get_result`, the
-        // `in_place_scope_fifo` will wait for ITS OWN results in fifo order. This implies, unless
-        // there is some other thread capable of stealing the worker the work task will in fact not
-        // get executed and the result will wait forever. It is impossible to otherwise schedule
-        // the worker tasks specifically (e.g. join handle would be cool *cough* if you read this
-        // and work on rayon) before while yielding from the current thread.
-        //
-        // So: we need at least one more worker thread that is _not_ occupied.
-        let threads = rayon::ThreadPoolBuilder::new().num_threads(4).build().unwrap();
-
-        threads.in_place_scope_fifo(|fifo| {
-            f(Scoped { fifo, inner: MpscWorker::default() })
-        })
-    }
-
-    impl super::Worker for Scoped<'_, '_> {
-        fn start(&mut self, row_data: RowData) -> Result<()> {
-            let fifo = &mut self.fifo;
-            self.inner.start_with(row_data, |_| {
-                let (tx, worker) = super::create_worker();
-                fifo.spawn_fifo(move |_| {
-                    worker()
-                });
-                Ok(tx)
-            })
-        }
-
-        fn append_row(&mut self, row: (usize, Vec<i16>)) -> Result<()> {
-            self.inner.append_row(row)
-        }
-
-        fn get_result(&mut self, index: usize) -> Result<Vec<u8>> {
-            self.inner.get_result_with(index, |rx| {
-                let mut result = vec![];
-                let deliver_result = &mut result;
-
-                rayon::in_place_scope_fifo(|scope| {
-                    scope.spawn_fifo(move |_| {
-                        *deliver_result = rx.recv().expect("jpeg-decoder worker thread error");
-                    });
-                });
-
-                result
-            })
-        }
-    }
-}
-
diff --git a/src/worker/rayon.rs b/src/worker/rayon.rs
@@ -0,0 +1,117 @@
+use core::convert::TryInto;
+use crate::error::Result;
+use crate::idct::dequantize_and_idct_block;
+
+use std::sync::Mutex;
+
+use super::{RowData, Worker};
+use crate::worker::immediate::ImmediateWorker;
+
+pub struct Scoped {
+    inner: Mutex<ImmediateWorker>,
+}
+
+pub fn with_rayon<T>(f: impl FnOnce(&mut dyn Worker) -> T) -> T {
+    rayon::in_place_scope(|_| {
+        let inner = ImmediateWorker::new_immediate();
+        f(&mut Scoped { inner: Mutex::new(inner) })
+    })
+}
+
+impl Scoped {
+    pub fn append_row_locked(
+        mutex: &Mutex<ImmediateWorker>,
+        (index, data): (usize, Vec<i16>),
+        result_offset: usize,
+    ) {
+        // Convert coefficients from a MCU row to samples.
+        let quantization_table;
+        let block_count;
+        let line_stride;
+        let block_size;
+        let dct_scale;
+
+        {
+            let inner = mutex.lock().unwrap();
+            let component = inner.components[index].as_ref().unwrap();
+            quantization_table = inner.quantization_tables[index].as_ref().unwrap().clone();
+
+            block_size = component.block_size;
+            block_count = block_size.width as usize * component.vertical_sampling_factor as usize;
+            line_stride = block_size.width as usize * component.dct_scale;
+            dct_scale = component.dct_scale;
+        }
+
+        assert_eq!(data.len(), block_count * 64);
+
+        let mut output_buffer = [0; 64];
+        for i in 0..block_count {
+            let x = (i % block_size.width as usize) * dct_scale;
+            let y = (i / block_size.width as usize) * dct_scale;
+
+            let coefficients: &[i16; 64] = &data[i * 64..(i + 1) * 64].try_into().unwrap();
+
+            // Write to a temporary intermediate buffer, a 8x8 'image'.
+            dequantize_and_idct_block(dct_scale, coefficients, &*quantization_table, 8, &mut output_buffer);
+
+            // Lock the mutex only for this write back, not the main computation.
+            // FIXME: we are only copying image data. Can we use some atomic backing buffer and a
+            // `Relaxed` write instead?
+            let mut write_back = mutex.lock().unwrap();
+            let write_back = &mut write_back.results[index][result_offset + y * line_stride + x..];
+
+            let buffered_lines = output_buffer.chunks_mut(8);
+            let back_lines = write_back.chunks_mut(line_stride);
+
+            for (buf, back) in buffered_lines.zip(back_lines).take(dct_scale) {
+                back[..dct_scale].copy_from_slice(&buf[..dct_scale]);
+            }
+        }
+    }
+}
+
+impl super::Worker for Scoped {
+    fn start(&mut self, row_data: RowData) -> Result<()> {
+        self.inner.get_mut().unwrap().start_immediate(row_data);
+        Ok(())
+    }
+
+    fn append_row(&mut self, row: (usize, Vec<i16>)) -> Result<()> {
+        self.inner.get_mut().unwrap().append_row_immediate(row);
+        Ok(())
+    }
+
+    fn get_result(&mut self, index: usize) -> Result<Vec<u8>> {
+        let result = self.inner.get_mut().unwrap().get_result_immediate(index);
+        Ok(result)
+    }
+
+    // Magic sauce, these _may_ run in parallel.
+    fn append_rows(&mut self, iter: &mut dyn Iterator<Item=(usize, Vec<i16>)>)
+        -> Result<()>
+    {
+        rayon::in_place_scope(|scope| {
+            let mut inner = self.inner.lock().unwrap();
+            // First we schedule everything, making sure their index is right etc.
+            for (index, data) in iter {
+                let component = inner.components[index].as_ref().unwrap();
+
+                let block_size = component.block_size;
+                let block_count = block_size.width as usize * component.vertical_sampling_factor as usize;
+                let dct_scale = component.dct_scale;
+
+                let result_offset = inner.offsets[index];
+                inner.offsets[index] += block_count * dct_scale * dct_scale;
+
+                let mutex = &self.inner;
+                scope.spawn(move |_| {
+                    Scoped::append_row_locked(mutex, (index, data), result_offset)
+                });
+            }
+
+            // Then the mutex is released, allowing all tasks to run.
+        });
+
+        Ok(())
+    }
+}