awxkee
diff --git a/‎README.md‎
Lines changed: 0 additions & 2 deletions b/‎README.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎app/src/main.rs‎
Lines changed: 15 additions & 9 deletions b/‎app/src/main.rs‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎src/alpha.rs‎
Lines changed: 83 additions & 99 deletions b/‎src/alpha.rs‎
Lines changed: 83 additions & 99 deletions
@@ -12,8 +12,6 @@ let img = ImageReader::open("./assets/nasa-4928x3279.png")
 let dimensions = img.dimensions();
 let transient = img.to_rgb8();
 
-let start = Instant::now();
-
 let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize);
 let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
 
 
@@ -35,8 +35,9 @@ use image::{
     ImageReader, Rgb, RgbImage,
 };
 use pic_scale_safe::{
-    resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8, resize_rgb_f32,
-    resize_rgba16, resize_rgba8, ImageSize, ResamplingFunction,
+    premultiply_rgba8, resize_fixed_point, resize_floating_point, resize_rgb16, resize_rgb8,
+    resize_rgb_f32, resize_rgba16, resize_rgba8, unpremultiply_rgba8, ImageSize,
+    ResamplingFunction,
 };
 use std::ops::{BitXor, Shr};
 use std::time::Instant;
@@ -47,26 +48,31 @@ fn main() {
         .decode()
         .unwrap();
     let dimensions = img.dimensions();
-    let transient = img.to_rgb8();
+    let transient = img.to_rgba8();
 
     let mut working_store = transient.to_vec();
 
     let start = Instant::now();
 
     let src_size = ImageSize::new(dimensions.0 as usize, dimensions.1 as usize);
-    let dst_size = ImageSize::new(
-        (dimensions.0 as f32 + 1.) as usize,
-        (dimensions.1 as f32 + 1.) as usize,
-    );
+    let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
 
-    let mut resized = resize_rgb8(
+    let start_mul = Instant::now();
+
+    premultiply_rgba8(&mut working_store);
+
+    println!("Alpha mul time {:?}", start_mul.elapsed());
+
+    let mut resized = resize_rgba8(
         &working_store,
         src_size,
         dst_size,
         ResamplingFunction::Lanczos3,
     )
     .unwrap();
 
+    // unpremultiply_rgba8(&mut resized);
+
     println!("Working time {:?}", start.elapsed());
 
     // let rgba_image = DynamicImage::ImageRgb16(ImageBuffer::<Rgb<u16>, Vec<u16>>::from_vec(dimensions.0 * 4, dimensions.1 / 4, resized).unwrap());
@@ -82,7 +88,7 @@ fn main() {
         &resized,
         dst_size.width as u32,
         dst_size.height as u32,
-        image::ColorType::Rgb8,
+        image::ColorType::Rgba8,
     )
     .unwrap();
 
 
@@ -27,6 +27,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#[inline]
+fn div_by_255(v: u16) -> u8 {
+    ((((v + 0x80) >> 8) + v + 0x80) >> 8).min(255) as u8
+}
+
 /// Associate alpha in place
 ///
 /// Note, for scaling alpha must be *associated*
@@ -36,17 +41,14 @@
 /// * `in_place`: Slice to where premultiply
 ///
 pub fn premultiply_rgba8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(4) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3] as u16;
-        let mut r = chunk[0] as u16;
-        let mut g = chunk[1] as u16;
-        let mut b = chunk[2] as u16;
-        r = (r * a) / 255;
-        g = (g * a) / 255;
-        b = (b * a) / 255;
-        chunk[0] = r as u8;
-        chunk[1] = g as u8;
-        chunk[2] = b as u8;
+        chunk[0] = div_by_255(chunk[0] as u16 * a);
+        chunk[1] = div_by_255(chunk[1] as u16 * a);
+        chunk[2] = div_by_255(chunk[2] as u16 * a);
+        chunk[3] = div_by_255(a * a);
     }
 }
 
@@ -60,23 +62,17 @@ pub fn premultiply_rgba8(in_place: &mut [u8]) {
 ///
 ///
 pub fn unpremultiply_rgba8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(4) {
-        let a = chunk[3] as u16;
-        let mut r = chunk[0] as u16;
-        let mut g = chunk[1] as u16;
-        let mut b = chunk[2] as u16;
-        if a == 0 {
-            r = 0;
-            g = 0;
-            b = 0;
-        } else {
-            r = (r * 255) / a;
-            g = (g * 255) / a;
-            b = (b * 255) / a;
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(4) {
+        let a = chunk[3];
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as f32 * 255.) * a_recip) as u8;
+            chunk[1] = ((chunk[1] as f32 * 255.) * a_recip) as u8;
+            chunk[2] = ((chunk[2] as f32 * 255.) * a_recip) as u8;
+            chunk[3] = ((a as f32 * 255.) * a_recip) as u8;
         }
-        chunk[0] = r as u8;
-        chunk[1] = g as u8;
-        chunk[2] = b as u8;
     }
 }
 
@@ -89,11 +85,12 @@ pub fn unpremultiply_rgba8(in_place: &mut [u8]) {
 /// * `in_place`: Slice to where premultiply
 ///
 pub fn premultiply_la8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(2) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(2) {
         let a = chunk[1] as u16;
-        let mut r = chunk[0] as u16;
-        r = (r * a) / 255;
-        chunk[0] = r as u8;
+        chunk[0] = div_by_255(chunk[0] as u16 * a);
+        chunk[1] = div_by_255(chunk[1] as u16 * a);
     }
 }
 
@@ -107,15 +104,15 @@ pub fn premultiply_la8(in_place: &mut [u8]) {
 ///
 ///
 pub fn unpremultiply_la8(in_place: &mut [u8]) {
-    for chunk in in_place.chunks_mut(2) {
-        let a = chunk[1] as u16;
-        let mut r = chunk[0] as u16;
-        if a == 0 {
-            r = 0;
-        } else {
-            r = (r * 255) / a;
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(2) {
+        let a = chunk[1];
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as f32 * 255.) * a_recip) as u8;
+            chunk[1] = ((a as f32 * 255.) * a_recip) as u8;
         }
-        chunk[0] = r as u8;
     }
 }
 
@@ -129,19 +126,20 @@ pub fn unpremultiply_la8(in_place: &mut [u8]) {
 /// * `bit_depth`: Bit-depth of the image
 ///
 pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(4) {
+    let recip_max_colors = 1. / max_colors as f32;
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3] as u32;
-        let mut r = chunk[0] as u32;
-        let mut g = chunk[1] as u32;
-        let mut b = chunk[2] as u32;
-        r = (r * a) / max_colors;
-        g = (g * a) / max_colors;
-        b = (b * a) / max_colors;
-        chunk[0] = r as u16;
-        chunk[1] = g as u16;
-        chunk[2] = b as u16;
+        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
+            as u16;
+        chunk[1] = (((chunk[1] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
+            as u16;
+        chunk[2] = (((chunk[2] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
+            as u16;
+        chunk[3] = (((a * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16;
     }
 }
 
@@ -155,13 +153,16 @@ pub fn premultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
 /// * `bit_depth`: Bit-depth of the image
 ///
 pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(2) {
+    let recip_max_colors = 1. / max_colors as f32;
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[1] as u32;
-        let mut r = chunk[0] as u32;
-        r = (r * a) / max_colors;
-        chunk[0] = r as u16;
+        chunk[0] = (((chunk[0] as u32 * a) as f32 * recip_max_colors) as u32).min(max_colors as u32)
+            as u16;
+        chunk[1] = (((a * a) as f32 * recip_max_colors) as u32).min(max_colors as u32) as u16;
     }
 }
 
@@ -176,17 +177,17 @@ pub fn premultiply_la16(in_place: &mut [u16], bit_depth: u32) {
 ///
 ///
 pub fn unpremultiply_la16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(2) {
+    for chunk in in_place.chunks_exact_mut(2) {
         let a = chunk[1] as u32;
-        let mut r = chunk[0] as u32;
-        if a == 0 {
-            r = 0;
-        } else {
-            r = (r * max_colors) / a;
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[1] = ((a * max_colors) as f32 * a_recip) as u16;
         }
-        chunk[0] = r as u16;
     }
 }
 
@@ -201,25 +202,19 @@ pub fn unpremultiply_la16(in_place: &mut [u16], bit_depth: u32) {
 ///
 ///
 pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
     assert!(bit_depth > 0 && bit_depth <= 16);
     let max_colors = (1 << bit_depth) - 1;
-    for chunk in in_place.chunks_mut(4) {
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3] as u32;
-        let mut r = chunk[0] as u32;
-        let mut g = chunk[1] as u32;
-        let mut b = chunk[2] as u32;
-        if a == 0 {
-            r = 0;
-            g = 0;
-            b = 0;
-        } else {
-            r = (r * max_colors) / a;
-            g = (g * max_colors) / a;
-            b = (b * max_colors) / a;
+        if a != 0 {
+            let a_recip = 1. / a as f32;
+            chunk[0] = ((chunk[0] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[1] = ((chunk[1] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[2] = ((chunk[2] as u32 * max_colors) as f32 * a_recip) as u16;
+            chunk[3] = ((a * max_colors) as f32 * a_recip) as u16;
         }
-        chunk[0] = r as u16;
-        chunk[1] = g as u16;
-        chunk[2] = b as u16;
     }
 }
 
@@ -232,17 +227,14 @@ pub fn unpremultiply_rgba16(in_place: &mut [u16], bit_depth: u32) {
 /// * `in_place`: Slice to where premultiply
 ///
 pub fn premultiply_rgba_f32(in_place: &mut [f32]) {
-    for chunk in in_place.chunks_mut(4) {
+    // Almost all loops are not auto-vectorised without doing anything dirty.
+    // So everywhere is just added something beautiful.
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3];
-        let mut r = chunk[0];
-        let mut g = chunk[1];
-        let mut b = chunk[2];
-        r *= a;
-        g *= a;
-        b *= a;
-        chunk[0] = r;
-        chunk[1] = g;
-        chunk[2] = b;
+        chunk[0] *= a;
+        chunk[1] *= a;
+        chunk[2] *= a;
+        chunk[3] = a;
     }
 }
 
@@ -256,22 +248,14 @@ pub fn premultiply_rgba_f32(in_place: &mut [f32]) {
 ///
 ///
 pub fn unpremultiply_rgba_f32(in_place: &mut [f32]) {
-    for chunk in in_place.chunks_mut(4) {
+    for chunk in in_place.chunks_exact_mut(4) {
         let a = chunk[3];
-        let mut r = chunk[0];
-        let mut g = chunk[1];
-        let mut b = chunk[2];
-        if a == 0. {
-            r = 0.;
-            g = 0.;
-            b = 0.;
-        } else {
-            r /= a;
-            g /= a;
-            b /= a;
+        if a != 0. {
+            let a_recip = 1. / a;
+            chunk[0] *= a_recip;
+            chunk[1] *= a_recip;
+            chunk[2] *= a_recip;
+            chunk[3] = a;
         }
-        chunk[0] = r;
-        chunk[1] = g;
-        chunk[2] = b;
     }
 }