Added stackblur, alpha handling improvements

awxkee · Jun 11, 2024 · 01ec35e · 01ec35e
1 parent 9d186b1
commit 01ec35e
Show file tree

Hide file tree

Showing 9 changed files with 737 additions and 54 deletions.
diff --git a/assets/abstract_alpha.png b/assets/abstract_alpha.png
diff --git a/src/lib/fast_gaussian.rs b/src/lib/fast_gaussian.rs
@@ -63,7 +63,10 @@ fn fast_gaussian_vertical_pass<
             );
             return;
         }
-        #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), target_feature = "sse4.1"))]
+        #[cfg(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "sse4.1"
+        ))]
         {
             let slice: &UnsafeSlice<'_, u8> = unsafe { std::mem::transmute(bytes) };
             sse_support::fast_gaussian_vertical_pass_sse_u8::<CHANNELS_CONFIGURATION>(
@@ -75,6 +78,7 @@ fn fast_gaussian_vertical_pass<
     let mut buffer_r: [i32; 1024] = [0; 1024];
     let mut buffer_g: [i32; 1024] = [0; 1024];
     let mut buffer_b: [i32; 1024] = [0; 1024];
+    let mut buffer_a: [i32; 1024] = [0; 1024];
     let radius_64 = radius as i64;
     let height_wide = height as i64;
     let mul_value = MUL_TABLE_DOUBLE[radius as usize];
@@ -87,6 +91,8 @@ fn fast_gaussian_vertical_pass<
         let mut sum_g: i32 = initial;
         let mut dif_b: i32 = 0;
         let mut sum_b: i32 = initial;
+        let mut dif_a: i32 = 0;
+        let mut sum_a: i32 = initial;
 
         let current_px = (x * CHANNELS_CONFIGURATION as u32) as usize;
 
@@ -105,6 +111,11 @@ fn fast_gaussian_vertical_pass<
                     bytes.write(current_y + current_px, new_r);
                     bytes.write(current_y + current_px + 1, new_g);
                     bytes.write(current_y + current_px + 2, new_b);
+                    if CHANNELS_CONFIGURATION == 4 {
+                        let new_a = T::from_u32((sum_a * mul_value) as u32 >> shr_value)
+                            .unwrap_or_default();
+                        bytes.write(current_y + current_px + 3, new_a);
+                    }
                 }
 
                 let arr_index = ((y - radius_64) & 1023) as usize;
@@ -115,11 +126,18 @@ fn fast_gaussian_vertical_pass<
                     - 2 * unsafe { *buffer_g.get_unchecked(d_arr_index) };
                 dif_b += unsafe { *buffer_b.get_unchecked(arr_index) }
                     - 2 * unsafe { *buffer_b.get_unchecked(d_arr_index) };
+                if CHANNELS_CONFIGURATION == 4 {
+                    dif_a += unsafe { *buffer_a.get_unchecked(arr_index) }
+                        - 2 * unsafe { *buffer_a.get_unchecked(d_arr_index) };
+                }
             } else if y + radius_64 >= 0 {
                 let arr_index = (y & 1023) as usize;
                 dif_r -= 2 * unsafe { *buffer_r.get_unchecked(arr_index) };
                 dif_g -= 2 * unsafe { *buffer_g.get_unchecked(arr_index) };
                 dif_b -= 2 * unsafe { *buffer_b.get_unchecked(arr_index) };
+                if CHANNELS_CONFIGURATION == 4 {
+                    dif_a -= 2 * unsafe { *buffer_a.get_unchecked(arr_index) };
+                }
             }
 
             let next_row_y = (std::cmp::min(std::cmp::max(y + radius_64, 0), height_wide - 1)
@@ -152,6 +170,15 @@ fn fast_gaussian_vertical_pass<
             unsafe {
                 *buffer_b.get_unchecked_mut(arr_index) = ub8.into();
             }
+
+            if CHANNELS_CONFIGURATION == 4 {
+                let ua8 = bytes[px_idx + 3];
+                dif_a += ua8.into();
+                sum_a += dif_a;
+                unsafe {
+                    *buffer_a.get_unchecked_mut(arr_index) = ua8.into();
+                }
+            }
         }
     }
 }
@@ -195,6 +222,7 @@ fn fast_gaussian_horizontal_pass<
     let mut buffer_r: [i32; 1024] = [0; 1024];
     let mut buffer_g: [i32; 1024] = [0; 1024];
     let mut buffer_b: [i32; 1024] = [0; 1024];
+    let mut buffer_a: [i32; 1024] = [0; 1024];
     let radius_64 = radius as i64;
     let width_wide = width as i64;
     let mul_value = MUL_TABLE_DOUBLE[radius as usize];
@@ -211,21 +239,32 @@ fn fast_gaussian_horizontal_pass<
         let mut sum_g: i32 = initial_sum;
         let mut dif_b: i32 = 0;
         let mut sum_b: i32 = initial_sum;
+        let mut dif_a: i32 = 0;
+        let mut sum_a: i32 = initial_sum;
 
         let current_y = ((y as i64) * (stride as i64)) as usize;
 
         let start_x = 0 - 2 * radius_64;
         for x in start_x..(width as i64) {
             if x >= 0 {
                 let current_px = ((std::cmp::max(x, 0) as u32) * channels_count) as usize;
-                let new_r = T::from_u32((sum_r * mul_value) as u32 >> shr_value).unwrap_or_default();
-                let new_g = T::from_u32((sum_g * mul_value) as u32 >> shr_value).unwrap_or_default();
-                let new_b = T::from_u32((sum_b * mul_value) as u32 >> shr_value).unwrap_or_default();
+                let new_r =
+                    T::from_u32((sum_r * mul_value) as u32 >> shr_value).unwrap_or_default();
+                let new_g =
+                    T::from_u32((sum_g * mul_value) as u32 >> shr_value).unwrap_or_default();
+                let new_b =
+                    T::from_u32((sum_b * mul_value) as u32 >> shr_value).unwrap_or_default();
 
                 unsafe {
-                    bytes.write(current_y + current_px, new_r);
-                    bytes.write(current_y + current_px + 1, new_g);
-                    bytes.write(current_y + current_px + 2, new_b);
+                    let offset = current_y + current_px;
+                    bytes.write(offset, new_r);
+                    bytes.write(offset + 1, new_g);
+                    bytes.write(offset + 2, new_b);
+                    if CHANNEL_CONFIGURATION == 4 {
+                        let new_a = T::from_u32((sum_a * mul_value) as u32 >> shr_value)
+                            .unwrap_or_default();
+                        bytes.write(offset + 3, new_a);
+                    }
                 }
 
                 let arr_index = ((x - radius_64) & 1023) as usize;
@@ -236,21 +275,30 @@ fn fast_gaussian_horizontal_pass<
                     - 2 * unsafe { *buffer_g.get_unchecked(d_arr_index) };
                 dif_b += unsafe { *buffer_b.get_unchecked(arr_index) }
                     - 2 * unsafe { *buffer_b.get_unchecked(d_arr_index) };
+                if CHANNEL_CONFIGURATION == 4 {
+                    dif_a += unsafe { *buffer_a.get_unchecked(arr_index) }
+                        - 2 * unsafe { *buffer_a.get_unchecked(d_arr_index) };
+                }
             } else if x + radius_64 >= 0 {
                 let arr_index = (x & 1023) as usize;
                 dif_r -= 2 * unsafe { *buffer_r.get_unchecked(arr_index) };
                 dif_g -= 2 * unsafe { *buffer_g.get_unchecked(arr_index) };
                 dif_b -= 2 * unsafe { *buffer_b.get_unchecked(arr_index) };
+                if CHANNEL_CONFIGURATION == 4 {
+                    dif_a -= 2 * unsafe { *buffer_a.get_unchecked(arr_index) };
+                }
             }
 
             let next_row_y = (y as usize) * (stride as usize);
             let next_row_x = ((std::cmp::min(std::cmp::max(x + radius_64, 0), width_wide - 1)
                 as u32)
                 * channels_count) as usize;
 
-            let ur8 = bytes[next_row_y + next_row_x];
-            let ug8 = bytes[next_row_y + next_row_x + 1];
-            let ub8 = bytes[next_row_y + next_row_x + 2];
+            let bytes_offset = next_row_y + next_row_x;
+
+            let ur8 = bytes[bytes_offset];
+            let ug8 = bytes[bytes_offset + 1];
+            let ub8 = bytes[bytes_offset + 2];
 
             let arr_index = ((x + radius_64) & 1023) as usize;
 
@@ -271,6 +319,15 @@ fn fast_gaussian_horizontal_pass<
             unsafe {
                 *buffer_b.get_unchecked_mut(arr_index) = ub8.into();
             }
+
+            if CHANNEL_CONFIGURATION == 4 {
+                let ua8 = bytes[bytes_offset + 3];
+                dif_a += ua8.into();
+                sum_a += dif_a;
+                unsafe {
+                    *buffer_a.get_unchecked_mut(arr_index) = ua8.into();
+                }
+            }
         }
     }
 }
@@ -415,7 +472,6 @@ pub fn fast_gaussian_u16(
 /// * `stride` - Lane length, default is width * channels_count if not aligned
 /// * `radius` - almost any radius is supported
 #[no_mangle]
-#[allow(dead_code)]
 pub fn fast_gaussian_f32(
     bytes: &mut [f32],
     stride: u32,
@@ -434,7 +490,6 @@ pub fn fast_gaussian_f32(
 /// * `stride` - Lane length, default is width * channels_count if not aligned
 /// * `radius` - almost any radius is supported
 #[no_mangle]
-#[allow(dead_code)]
 pub fn fast_gaussian_f16(
     bytes: &mut [u16],
     stride: u32,

diff --git a/src/lib/fast_gaussian_neon.rs b/src/lib/fast_gaussian_neon.rs
@@ -75,12 +75,20 @@ pub mod neon_support {
 
                     let casted_u32 = unsafe { vreinterpret_u32_u8(prepared_u8) };
                     let pixel = unsafe { vget_lane_u32::<0>(casted_u32) };
-                    let bits = pixel.to_le_bytes();
-
-                    unsafe {
-                        bytes.write(current_y + current_px, bits[0]);
-                        bytes.write(current_y + current_px + 1, bits[1]);
-                        bytes.write(current_y + current_px + 2, bits[2]);
+                    let offset = current_y + current_px;
+                    if CHANNELS_COUNT == 4 {
+                        unsafe {
+                            let dst_ptr = (bytes.slice.as_ptr() as *mut u8).add(offset) as *mut u32;
+                            *dst_ptr = pixel;
+                        }
+                    } else {
+                        let bits = pixel.to_le_bytes();
+
+                        unsafe {
+                            bytes.write(offset, bits[0]);
+                            bytes.write(offset + 1, bits[1]);
+                            bytes.write(offset + 2, bits[2]);
+                        }
                     }
 
                     let arr_index = ((x - radius_64) & 1023) as usize;
@@ -165,12 +173,23 @@ pub mod neon_support {
 
                     let casted_u32 = unsafe { vreinterpret_u32_u8(prepared_u8) };
                     let pixel = unsafe { vget_lane_u32::<0>(casted_u32) };
-                    let bits = pixel.to_le_bytes();
 
-                    unsafe {
-                        bytes.write(current_y + current_px, bits[0]);
-                        bytes.write(current_y + current_px + 1, bits[1]);
-                        bytes.write(current_y + current_px + 2, bits[2]);
+                    let bytes_offset = current_y + current_px;
+
+                    if CHANNELS_COUNT == 4 {
+                        unsafe {
+                            let dst_ptr =
+                                (bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut u32;
+                            *dst_ptr = pixel;
+                        }
+                    } else {
+                        let bits = pixel.to_le_bytes();
+
+                        unsafe {
+                            bytes.write(bytes_offset, bits[0]);
+                            bytes.write(bytes_offset + 1, bits[1]);
+                            bytes.write(bytes_offset + 2, bits[2]);
+                        }
                     }
 
                     let arr_index = ((y - radius_64) & 1023) as usize;