Bugfix, added best fast gaussian

awxkee · Apr 28, 2024 · 116ff71 · 116ff71
1 parent 831d69e
commit 116ff71
Show file tree

Hide file tree

Showing 6 changed files with 337 additions and 13 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/lib/Cargo.toml b/src/lib/Cargo.toml
@@ -1,14 +1,14 @@
 [package]
 name = "libblur"
-version = "0.9.3"
+version = "0.9.4"
 edition = "2021"
 description = "High performance blur in pure rust"
 readme = "../../README.md"
 keywords = ["blur", "gauss_blur", "image_blur", "fast_blur", "box_blur"]
 license = "Apache-2.0 OR BSD-3-Clause"
 authors = ["Radzivon Bartoshyk"]
 documentation = "https://github.com/awxkee/libblur"
-categories = ["multimedia::images", "multimedia::video"]
+categories = ["multimedia::images", "multimedia::video", "algorithms"]
 homepage = "https://github.com/awxkee/libblur"
 repository = "https://github.com/awxkee/libblur.git"
 exclude = ["*.jpg"]

diff --git a/src/lib/box_blur_neon.rs b/src/lib/box_blur_neon.rs
@@ -112,7 +112,7 @@ pub mod neon_support {
                 // add next
                 {
                     let next_x =
-                        std::cmp::min(x + half_kernel, width - 1) as usize * channels_count as usize;
+                        std::cmp::min(x + half_kernel, width - 1) as usize;
 
                     let next = next_x * channels_count as usize;
 

diff --git a/src/lib/fast_gaussian_superior.rs b/src/lib/fast_gaussian_superior.rs
@@ -0,0 +1,322 @@
+use crate::FastBlurChannels;
+
+mod fast_gaussian_superior {
+    use num_traits::{FromPrimitive, ToPrimitive};
+    use crate::FastBlurChannels;
+    use crate::unsafe_slice::UnsafeSlice;
+
+    fn fast_gaussian_vertical_pass<T: FromPrimitive + ToPrimitive + Default + Into<i64> + Send + Sync>(
+        bytes: &UnsafeSlice<T>,
+        stride: u32,
+        width: u32,
+        height: u32,
+        radius: u32,
+        start: u32,
+        end: u32,
+        channels: FastBlurChannels,
+    ) where
+        T: std::ops::AddAssign + std::ops::SubAssign + Copy,
+    {
+        let mut buffer_r: [i64; 2048] = [0; 2048];
+        let mut buffer_g: [i64; 2048] = [0; 2048];
+        let mut buffer_b: [i64; 2048] = [0; 2048];
+        let radius_64 = radius as i64;
+        let height_wide = height as i64;
+        let radius_2d = (radius as f32) * (radius as f32);
+        let weight = 1.0f32 / (radius_2d * radius_2d);
+        let channels_count = match channels {
+            FastBlurChannels::Channels3 => 3,
+            FastBlurChannels::Channels4 => 4,
+        };
+        for x in start..std::cmp::min(width, end) {
+            let mut dif_r: i64 = 0;
+            let mut der_1_r: i64 = 0;
+            let mut der_2_r: i64 = 0;
+            let mut sum_r: i64 = 0;
+            let mut dif_g: i64 = 0;
+            let mut der_1_g: i64 = 0;
+            let mut der_2_g: i64 = 0;
+            let mut sum_g: i64 = 0;
+            let mut dif_b: i64 = 0;
+            let mut der_1_b: i64 = 0;
+            let mut der_2_b: i64 = 0;
+            let mut sum_b: i64 = 0;
+
+            let current_px = (x * channels_count) as usize;
+
+            let start_y = 0i64 - 4i64 * radius as i64;
+            for y in start_y..height_wide {
+                let current_y = (y * (stride as i64)) as usize;
+                if y >= 0 {
+                    let new_r = T::from_u32(((sum_r as f32) * weight) as u32).unwrap_or_default();
+                    let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default();
+                    let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default();
+
+                    unsafe {
+                        bytes.write(current_y + current_px, new_r);
+                        bytes.write(current_y + current_px + 1, new_g);
+                        bytes.write(current_y + current_px + 2, new_b);
+                    }
+
+                    let arr_index_3 = (y & 2047) as usize;
+                    let arr_index_2 = ((y + radius_64) & 2047) as usize;
+                    let arr_index_1 = ((y - radius_64) & 2047) as usize;
+                    let arr_index_4 = ((y - 2 * radius_64) & 2047) as usize;
+
+                    dif_r += -4 * (buffer_r[arr_index_1] + buffer_r[arr_index_2]) + 6 * buffer_r[arr_index_3] + buffer_r[arr_index_4];
+                    dif_g += -4 * (buffer_g[arr_index_1] + buffer_g[arr_index_2]) + 6 * buffer_g[arr_index_3] + buffer_g[arr_index_4];
+                    dif_b += -4 * (buffer_b[arr_index_1] + buffer_b[arr_index_2]) + 6 * buffer_b[arr_index_3] + buffer_b[arr_index_4];
+                } else {
+                    if y + 3 * radius_64 >= 0 {
+                        let arr_index = ((y + radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                    if y + 2 * radius_64 >= 0 {
+                        let arr_index = (y & 2047) as usize;
+                        dif_r += 6 * buffer_r[arr_index];
+                        dif_g += 6 * buffer_g[arr_index];
+                        dif_b += 6 * buffer_b[arr_index];
+                    }
+                    if y + radius_64 >= 0 {
+                        let arr_index = ((y - radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                }
+
+                let next_row_y = (std::cmp::min(
+                    std::cmp::max(y + 2 * radius_64 - 1, 0),
+                    height_wide - 1,
+                ) as usize)
+                    * (stride as usize);
+                let next_row_x = (x * channels_count) as usize;
+
+                let px_idx = next_row_y + next_row_x;
+
+                let ur8 = bytes[px_idx];
+                let ug8 = bytes[px_idx + 1];
+                let ub8 = bytes[px_idx + 2];
+
+                let arr_index = ((y + 2 * radius_64) & 2047) as usize;
+
+                dif_r += ur8.into();
+                der_2_r += dif_r;
+                der_1_r += der_2_r;
+                sum_r += der_1_r;
+                buffer_r[arr_index] = ur8.into();
+
+                dif_g += ug8.into();
+                der_2_g += dif_g;
+                der_1_g += der_2_g;
+                sum_g += der_1_g;
+                buffer_g[arr_index] = ug8.into();
+
+                dif_b += ub8.into();
+                der_2_b += dif_b;
+                der_1_b += der_2_b;
+                sum_b += der_1_b;
+                buffer_b[arr_index] = ub8.into();
+            }
+        }
+    }
+
+    fn fast_gaussian_horizontal_pass<T: FromPrimitive + ToPrimitive + Default + Into<i64> + Send + Sync>(
+        bytes: &UnsafeSlice<T>,
+        stride: u32,
+        width: u32,
+        height: u32,
+        radius: u32,
+        start: u32,
+        end: u32,
+        channels: FastBlurChannels,
+    ) where
+        T: std::ops::AddAssign + std::ops::SubAssign + Copy,
+    {
+        let mut buffer_r: [i64; 2048] = [0; 2048];
+        let mut buffer_g: [i64; 2048] = [0; 2048];
+        let mut buffer_b: [i64; 2048] = [0; 2048];
+        let radius_64 = radius as i64;
+        let width_wide = width as i64;
+        let radius_2d = (radius as f32) * (radius as f32);
+        let weight = 1.0f32 / (radius_2d * radius_2d);
+        let channels_count = match channels {
+            FastBlurChannels::Channels3 => 3,
+            FastBlurChannels::Channels4 => 4,
+        };
+        for y in start..std::cmp::min(height, end) {
+            let mut dif_r: i64 = 0;
+            let mut der_1_r: i64 = 0;
+            let mut der_2_r: i64 = 0;
+            let mut sum_r: i64 = 0;
+            let mut dif_g: i64 = 0;
+            let mut der_1_g: i64 = 0;
+            let mut der_2_g: i64 = 0;
+            let mut sum_g: i64 = 0;
+            let mut dif_b: i64 = 0;
+            let mut der_1_b: i64 = 0;
+            let mut der_2_b: i64 = 0;
+            let mut sum_b: i64 = 0;
+
+            let current_y = ((y as i64) * (stride as i64)) as usize;
+
+            for x in (0i64 - 4i64 * radius_64)..(width as i64) {
+                if x >= 0 {
+                    let current_px = ((std::cmp::max(x, 0) as u32) * channels_count) as usize;
+                    let new_r = T::from_u32(((sum_r as f32) * weight) as u32).unwrap_or_default();
+                    let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default();
+                    let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default();
+
+                    unsafe {
+                        bytes.write(current_y + current_px, new_r);
+                        bytes.write(current_y + current_px + 1, new_g);
+                        bytes.write(current_y + current_px + 2, new_b);
+                    }
+
+                    let arr_index_3 = (x & 2047) as usize;
+                    let arr_index_2 = ((x + radius_64) & 2047) as usize;
+                    let arr_index_1 = ((x - radius_64) & 2047) as usize;
+                    let arr_index_4 = ((x - 2 * radius_64) & 2047) as usize;
+
+                    dif_r += -4 * (buffer_r[arr_index_1] + buffer_r[arr_index_2]) + 6 * buffer_r[arr_index_3] + buffer_r[arr_index_4];
+                    dif_g += -4 * (buffer_g[arr_index_1] + buffer_g[arr_index_2]) + 6 * buffer_g[arr_index_3] + buffer_g[arr_index_4];
+                    dif_b += -4 * (buffer_b[arr_index_1] + buffer_b[arr_index_2]) + 6 * buffer_b[arr_index_3] + buffer_b[arr_index_4];
+                } else {
+                    if x + 3 * radius_64 >= 0 {
+                        let arr_index = ((x + radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                    if x + 2 * radius_64 >= 0 {
+                        let arr_index = (x & 2047) as usize;
+                        dif_r += 6 * buffer_r[arr_index];
+                        dif_g += 6 * buffer_g[arr_index];
+                        dif_b += 6 * buffer_b[arr_index];
+                    }
+                    if x + radius_64 >= 0 {
+                        let arr_index = ((x - radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                }
+
+                let next_row_y = (y as usize) * (stride as usize);
+                let next_row_x =
+                    ((std::cmp::min(std::cmp::max(x + 2 * radius_64 - 1, 0), width_wide - 1) as u32)
+                        * channels_count) as usize;
+
+                let ur8 = bytes[next_row_y + next_row_x];
+                let ug8 = bytes[next_row_y + next_row_x + 1];
+                let ub8 = bytes[next_row_y + next_row_x + 2];
+
+                let arr_index = ((x + 2 * radius_64) & 2047) as usize;
+
+                dif_r += ur8.into();
+                der_2_r += dif_r;
+                der_1_r += der_2_r;
+                sum_r += der_1_r;
+                buffer_r[arr_index] = ur8.into();
+
+                dif_g += ug8.into();
+                der_2_g += dif_g;
+                der_1_g += der_2_g;
+                sum_g += der_1_g;
+                buffer_g[arr_index] = ug8.into();
+
+                dif_b += ub8.into();
+                der_2_b += dif_b;
+                der_1_b += der_2_b;
+                sum_b += der_1_b;
+                buffer_b[arr_index] = ub8.into();
+            }
+        }
+    }
+
+    pub(crate) fn fast_gaussian_impl<T: FromPrimitive + ToPrimitive + Default + Into<i64> + Send + Sync>(
+        bytes: &mut Vec<T>,
+        stride: u32,
+        width: u32,
+        height: u32,
+        radius: u32,
+        channels: FastBlurChannels,
+    ) where
+        T: std::ops::AddAssign + std::ops::SubAssign + Copy,
+    {
+        let unsafe_image = UnsafeSlice::new(bytes);
+        let thread_count = std::cmp::max(std::cmp::min(width * height / (256 * 256), 12), 1);
+        let pool = rayon::ThreadPoolBuilder::new()
+            .num_threads(thread_count as usize)
+            .build()
+            .unwrap();
+        pool.scope(|scope| {
+            let segment_size = width / thread_count;
+
+            for i in 0..thread_count {
+                let start_x = i * segment_size;
+                let mut end_x = (i + 1) * segment_size;
+                if i == thread_count - 1 {
+                    end_x = width;
+                }
+                scope.spawn(move |_| {
+                    fast_gaussian_vertical_pass::<T>(
+                        &unsafe_image,
+                        stride,
+                        width,
+                        height,
+                        radius,
+                        start_x,
+                        end_x,
+                        channels,
+                    );
+                });
+            }
+        });
+        pool.scope(|scope| {
+            let segment_size = height / thread_count;
+
+            for i in 0..thread_count {
+                let start_y = i * segment_size;
+                let mut end_y = (i + 1) * segment_size;
+                if i == thread_count - 1 {
+                    end_y = height;
+                }
+                scope.spawn(move |_| {
+                    fast_gaussian_horizontal_pass::<T>(
+                        &unsafe_image,
+                        stride,
+                        width,
+                        height,
+                        radius,
+                        start_y,
+                        end_y,
+                        channels,
+                    );
+                });
+            }
+        });
+    }
+}
+
+/// Fast gaussian approximation. This is almost gaussian blur. Significantly slower than alternatives.
+/// # Arguments
+///
+/// * `stride` - Lane length, default is width * channels_count if not aligned
+/// * `radius` - Radius more than ~312 is not supported.
+/// O(1) complexity.
+#[no_mangle]
+#[allow(dead_code)]
+pub extern "C" fn fast_gaussian_superior(
+    bytes: &mut Vec<u8>,
+    stride: u32,
+    width: u32,
+    height: u32,
+    radius: u32,
+    channels: FastBlurChannels,
+) {
+    let acq_radius = std::cmp::min(radius, 312);
+    fast_gaussian_superior::fast_gaussian_impl::<u8>(bytes, stride, width, height, acq_radius, channels);
+}
diff --git a/src/lib/lib.rs b/src/lib/lib.rs
@@ -43,6 +43,7 @@ mod gaussian_f16;
 mod gaussian_helper;
 mod fast_gaussian_f16;
 mod fast_gaussian_next_f16;
+mod fast_gaussian_superior;
 
 pub use box_blur::box_blur;
 pub use box_blur::box_blur_u16;
@@ -64,3 +65,4 @@ pub use gaussian::gaussian_blur_u16;
 pub use gaussian::gaussian_blur_f16;
 pub use gaussian::gaussian_blur_f32;
 pub use median_blur::median_blur;
+pub use fast_gaussian_superior::fast_gaussian_superior;