From 116ff71fb01bbe259a14713f798a3b13c7f336c7 Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Sun, 28 Apr 2024 01:05:56 +0100
Subject: [PATCH] Bugfix, added best fast gaussian

---
 Cargo.lock                        |   2 +-
 src/lib/Cargo.toml                |   4 +-
 src/lib/box_blur_neon.rs          |   2 +-
 src/lib/fast_gaussian_superior.rs | 322 ++++++++++++++++++++++++++++++
 src/lib/lib.rs                    |   2 +
 src/main.rs                       |  18 +-
 6 files changed, 337 insertions(+), 13 deletions(-)
 create mode 100644 src/lib/fast_gaussian_superior.rs
diff --git a/Cargo.lock b/Cargo.lock
index fb6cf0f..dc6de6f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -394,7 +394,7 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libblur"
-version = "0.9.3"
+version = "0.9.4"
 dependencies = [
  "half",
  "num-traits",
diff --git a/src/lib/Cargo.toml b/src/lib/Cargo.toml
index 69973eb..6143f1d 100644
--- a/src/lib/Cargo.toml
+++ b/src/lib/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "libblur"
-version = "0.9.3"
+version = "0.9.4"
 edition = "2021"
 description = "High performance blur in pure rust"
 readme = "../../README.md"
@@ -8,7 +8,7 @@ keywords = ["blur", "gauss_blur", "image_blur", "fast_blur", "box_blur"]
 license = "Apache-2.0 OR BSD-3-Clause"
 authors = ["Radzivon Bartoshyk"]
 documentation = "https://github.com/awxkee/libblur"
-categories = ["multimedia::images", "multimedia::video"]
+categories = ["multimedia::images", "multimedia::video", "algorithms"]
 homepage = "https://github.com/awxkee/libblur"
 repository = "https://github.com/awxkee/libblur.git"
 exclude = ["*.jpg"]
diff --git a/src/lib/box_blur_neon.rs b/src/lib/box_blur_neon.rs
index e9d5de9..a771ffc 100644
--- a/src/lib/box_blur_neon.rs
+++ b/src/lib/box_blur_neon.rs
@@ -112,7 +112,7 @@ pub mod neon_support {
                 // add next
                 {
                     let next_x =
-                        std::cmp::min(x + half_kernel, width - 1) as usize * channels_count as usize;
+                        std::cmp::min(x + half_kernel, width - 1) as usize;
 
                     let next = next_x * channels_count as usize;
 
diff --git a/src/lib/fast_gaussian_superior.rs b/src/lib/fast_gaussian_superior.rs
new file mode 100644
index 0000000..f2fd943
--- /dev/null
+++ b/src/lib/fast_gaussian_superior.rs
@@ -0,0 +1,322 @@
+use crate::FastBlurChannels;
+
+mod fast_gaussian_superior {
+    use num_traits::{FromPrimitive, ToPrimitive};
+    use crate::FastBlurChannels;
+    use crate::unsafe_slice::UnsafeSlice;
+
+    fn fast_gaussian_vertical_pass<T: FromPrimitive + ToPrimitive + Default + Into<i64> + Send + Sync>(
+        bytes: &UnsafeSlice<T>,
+        stride: u32,
+        width: u32,
+        height: u32,
+        radius: u32,
+        start: u32,
+        end: u32,
+        channels: FastBlurChannels,
+    ) where
+        T: std::ops::AddAssign + std::ops::SubAssign + Copy,
+    {
+        let mut buffer_r: [i64; 2048] = [0; 2048];
+        let mut buffer_g: [i64; 2048] = [0; 2048];
+        let mut buffer_b: [i64; 2048] = [0; 2048];
+        let radius_64 = radius as i64;
+        let height_wide = height as i64;
+        let radius_2d = (radius as f32) * (radius as f32);
+        let weight = 1.0f32 / (radius_2d * radius_2d);
+        let channels_count = match channels {
+            FastBlurChannels::Channels3 => 3,
+            FastBlurChannels::Channels4 => 4,
+        };
+        for x in start..std::cmp::min(width, end) {
+            let mut dif_r: i64 = 0;
+            let mut der_1_r: i64 = 0;
+            let mut der_2_r: i64 = 0;
+            let mut sum_r: i64 = 0;
+            let mut dif_g: i64 = 0;
+            let mut der_1_g: i64 = 0;
+            let mut der_2_g: i64 = 0;
+            let mut sum_g: i64 = 0;
+            let mut dif_b: i64 = 0;
+            let mut der_1_b: i64 = 0;
+            let mut der_2_b: i64 = 0;
+            let mut sum_b: i64 = 0;
+
+            let current_px = (x * channels_count) as usize;
+
+            let start_y = 0i64 - 4i64 * radius as i64;
+            for y in start_y..height_wide {
+                let current_y = (y * (stride as i64)) as usize;
+                if y >= 0 {
+                    let new_r = T::from_u32(((sum_r as f32) * weight) as u32).unwrap_or_default();
+                    let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default();
+                    let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default();
+
+                    unsafe {
+                        bytes.write(current_y + current_px, new_r);
+                        bytes.write(current_y + current_px + 1, new_g);
+                        bytes.write(current_y + current_px + 2, new_b);
+                    }
+
+                    let arr_index_3 = (y & 2047) as usize;
+                    let arr_index_2 = ((y + radius_64) & 2047) as usize;
+                    let arr_index_1 = ((y - radius_64) & 2047) as usize;
+                    let arr_index_4 = ((y - 2 * radius_64) & 2047) as usize;
+
+                    dif_r += -4 * (buffer_r[arr_index_1] + buffer_r[arr_index_2]) + 6 * buffer_r[arr_index_3] + buffer_r[arr_index_4];
+                    dif_g += -4 * (buffer_g[arr_index_1] + buffer_g[arr_index_2]) + 6 * buffer_g[arr_index_3] + buffer_g[arr_index_4];
+                    dif_b += -4 * (buffer_b[arr_index_1] + buffer_b[arr_index_2]) + 6 * buffer_b[arr_index_3] + buffer_b[arr_index_4];
+                } else {
+                    if y + 3 * radius_64 >= 0 {
+                        let arr_index = ((y + radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                    if y + 2 * radius_64 >= 0 {
+                        let arr_index = (y & 2047) as usize;
+                        dif_r += 6 * buffer_r[arr_index];
+                        dif_g += 6 * buffer_g[arr_index];
+                        dif_b += 6 * buffer_b[arr_index];
+                    }
+                    if y + radius_64 >= 0 {
+                        let arr_index = ((y - radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                }
+
+                let next_row_y = (std::cmp::min(
+                    std::cmp::max(y + 2 * radius_64 - 1, 0),
+                    height_wide - 1,
+                ) as usize)
+                    * (stride as usize);
+                let next_row_x = (x * channels_count) as usize;
+
+                let px_idx = next_row_y + next_row_x;
+
+                let ur8 = bytes[px_idx];
+                let ug8 = bytes[px_idx + 1];
+                let ub8 = bytes[px_idx + 2];
+
+                let arr_index = ((y + 2 * radius_64) & 2047) as usize;
+
+                dif_r += ur8.into();
+                der_2_r += dif_r;
+                der_1_r += der_2_r;
+                sum_r += der_1_r;
+                buffer_r[arr_index] = ur8.into();
+
+                dif_g += ug8.into();
+                der_2_g += dif_g;
+                der_1_g += der_2_g;
+                sum_g += der_1_g;
+                buffer_g[arr_index] = ug8.into();
+
+                dif_b += ub8.into();
+                der_2_b += dif_b;
+                der_1_b += der_2_b;
+                sum_b += der_1_b;
+                buffer_b[arr_index] = ub8.into();
+            }
+        }
+    }
+
+    fn fast_gaussian_horizontal_pass<T: FromPrimitive + ToPrimitive + Default + Into<i64> + Send + Sync>(
+        bytes: &UnsafeSlice<T>,
+        stride: u32,
+        width: u32,
+        height: u32,
+        radius: u32,
+        start: u32,
+        end: u32,
+        channels: FastBlurChannels,
+    ) where
+        T: std::ops::AddAssign + std::ops::SubAssign + Copy,
+    {
+        let mut buffer_r: [i64; 2048] = [0; 2048];
+        let mut buffer_g: [i64; 2048] = [0; 2048];
+        let mut buffer_b: [i64; 2048] = [0; 2048];
+        let radius_64 = radius as i64;
+        let width_wide = width as i64;
+        let radius_2d = (radius as f32) * (radius as f32);
+        let weight = 1.0f32 / (radius_2d * radius_2d);
+        let channels_count = match channels {
+            FastBlurChannels::Channels3 => 3,
+            FastBlurChannels::Channels4 => 4,
+        };
+        for y in start..std::cmp::min(height, end) {
+            let mut dif_r: i64 = 0;
+            let mut der_1_r: i64 = 0;
+            let mut der_2_r: i64 = 0;
+            let mut sum_r: i64 = 0;
+            let mut dif_g: i64 = 0;
+            let mut der_1_g: i64 = 0;
+            let mut der_2_g: i64 = 0;
+            let mut sum_g: i64 = 0;
+            let mut dif_b: i64 = 0;
+            let mut der_1_b: i64 = 0;
+            let mut der_2_b: i64 = 0;
+            let mut sum_b: i64 = 0;
+
+            let current_y = ((y as i64) * (stride as i64)) as usize;
+
+            for x in (0i64 - 4i64 * radius_64)..(width as i64) {
+                if x >= 0 {
+                    let current_px = ((std::cmp::max(x, 0) as u32) * channels_count) as usize;
+                    let new_r = T::from_u32(((sum_r as f32) * weight) as u32).unwrap_or_default();
+                    let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default();
+                    let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default();
+
+                    unsafe {
+                        bytes.write(current_y + current_px, new_r);
+                        bytes.write(current_y + current_px + 1, new_g);
+                        bytes.write(current_y + current_px + 2, new_b);
+                    }
+
+                    let arr_index_3 = (x & 2047) as usize;
+                    let arr_index_2 = ((x + radius_64) & 2047) as usize;
+                    let arr_index_1 = ((x - radius_64) & 2047) as usize;
+                    let arr_index_4 = ((x - 2 * radius_64) & 2047) as usize;
+
+                    dif_r += -4 * (buffer_r[arr_index_1] + buffer_r[arr_index_2]) + 6 * buffer_r[arr_index_3] + buffer_r[arr_index_4];
+                    dif_g += -4 * (buffer_g[arr_index_1] + buffer_g[arr_index_2]) + 6 * buffer_g[arr_index_3] + buffer_g[arr_index_4];
+                    dif_b += -4 * (buffer_b[arr_index_1] + buffer_b[arr_index_2]) + 6 * buffer_b[arr_index_3] + buffer_b[arr_index_4];
+                } else {
+                    if x + 3 * radius_64 >= 0 {
+                        let arr_index = ((x + radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                    if x + 2 * radius_64 >= 0 {
+                        let arr_index = (x & 2047) as usize;
+                        dif_r += 6 * buffer_r[arr_index];
+                        dif_g += 6 * buffer_g[arr_index];
+                        dif_b += 6 * buffer_b[arr_index];
+                    }
+                    if x + radius_64 >= 0 {
+                        let arr_index = ((x - radius_64) & 2047) as usize;
+                        dif_r -= 4 * buffer_r[arr_index];
+                        dif_g -= 4 * buffer_g[arr_index];
+                        dif_b -= 4 * buffer_b[arr_index];
+                    }
+                }
+
+                let next_row_y = (y as usize) * (stride as usize);
+                let next_row_x =
+                    ((std::cmp::min(std::cmp::max(x + 2 * radius_64 - 1, 0), width_wide - 1) as u32)
+                        * channels_count) as usize;
+
+                let ur8 = bytes[next_row_y + next_row_x];
+                let ug8 = bytes[next_row_y + next_row_x + 1];
+                let ub8 = bytes[next_row_y + next_row_x + 2];
+
+                let arr_index = ((x + 2 * radius_64) & 2047) as usize;
+
+                dif_r += ur8.into();
+                der_2_r += dif_r;
+                der_1_r += der_2_r;
+                sum_r += der_1_r;
+                buffer_r[arr_index] = ur8.into();
+
+                dif_g += ug8.into();
+                der_2_g += dif_g;
+                der_1_g += der_2_g;
+                sum_g += der_1_g;
+                buffer_g[arr_index] = ug8.into();
+
+                dif_b += ub8.into();
+                der_2_b += dif_b;
+                der_1_b += der_2_b;
+                sum_b += der_1_b;
+                buffer_b[arr_index] = ub8.into();
+            }
+        }
+    }
+
+    pub(crate) fn fast_gaussian_impl<T: FromPrimitive + ToPrimitive + Default + Into<i64> + Send + Sync>(
+        bytes: &mut Vec<T>,
+        stride: u32,
+        width: u32,
+        height: u32,
+        radius: u32,
+        channels: FastBlurChannels,
+    ) where
+        T: std::ops::AddAssign + std::ops::SubAssign + Copy,
+    {
+        let unsafe_image = UnsafeSlice::new(bytes);
+        let thread_count = std::cmp::max(std::cmp::min(width * height / (256 * 256), 12), 1);
+        let pool = rayon::ThreadPoolBuilder::new()
+            .num_threads(thread_count as usize)
+            .build()
+            .unwrap();
+        pool.scope(|scope| {
+            let segment_size = width / thread_count;
+
+            for i in 0..thread_count {
+                let start_x = i * segment_size;
+                let mut end_x = (i + 1) * segment_size;
+                if i == thread_count - 1 {
+                    end_x = width;
+                }
+                scope.spawn(move |_| {
+                    fast_gaussian_vertical_pass::<T>(
+                        &unsafe_image,
+                        stride,
+                        width,
+                        height,
+                        radius,
+                        start_x,
+                        end_x,
+                        channels,
+                    );
+                });
+            }
+        });
+        pool.scope(|scope| {
+            let segment_size = height / thread_count;
+
+            for i in 0..thread_count {
+                let start_y = i * segment_size;
+                let mut end_y = (i + 1) * segment_size;
+                if i == thread_count - 1 {
+                    end_y = height;
+                }
+                scope.spawn(move |_| {
+                    fast_gaussian_horizontal_pass::<T>(
+                        &unsafe_image,
+                        stride,
+                        width,
+                        height,
+                        radius,
+                        start_y,
+                        end_y,
+                        channels,
+                    );
+                });
+            }
+        });
+    }
+}
+
+/// Fast gaussian approximation. This is almost gaussian blur. Significantly slower than alternatives.
+/// # Arguments
+///
+/// * `stride` - Lane length, default is width * channels_count if not aligned
+/// * `radius` - Radius more than ~312 is not supported.
+/// O(1) complexity.
+#[no_mangle]
+#[allow(dead_code)]
+pub extern "C" fn fast_gaussian_superior(
+    bytes: &mut Vec<u8>,
+    stride: u32,
+    width: u32,
+    height: u32,
+    radius: u32,
+    channels: FastBlurChannels,
+) {
+    let acq_radius = std::cmp::min(radius, 312);
+    fast_gaussian_superior::fast_gaussian_impl::<u8>(bytes, stride, width, height, acq_radius, channels);
+}
\ No newline at end of file
diff --git a/src/lib/lib.rs b/src/lib/lib.rs
index 4a8856e..004fbfe 100644
--- a/src/lib/lib.rs
+++ b/src/lib/lib.rs
@@ -43,6 +43,7 @@ mod gaussian_f16;
 mod gaussian_helper;
 mod fast_gaussian_f16;
 mod fast_gaussian_next_f16;
+mod fast_gaussian_superior;
 
 pub use box_blur::box_blur;
 pub use box_blur::box_blur_u16;
@@ -64,3 +65,4 @@ pub use gaussian::gaussian_blur_u16;
 pub use gaussian::gaussian_blur_f16;
 pub use gaussian::gaussian_blur_f32;
 pub use median_blur::median_blur;
+pub use fast_gaussian_superior::fast_gaussian_superior;
\ No newline at end of file
diff --git a/src/main.rs b/src/main.rs
index cd86a91..aa2bccb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,7 +15,7 @@ fn f16_to_f32(bytes: Vec<u16>) -> Vec<f32> {
 
 
 fn main() {
-    let img = ImageReader::open("assets/filirovska.jpeg")
+    let img = ImageReader::open("assets/test_image_1.jpg")
         .unwrap()
         .decode()
         .unwrap();
@@ -29,27 +29,27 @@ fn main() {
     for i in 0..dimensions.1 as usize * stride {
         bytes.push(src_bytes[i]);
     }
-    let mut dst_bytes: Vec<u16> = Vec::with_capacity(dimensions.1 as usize * stride);
+    let mut dst_bytes: Vec<u8> = Vec::with_capacity(dimensions.1 as usize * stride);
     dst_bytes.resize(dimensions.1 as usize * stride, 0);
     let start_time = Instant::now();
 
-    libblur::fast_gaussian(
+    libblur::fast_gaussian_superior(
         &mut bytes,
         stride as u32,
         dimensions.0,
         dimensions.1,
-        151,
+        512,
         FastBlurChannels::Channels3,
     );
-    // libblur::gaussian_blur_f16(
-    //     &f16_bytes,
+    // libblur::gaussian_blur(
+    //     &bytes,
     //     stride as u32,
     //     &mut dst_bytes,
     //     stride as u32,
     //     dimensions.0,
     //     dimensions.1,
-    //     151,
-    //     151f32 / 3f32,
+    //     127*2+1,
+    //     256f32 / 6f32,
     //     FastBlurChannels::Channels3,
     // );
     // libblur::median_blur(
@@ -62,7 +62,7 @@ fn main() {
     //     36,
     //     FastBlurChannels::Channels3,
     // );
-    // libblur::gaussian_box_blur(&bytes, stride as u32, &mut dst_bytes, stride as u32, dimensions.0, dimensions.1, 128, FastBlurChannels::Channels4);
+    // libblur::gaussian_box_blur(&bytes, stride as u32, &mut dst_bytes, stride as u32, dimensions.0, dimensions.1, 128, FastBlurChannels::Channels3);
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds