From 116ff71fb01bbe259a14713f798a3b13c7f336c7 Mon Sep 17 00:00:00 2001 From: awxkee Date: Sun, 28 Apr 2024 01:05:56 +0100 Subject: [PATCH] Bugfix, added best fast gaussian --- Cargo.lock | 2 +- src/lib/Cargo.toml | 4 +- src/lib/box_blur_neon.rs | 2 +- src/lib/fast_gaussian_superior.rs | 322 ++++++++++++++++++++++++++++++ src/lib/lib.rs | 2 + src/main.rs | 18 +- 6 files changed, 337 insertions(+), 13 deletions(-) create mode 100644 src/lib/fast_gaussian_superior.rs diff --git a/Cargo.lock b/Cargo.lock index fb6cf0f..dc6de6f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -394,7 +394,7 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8" [[package]] name = "libblur" -version = "0.9.3" +version = "0.9.4" dependencies = [ "half", "num-traits", diff --git a/src/lib/Cargo.toml b/src/lib/Cargo.toml index 69973eb..6143f1d 100644 --- a/src/lib/Cargo.toml +++ b/src/lib/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libblur" -version = "0.9.3" +version = "0.9.4" edition = "2021" description = "High performance blur in pure rust" readme = "../../README.md" @@ -8,7 +8,7 @@ keywords = ["blur", "gauss_blur", "image_blur", "fast_blur", "box_blur"] license = "Apache-2.0 OR BSD-3-Clause" authors = ["Radzivon Bartoshyk"] documentation = "https://github.com/awxkee/libblur" -categories = ["multimedia::images", "multimedia::video"] +categories = ["multimedia::images", "multimedia::video", "algorithms"] homepage = "https://github.com/awxkee/libblur" repository = "https://github.com/awxkee/libblur.git" exclude = ["*.jpg"] diff --git a/src/lib/box_blur_neon.rs b/src/lib/box_blur_neon.rs index e9d5de9..a771ffc 100644 --- a/src/lib/box_blur_neon.rs +++ b/src/lib/box_blur_neon.rs @@ -112,7 +112,7 @@ pub mod neon_support { // add next { let next_x = - std::cmp::min(x + half_kernel, width - 1) as usize * channels_count as usize; + std::cmp::min(x + half_kernel, width - 1) as usize; let next = next_x * channels_count as usize; diff --git a/src/lib/fast_gaussian_superior.rs b/src/lib/fast_gaussian_superior.rs new file mode 100644 index 0000000..f2fd943 --- /dev/null +++ b/src/lib/fast_gaussian_superior.rs @@ -0,0 +1,322 @@ +use crate::FastBlurChannels; + +mod fast_gaussian_superior { + use num_traits::{FromPrimitive, ToPrimitive}; + use crate::FastBlurChannels; + use crate::unsafe_slice::UnsafeSlice; + + fn fast_gaussian_vertical_pass + Send + Sync>( + bytes: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, + channels: FastBlurChannels, + ) where + T: std::ops::AddAssign + std::ops::SubAssign + Copy, + { + let mut buffer_r: [i64; 2048] = [0; 2048]; + let mut buffer_g: [i64; 2048] = [0; 2048]; + let mut buffer_b: [i64; 2048] = [0; 2048]; + let radius_64 = radius as i64; + let height_wide = height as i64; + let radius_2d = (radius as f32) * (radius as f32); + let weight = 1.0f32 / (radius_2d * radius_2d); + let channels_count = match channels { + FastBlurChannels::Channels3 => 3, + FastBlurChannels::Channels4 => 4, + }; + for x in start..std::cmp::min(width, end) { + let mut dif_r: i64 = 0; + let mut der_1_r: i64 = 0; + let mut der_2_r: i64 = 0; + let mut sum_r: i64 = 0; + let mut dif_g: i64 = 0; + let mut der_1_g: i64 = 0; + let mut der_2_g: i64 = 0; + let mut sum_g: i64 = 0; + let mut dif_b: i64 = 0; + let mut der_1_b: i64 = 0; + let mut der_2_b: i64 = 0; + let mut sum_b: i64 = 0; + + let current_px = (x * channels_count) as usize; + + let start_y = 0i64 - 4i64 * radius as i64; + for y in start_y..height_wide { + let current_y = (y * (stride as i64)) as usize; + if y >= 0 { + let new_r = T::from_u32(((sum_r as f32) * weight) as u32).unwrap_or_default(); + let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default(); + let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default(); + + unsafe { + bytes.write(current_y + current_px, new_r); + bytes.write(current_y + current_px + 1, new_g); + bytes.write(current_y + current_px + 2, new_b); + } + + let arr_index_3 = (y & 2047) as usize; + let arr_index_2 = ((y + radius_64) & 2047) as usize; + let arr_index_1 = ((y - radius_64) & 2047) as usize; + let arr_index_4 = ((y - 2 * radius_64) & 2047) as usize; + + dif_r += -4 * (buffer_r[arr_index_1] + buffer_r[arr_index_2]) + 6 * buffer_r[arr_index_3] + buffer_r[arr_index_4]; + dif_g += -4 * (buffer_g[arr_index_1] + buffer_g[arr_index_2]) + 6 * buffer_g[arr_index_3] + buffer_g[arr_index_4]; + dif_b += -4 * (buffer_b[arr_index_1] + buffer_b[arr_index_2]) + 6 * buffer_b[arr_index_3] + buffer_b[arr_index_4]; + } else { + if y + 3 * radius_64 >= 0 { + let arr_index = ((y + radius_64) & 2047) as usize; + dif_r -= 4 * buffer_r[arr_index]; + dif_g -= 4 * buffer_g[arr_index]; + dif_b -= 4 * buffer_b[arr_index]; + } + if y + 2 * radius_64 >= 0 { + let arr_index = (y & 2047) as usize; + dif_r += 6 * buffer_r[arr_index]; + dif_g += 6 * buffer_g[arr_index]; + dif_b += 6 * buffer_b[arr_index]; + } + if y + radius_64 >= 0 { + let arr_index = ((y - radius_64) & 2047) as usize; + dif_r -= 4 * buffer_r[arr_index]; + dif_g -= 4 * buffer_g[arr_index]; + dif_b -= 4 * buffer_b[arr_index]; + } + } + + let next_row_y = (std::cmp::min( + std::cmp::max(y + 2 * radius_64 - 1, 0), + height_wide - 1, + ) as usize) + * (stride as usize); + let next_row_x = (x * channels_count) as usize; + + let px_idx = next_row_y + next_row_x; + + let ur8 = bytes[px_idx]; + let ug8 = bytes[px_idx + 1]; + let ub8 = bytes[px_idx + 2]; + + let arr_index = ((y + 2 * radius_64) & 2047) as usize; + + dif_r += ur8.into(); + der_2_r += dif_r; + der_1_r += der_2_r; + sum_r += der_1_r; + buffer_r[arr_index] = ur8.into(); + + dif_g += ug8.into(); + der_2_g += dif_g; + der_1_g += der_2_g; + sum_g += der_1_g; + buffer_g[arr_index] = ug8.into(); + + dif_b += ub8.into(); + der_2_b += dif_b; + der_1_b += der_2_b; + sum_b += der_1_b; + buffer_b[arr_index] = ub8.into(); + } + } + } + + fn fast_gaussian_horizontal_pass + Send + Sync>( + bytes: &UnsafeSlice, + stride: u32, + width: u32, + height: u32, + radius: u32, + start: u32, + end: u32, + channels: FastBlurChannels, + ) where + T: std::ops::AddAssign + std::ops::SubAssign + Copy, + { + let mut buffer_r: [i64; 2048] = [0; 2048]; + let mut buffer_g: [i64; 2048] = [0; 2048]; + let mut buffer_b: [i64; 2048] = [0; 2048]; + let radius_64 = radius as i64; + let width_wide = width as i64; + let radius_2d = (radius as f32) * (radius as f32); + let weight = 1.0f32 / (radius_2d * radius_2d); + let channels_count = match channels { + FastBlurChannels::Channels3 => 3, + FastBlurChannels::Channels4 => 4, + }; + for y in start..std::cmp::min(height, end) { + let mut dif_r: i64 = 0; + let mut der_1_r: i64 = 0; + let mut der_2_r: i64 = 0; + let mut sum_r: i64 = 0; + let mut dif_g: i64 = 0; + let mut der_1_g: i64 = 0; + let mut der_2_g: i64 = 0; + let mut sum_g: i64 = 0; + let mut dif_b: i64 = 0; + let mut der_1_b: i64 = 0; + let mut der_2_b: i64 = 0; + let mut sum_b: i64 = 0; + + let current_y = ((y as i64) * (stride as i64)) as usize; + + for x in (0i64 - 4i64 * radius_64)..(width as i64) { + if x >= 0 { + let current_px = ((std::cmp::max(x, 0) as u32) * channels_count) as usize; + let new_r = T::from_u32(((sum_r as f32) * weight) as u32).unwrap_or_default(); + let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default(); + let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default(); + + unsafe { + bytes.write(current_y + current_px, new_r); + bytes.write(current_y + current_px + 1, new_g); + bytes.write(current_y + current_px + 2, new_b); + } + + let arr_index_3 = (x & 2047) as usize; + let arr_index_2 = ((x + radius_64) & 2047) as usize; + let arr_index_1 = ((x - radius_64) & 2047) as usize; + let arr_index_4 = ((x - 2 * radius_64) & 2047) as usize; + + dif_r += -4 * (buffer_r[arr_index_1] + buffer_r[arr_index_2]) + 6 * buffer_r[arr_index_3] + buffer_r[arr_index_4]; + dif_g += -4 * (buffer_g[arr_index_1] + buffer_g[arr_index_2]) + 6 * buffer_g[arr_index_3] + buffer_g[arr_index_4]; + dif_b += -4 * (buffer_b[arr_index_1] + buffer_b[arr_index_2]) + 6 * buffer_b[arr_index_3] + buffer_b[arr_index_4]; + } else { + if x + 3 * radius_64 >= 0 { + let arr_index = ((x + radius_64) & 2047) as usize; + dif_r -= 4 * buffer_r[arr_index]; + dif_g -= 4 * buffer_g[arr_index]; + dif_b -= 4 * buffer_b[arr_index]; + } + if x + 2 * radius_64 >= 0 { + let arr_index = (x & 2047) as usize; + dif_r += 6 * buffer_r[arr_index]; + dif_g += 6 * buffer_g[arr_index]; + dif_b += 6 * buffer_b[arr_index]; + } + if x + radius_64 >= 0 { + let arr_index = ((x - radius_64) & 2047) as usize; + dif_r -= 4 * buffer_r[arr_index]; + dif_g -= 4 * buffer_g[arr_index]; + dif_b -= 4 * buffer_b[arr_index]; + } + } + + let next_row_y = (y as usize) * (stride as usize); + let next_row_x = + ((std::cmp::min(std::cmp::max(x + 2 * radius_64 - 1, 0), width_wide - 1) as u32) + * channels_count) as usize; + + let ur8 = bytes[next_row_y + next_row_x]; + let ug8 = bytes[next_row_y + next_row_x + 1]; + let ub8 = bytes[next_row_y + next_row_x + 2]; + + let arr_index = ((x + 2 * radius_64) & 2047) as usize; + + dif_r += ur8.into(); + der_2_r += dif_r; + der_1_r += der_2_r; + sum_r += der_1_r; + buffer_r[arr_index] = ur8.into(); + + dif_g += ug8.into(); + der_2_g += dif_g; + der_1_g += der_2_g; + sum_g += der_1_g; + buffer_g[arr_index] = ug8.into(); + + dif_b += ub8.into(); + der_2_b += dif_b; + der_1_b += der_2_b; + sum_b += der_1_b; + buffer_b[arr_index] = ub8.into(); + } + } + } + + pub(crate) fn fast_gaussian_impl + Send + Sync>( + bytes: &mut Vec, + stride: u32, + width: u32, + height: u32, + radius: u32, + channels: FastBlurChannels, + ) where + T: std::ops::AddAssign + std::ops::SubAssign + Copy, + { + let unsafe_image = UnsafeSlice::new(bytes); + let thread_count = std::cmp::max(std::cmp::min(width * height / (256 * 256), 12), 1); + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(thread_count as usize) + .build() + .unwrap(); + pool.scope(|scope| { + let segment_size = width / thread_count; + + for i in 0..thread_count { + let start_x = i * segment_size; + let mut end_x = (i + 1) * segment_size; + if i == thread_count - 1 { + end_x = width; + } + scope.spawn(move |_| { + fast_gaussian_vertical_pass::( + &unsafe_image, + stride, + width, + height, + radius, + start_x, + end_x, + channels, + ); + }); + } + }); + pool.scope(|scope| { + let segment_size = height / thread_count; + + for i in 0..thread_count { + let start_y = i * segment_size; + let mut end_y = (i + 1) * segment_size; + if i == thread_count - 1 { + end_y = height; + } + scope.spawn(move |_| { + fast_gaussian_horizontal_pass::( + &unsafe_image, + stride, + width, + height, + radius, + start_y, + end_y, + channels, + ); + }); + } + }); + } +} + +/// Fast gaussian approximation. This is almost gaussian blur. Significantly slower than alternatives. +/// # Arguments +/// +/// * `stride` - Lane length, default is width * channels_count if not aligned +/// * `radius` - Radius more than ~312 is not supported. +/// O(1) complexity. +#[no_mangle] +#[allow(dead_code)] +pub extern "C" fn fast_gaussian_superior( + bytes: &mut Vec, + stride: u32, + width: u32, + height: u32, + radius: u32, + channels: FastBlurChannels, +) { + let acq_radius = std::cmp::min(radius, 312); + fast_gaussian_superior::fast_gaussian_impl::(bytes, stride, width, height, acq_radius, channels); +} \ No newline at end of file diff --git a/src/lib/lib.rs b/src/lib/lib.rs index 4a8856e..004fbfe 100644 --- a/src/lib/lib.rs +++ b/src/lib/lib.rs @@ -43,6 +43,7 @@ mod gaussian_f16; mod gaussian_helper; mod fast_gaussian_f16; mod fast_gaussian_next_f16; +mod fast_gaussian_superior; pub use box_blur::box_blur; pub use box_blur::box_blur_u16; @@ -64,3 +65,4 @@ pub use gaussian::gaussian_blur_u16; pub use gaussian::gaussian_blur_f16; pub use gaussian::gaussian_blur_f32; pub use median_blur::median_blur; +pub use fast_gaussian_superior::fast_gaussian_superior; \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index cd86a91..aa2bccb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ fn f16_to_f32(bytes: Vec) -> Vec { fn main() { - let img = ImageReader::open("assets/filirovska.jpeg") + let img = ImageReader::open("assets/test_image_1.jpg") .unwrap() .decode() .unwrap(); @@ -29,27 +29,27 @@ fn main() { for i in 0..dimensions.1 as usize * stride { bytes.push(src_bytes[i]); } - let mut dst_bytes: Vec = Vec::with_capacity(dimensions.1 as usize * stride); + let mut dst_bytes: Vec = Vec::with_capacity(dimensions.1 as usize * stride); dst_bytes.resize(dimensions.1 as usize * stride, 0); let start_time = Instant::now(); - libblur::fast_gaussian( + libblur::fast_gaussian_superior( &mut bytes, stride as u32, dimensions.0, dimensions.1, - 151, + 512, FastBlurChannels::Channels3, ); - // libblur::gaussian_blur_f16( - // &f16_bytes, + // libblur::gaussian_blur( + // &bytes, // stride as u32, // &mut dst_bytes, // stride as u32, // dimensions.0, // dimensions.1, - // 151, - // 151f32 / 3f32, + // 127*2+1, + // 256f32 / 6f32, // FastBlurChannels::Channels3, // ); // libblur::median_blur( @@ -62,7 +62,7 @@ fn main() { // 36, // FastBlurChannels::Channels3, // ); - // libblur::gaussian_box_blur(&bytes, stride as u32, &mut dst_bytes, stride as u32, dimensions.0, dimensions.1, 128, FastBlurChannels::Channels4); + // libblur::gaussian_box_blur(&bytes, stride as u32, &mut dst_bytes, stride as u32, dimensions.0, dimensions.1, 128, FastBlurChannels::Channels3); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds