diff --git a/Cargo.lock b/Cargo.lock index f3382fa..5d8041f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -394,7 +394,7 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8" [[package]] name = "libblur" -version = "0.10.6" +version = "0.10.7" dependencies = [ "half", "num-traits", diff --git a/src/lib/Cargo.toml b/src/lib/Cargo.toml index 33e694d..284fec2 100644 --- a/src/lib/Cargo.toml +++ b/src/lib/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libblur" -version = "0.10.6" +version = "0.10.7" edition = "2021" description = "High performance blur in pure rust" readme = "../../README.md" @@ -11,7 +11,7 @@ documentation = "https://github.com/awxkee/libblur" categories = ["multimedia::images", "multimedia::video", "algorithms"] homepage = "https://github.com/awxkee/libblur" repository = "https://github.com/awxkee/libblur.git" -exclude = ["*.jpg", "../../assets/*"] +exclude = ["*.jpg", "../../assets/*", "*.png"] [lib] name = "libblur" diff --git a/src/lib/fast_gaussian_next_sse.rs b/src/lib/fast_gaussian_next_sse.rs index f696c67..5866e04 100644 --- a/src/lib/fast_gaussian_next_sse.rs +++ b/src/lib/fast_gaussian_next_sse.rs @@ -72,12 +72,23 @@ pub mod sse_support { unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) }; let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) }; - let bits = pixel.to_le_bytes(); - unsafe { - bytes.write(current_y + current_px, bits[0]); - bytes.write(current_y + current_px + 1, bits[1]); - bytes.write(current_y + current_px + 2, bits[2]); + let bytes_offset = current_y + current_px; + + if CHANNELS_COUNT == 4 { + unsafe { + let dst_ptr = + (bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32; + *dst_ptr = pixel; + } + } else { + let bits = pixel.to_le_bytes(); + + unsafe { + bytes.write(bytes_offset, bits[0]); + bytes.write(bytes_offset + 1, bits[1]); + bytes.write(bytes_offset + 2, bits[2]); + } } let d_arr_index_1 = ((y + radius_64) & 1023) as usize; @@ -177,12 +188,23 @@ pub mod sse_support { unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) }; let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) }; - let bits = pixel.to_le_bytes(); - unsafe { - bytes.write(current_y + current_px, bits[0]); - bytes.write(current_y + current_px + 1, bits[1]); - bytes.write(current_y + current_px + 2, bits[2]); + let bytes_offset = current_y + current_px; + + if CHANNELS_COUNT == 4 { + unsafe { + let dst_ptr = + (bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32; + *dst_ptr = pixel; + } + } else { + let bits = pixel.to_le_bytes(); + + unsafe { + bytes.write(bytes_offset, bits[0]); + bytes.write(bytes_offset + 1, bits[1]); + bytes.write(bytes_offset + 2, bits[2]); + } } let d_arr_index_1 = ((x + radius_64) & 1023) as usize; diff --git a/src/lib/fast_gaussian_sse.rs b/src/lib/fast_gaussian_sse.rs index d311cee..a613c15 100644 --- a/src/lib/fast_gaussian_sse.rs +++ b/src/lib/fast_gaussian_sse.rs @@ -71,12 +71,23 @@ pub mod sse_support { let prepared_u8 = unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) }; let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) }; - let bits = pixel.to_le_bytes(); - unsafe { - bytes.write(current_y + current_px, bits[0]); - bytes.write(current_y + current_px + 1, bits[1]); - bytes.write(current_y + current_px + 2, bits[2]); + let bytes_offset = current_y + current_px; + + if CHANNELS_COUNT == 4 { + unsafe { + let dst_ptr = + (bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32; + *dst_ptr = pixel; + } + } else { + let bits = pixel.to_le_bytes(); + + unsafe { + bytes.write(current_y + current_px, bits[0]); + bytes.write(current_y + current_px + 1, bits[1]); + bytes.write(current_y + current_px + 2, bits[2]); + } } let arr_index = ((x - radius_64) & 1023) as usize; @@ -155,12 +166,23 @@ pub mod sse_support { unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) }; let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) }; - let bits = pixel.to_le_bytes(); - unsafe { - bytes.write(current_y + current_px, bits[0]); - bytes.write(current_y + current_px + 1, bits[1]); - bytes.write(current_y + current_px + 2, bits[2]); + let bytes_offset = current_y + current_px; + + if CHANNELS_COUNT == 4 { + unsafe { + let dst_ptr = + (bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32; + *dst_ptr = pixel; + } + } else { + let bits = pixel.to_le_bytes(); + + unsafe { + bytes.write(bytes_offset, bits[0]); + bytes.write(bytes_offset + 1, bits[1]); + bytes.write(bytes_offset + 2, bits[2]); + } } let arr_index = ((y - radius_64) & 1023) as usize; diff --git a/src/lib/fast_gaussian_superior.rs b/src/lib/fast_gaussian_superior.rs index d4e7cf5..5a7d358 100644 --- a/src/lib/fast_gaussian_superior.rs +++ b/src/lib/fast_gaussian_superior.rs @@ -1,12 +1,13 @@ use crate::{FastBlurChannels, ThreadingPolicy}; mod fast_gaussian_superior { - use crate::unsafe_slice::UnsafeSlice; - use crate::{FastBlurChannels, ThreadingPolicy}; use num_traits::{FromPrimitive, ToPrimitive}; + use crate::ThreadingPolicy; + use crate::unsafe_slice::UnsafeSlice; + fn fast_gaussian_vertical_pass< - T: FromPrimitive + ToPrimitive + Default + Into + Send + Sync, + T: FromPrimitive + ToPrimitive + Default + Into + Send + Sync, const CHANNELS_COUNT: usize, >( bytes: &UnsafeSlice, stride: u32, @@ -15,21 +16,17 @@ mod fast_gaussian_superior { radius: u32, start: u32, end: u32, - channels: FastBlurChannels, ) where T: std::ops::AddAssign + std::ops::SubAssign + Copy, { let mut buffer_r: [i64; 2048] = [0; 2048]; let mut buffer_g: [i64; 2048] = [0; 2048]; let mut buffer_b: [i64; 2048] = [0; 2048]; + let mut buffer_a: [i64; 2048] = [0; 2048]; let radius_64 = radius as i64; let height_wide = height as i64; let radius_2d = (radius as f32) * (radius as f32); let weight = 1.0f32 / (radius_2d * radius_2d); - let channels_count = match channels { - FastBlurChannels::Channels3 => 3, - FastBlurChannels::Channels4 => 4, - }; for x in start..std::cmp::min(width, end) { let mut dif_r: i64 = 0; let mut der_1_r: i64 = 0; @@ -43,8 +40,12 @@ mod fast_gaussian_superior { let mut der_1_b: i64 = 0; let mut der_2_b: i64 = 0; let mut sum_b: i64 = 0; + let mut dif_a: i64 = 0; + let mut der_1_a: i64 = 0; + let mut der_2_a: i64 = 0; + let mut sum_a: i64 = 0; - let current_px = (x * channels_count) as usize; + let current_px = x as usize * CHANNELS_COUNT; let start_y = 0i64 - 4i64 * radius as i64; for y in start_y..height_wide { @@ -54,10 +55,16 @@ mod fast_gaussian_superior { let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default(); let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default(); + let bytes_offset = current_y + current_px; + unsafe { - bytes.write(current_y + current_px, new_r); - bytes.write(current_y + current_px + 1, new_g); - bytes.write(current_y + current_px + 2, new_b); + bytes.write(bytes_offset, new_r); + bytes.write(bytes_offset + 1, new_g); + bytes.write(bytes_offset + 2, new_b); + if CHANNELS_COUNT == 4 { + let new_a = T::from_u32(((sum_a as f32) * weight) as u32).unwrap_or_default(); + bytes.write(bytes_offset + 3, new_a); + } } let arr_index_3 = (y & 2047) as usize; @@ -81,6 +88,13 @@ mod fast_gaussian_superior { + (*buffer_b.get_unchecked(arr_index_2))) + 6 * (*buffer_b.get_unchecked(arr_index_3)) + (*buffer_b.get_unchecked(arr_index_4)); + if CHANNELS_COUNT == 4 { + dif_a += -4 + * ((*buffer_a.get_unchecked(arr_index_1)) + + (*buffer_a.get_unchecked(arr_index_2))) + + 6 * (*buffer_a.get_unchecked(arr_index_3)) + + (*buffer_a.get_unchecked(arr_index_4)); + } }; } else { if y + 3 * radius_64 >= 0 { @@ -88,18 +102,27 @@ mod fast_gaussian_superior { dif_r -= 4 * unsafe { *buffer_r.get_unchecked(arr_index) }; dif_g -= 4 * unsafe { *buffer_g.get_unchecked(arr_index) }; dif_b -= 4 * unsafe { *buffer_b.get_unchecked(arr_index) }; + if CHANNELS_COUNT == 4 { + dif_a -= 4 * unsafe { *buffer_a.get_unchecked(arr_index) }; + } } if y + 2 * radius_64 >= 0 { let arr_index = (y & 2047) as usize; dif_r += 6 * unsafe { *buffer_r.get_unchecked(arr_index) }; dif_g += 6 * unsafe { *buffer_g.get_unchecked(arr_index) }; dif_b += 6 * unsafe { *buffer_b.get_unchecked(arr_index) }; + if CHANNELS_COUNT == 4 { + dif_a += 6 * unsafe { *buffer_a.get_unchecked(arr_index) }; + } } if y + radius_64 >= 0 { let arr_index = ((y - radius_64) & 2047) as usize; dif_r -= 4 * unsafe { *buffer_r.get_unchecked(arr_index) }; dif_g -= 4 * unsafe { *buffer_g.get_unchecked(arr_index) }; dif_b -= 4 * unsafe { *buffer_b.get_unchecked(arr_index) }; + if CHANNELS_COUNT == 4 { + dif_a -= 4 * unsafe { *buffer_a.get_unchecked(arr_index) }; + } } } @@ -107,7 +130,7 @@ mod fast_gaussian_superior { (std::cmp::min(std::cmp::max(y + 2 * radius_64 - 1, 0), height_wide - 1) as usize) * (stride as usize); - let next_row_x = (x * channels_count) as usize; + let next_row_x = x as usize * CHANNELS_COUNT; let px_idx = next_row_y + next_row_x; @@ -140,12 +163,23 @@ mod fast_gaussian_superior { unsafe { *buffer_b.get_unchecked_mut(arr_index) = ub8.into(); } + + if CHANNELS_COUNT == 4 { + let ua8 = bytes[px_idx + 3]; + dif_a += ua8.into(); + der_2_a += dif_a; + der_1_a += der_2_a; + sum_a += der_1_a; + unsafe { + *buffer_a.get_unchecked_mut(arr_index) = ua8.into(); + } + } } } } fn fast_gaussian_horizontal_pass< - T: FromPrimitive + ToPrimitive + Default + Into + Send + Sync, + T: FromPrimitive + ToPrimitive + Default + Into + Send + Sync, const CHANNELS_COUNT: usize, >( bytes: &UnsafeSlice, stride: u32, @@ -154,21 +188,17 @@ mod fast_gaussian_superior { radius: u32, start: u32, end: u32, - channels: FastBlurChannels, ) where T: std::ops::AddAssign + std::ops::SubAssign + Copy, { let mut buffer_r: [i64; 2048] = [0; 2048]; let mut buffer_g: [i64; 2048] = [0; 2048]; let mut buffer_b: [i64; 2048] = [0; 2048]; + let mut buffer_a: [i64; 2048] = [0; 2048]; let radius_64 = radius as i64; let width_wide = width as i64; let radius_2d = (radius as f32) * (radius as f32); let weight = 1.0f32 / (radius_2d * radius_2d); - let channels_count = match channels { - FastBlurChannels::Channels3 => 3, - FastBlurChannels::Channels4 => 4, - }; for y in start..std::cmp::min(height, end) { let mut dif_r: i64 = 0; let mut der_1_r: i64 = 0; @@ -182,20 +212,30 @@ mod fast_gaussian_superior { let mut der_1_b: i64 = 0; let mut der_2_b: i64 = 0; let mut sum_b: i64 = 0; + let mut dif_a: i64 = 0; + let mut der_1_a: i64 = 0; + let mut der_2_a: i64 = 0; + let mut sum_a: i64 = 0; let current_y = ((y as i64) * (stride as i64)) as usize; for x in (0i64 - 4i64 * radius_64)..(width as i64) { if x >= 0 { - let current_px = ((std::cmp::max(x, 0) as u32) * channels_count) as usize; + let current_px = (std::cmp::max(x, 0) as u32) as usize * CHANNELS_COUNT; let new_r = T::from_u32(((sum_r as f32) * weight) as u32).unwrap_or_default(); let new_g = T::from_u32(((sum_g as f32) * weight) as u32).unwrap_or_default(); let new_b = T::from_u32(((sum_b as f32) * weight) as u32).unwrap_or_default(); + let bytes_offset = current_y + current_px; + unsafe { - bytes.write(current_y + current_px, new_r); - bytes.write(current_y + current_px + 1, new_g); - bytes.write(current_y + current_px + 2, new_b); + bytes.write(bytes_offset, new_r); + bytes.write(bytes_offset + 1, new_g); + bytes.write(bytes_offset + 2, new_b); + if CHANNELS_COUNT == 4 { + let new_a = T::from_u32(((sum_a as f32) * weight) as u32).unwrap_or_default(); + bytes.write(bytes_offset + 3, new_a); + } } let arr_index_3 = (x & 2047) as usize; @@ -206,19 +246,26 @@ mod fast_gaussian_superior { unsafe { dif_r += -4 * ((*buffer_r.get_unchecked(arr_index_1)) - + (*buffer_r.get_unchecked(arr_index_2))) + + (*buffer_r.get_unchecked(arr_index_2))) + 6 * (*buffer_r.get_unchecked(arr_index_3)) + (*buffer_r.get_unchecked(arr_index_4)); dif_g += -4 * ((*buffer_g.get_unchecked(arr_index_1)) - + (*buffer_g.get_unchecked(arr_index_2))) + + (*buffer_g.get_unchecked(arr_index_2))) + 6 * (*buffer_g.get_unchecked(arr_index_3)) + (*buffer_g.get_unchecked(arr_index_4)); dif_b += -4 * ((*buffer_b.get_unchecked(arr_index_1)) - + (*buffer_b.get_unchecked(arr_index_2))) + + (*buffer_b.get_unchecked(arr_index_2))) + 6 * (*buffer_b.get_unchecked(arr_index_3)) + (*buffer_b.get_unchecked(arr_index_4)); + if CHANNELS_COUNT == 4 { + dif_a += -4 + * ((*buffer_a.get_unchecked(arr_index_1)) + + (*buffer_a.get_unchecked(arr_index_2))) + + 6 * (*buffer_a.get_unchecked(arr_index_3)) + + (*buffer_a.get_unchecked(arr_index_4)); + } } } else { if x + 3 * radius_64 >= 0 { @@ -226,30 +273,41 @@ mod fast_gaussian_superior { dif_r -= 4 * unsafe { *buffer_r.get_unchecked(arr_index) }; dif_g -= 4 * unsafe { *buffer_g.get_unchecked(arr_index) }; dif_b -= 4 * unsafe { *buffer_b.get_unchecked(arr_index) }; + if CHANNELS_COUNT == 4 { + dif_a -= 4 * unsafe { *buffer_a.get_unchecked(arr_index) }; + } } if x + 2 * radius_64 >= 0 { let arr_index = (x & 2047) as usize; dif_r += 6 * unsafe { *buffer_r.get_unchecked(arr_index) }; dif_g += 6 * unsafe { *buffer_g.get_unchecked(arr_index) }; dif_b += 6 * unsafe { *buffer_b.get_unchecked(arr_index) }; + if CHANNELS_COUNT == 4 { + dif_a += 6 * unsafe { *buffer_a.get_unchecked(arr_index) }; + } } if x + radius_64 >= 0 { let arr_index = ((x - radius_64) & 2047) as usize; dif_r -= 4 * unsafe { *buffer_r.get_unchecked(arr_index) }; dif_g -= 4 * unsafe { *buffer_g.get_unchecked(arr_index) }; dif_b -= 4 * unsafe { *buffer_b.get_unchecked(arr_index) }; + if CHANNELS_COUNT == 4 { + dif_a -= 4 * unsafe { *buffer_a.get_unchecked(arr_index) }; + } } } let next_row_y = (y as usize) * (stride as usize); let next_row_x = - ((std::cmp::min(std::cmp::max(x + 2 * radius_64 - 1, 0), width_wide - 1) - as u32) - * channels_count) as usize; + (std::cmp::min(std::cmp::max(x + 2 * radius_64 - 1, 0), width_wide - 1) + as u32) as usize + * CHANNELS_COUNT; + + let bytes_offset = next_row_y + next_row_x; - let ur8 = bytes[next_row_y + next_row_x]; - let ug8 = bytes[next_row_y + next_row_x + 1]; - let ub8 = bytes[next_row_y + next_row_x + 2]; + let ur8 = bytes[bytes_offset]; + let ug8 = bytes[bytes_offset + 1]; + let ub8 = bytes[bytes_offset + 2]; let arr_index = ((x + 2 * radius_64) & 2047) as usize; @@ -276,19 +334,29 @@ mod fast_gaussian_superior { unsafe { *buffer_b.get_unchecked_mut(arr_index) = ub8.into(); } + + if CHANNELS_COUNT == 4 { + let ua8 = bytes[bytes_offset + 3]; + dif_a += ua8.into(); + der_2_a += dif_a; + der_1_a += der_2_a; + sum_a += der_1_a; + unsafe { + *buffer_a.get_unchecked_mut(arr_index) = ua8.into(); + } + } } } } pub(crate) fn fast_gaussian_impl< - T: FromPrimitive + ToPrimitive + Default + Into + Send + Sync, + T: FromPrimitive + ToPrimitive + Default + Into + Send + Sync, const CHANNELS_COUNT: usize, >( bytes: &mut [T], stride: u32, width: u32, height: u32, radius: u32, - channels: FastBlurChannels, threading_policy: ThreadingPolicy, ) where T: std::ops::AddAssign + std::ops::SubAssign + Copy, @@ -309,7 +377,7 @@ mod fast_gaussian_superior { end_x = width; } scope.spawn(move |_| { - fast_gaussian_vertical_pass::( + fast_gaussian_vertical_pass::( &unsafe_image, stride, width, @@ -317,7 +385,6 @@ mod fast_gaussian_superior { radius, start_x, end_x, - channels, ); }); } @@ -332,7 +399,7 @@ mod fast_gaussian_superior { end_y = height; } scope.spawn(move |_| { - fast_gaussian_horizontal_pass::( + fast_gaussian_horizontal_pass::( &unsafe_image, stride, width, @@ -340,7 +407,6 @@ mod fast_gaussian_superior { radius, start_y, end_y, - channels, ); }); } @@ -366,13 +432,26 @@ pub fn fast_gaussian_superior( threading_policy: ThreadingPolicy, ) { let acq_radius = std::cmp::min(radius, 256); - fast_gaussian_superior::fast_gaussian_impl::( - bytes, - stride, - width, - height, - acq_radius, - channels, - threading_policy, - ); + match channels { + FastBlurChannels::Channels3 => { + fast_gaussian_superior::fast_gaussian_impl::( + bytes, + stride, + width, + height, + acq_radius, + threading_policy, + ); + } + FastBlurChannels::Channels4 => { + fast_gaussian_superior::fast_gaussian_impl::( + bytes, + stride, + width, + height, + acq_radius, + threading_policy, + ); + } + } } diff --git a/src/lib/mul_table.rs b/src/lib/mul_table.rs index 179df31..634674e 100644 --- a/src/lib/mul_table.rs +++ b/src/lib/mul_table.rs @@ -87,7 +87,7 @@ pub(crate) const MUL_TABLE_STACK_BLUR: [i32; 255] = [ 320, 318, 315, 312, 310, 307, 304, 302, 299, 297, 294, 292, 289, 287, 285, 282, 280, 278, 275, 273, 271, 269, 267, 265, 263, 261, 259, ]; -pub(crate) const SHR_TABLE_STACKBLUR: [i32; 255] = [ +pub(crate) const SHR_TABLE_STACK_BLUR: [i32; 255] = [ 9, 11, 12, 13, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, diff --git a/src/lib/stack_blur.rs b/src/lib/stack_blur.rs index 56914ab..9890d6b 100644 --- a/src/lib/stack_blur.rs +++ b/src/lib/stack_blur.rs @@ -1,4 +1,4 @@ -use crate::mul_table::{MUL_TABLE_STACK_BLUR, SHR_TABLE_STACKBLUR}; +use crate::mul_table::{MUL_TABLE_STACK_BLUR, SHR_TABLE_STACK_BLUR}; use crate::FastBlurChannels; #[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)] @@ -26,7 +26,7 @@ enum StackBlurPass { VERTICAL, } -fn stack_blur_pass<'a, const COMPONENTS: usize>( +fn stack_blur_pass( pixels: &mut [u8], stride: u32, width: u32, @@ -60,10 +60,9 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( let wm = width - 1; let hm = height - 1; - // let w4 = width as usize * components; let div = (radius * 2) + 1; let mul_sum = MUL_TABLE_STACK_BLUR[radius as usize]; - let shr_sum = SHR_TABLE_STACKBLUR[radius as usize]; + let shr_sum = SHR_TABLE_STACK_BLUR[radius as usize]; let mut src_ptr; let mut dst_ptr; @@ -88,33 +87,36 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( src_ptr = stride as usize * y; // start of line (0,y) + let src_r = unsafe { *pixels.get_unchecked(src_ptr + 0) as i32 }; + let src_g = unsafe { *pixels.get_unchecked(src_ptr + 1) as i32 }; + let src_b = unsafe { *pixels.get_unchecked(src_ptr + 2) as i32 }; + let src_a = if COMPONENTS == 4 { + unsafe { *pixels.get_unchecked(src_ptr + 3) as i32 } + } else { + 0i32 + }; + for i in 0..=radius { let stack_value = unsafe { &mut *stacks.get_unchecked_mut(i as usize) }; - unsafe { - stack_value.r = *pixels.get_unchecked(src_ptr + 0) as i32; - stack_value.g = *pixels.get_unchecked(src_ptr + 1) as i32; - stack_value.b = *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - stack_value.a = *pixels.get_unchecked(src_ptr + 3) as i32; - } + stack_value.r = src_r; + stack_value.g = src_g; + stack_value.b = src_b; + if COMPONENTS == 4 { + stack_value.a = src_a; } - unsafe { - sum_r += *pixels.get_unchecked(src_ptr + 0) as i32 * (i + 1) as i32; - sum_g += *pixels.get_unchecked(src_ptr + 1) as i32 * (i + 1) as i32; - sum_b += *pixels.get_unchecked(src_ptr + 2) as i32 * (i + 1) as i32; - if COMPONENTS == 4 { - sum_a += *pixels.get_unchecked(src_ptr + 3) as i32 * (i + 1) as i32; - } + sum_r += src_r * (i + 1) as i32; + sum_g += src_g * (i + 1) as i32; + sum_b += src_b * (i + 1) as i32; + if COMPONENTS == 4 { + sum_a += src_a * (i + 1) as i32; } - unsafe { - sum_out_r += *pixels.get_unchecked(src_ptr + 0) as i32; - sum_out_g += *pixels.get_unchecked(src_ptr + 1) as i32; - sum_out_b += *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - sum_out_a += *pixels.get_unchecked(src_ptr + 3) as i32; - } + sum_out_r += src_r; + sum_out_g += src_g; + sum_out_b += src_b; + if COMPONENTS == 4 { + sum_out_a += src_a; } } @@ -123,30 +125,35 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( src_ptr += COMPONENTS; } let stack_ptr = unsafe { &mut *stacks.get_unchecked_mut((i + radius) as usize) }; - unsafe { - stack_ptr.r = *pixels.get_unchecked(src_ptr + 0) as i32; - stack_ptr.g = *pixels.get_unchecked(src_ptr + 1) as i32; - stack_ptr.b = *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - stack_ptr.a = *pixels.get_unchecked(src_ptr + 3) as i32; - } + + let src_r = unsafe { *pixels.get_unchecked(src_ptr + 0) as i32 }; + let src_g = unsafe { *pixels.get_unchecked(src_ptr + 1) as i32 }; + let src_b = unsafe { *pixels.get_unchecked(src_ptr + 2) as i32 }; + let src_a = if COMPONENTS == 4 { + unsafe { *pixels.get_unchecked(src_ptr + 3) as i32 } + } else { + 0i32 + }; + + stack_ptr.r = src_r; + stack_ptr.g = src_g; + stack_ptr.b = src_b; + if COMPONENTS == 4 { + stack_ptr.a = src_a; } - unsafe { - sum_r += *pixels.get_unchecked(src_ptr + 0) as i32 * (radius + 1 - i) as i32; - sum_g += *pixels.get_unchecked(src_ptr + 1) as i32 * (radius + 1 - i) as i32; - sum_b += *pixels.get_unchecked(src_ptr + 2) as i32 * (radius + 1 - i) as i32; - if COMPONENTS == 4 { - sum_a += - *pixels.get_unchecked(src_ptr + 3) as i32 * (radius + 1 - i) as i32; - } + + sum_r += src_r * (radius + 1 - i) as i32; + sum_g += src_g * (radius + 1 - i) as i32; + sum_b += src_b * (radius + 1 - i) as i32; + if COMPONENTS == 4 { + sum_a += src_a * (radius + 1 - i) as i32; } - unsafe { - sum_in_r += *pixels.get_unchecked(src_ptr + 0) as i32; - sum_in_g += *pixels.get_unchecked(src_ptr + 1) as i32; - sum_in_b += *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - sum_in_a += *pixels.get_unchecked(src_ptr + 3) as i32; - } + + sum_in_r += src_r; + sum_in_g += src_g; + sum_in_b += src_b; + if COMPONENTS == 4 { + sum_in_a += src_a; } } @@ -155,6 +162,7 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( if xp > wm { xp = wm; } + src_ptr = COMPONENTS * xp as usize + y * stride as usize; dst_ptr = y * stride as usize; for _ in 0..width { @@ -192,23 +200,28 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( xp += 1; } - unsafe { - stack.r = *pixels.get_unchecked(src_ptr + 0) as i32; - stack.g = *pixels.get_unchecked(src_ptr + 1) as i32; - stack.b = *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - stack.a = *pixels.get_unchecked(src_ptr + 3) as i32; - } + let src_r = unsafe { *pixels.get_unchecked(src_ptr + 0) as i32 }; + let src_g = unsafe { *pixels.get_unchecked(src_ptr + 1) as i32 }; + let src_b = unsafe { *pixels.get_unchecked(src_ptr + 2) as i32 }; + let src_a = if COMPONENTS == 4 { + unsafe { *pixels.get_unchecked(src_ptr + 3) as i32 } + } else { + 0i32 + }; + stack.r = src_r; + stack.g = src_g; + stack.b = src_b; + if COMPONENTS == 4 { + stack.a = src_a; } - unsafe { - sum_in_r += *pixels.get_unchecked(src_ptr + 0) as i32; - sum_in_g += *pixels.get_unchecked(src_ptr + 1) as i32; - sum_in_b += *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - sum_in_a += *pixels.get_unchecked(src_ptr + 3) as i32; - } + sum_in_r += src_r; + sum_in_g += src_g; + sum_in_b += src_b; + if COMPONENTS == 4 { + sum_in_a += src_a; } + sum_r += sum_in_r; sum_g += sum_in_g; sum_b += sum_in_b; @@ -249,65 +262,74 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( sum_out_a = 0; src_ptr = COMPONENTS * x; // x,0 + + let src_r = unsafe { *pixels.get_unchecked(src_ptr + 0) as i32 }; + let src_g = unsafe { *pixels.get_unchecked(src_ptr + 1) as i32 }; + let src_b = unsafe { *pixels.get_unchecked(src_ptr + 2) as i32 }; + let src_a = if COMPONENTS == 4 { + unsafe { *pixels.get_unchecked(src_ptr + 3) as i32 } + } else { + 0i32 + }; + for i in 0..=radius { let stack_value = unsafe { &mut *stacks.get_unchecked_mut(i as usize) }; - unsafe { - stack_value.r = *pixels.get_unchecked(src_ptr + 0) as i32; - stack_value.g = *pixels.get_unchecked(src_ptr + 1) as i32; - stack_value.b = *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - stack_value.a = *pixels.get_unchecked(src_ptr + 3) as i32; - } + stack_value.r = src_r; + stack_value.g = src_g; + stack_value.b = src_b; + if COMPONENTS == 4 { + stack_value.a = src_a; } - unsafe { - sum_r += *pixels.get_unchecked(src_ptr + 0) as i32 * (i + 1) as i32; - sum_g += *pixels.get_unchecked(src_ptr + 1) as i32 * (i + 1) as i32; - sum_b += *pixels.get_unchecked(src_ptr + 2) as i32 * (i + 1) as i32; - if COMPONENTS == 4 { - sum_a += *pixels.get_unchecked(src_ptr + 3) as i32 * (i + 1) as i32; - } + sum_r += src_r * (i + 1) as i32; + sum_g += src_g * (i + 1) as i32; + sum_b += src_b * (i + 1) as i32; + if COMPONENTS == 4 { + sum_a += src_a * (i + 1) as i32; } - unsafe { - sum_out_r += *pixels.get_unchecked(src_ptr + 0) as i32; - sum_out_g += *pixels.get_unchecked(src_ptr + 1) as i32; - sum_out_b += *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - sum_out_a += *pixels.get_unchecked(src_ptr + 3) as i32; - } + sum_out_r += src_r; + sum_out_g += src_g; + sum_out_b += src_b; + if COMPONENTS == 4 { + sum_out_a += src_a; } } + for i in 1..=radius { if i <= hm { src_ptr += stride as usize; } let stack_ptr = unsafe { &mut *stacks.get_unchecked_mut((i + radius) as usize) }; - unsafe { - stack_ptr.r = *pixels.get_unchecked(src_ptr + 0) as i32; - stack_ptr.g = *pixels.get_unchecked(src_ptr + 1) as i32; - stack_ptr.b = *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - stack_ptr.a = *pixels.get_unchecked(src_ptr + 3) as i32; - } + + let src_r = unsafe { *pixels.get_unchecked(src_ptr + 0) as i32 }; + let src_g = unsafe { *pixels.get_unchecked(src_ptr + 1) as i32 }; + let src_b = unsafe { *pixels.get_unchecked(src_ptr + 2) as i32 }; + let src_a = if COMPONENTS == 4 { + unsafe { *pixels.get_unchecked(src_ptr + 3) as i32 } + } else { + 0i32 + }; + + stack_ptr.r = src_r; + stack_ptr.g = src_g; + stack_ptr.b = src_b; + if COMPONENTS == 4 { + stack_ptr.a = src_a; } - unsafe { - sum_r += *pixels.get_unchecked(src_ptr + 0) as i32 * (radius + 1 - i) as i32; - sum_g += *pixels.get_unchecked(src_ptr + 1) as i32 * (radius + 1 - i) as i32; - sum_b += *pixels.get_unchecked(src_ptr + 2) as i32 * (radius + 1 - i) as i32; - if COMPONENTS == 4 { - sum_a += - *pixels.get_unchecked(src_ptr + 3) as i32 * (radius + 1 - i) as i32; - } + + sum_r += src_r * (radius + 1 - i) as i32; + sum_g += src_g * (radius + 1 - i) as i32; + sum_b += src_b * (radius + 1 - i) as i32; + if COMPONENTS == 4 { + sum_a += src_a * (radius + 1 - i) as i32; } - unsafe { - sum_in_r += *pixels.get_unchecked(src_ptr + 0) as i32; - sum_in_g += *pixels.get_unchecked(src_ptr + 1) as i32; - sum_in_b += *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - sum_in_a += *pixels.get_unchecked(src_ptr + 3) as i32; - } + sum_in_r += src_r; + sum_in_g += src_g; + sum_in_b += src_b; + if COMPONENTS == 4 { + sum_in_a += src_a; } } @@ -355,23 +377,29 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( yp += 1; } - unsafe { - stack_ptr.r = *pixels.get_unchecked(src_ptr + 0) as i32; - stack_ptr.g = *pixels.get_unchecked(src_ptr + 1) as i32; - stack_ptr.b = *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - stack_ptr.a = *pixels.get_unchecked(src_ptr + 3) as i32; - } + let src_r = unsafe { *pixels.get_unchecked(src_ptr + 0) as i32 }; + let src_g = unsafe { *pixels.get_unchecked(src_ptr + 1) as i32 }; + let src_b = unsafe { *pixels.get_unchecked(src_ptr + 2) as i32 }; + let src_a = if COMPONENTS == 4 { + unsafe { *pixels.get_unchecked(src_ptr + 3) as i32 } + } else { + 0i32 + }; + + stack_ptr.r = src_r; + stack_ptr.g = src_g; + stack_ptr.b = src_b; + if COMPONENTS == 4 { + stack_ptr.a = src_a; } - unsafe { - sum_in_r += *pixels.get_unchecked(src_ptr + 0) as i32; - sum_in_g += *pixels.get_unchecked(src_ptr + 1) as i32; - sum_in_b += *pixels.get_unchecked(src_ptr + 2) as i32; - if COMPONENTS == 4 { - sum_in_a += *pixels.get_unchecked(src_ptr + 3) as i32; - } + sum_in_r += src_r; + sum_in_g += src_g; + sum_in_b += src_b; + if COMPONENTS == 4 { + sum_in_a += src_a; } + sum_r += sum_in_r; sum_g += sum_in_g; sum_b += sum_in_b; @@ -403,8 +431,8 @@ fn stack_blur_pass<'a, const COMPONENTS: usize>( } #[no_mangle] -pub fn stack_blur<'a>( - pixels: &mut [u8], +pub fn stack_blur( + in_place: &mut [u8], stride: u32, width: u32, height: u32, @@ -414,7 +442,7 @@ pub fn stack_blur<'a>( match channels { FastBlurChannels::Channels3 => { stack_blur_pass::<3>( - pixels, + in_place, stride, width, height, @@ -424,7 +452,7 @@ pub fn stack_blur<'a>( 1, ); stack_blur_pass::<3>( - pixels, + in_place, stride, width, height, @@ -436,7 +464,7 @@ pub fn stack_blur<'a>( } FastBlurChannels::Channels4 => { stack_blur_pass::<4>( - pixels, + in_place, stride, width, height, @@ -446,7 +474,7 @@ pub fn stack_blur<'a>( 1, ); stack_blur_pass::<4>( - pixels, + in_place, stride, width, height, diff --git a/src/main.rs b/src/main.rs index b060b62..c9edc5a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,7 +56,7 @@ fn main() { FastBlurChannels::Channels4, ); - // libblur::fast_gaussian_next( + // libblur::fast_gaussian_superior( // &mut bytes, // stride as u32, // dimensions.0, @@ -77,7 +77,7 @@ fn main() { // FastBlurChannels::Channels4, // ThreadingPolicy::Single, // ); - // bytes = dst_bytes; + bytes = dst_bytes; // libblur::gaussian_blur( // &bytes, // stride as u32,