Skip to content

Commit

Permalink
Improve gaussian speed
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 10, 2024
1 parent 1a708bc commit 76c96d0
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 31 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "libblur"
version = "0.10.4"
version = "0.10.5"
edition = "2021"
description = "High performance blur in pure rust"
readme = "../../README.md"
Expand Down
20 changes: 10 additions & 10 deletions src/lib/box_blur_sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@
target_feature = "sse4.1"
))]
pub mod sse_support {
use crate::mul_table::{
MUL_TABLE_DOUBLE, MUL_TABLE_TWICE_RAD, SHR_TABLE_DOUBLE, SHR_TABLE_TWICE_RAD,
};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::mul_table::{
MUL_TABLE_TWICE_RAD, SHR_TABLE_TWICE_RAD,
};
use crate::sse_utils::sse_utils::load_u8_s32_fast;
use crate::unsafe_slice::UnsafeSlice;

Expand All @@ -61,7 +61,7 @@ pub mod sse_support {
let mul_value = MUL_TABLE_TWICE_RAD[radius as usize];
let shr_value = SHR_TABLE_TWICE_RAD[radius as usize];
let v_mul_value = unsafe { _mm_set1_epi32(mul_value) };
let v_shr_value = unsafe { _mm_set1_epi32(shr_value) };
let v_shr_value = unsafe { _mm_setr_epi32(shr_value,0,0,0) };

let kernel_size = radius * 2 + 1;
let edge_count = (kernel_size / 2) + 1;
Expand Down Expand Up @@ -117,7 +117,7 @@ pub mod sse_support {
}

let scale_store =
unsafe { _mm_sra_epi32(_mm_mullo_epi32(store, v_mul_value), v_shr_value) };
unsafe { _mm_srl_epi32(_mm_mullo_epi32(store, v_mul_value), v_shr_value) };
let px_16 = unsafe { _mm_packus_epi32(scale_store, scale_store) };
let px_8 = unsafe { _mm_packus_epi16(px_16, px_16) };
let pixel = unsafe { _mm_extract_epi32::<0>(px_8) };
Expand Down Expand Up @@ -157,7 +157,7 @@ pub mod sse_support {
let mul_value = MUL_TABLE_TWICE_RAD[radius as usize];
let shr_value = SHR_TABLE_TWICE_RAD[radius as usize];
let v_mul_value = unsafe { _mm_set1_epi32(mul_value) };
let v_shr_value = unsafe { _mm_set1_epi32(shr_value) };
let v_shr_value = unsafe { _mm_setr_epi32(shr_value,0,0,0) };

let kernel_size = radius * 2 + 1;
let edge_count = (kernel_size / 2) + 1;
Expand Down Expand Up @@ -226,9 +226,9 @@ pub mod sse_support {
}

let scale_store_0 =
unsafe { _mm_sra_epi32(_mm_mullo_epi32(store_0, v_mul_value), v_shr_value) };
unsafe { _mm_srl_epi32(_mm_mullo_epi32(store_0, v_mul_value), v_shr_value) };
let scale_store_1 =
unsafe { _mm_sra_epi32(_mm_mullo_epi32(store_1, v_mul_value), v_shr_value) };
unsafe { _mm_srl_epi32(_mm_mullo_epi32(store_1, v_mul_value), v_shr_value) };

if CHANNELS == 3 {
let px_16 = unsafe { _mm_packus_epi32(scale_store_0, scale_store_0) };
Expand Down Expand Up @@ -322,7 +322,7 @@ pub mod sse_support {
}

let scale_store =
unsafe { _mm_sra_epi32(_mm_mullo_epi32(store, v_mul_value), v_shr_value) };
unsafe { _mm_srl_epi32(_mm_mullo_epi32(store, v_mul_value), v_shr_value) };
let px_16 = unsafe { _mm_packus_epi32(scale_store, scale_store) };
let px_8 = unsafe { _mm_packus_epi16(px_16, px_16) };

Expand All @@ -348,8 +348,8 @@ pub mod sse_support {
target_feature = "sse4.1"
)))]
pub mod sse_support {
use crate::unsafe_slice::UnsafeSlice;
use crate::FastBlurChannels;
use crate::unsafe_slice::UnsafeSlice;

#[allow(dead_code)]
pub(crate) fn box_blur_horizontal_pass_sse(
Expand Down
10 changes: 4 additions & 6 deletions src/lib/fast_gaussian_sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pub mod sse_support {
let mul_value = MUL_TABLE_DOUBLE[radius as usize];
let shr_value = SHR_TABLE_DOUBLE[radius as usize];
let v_mul_value = unsafe { _mm_set1_epi32(mul_value) };
let v_shr_value = unsafe { _mm_set1_epi32(shr_value) };
let v_shr_value = unsafe { _mm_setr_epi32(shr_value,0,0,0) };
for y in start..std::cmp::min(height, end) {
let mut diffs = unsafe { _mm_set1_epi32(0) };
let mut summs = unsafe { _mm_set1_epi32(initial_sum) };
Expand All @@ -64,9 +64,8 @@ pub mod sse_support {
if x >= 0 {
let current_px = ((std::cmp::max(x, 0) as u32) * CHANNELS_COUNT as u32) as usize;

const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
let prepared_px_s32 = unsafe {
_mm_sra_epi32(_mm_mullo_epi32(summs, v_mul_value), v_shr_value)
_mm_srl_epi32(_mm_mullo_epi32(summs, v_mul_value), v_shr_value)
};
let prepared_u16 = unsafe { _mm_packus_epi32(prepared_px_s32, prepared_px_s32) };
let prepared_u8 =
Expand Down Expand Up @@ -137,7 +136,7 @@ pub mod sse_support {
let mul_value = MUL_TABLE_DOUBLE[radius as usize];
let shr_value = SHR_TABLE_DOUBLE[radius as usize];
let v_mul_value = unsafe { _mm_set1_epi32(mul_value) };
let v_shr_value = unsafe { _mm_set1_epi32(shr_value) };
let v_shr_value = unsafe { _mm_setr_epi32(shr_value,0,0,0) };
for x in start..std::cmp::min(width, end) {
let mut diffs = unsafe { _mm_set1_epi32(0) };
let mut summs = unsafe { _mm_set1_epi32(initial_sum) };
Expand All @@ -148,9 +147,8 @@ pub mod sse_support {

if y >= 0 {
let current_px = ((std::cmp::max(x, 0)) * CHANNELS_COUNT as u32) as usize;
const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
let prepared_px_s32 = unsafe {
_mm_sra_epi32(_mm_mullo_epi32(summs, v_mul_value), v_shr_value)
_mm_srl_epi32(_mm_mullo_epi32(summs, v_mul_value), v_shr_value)
};
let prepared_u16 = unsafe { _mm_packus_epi32(prepared_px_s32, prepared_px_s32) };
let prepared_u8 =
Expand Down
26 changes: 13 additions & 13 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,23 +43,23 @@ fn main() {
// stride as u32,
// dimensions.0,
// dimensions.1,
// 319,
// 32,
// FastBlurChannels::Channels3,
// ThreadingPolicy::Adaptive,
// );

// libblur::gaussian_box_blur(
// &bytes,
// stride as u32,
// &mut dst_bytes,
// stride as u32,
// dimensions.0,
// dimensions.1,
// 32,
// FastBlurChannels::Channels3,
// ThreadingPolicy::Single,
// );
// bytes = dst_bytes;
libblur::gaussian_box_blur(
&bytes,
stride as u32,
&mut dst_bytes,
stride as u32,
dimensions.0,
dimensions.1,
35,
FastBlurChannels::Channels3,
ThreadingPolicy::Single,
);
bytes = dst_bytes;
// libblur::gaussian_blur(
// &bytes,
// stride as u32,
Expand Down

0 comments on commit 76c96d0

Please sign in to comment.