Skip to content

Commit

Permalink
Improve SSE, improve Stackblur
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 11, 2024
1 parent 01ec35e commit beb2c4f
Show file tree
Hide file tree
Showing 8 changed files with 348 additions and 197 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions src/lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "libblur"
version = "0.10.6"
version = "0.10.7"
edition = "2021"
description = "High performance blur in pure rust"
readme = "../../README.md"
Expand All @@ -11,7 +11,7 @@ documentation = "https://github.com/awxkee/libblur"
categories = ["multimedia::images", "multimedia::video", "algorithms"]
homepage = "https://github.com/awxkee/libblur"
repository = "https://github.com/awxkee/libblur.git"
exclude = ["*.jpg", "../../assets/*"]
exclude = ["*.jpg", "../../assets/*", "*.png"]

[lib]
name = "libblur"
Expand Down
42 changes: 32 additions & 10 deletions src/lib/fast_gaussian_next_sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,23 @@ pub mod sse_support {
unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) };

let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) };
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(current_y + current_px, bits[0]);
bytes.write(current_y + current_px + 1, bits[1]);
bytes.write(current_y + current_px + 2, bits[2]);
let bytes_offset = current_y + current_px;

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr =
(bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32;
*dst_ptr = pixel;
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(bytes_offset, bits[0]);
bytes.write(bytes_offset + 1, bits[1]);
bytes.write(bytes_offset + 2, bits[2]);
}
}

let d_arr_index_1 = ((y + radius_64) & 1023) as usize;
Expand Down Expand Up @@ -177,12 +188,23 @@ pub mod sse_support {
unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) };

let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) };
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(current_y + current_px, bits[0]);
bytes.write(current_y + current_px + 1, bits[1]);
bytes.write(current_y + current_px + 2, bits[2]);
let bytes_offset = current_y + current_px;

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr =
(bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32;
*dst_ptr = pixel;
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(bytes_offset, bits[0]);
bytes.write(bytes_offset + 1, bits[1]);
bytes.write(bytes_offset + 2, bits[2]);
}
}

let d_arr_index_1 = ((x + radius_64) & 1023) as usize;
Expand Down
42 changes: 32 additions & 10 deletions src/lib/fast_gaussian_sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,23 @@ pub mod sse_support {
let prepared_u8 =
unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) };
let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) };
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(current_y + current_px, bits[0]);
bytes.write(current_y + current_px + 1, bits[1]);
bytes.write(current_y + current_px + 2, bits[2]);
let bytes_offset = current_y + current_px;

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr =
(bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32;
*dst_ptr = pixel;
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(current_y + current_px, bits[0]);
bytes.write(current_y + current_px + 1, bits[1]);
bytes.write(current_y + current_px + 2, bits[2]);
}
}

let arr_index = ((x - radius_64) & 1023) as usize;
Expand Down Expand Up @@ -155,12 +166,23 @@ pub mod sse_support {
unsafe { _mm_packus_epi16(prepared_u16, prepared_u16) };

let pixel = unsafe { _mm_extract_epi32::<0>(prepared_u8) };
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(current_y + current_px, bits[0]);
bytes.write(current_y + current_px + 1, bits[1]);
bytes.write(current_y + current_px + 2, bits[2]);
let bytes_offset = current_y + current_px;

if CHANNELS_COUNT == 4 {
unsafe {
let dst_ptr =
(bytes.slice.as_ptr() as *mut u8).add(bytes_offset) as *mut i32;
*dst_ptr = pixel;
}
} else {
let bits = pixel.to_le_bytes();

unsafe {
bytes.write(bytes_offset, bits[0]);
bytes.write(bytes_offset + 1, bits[1]);
bytes.write(bytes_offset + 2, bits[2]);
}
}

let arr_index = ((y - radius_64) & 1023) as usize;
Expand Down
Loading

0 comments on commit beb2c4f

Please sign in to comment.