Skip to content

Commit

Permalink
Saturating, rounding, bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Sep 5, 2024
1 parent fbab0cb commit 10ce13e
Show file tree
Hide file tree
Showing 19 changed files with 221 additions and 154 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ workspace = { members = ["app"] }

[package]
name = "yuvutils-rs"
version = "0.4.4"
version = "0.4.5"
edition = "2021"
description = "High performance utilities for YUV format handling and conversion."
readme = "README.md"
Expand Down
13 changes: 7 additions & 6 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ fn main() {
let end_time = Instant::now().sub(start_time);
println!("yuv_nv12_to_rgb time: {:?}", end_time);
let start_time = Instant::now();
rgb_to_yuv420(
rgb_to_sharp_yuv420(
&mut y_plane,
y_stride as u32,
&mut u_plane,
Expand All @@ -123,8 +123,9 @@ fn main() {
width * components,
width,
height,
YuvRange::TV,
YuvStandardMatrix::Bt709,
YuvRange::Full,
YuvStandardMatrix::Bt601,
SharpYuvGammaTransfer::Srgb,
);

// let mut y_plane_16 = vec![0u16; width as usize * height as usize];
Expand Down Expand Up @@ -250,8 +251,8 @@ fn main() {
rgba_stride as u32,
width,
height,
YuvRange::TV,
YuvStandardMatrix::Bt709,
YuvRange::Full,
YuvStandardMatrix::Bt601,
);

let end_time = Instant::now().sub(start_time);
Expand Down Expand Up @@ -287,7 +288,7 @@ fn main() {
// rgba = Vec::from(gbr);

image::save_buffer(
"converted.png",
"converted_sharp.png",
rgba.as_bytes(),
dimensions.0,
dimensions.1,
Expand Down
6 changes: 5 additions & 1 deletion src/avx512bw/y_to_rgb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub unsafe fn avx512_y_to_rgb_row<const DESTINATION_CHANNELS: u8>(
let v_luma_coeff = _mm512_set1_epi16(transform.y_coef as i16);
let v_min_values = _mm512_setzero_si512();
let v_alpha = _mm512_set1_epi8(255u8 as i8);
let rounding_const = _mm512_set1_epi16(1 << 5);

while cx + 64 < width {
let y_values = _mm512_subs_epi8(
Expand All @@ -53,7 +54,10 @@ pub unsafe fn avx512_y_to_rgb_row<const DESTINATION_CHANNELS: u8>(
v_luma_coeff,
);

let r_low = _mm512_srli_epi16::<6>(_mm512_max_epi16(y_low, v_min_values));
let r_low = _mm512_srli_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(y_low, v_min_values),
rounding_const,
));

let r_values = avx512_pack_u16(r_low, r_high);

Expand Down
37 changes: 19 additions & 18 deletions src/avx512bw/ycgco_to_rgb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ pub unsafe fn avx512_ycgco_to_rgb_row<const DESTINATION_CHANNELS: u8, const SAMP
let uv_reduction = _mm512_set1_epi16(range_reduction_uv as i16);
let v_alpha = _mm512_set1_epi16(-128);
let v_min_zeros = _mm512_setzero_si512();
let rounding_const = _mm512_set1_epi16(1 << 5);

while cx + 64 < width {
let y_values = _mm512_loadu_si512(y_ptr.add(cx) as *const i32);
Expand Down Expand Up @@ -109,17 +110,17 @@ pub unsafe fn avx512_ycgco_to_rgb_row<const DESTINATION_CHANNELS: u8, const SAMP

let t_high = _mm512_subs_epi16(y_high, cg_high);

let r_high = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(t_high, co_high),
v_min_zeros,
let r_high = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(t_high, co_high), v_min_zeros),
rounding_const,
));
let b_high = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_subs_epi16(t_high, co_high),
v_min_zeros,
let b_high = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_subs_epi16(t_high, co_high), v_min_zeros),
rounding_const,
));
let g_high = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_high, cg_high),
v_min_zeros,
let g_high = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(y_high, cg_high), v_min_zeros),
rounding_const,
));

let cg_low = _mm512_mullo_epi16(
Expand All @@ -140,17 +141,17 @@ pub unsafe fn avx512_ycgco_to_rgb_row<const DESTINATION_CHANNELS: u8, const SAMP

let t_low = _mm512_subs_epi16(y_low, cg_low);

let r_low = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(t_low, co_low),
v_min_zeros,
let r_low = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(t_low, co_low), v_min_zeros),
rounding_const,
));
let b_low = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_subs_epi16(t_low, co_low),
v_min_zeros,
let b_low = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_subs_epi16(t_low, co_low), v_min_zeros),
rounding_const,
));
let g_low = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_low, cg_low),
v_min_zeros,
let g_low = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(y_low, cg_low), v_min_zeros),
rounding_const,
));

let r_values = avx512_pack_u16(r_low, r_high);
Expand Down
39 changes: 20 additions & 19 deletions src/avx512bw/ycgco_to_rgba_alpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ pub unsafe fn avx512_ycgco_to_rgba_alpha<const DESTINATION_CHANNELS: u8, const S
let uv_reduction = _mm512_set1_epi16(range_reduction_uv as i16);
let v_alpha = _mm512_set1_epi16(-128);
let v_min_zeros = _mm512_setzero_si512();
let rounding_const = _mm512_set1_epi16(1 << 5);

while cx + 64 < width {
let y_values = _mm512_loadu_si512(y_ptr.add(cx) as *const i32);
Expand Down Expand Up @@ -114,17 +115,17 @@ pub unsafe fn avx512_ycgco_to_rgba_alpha<const DESTINATION_CHANNELS: u8, const S

let t_high = _mm512_subs_epi16(y_high, cg_high);

let r_high = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(t_high, co_high),
v_min_zeros,
let r_high = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(t_high, co_high), v_min_zeros),
rounding_const,
));
let b_high = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_subs_epi16(t_high, co_high),
v_min_zeros,
let b_high = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_subs_epi16(t_high, co_high), v_min_zeros),
rounding_const,
));
let g_high = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_high, cg_high),
v_min_zeros,
let g_high = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(y_high, cg_high), v_min_zeros),
rounding_const,
));

let cg_low = _mm512_mullo_epi16(
Expand All @@ -145,17 +146,17 @@ pub unsafe fn avx512_ycgco_to_rgba_alpha<const DESTINATION_CHANNELS: u8, const S

let t_low = _mm512_subs_epi16(y_low, cg_low);

let r_low = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(t_low, co_low),
v_min_zeros,
let r_low = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(t_low, co_low), v_min_zeros),
rounding_const,
));
let b_low = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_subs_epi16(t_low, co_low),
v_min_zeros,
let b_low = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_subs_epi16(t_low, co_low), v_min_zeros),
rounding_const,
));
let g_low = _mm512_srai_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_low, cg_low),
v_min_zeros,
let g_low = _mm512_srai_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(_mm512_adds_epi16(y_low, cg_low), v_min_zeros),
rounding_const,
));

let (r_values, g_values, b_values);
Expand Down Expand Up @@ -223,5 +224,5 @@ pub unsafe fn avx512_ycgco_to_rgba_alpha<const DESTINATION_CHANNELS: u8, const S
}
}

return ProcessedOffset { cx, ux: uv_x };
ProcessedOffset { cx, ux: uv_x }
}
67 changes: 43 additions & 24 deletions src/avx512bw/yuv_nv_to_rgba.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ pub unsafe fn avx512_yuv_nv_to_rgba<
let v_g_coeff_1 = _mm512_set1_epi16(-1 * transform.g_coeff_1 as i16);
let v_g_coeff_2 = _mm512_set1_epi16(-1 * transform.g_coeff_2 as i16);
let v_alpha = _mm512_set1_epi8(255u8 as i8);
let rounding_const = _mm512_set1_epi16(1 << 5);

while cx + 32 < width {
let y_values = _mm512_subs_epu8(
Expand Down Expand Up @@ -120,23 +121,32 @@ pub unsafe fn avx512_yuv_nv_to_rgba<
v_luma_coeff,
);

let r_high = _mm512_srli_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_high, _mm512_mullo_epi16(v_high, v_cr_coeff)),
v_min_values,
let r_high = _mm512_srli_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(
_mm512_adds_epi16(y_high, _mm512_mullo_epi16(v_high, v_cr_coeff)),
v_min_values,
),
rounding_const,
));
let b_high = _mm512_srli_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_high, _mm512_mullo_epi16(u_high, v_cb_coeff)),
v_min_values,
let b_high = _mm512_srli_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(
_mm512_adds_epi16(y_high, _mm512_mullo_epi16(u_high, v_cb_coeff)),
v_min_values,
),
rounding_const,
));
let g_high = _mm512_srli_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(
y_high,
let g_high = _mm512_srli_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(
_mm512_adds_epi16(
_mm512_mullo_epi16(v_high, v_g_coeff_1),
_mm512_mullo_epi16(u_high, v_g_coeff_2),
y_high,
_mm512_adds_epi16(
_mm512_mullo_epi16(v_high, v_g_coeff_1),
_mm512_mullo_epi16(u_high, v_g_coeff_2),
),
),
v_min_values,
),
v_min_values,
rounding_const,
));

let u_low = _mm512_subs_epi16(_mm512_cvtepu8_epi16(u_low_u8), uv_corr);
Expand All @@ -146,23 +156,32 @@ pub unsafe fn avx512_yuv_nv_to_rgba<
v_luma_coeff,
);

let r_low = _mm512_srli_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_low, _mm512_mullo_epi16(v_low, v_cr_coeff)),
v_min_values,
let r_low = _mm512_srli_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(
_mm512_adds_epi16(y_low, _mm512_mullo_epi16(v_low, v_cr_coeff)),
v_min_values,
),
rounding_const,
));
let b_low = _mm512_srli_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(y_low, _mm512_mullo_epi16(u_low, v_cb_coeff)),
v_min_values,
let b_low = _mm512_srli_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(
_mm512_adds_epi16(y_low, _mm512_mullo_epi16(u_low, v_cb_coeff)),
v_min_values,
),
rounding_const,
));
let g_low = _mm512_srli_epi16::<6>(_mm512_max_epi16(
_mm512_adds_epi16(
y_low,
let g_low = _mm512_srli_epi16::<6>(_mm512_adds_epi16(
_mm512_max_epi16(
_mm512_adds_epi16(
_mm512_mullo_epi16(v_low, v_g_coeff_1),
_mm512_mullo_epi16(u_low, v_g_coeff_2),
y_low,
_mm512_adds_epi16(
_mm512_mullo_epi16(v_low, v_g_coeff_1),
_mm512_mullo_epi16(u_low, v_g_coeff_2),
),
),
v_min_values,
),
v_min_values,
rounding_const,
));

let r_values = avx512_pack_u16(r_low, r_high);
Expand Down
Loading

0 comments on commit 10ce13e

Please sign in to comment.