Skip to content

Commit

Permalink
AVX-2 encoding improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Dec 20, 2024
1 parent 2939263 commit 487366b
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 1 deletion.
50 changes: 50 additions & 0 deletions src/avx2/avx2_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,56 @@ pub(crate) unsafe fn _mm256_load_deinterleave_rgb_for_yuv<const ORIGINS: u8>(
(r_values, g_values, b_values)
}

#[inline(always)]
pub(crate) unsafe fn _mm256_load_deinterleave_half_rgb_for_yuv<const ORIGINS: u8>(
ptr: *const u8,
) -> (__m256i, __m256i, __m256i) {
let source_channels: YuvSourceChannels = ORIGINS.into();

let (r_values, g_values, b_values);

match source_channels {
YuvSourceChannels::Rgb | YuvSourceChannels::Bgr => {
let row_1 = _mm256_loadu_si256(ptr as *const __m256i);
let row_2 = _mm_loadu_si128(ptr.add(32) as *const __m128i);

let (it1, it2, it3) =
avx2_deinterleave_rgb(row_1, _mm256_castsi128_si256(row_2), _mm256_setzero_si256());
if source_channels == YuvSourceChannels::Rgb {
r_values = it1;
g_values = it2;
b_values = it3;
} else {
r_values = it3;
g_values = it2;
b_values = it1;
}
}
YuvSourceChannels::Rgba | YuvSourceChannels::Bgra => {
let row_1 = _mm256_loadu_si256(ptr as *const __m256i);
let row_2 = _mm256_loadu_si256(ptr.add(32) as *const __m256i);

let (it1, it2, it3, _) = _mm256_deinterleave_rgba_epi8(
row_1,
row_2,
_mm256_setzero_si256(),
_mm256_setzero_si256(),
);
if source_channels == YuvSourceChannels::Rgba {
r_values = it1;
g_values = it2;
b_values = it3;
} else {
r_values = it3;
g_values = it2;
b_values = it1;
}
}
}

(r_values, g_values, b_values)
}

#[inline(always)]
pub(crate) unsafe fn _mm256_store_interleave_rgb16_for_yuv<const ORIGINS: u8>(
ptr: *mut u16,
Expand Down
123 changes: 122 additions & 1 deletion src/avx2/rgba_to_yuv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
*/

use crate::avx2::avx2_utils::{
_mm256_load_deinterleave_rgb_for_yuv, avx2_pack_u16, avx_pairwise_avg_epi16_epi8,
_mm256_load_deinterleave_half_rgb_for_yuv, _mm256_load_deinterleave_rgb_for_yuv, avx2_pack_u16,
avx_pairwise_avg_epi16_epi8,
};
use crate::internals::ProcessedOffset;
use crate::yuv_support::{
Expand Down Expand Up @@ -280,5 +281,125 @@ unsafe fn avx2_rgba_to_yuv_impl<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
cx += 32;
}

while cx + 16 < width {
let px = cx * channels;
let (r_values, g_values, b_values) =
_mm256_load_deinterleave_half_rgb_for_yuv::<ORIGIN_CHANNELS>(rgba_ptr.add(px));

let r_low =
_mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_values)));
let g_low =
_mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_values)));
let b_low =
_mm256_slli_epi16::<V_SCALE>(_mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_values)));

let y_l = _mm256_min_epi16(
_mm256_add_epi16(
y_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhrs_epi16(r_low, v_yr),
_mm256_mulhrs_epi16(g_low, v_yg),
),
_mm256_mulhrs_epi16(b_low, v_yb),
),
),
i_cap_y,
);

let y_yuv = avx2_pack_u16(y_l, y_l);
_mm_storeu_si128(y_ptr.add(cx) as *mut __m128i, _mm256_castsi256_si128(y_yuv));

if chroma_subsampling == YuvChromaSubsampling::Yuv444 {
let cb_l = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhrs_epi16(r_low, v_cb_r),
_mm256_mulhrs_epi16(g_low, v_cb_g),
),
_mm256_mulhrs_epi16(b_low, v_cb_b),
),
),
i_cap_uv,
),
y_bias,
);
let cr_l = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhrs_epi16(r_low, v_cr_r),
_mm256_mulhrs_epi16(g_low, v_cr_g),
),
_mm256_mulhrs_epi16(b_low, v_cr_b),
),
),
i_cap_uv,
),
y_bias,
);

let cb = avx2_pack_u16(cb_l, cb_l);
let cr = avx2_pack_u16(cr_l, cr_l);

_mm_storeu_si128(u_ptr.add(uv_x) as *mut __m128i, _mm256_castsi256_si128(cb));
_mm_storeu_si128(v_ptr.add(uv_x) as *mut __m128i, _mm256_castsi256_si128(cr));
uv_x += 16;
} else if chroma_subsampling == YuvChromaSubsampling::Yuv422
|| (chroma_subsampling == YuvChromaSubsampling::Yuv420)
{
let r1 = _mm256_slli_epi16::<V_SCALE>(avx_pairwise_avg_epi16_epi8(r_values));
let g1 = _mm256_slli_epi16::<V_SCALE>(avx_pairwise_avg_epi16_epi8(g_values));
let b1 = _mm256_slli_epi16::<V_SCALE>(avx_pairwise_avg_epi16_epi8(b_values));
let cb = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhrs_epi16(r1, v_cb_r),
_mm256_mulhrs_epi16(g1, v_cb_g),
),
_mm256_mulhrs_epi16(b1, v_cb_b),
),
),
i_cap_uv,
),
y_bias,
);
let cr = _mm256_max_epi16(
_mm256_min_epi16(
_mm256_add_epi16(
uv_bias,
_mm256_add_epi16(
_mm256_add_epi16(
_mm256_mulhrs_epi16(r1, v_cr_r),
_mm256_mulhrs_epi16(g1, v_cr_g),
),
_mm256_mulhrs_epi16(b1, v_cr_b),
),
),
i_cap_uv,
),
y_bias,
);

let cb = _mm256_castsi256_si128(avx2_pack_u16(cb, cb));
let cr = _mm256_castsi256_si128(avx2_pack_u16(cr, cr));

std::ptr::copy_nonoverlapping(&cb as *const _ as *const u8, u_ptr.add(uv_x), 8);
std::ptr::copy_nonoverlapping(&cr as *const _ as *const u8, v_ptr.add(uv_x), 8);

uv_x += 8;
}

cx += 16;
}

ProcessedOffset { cx, ux: uv_x }
}

0 comments on commit 487366b

Please sign in to comment.