Skip to content

Commit

Permalink
Merge pull request #16 from awxkee/dev
Browse files Browse the repository at this point in the history
Sharp yuv improvements, MSRV improvements
  • Loading branch information
awxkee authored Nov 28, 2024
2 parents 9502dbf + ad64a2a commit a4c0264
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 57 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ categories = ["multimedia::images", "multimedia::video"]
homepage = "https://github.com/awxkee/yuvutils-rs"
repository = "https://github.com/awxkee/yuvutils-rs"
exclude = ["*.jpg", "assets/*", "*.png"]
rust-version = "1.73.0"

[dependencies]
num-traits = "0.2.19"
Expand Down
1 change: 1 addition & 0 deletions src/from_identity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
#[cfg(feature = "rayon")]
use rayon::prelude::{ParallelSlice, ParallelSliceMut};
use std::fmt::Debug;
use std::mem::size_of;

#[inline]
fn gbr_to_rgbx_impl<
Expand Down
1 change: 1 addition & 0 deletions src/from_identity_alpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
#[cfg(feature = "rayon")]
use rayon::prelude::{ParallelSlice, ParallelSliceMut};
use std::fmt::Debug;
use std::mem::size_of;

#[inline]
fn gbr_to_rgbx_alpha_impl<
Expand Down
1 change: 1 addition & 0 deletions src/images.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use crate::YuvError;
use std::fmt::Debug;

#[derive(Debug)]
/// Shared storage type
pub enum BufferStoreMut<'a, T: Copy + Debug> {
Borrowed(&'a mut [T]),
Owned(Vec<T>),
Expand Down
2 changes: 1 addition & 1 deletion src/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ pub(crate) use rgb_to_yuv_p16::{neon_rgba_to_yuv_p16, neon_rgba_to_yuv_p16_rdm};
pub(crate) use rgba_to_nv::{neon_rgbx_to_nv_row, neon_rgbx_to_nv_row_rdm};
pub(crate) use rgba_to_yuv::{neon_rgba_to_yuv, neon_rgba_to_yuv_rdm};
pub(crate) use rgba_to_yuv420::{neon_rgba_to_yuv420, neon_rgba_to_yuv_rdm420};
pub(crate) use y_p16_to_rgba16::neon_y_p16_to_rgba16_row;
pub(crate) use y_p16_to_rgba16::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
pub(crate) use y_to_rgb::{neon_y_to_rgb_row, neon_y_to_rgb_row_rdm};
pub(crate) use ycgco_to_rgb::neon_ycgco_to_rgb_row;
pub(crate) use ycgco_to_rgb_alpha::neon_ycgco_to_rgb_alpha_row;
Expand Down
146 changes: 111 additions & 35 deletions src/neon/y_p16_to_rgba16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,17 @@
use std::arch::aarch64::*;

use crate::internals::ProcessedOffset;
use crate::yuv_support::{
CbCrInverseTransform, YuvBytesPacking, YuvChromaRange, YuvEndianness, YuvSourceChannels,
};
use crate::neon::neon_simd_support::vldq_s16_endian;
use crate::yuv_support::{CbCrInverseTransform, YuvChromaRange, YuvSourceChannels};

pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
#[target_feature(enable = "rdm")]
pub(crate) unsafe fn neon_y_p16_to_rgba16_rdm<
const DESTINATION_CHANNELS: u8,
const ENDIANNESS: u8,
const BYTES_POSITION: u8,
const PRECISION: i32,
>(
y_ld_ptr: *const u16,
rgba: *mut u16,
y_ld_ptr: &[u16],
rgba: &mut [u16],
width: u32,
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
Expand All @@ -50,66 +49,143 @@ pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
) -> ProcessedOffset {
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let channels = destination_channels.get_channels_count();
let endianness: YuvEndianness = ENDIANNESS.into();
let bytes_position: YuvBytesPacking = BYTES_POSITION.into();
let dst_ptr = rgba;

let y_corr = vdupq_n_s16(range.bias_y as i16);
let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16);
let y_corr = vdupq_n_u16(range.bias_y as u16);
let v_min_values = vdupq_n_s16(0i16);
let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);

let mut cx = start_cx;

const V_SCALE: i32 = 2;

while cx + 8 < width as usize {
let y_values: int16x8_t;

match endianness {
YuvEndianness::BigEndian => {
let mut y_u_values = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(
vld1q_u16(y_ld_ptr.add(cx)),
)));
if bytes_position == YuvBytesPacking::MostSignificantBytes {
y_u_values = vshlq_u16(y_u_values, v_msb_shift);
}
y_values = vsubq_s16(vreinterpretq_s16_u16(y_u_values), y_corr);
let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16(
vreinterpretq_u16_s16(vldq_s16_endian::<ENDIANNESS, BYTES_POSITION>(
y_ld_ptr.get_unchecked(cx..).as_ptr(),
v_msb_shift,
)),
y_corr,
));

let y_high = vqrdmulhq_n_s16(vshlq_n_s16::<V_SCALE>(y_values), transform.y_coef as i16);

let r_values = vminq_u16(
vreinterpretq_u16_s16(vmaxq_s16(y_high, v_min_values)),
v_alpha,
);

match destination_channels {
YuvSourceChannels::Rgb => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Bgr => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Rgba => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvEndianness::LittleEndian => {
let mut y_vl = vld1q_u16(y_ld_ptr.add(cx));
if bytes_position == YuvBytesPacking::MostSignificantBytes {
y_vl = vshlq_u16(y_vl, v_msb_shift);
}
y_values = vsubq_s16(vreinterpretq_s16_u16(y_vl), y_corr);
YuvSourceChannels::Bgra => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
}

cx += 8;
}

ProcessedOffset { cx, ux: 0 }
}

pub(crate) unsafe fn neon_y_p16_to_rgba16_row<
const DESTINATION_CHANNELS: u8,
const ENDIANNESS: u8,
const BYTES_POSITION: u8,
const PRECISION: i32,
>(
y_ld_ptr: &[u16],
rgba: &mut [u16],
width: u32,
range: &YuvChromaRange,
transform: &CbCrInverseTransform<i32>,
start_cx: usize,
bit_depth: usize,
) -> ProcessedOffset {
let destination_channels: YuvSourceChannels = DESTINATION_CHANNELS.into();
let channels = destination_channels.get_channels_count();
let dst_ptr = rgba;

let y_corr = vdupq_n_u16(range.bias_y as u16);
let v_luma_coeff = vdupq_n_s16(transform.y_coef as i16);
let v_alpha = vdupq_n_u16((1 << bit_depth) - 1);
let v_max_values = vdupq_n_s32((1 << bit_depth) - 1);
let v_msb_shift = vdupq_n_s16(bit_depth as i16 - 16);

let mut cx = start_cx;

while cx + 8 < width as usize {
let y_values: int16x8_t = vreinterpretq_s16_u16(vqsubq_u16(
vreinterpretq_u16_s16(vldq_s16_endian::<ENDIANNESS, BYTES_POSITION>(
y_ld_ptr.get_unchecked(cx..).as_ptr(),
v_msb_shift,
)),
y_corr,
));

let y_high = vmull_high_s16(y_values, v_luma_coeff);

let r_high = vrshrn_n_s32::<PRECISION>(y_high);
let r_high = vqmovun_s32(vminq_s32(vrshrq_n_s32::<PRECISION>(y_high), v_max_values));

let y_low = vmull_s16(vget_low_s16(y_values), vget_low_s16(v_luma_coeff));

let r_low = vrshrn_n_s32::<PRECISION>(y_low);
let r_low = vqmovun_s32(vminq_s32(vrshrq_n_s32::<PRECISION>(y_low), v_max_values));

let r_values = vreinterpretq_u16_s16(vmaxq_s16(vcombine_s16(r_low, r_high), v_min_values));
let r_values = vcombine_u16(r_low, r_high);

match destination_channels {
YuvSourceChannels::Rgb => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Bgr => {
let dst_pack = uint16x8x3_t(r_values, r_values, r_values);
vst3q_u16(dst_ptr.add(cx * channels), dst_pack);
vst3q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Rgba => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
YuvSourceChannels::Bgra => {
let dst_pack = uint16x8x4_t(r_values, r_values, r_values, v_alpha);
vst4q_u16(dst_ptr.add(cx * channels), dst_pack);
vst4q_u16(
dst_ptr.get_unchecked_mut(cx * channels..).as_mut_ptr(),
dst_pack,
);
}
}

Expand Down
24 changes: 12 additions & 12 deletions src/sharpyuv/sharp_rgba_to_yuv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,10 @@ fn rgbx_to_sharp_yuv<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
};
sharpen_row420::<ORIGIN_CHANNELS, SAMPLING, PRECISION>(
y,
rgba,
y_plane,
u_plane,
v_plane,
&rgba[0..planar_image.width as usize * src_chans.get_channels_count()],
&mut y_plane[0..planar_image.width as usize],
&mut u_plane[0..(planar_image.width as usize).div_ceil(2)],
&mut v_plane[0..(planar_image.width as usize).div_ceil(2)],
rgb_layout_lane,
rgb_layout_next_lane,
&gamma_map_table,
Expand All @@ -513,10 +513,10 @@ fn rgbx_to_sharp_yuv<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
let rgb_layout_lane = &rgb_layout
[rgb_layout_start..((planar_image.width as usize) * 3 + rgb_layout_start)];
sharpen_row422::<ORIGIN_CHANNELS, SAMPLING, PRECISION>(
rgba,
y_plane,
u_plane,
v_plane,
&rgba[0..planar_image.width as usize * src_chans.get_channels_count()],
&mut y_plane[0..planar_image.width as usize],
&mut u_plane[0..(planar_image.width as usize).div_ceil(2)],
&mut v_plane[0..(planar_image.width as usize).div_ceil(2)],
rgb_layout_lane,
&gamma_map_table,
&chroma_range,
Expand Down Expand Up @@ -564,10 +564,10 @@ fn rgbx_to_sharp_yuv<const ORIGIN_CHANNELS: u8, const SAMPLING: u8>(
let rgb_layout_next_lane: &[u16] = rgb_layout;
sharpen_row420::<ORIGIN_CHANNELS, SAMPLING, PRECISION>(
y,
rgba,
y_plane,
u_plane,
v_plane,
&rgba[0..planar_image.width as usize * src_chans.get_channels_count()],
&mut y_plane[0..planar_image.width as usize],
&mut u_plane[0..(planar_image.width as usize).div_ceil(2)],
&mut v_plane[0..(planar_image.width as usize).div_ceil(2)],
rgb_layout_lane,
rgb_layout_next_lane,
&gamma_map_table,
Expand Down
1 change: 1 addition & 0 deletions src/to_identity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ use crate::yuv_support::{get_yuv_range, YuvSourceChannels};
use crate::{YuvChromaSubsampling, YuvError, YuvPlanarImageMut, YuvRange};
use num_traits::AsPrimitive;
use std::fmt::Debug;
use std::mem::size_of;

#[inline]
fn rgbx_to_gbr_impl<
Expand Down
22 changes: 13 additions & 9 deletions src/y_p16_to_rgb16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
*/
use crate::built_coefficients::get_built_inverse_transform;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::neon::neon_y_p16_to_rgba16_row;
use crate::neon::{neon_y_p16_to_rgba16_rdm, neon_y_p16_to_rgba16_row};
use crate::yuv_support::*;
use crate::{YuvError, YuvGrayImage};
#[cfg(feature = "rayon")]
Expand Down Expand Up @@ -92,6 +92,15 @@ fn yuv400_p16_to_rgbx<
.zip(image.y_plane.chunks_exact(image.y_stride as usize));
}

#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
let neon_wide_handler = if is_rdm_available && bit_depth <= 12 {
neon_y_p16_to_rgba16_rdm::<DESTINATION_CHANNELS, ENDIANNESS, BYTES_POSITION>
} else {
neon_y_p16_to_rgba16_row::<DESTINATION_CHANNELS, ENDIANNESS, BYTES_POSITION, PRECISION>
};

match range {
YuvRange::Limited => {
iter.for_each(|(rgba16, y_plane)| {
Expand All @@ -101,14 +110,9 @@ fn yuv400_p16_to_rgbx<
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
unsafe {
let offset = neon_y_p16_to_rgba16_row::<
DESTINATION_CHANNELS,
ENDIANNESS,
BYTES_POSITION,
PRECISION,
>(
y_plane.as_ptr(),
rgba16.as_mut_ptr(),
let offset = neon_wide_handler(
y_plane,
rgba16,
image.width,
&chroma_range,
&i_transform,
Expand Down

0 comments on commit a4c0264

Please sign in to comment.