diff --git a/README.md b/README.md index 5a5b2b0..c7aa60a 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ YUV 8 bit-depth conversion `aarch64` tested on Mac Pro M3. AVX2 tests performed on `Digital Ocean Shared Premium Intel 2 vCPU` droplet. +AVX2 Win test performed on Windows 11 Intel Core i9-14900HX. ```bash cargo bench --bench yuv8 --manifest-path ./app/Cargo.toml @@ -87,31 +88,31 @@ cargo +nightly bench --bench yuv8 --manifest-path ./app/Cargo.toml --features ni ### Encoding 8-bit -| Conversion | time(NEON) | Time(AVX2) | Time(AVX-512) | -|------------------------|:----------:|:----------:|:-------------:| -| utils RGB->YUV 4:2:0 | 3.16ms | 16.28ms | 8.93ms | -| libyuv RGB->YUV 4:2:0 | 3.58ms | 17.64ms | 12.83ms | -| utils RGBA->YUV 4:2:0 | 4.04ms | 12.63ms | 10.27ms | -| libyuv RGBA->YUV 4:2:0 | 4.87ms | 11.27ms | 10.73ms | -| utils RGBA->YUV 4:2:2 | 4.34ms | 24.02ms | 18.53ms | -| libyuv RGBA->YUV 4:2:2 | 5.90ms | 19.43ms | 17.94ms | -| utils RGBA->YUV 4:4:4 | 4.49ms | 29.18ms | 22.08ms | +| Conversion | time(NEON) | Time(AVX2 Win) | Time(AVX2) | Time(AVX-512) | +|------------------------|:----------:|:--------------:|:----------:|:-------------:| +| utils RGB->YUV 4:2:0 | 3.16ms | 5.54ms | 16.28ms | 8.93ms | +| libyuv RGB->YUV 4:2:0 | 3.58ms | 34.30ms | 17.64ms | 12.83ms | +| utils RGBA->YUV 4:2:0 | 4.04ms | 5.78ms | 12.63ms | 10.27ms | +| libyuv RGBA->YUV 4:2:0 | 4.87ms | 25.29ms | 11.27ms | 10.73ms | +| utils RGBA->YUV 4:2:2 | 4.34ms | 7.35ms | 24.02ms | 18.53ms | +| libyuv RGBA->YUV 4:2:2 | 5.90ms | 37.65ms | 19.43ms | 17.94ms | +| utils RGBA->YUV 4:4:4 | 4.49ms | 8.97ms | 29.18ms | 22.08ms | ### Decoding 8-bit -| Conversion | time(NEON) | Time(AVX2) | Time(AVX-512) | -|------------------------|:----------:|:----------:|:-------------:| -| utils YUV NV12->RGBA | 3.91ms | 22.59ms | 18.55ms | -| utils YUV NV12->RGB | 3.28ms | 17.56ms | 13.64ms | -| libyuv YUV NV12->RGB | 5.20ms | 22.27ms | 18.55ms | -| utils YUV 4:2:0->RGB | 3.15ms | 17.69ms | 13.70ms | -| libyuv YUV 4:2:0->RGB | 5.70ms | 23.91ms | 20.07ms | -| utils YUV 4:2:0->RGBA | 3.70ms | 20.81ms | 18.84ms | -| libyuv YUV 4:2:0->RGBA | 6.13ms | 24.32ms | 18.50ms | -| utils YUV 4:2:2->RGBA | 4.05ms | 24.44ms | 22.05ms | -| libyuv YUV 4:2:2->RGBA | 5.91ms | 23.72ms | 18.71ms | -| utils YUV 4:4:4->RGBA | 3.91ms | 27.58ms | 22.85ms | -| libyuv YUV 4:4:4->RGBA | 4.82ms | 34.60ms | 21.47ms | +| Conversion | time(NEON) | Time(AVX2 Win) | Time(AVX2) | Time(AVX-512) | +|------------------------|:----------:|:--------------:|:----------:|:-------------:| +| utils YUV NV12->RGBA | 3.91ms | 5.15ms | 22.59ms | 18.55ms | +| utils YUV NV12->RGB | 3.28ms | 6.71ms | 17.56ms | 13.64ms | +| libyuv YUV NV12->RGB | 5.20ms | 50.16ms | 22.27ms | 18.55ms | +| utils YUV 4:2:0->RGB | 3.15ms | 5.15ms | 17.69ms | 13.70ms | +| libyuv YUV 4:2:0->RGB | 5.70ms | 48.52ms | 23.91ms | 20.07ms | +| utils YUV 4:2:0->RGBA | 3.70ms | 6.70ms | 20.81ms | 18.84ms | +| libyuv YUV 4:2:0->RGBA | 6.13ms | 7.20ms | 24.32ms | 18.50ms | +| utils YUV 4:2:2->RGBA | 4.05ms | 7.61ms | 24.44ms | 22.05ms | +| libyuv YUV 4:2:2->RGBA | 5.91ms | 7.48ms | 23.72ms | 18.71ms | +| utils YUV 4:4:4->RGBA | 3.91ms | 7.65ms | 27.58ms | 22.85ms | +| libyuv YUV 4:4:4->RGBA | 4.82ms | 7.55ms | 34.60ms | 21.47ms | YUV 16 bit-depth conversion @@ -123,29 +124,29 @@ cargo bench --bench yuv16 --manifest-path ./app/Cargo.toml 10-bit encoding is not implemented in `libyuv` -| | time(NEON) | Time(AVX2) | -|----------------------------|:----------:|:----------:| -| utils RGB10->YUV10 4:2:0 | 4.98ms | 33.88ms | -| libyuv RGB10->YUV10 4:2:0 | x | x | -| utils RGBA10->YUV10 4:2:0 | 6.03ms | 32.69ms | -| libyuv RGBA10->YUV10 4:2:0 | x | x | -| utils RGBA10->YUV10 4:2:2 | 5.99ms | 50.26ms | -| libyuv RGBA10->YUV10 4:2:2 | x | x | -| utils RGBA10->YUV10 4:4:4 | 4.84ms | 70.11ms | +| | time(NEON) | Time(AVX2 Win) | Time(AVX2) | +|----------------------------|:----------:|:--------------:|:----------:| +| utils RGB10->YUV10 4:2:0 | 4.98ms | 9.13ms | 33.88ms | +| libyuv RGB10->YUV10 4:2:0 | x | x | x | +| utils RGBA10->YUV10 4:2:0 | 6.03ms | 10.82ms | 32.69ms | +| libyuv RGBA10->YUV10 4:2:0 | x | x | x | +| utils RGBA10->YUV10 4:2:2 | 5.99ms | 14.74ms | 50.26ms | +| libyuv RGBA10->YUV10 4:2:2 | x | x | x | +| utils RGBA10->YUV10 4:4:4 | 4.84ms | 16.49ms | 70.11ms | ### Decoding 10-bit -| | time(NEON) | Time(AVX2) | -|----------------------------|:----------:|:----------:| -| utils YUV10 4:2:0->RGB10 | 5.64ms | 45.58ms | -| libyuv YUV10 4:2:0->RGB10 | - | - | -| utils YUV10 4:2:0->RGBA10 | 6.03ms | 65.95ms | -| utils YUV10 4:2:0->RGBA8 | 6.94ms | 31.15ms | -| libyuv YUV10 4:2:0->RGBA8 | 12.39ms | 24.59ms | -| utils YUV10 4:2:2->RGBA10 | 5.88ms | 59.44ms | -| utils YUV10 4:2:2->RGBA8 | 7.33ms | 29.15ms | -| libyuv YUV10 4:2:2->RGBA10 | 12.40ms | 29.96ms | -| utils YUV10 4:4:4->RGBA10 | 6.01ms | 70.84ms | +| | time(NEON) | Time(AVX2 Win) | Time(AVX2) | +|---------------------------|:----------:|:--------------:|:----------:| +| utils YUV10 4:2:0->RGB10 | 5.64ms | 11.06ms | 45.58ms | +| libyuv YUV10 4:2:0->RGB10 | - | - | - | +| utils YUV10 4:2:0->RGBA10 | 6.03ms | 14.85ms | 65.95ms | +| utils YUV10 4:2:0->RGBA8 | 6.94ms | 8.77ms | 31.15ms | +| libyuv YUV10 4:2:0->RGBA8 | 12.39ms | 62.01ms | 24.59ms | +| utils YUV10 4:2:2->RGBA10 | 5.88ms | 15.92ms | 59.44ms | +| utils YUV10 4:2:2->RGBA8 | 7.33ms | 8.76ms | 29.15ms | +| libyuv YUV10 4:2:2->RGBA8 | 12.40ms | 61.28ms | 29.96ms | +| utils YUV10 4:4:4->RGBA10 | 6.01ms | 16.09ms | 70.84ms | This project is licensed under either of diff --git a/src/avx512bw/rgba_to_yuv.rs b/src/avx512bw/rgba_to_yuv.rs index 467cfb2..0ba66fe 100644 --- a/src/avx512bw/rgba_to_yuv.rs +++ b/src/avx512bw/rgba_to_yuv.rs @@ -49,20 +49,10 @@ pub(crate) fn avx512_rgba_to_yuv( start_cx: usize, start_ux: usize, width: usize, - compute_uv_row: bool, ) -> ProcessedOffset { unsafe { avx512_rgba_to_yuv_impl::( - transform, - range, - y_plane, - u_plane, - v_plane, - rgba, - start_cx, - start_ux, - width, - compute_uv_row, + transform, range, y_plane, u_plane, v_plane, rgba, start_cx, start_ux, width, ) } } @@ -78,7 +68,6 @@ unsafe fn avx512_rgba_to_yuv_impl start_cx: usize, start_ux: usize, width: usize, - compute_uv_row: bool, ) -> ProcessedOffset { let chroma_subsampling: YuvChromaSubsampling = SAMPLING.into(); let source_channels: YuvSourceChannels = ORIGIN_CHANNELS.into(); @@ -238,11 +227,11 @@ unsafe fn avx512_rgba_to_yuv_impl _mm512_storeu_si512(v_ptr.add(uv_x) as *mut i32, cr); uv_x += 64; } else if chroma_subsampling == YuvChromaSubsampling::Yuv422 - || (chroma_subsampling == YuvChromaSubsampling::Yuv420 && compute_uv_row) + || (chroma_subsampling == YuvChromaSubsampling::Yuv420) { - let r1 = avx512_pairwise_avg_epi8(r_values); - let g1 = avx512_pairwise_avg_epi8(g_values); - let b1 = avx512_pairwise_avg_epi8(b_values); + let r1 = _mm512_slli_epi16::(avx512_pairwise_avg_epi8(r_values)); + let g1 = _mm512_slli_epi16::(avx512_pairwise_avg_epi8(g_values)); + let b1 = _mm512_slli_epi16::(avx512_pairwise_avg_epi8(b_values)); let cbk = _mm512_max_epi16( _mm512_min_epi16( diff --git a/src/avx512bw/rgba_to_yuv420.rs b/src/avx512bw/rgba_to_yuv420.rs index 477feb6..d96f136 100644 --- a/src/avx512bw/rgba_to_yuv420.rs +++ b/src/avx512bw/rgba_to_yuv420.rs @@ -209,9 +209,10 @@ unsafe fn avx512_rgba_to_yuv_impl420( y_plane1.get_unchecked_mut(cx..).as_mut_ptr() as *mut i32, y_yuv1, ); - let r1 = avx512_pairwise_avg_epi16_epi8(r_values0, r_values1); - let g1 = avx512_pairwise_avg_epi16_epi8(g_values0, g_values1); - let b1 = avx512_pairwise_avg_epi16_epi8(b_values0, b_values1); + + let r1 = _mm512_slli_epi16::(avx512_pairwise_avg_epi16_epi8(r_values0, r_values1)); + let g1 = _mm512_slli_epi16::(avx512_pairwise_avg_epi16_epi8(g_values0, g_values1)); + let b1 = _mm512_slli_epi16::(avx512_pairwise_avg_epi16_epi8(b_values0, b_values1)); let cbk = _mm512_max_epi16( _mm512_min_epi16( diff --git a/src/rgba_to_yuv.rs b/src/rgba_to_yuv.rs index 498f8bb..80f8f18 100644 --- a/src/rgba_to_yuv.rs +++ b/src/rgba_to_yuv.rs @@ -140,7 +140,6 @@ fn rgbx_to_yuv8( _offset.cx, _offset.ux, image.width as usize, - compute_uv_row, ); _offset = processed_offset; }