diff options
Diffstat (limited to 'vendor/jpeg-decoder/src/arch')
-rw-r--r-- | vendor/jpeg-decoder/src/arch/mod.rs | 46 | ||||
-rw-r--r-- | vendor/jpeg-decoder/src/arch/neon.rs | 221 | ||||
-rw-r--r-- | vendor/jpeg-decoder/src/arch/ssse3.rs | 288 |
3 files changed, 0 insertions, 555 deletions
diff --git a/vendor/jpeg-decoder/src/arch/mod.rs b/vendor/jpeg-decoder/src/arch/mod.rs deleted file mode 100644 index 15b46c5..0000000 --- a/vendor/jpeg-decoder/src/arch/mod.rs +++ /dev/null @@ -1,46 +0,0 @@ -#![allow(unsafe_code)] - -mod neon; -mod ssse3; - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use std::is_x86_feature_detected; - -/// Arch-specific implementation of YCbCr conversion. Returns the number of pixels that were -/// converted. -pub fn get_color_convert_line_ycbcr() -> Option<unsafe fn(&[u8], &[u8], &[u8], &mut [u8]) -> usize> -{ - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - #[allow(unsafe_code)] - { - if is_x86_feature_detected!("ssse3") { - return Some(ssse3::color_convert_line_ycbcr); - } - } - // Runtime detection is not needed on aarch64. - #[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))] - { - return Some(neon::color_convert_line_ycbcr); - } - #[allow(unreachable_code)] - None -} - -/// Arch-specific implementation of 8x8 IDCT. -pub fn get_dequantize_and_idct_block_8x8( -) -> Option<unsafe fn(&[i16; 64], &[u16; 64], usize, &mut [u8])> { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - #[allow(unsafe_code)] - { - if is_x86_feature_detected!("ssse3") { - return Some(ssse3::dequantize_and_idct_block_8x8); - } - } - // Runtime detection is not needed on aarch64. - #[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))] - { - return Some(neon::dequantize_and_idct_block_8x8); - } - #[allow(unreachable_code)] - None -} diff --git a/vendor/jpeg-decoder/src/arch/neon.rs b/vendor/jpeg-decoder/src/arch/neon.rs deleted file mode 100644 index 4843578..0000000 --- a/vendor/jpeg-decoder/src/arch/neon.rs +++ /dev/null @@ -1,221 +0,0 @@ -#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))] -use core::arch::aarch64::*; - -#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))] -#[target_feature(enable = "neon")] -unsafe fn idct8(data: &mut [int16x8_t; 8]) { - // The fixed-point constants here are obtained by taking the fractional part of the constants - // from the non-SIMD implementation and scaling them up by 1<<15. This is because - // vqrdmulhq_n_s16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some - // slight differences in rounding). - - // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it - // doesn't apply any further scaling and fixed point constants have a different precision. - - let p2 = data[2]; - let p3 = data[6]; - let p1 = vqrdmulhq_n_s16(vqaddq_s16(p2, p3), 17734); // 0.5411961 - let t2 = vqsubq_s16( - vqsubq_s16(p1, p3), - vqrdmulhq_n_s16(p3, 27779), // 0.847759065 - ); - let t3 = vqaddq_s16(p1, vqrdmulhq_n_s16(p2, 25079)); // 0.765366865 - - let p2 = data[0]; - let p3 = data[4]; - let t0 = vqaddq_s16(p2, p3); - let t1 = vqsubq_s16(p2, p3); - - let x0 = vqaddq_s16(t0, t3); - let x3 = vqsubq_s16(t0, t3); - let x1 = vqaddq_s16(t1, t2); - let x2 = vqsubq_s16(t1, t2); - - let t0 = data[7]; - let t1 = data[5]; - let t2 = data[3]; - let t3 = data[1]; - - let p3 = vqaddq_s16(t0, t2); - let p4 = vqaddq_s16(t1, t3); - let p1 = vqaddq_s16(t0, t3); - let p2 = vqaddq_s16(t1, t2); - let p5 = vqaddq_s16(p3, p4); - let p5 = vqaddq_s16(p5, vqrdmulhq_n_s16(p5, 5763)); // 0.175875602 - - let t0 = vqrdmulhq_n_s16(t0, 9786); // 0.298631336 - let t1 = vqaddq_s16( - vqaddq_s16(t1, t1), - vqrdmulhq_n_s16(t1, 1741), // 0.053119869 - ); - let t2 = vqaddq_s16( - vqaddq_s16(t2, vqaddq_s16(t2, t2)), - vqrdmulhq_n_s16(t2, 2383), // 0.072711026 - ); - let t3 = vqaddq_s16(t3, vqrdmulhq_n_s16(t3, 16427)); // 0.501321110 - - let p1 = vqsubq_s16(p5, vqrdmulhq_n_s16(p1, 29490)); // 0.899976223 - let p2 = vqsubq_s16( - vqsubq_s16(vqsubq_s16(p5, p2), p2), - vqrdmulhq_n_s16(p2, 18446), // 0.562915447 - ); - - let p3 = vqsubq_s16( - vqrdmulhq_n_s16(p3, -31509), // -0.961570560 - p3, - ); - let p4 = vqrdmulhq_n_s16(p4, -12785); // -0.390180644 - - let t3 = vqaddq_s16(vqaddq_s16(p1, p4), t3); - let t2 = vqaddq_s16(vqaddq_s16(p2, p3), t2); - let t1 = vqaddq_s16(vqaddq_s16(p2, p4), t1); - let t0 = vqaddq_s16(vqaddq_s16(p1, p3), t0); - - data[0] = vqaddq_s16(x0, t3); - data[7] = vqsubq_s16(x0, t3); - data[1] = vqaddq_s16(x1, t2); - data[6] = vqsubq_s16(x1, t2); - data[2] = vqaddq_s16(x2, t1); - data[5] = vqsubq_s16(x2, t1); - data[3] = vqaddq_s16(x3, t0); - data[4] = vqsubq_s16(x3, t0); -} - -#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))] -#[target_feature(enable = "neon")] -unsafe fn transpose8(data: &mut [int16x8_t; 8]) { - // Use NEON's 2x2 matrix transposes (vtrn) to do the transposition in each 4x4 block, then - // combine the 4x4 blocks. - let a01 = vtrnq_s16(data[0], data[1]); - let a23 = vtrnq_s16(data[2], data[3]); - - let four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.0), vreinterpretq_s32_s16(a23.0)); - let four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.1), vreinterpretq_s32_s16(a23.1)); - - let a45 = vtrnq_s16(data[4], data[5]); - let a67 = vtrnq_s16(data[6], data[7]); - - let four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.0), vreinterpretq_s32_s16(a67.0)); - let four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.1), vreinterpretq_s32_s16(a67.1)); - - data[0] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.0), vget_low_s32(four2.0))); - data[1] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.0), vget_low_s32(four3.0))); - data[2] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four0.1), vget_low_s32(four2.1))); - data[3] = vreinterpretq_s16_s32(vcombine_s32(vget_low_s32(four1.1), vget_low_s32(four3.1))); - data[4] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.0), vget_high_s32(four2.0))); - data[5] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.0), vget_high_s32(four3.0))); - data[6] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four0.1), vget_high_s32(four2.1))); - data[7] = vreinterpretq_s16_s32(vcombine_s32(vget_high_s32(four1.1), vget_high_s32(four3.1))); -} - -#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))] -#[target_feature(enable = "neon")] -pub unsafe fn dequantize_and_idct_block_8x8( - coefficients: &[i16; 64], - quantization_table: &[u16; 64], - output_linestride: usize, - output: &mut [u8], -) { - // The loop below will write to positions [output_linestride * i, output_linestride * i + 8) - // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7, - // and if that position is in-bounds, so are all other accesses. - assert!( - output.len() - > output_linestride - .checked_mul(7) - .unwrap() - .checked_add(7) - .unwrap() - ); - - const SHIFT: i32 = 3; - - // Read the DCT coefficients, scale them up and dequantize them. - let mut data = [vdupq_n_s16(0); 8]; - for i in 0..8 { - data[i] = vshlq_n_s16( - vmulq_s16( - vld1q_s16(coefficients.as_ptr().wrapping_add(i * 8)), - vreinterpretq_s16_u16(vld1q_u16(quantization_table.as_ptr().wrapping_add(i * 8))), - ), - SHIFT, - ); - } - - // Usual column IDCT - transpose - column IDCT - transpose approach. - idct8(&mut data); - transpose8(&mut data); - idct8(&mut data); - transpose8(&mut data); - - for i in 0..8 { - // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is - // increased by 3. - // As values will be stored in a u8, they need to be 128-centered and not 0-centered. - // We add 128 with the appropriate shift for that purpose. - const OFFSET: i16 = 128 << (SHIFT + 3); - // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting. - const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1; - - let data_with_offset = vqaddq_s16(data[i], vdupq_n_s16(OFFSET + ROUNDING_BIAS)); - - vst1_u8( - output.as_mut_ptr().wrapping_add(output_linestride * i), - vqshrun_n_s16(data_with_offset, SHIFT + 3), - ); - } -} - -#[cfg(all(feature = "nightly_aarch64_neon", target_arch = "aarch64"))] -#[target_feature(enable = "neon")] -pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize { - assert!(output.len() % 3 == 0); - let num = output.len() / 3; - assert!(num <= y.len()); - assert!(num <= cb.len()); - assert!(num <= cr.len()); - let num_vecs = num / 8; - - for i in 0..num_vecs { - const SHIFT: i32 = 6; - // Load. - let y = vld1_u8(y.as_ptr().wrapping_add(i * 8)); - let cb = vld1_u8(cb.as_ptr().wrapping_add(i * 8)); - let cr = vld1_u8(cr.as_ptr().wrapping_add(i * 8)); - - // Convert to 16 bit and shift. - let y = vreinterpretq_s16_u16(vshll_n_u8(y, SHIFT)); - let cb = vreinterpretq_s16_u16(vshll_n_u8(cb, SHIFT)); - let cr = vreinterpretq_s16_u16(vshll_n_u8(cr, SHIFT)); - - // Add offsets - let y = vqaddq_s16(y, vdupq_n_s16((1 << SHIFT) >> 1)); - let c128 = vdupq_n_s16(128 << SHIFT); - let cb = vqsubq_s16(cb, c128); - let cr = vqsubq_s16(cr, c128); - - // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772 - let cr_140200 = vqaddq_s16(vqrdmulhq_n_s16(cr, 13173), cr); - let cb_034414 = vqrdmulhq_n_s16(cb, 11276); - let cr_071414 = vqrdmulhq_n_s16(cr, 23401); - let cb_177200 = vqaddq_s16(vqrdmulhq_n_s16(cb, 25297), cb); - - // Last conversion step. - let r = vqaddq_s16(y, cr_140200); - let g = vqsubq_s16(y, vqaddq_s16(cb_034414, cr_071414)); - let b = vqaddq_s16(y, cb_177200); - - // Shift back and convert to u8. - let r = vqshrun_n_s16(r, SHIFT); - let g = vqshrun_n_s16(g, SHIFT); - let b = vqshrun_n_s16(b, SHIFT); - - // Shuffle + store. - vst3_u8( - output.as_mut_ptr().wrapping_add(24 * i), - uint8x8x3_t(r, g, b), - ); - } - - num_vecs * 8 -} diff --git a/vendor/jpeg-decoder/src/arch/ssse3.rs b/vendor/jpeg-decoder/src/arch/ssse3.rs deleted file mode 100644 index 374a70c..0000000 --- a/vendor/jpeg-decoder/src/arch/ssse3.rs +++ /dev/null @@ -1,288 +0,0 @@ -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[target_feature(enable = "ssse3")] -unsafe fn idct8(data: &mut [__m128i; 8]) { - // The fixed-point constants here are obtained by taking the fractional part of the constants - // from the non-SIMD implementation and scaling them up by 1<<15. This is because - // _mm_mulhrs_epi16(a, b) is effectively equivalent to (a*b)>>15 (except for possibly some - // slight differences in rounding). - - // The code here is effectively equivalent to the calls to "kernel" in idct.rs, except that it - // doesn't apply any further scaling and fixed point constants have a different precision. - - let p2 = data[2]; - let p3 = data[6]; - let p1 = _mm_mulhrs_epi16(_mm_adds_epi16(p2, p3), _mm_set1_epi16(17734)); // 0.5411961 - let t2 = _mm_subs_epi16( - _mm_subs_epi16(p1, p3), - _mm_mulhrs_epi16(p3, _mm_set1_epi16(27779)), // 0.847759065 - ); - let t3 = _mm_adds_epi16(p1, _mm_mulhrs_epi16(p2, _mm_set1_epi16(25079))); // 0.765366865 - - let p2 = data[0]; - let p3 = data[4]; - let t0 = _mm_adds_epi16(p2, p3); - let t1 = _mm_subs_epi16(p2, p3); - - let x0 = _mm_adds_epi16(t0, t3); - let x3 = _mm_subs_epi16(t0, t3); - let x1 = _mm_adds_epi16(t1, t2); - let x2 = _mm_subs_epi16(t1, t2); - - let t0 = data[7]; - let t1 = data[5]; - let t2 = data[3]; - let t3 = data[1]; - - let p3 = _mm_adds_epi16(t0, t2); - let p4 = _mm_adds_epi16(t1, t3); - let p1 = _mm_adds_epi16(t0, t3); - let p2 = _mm_adds_epi16(t1, t2); - let p5 = _mm_adds_epi16(p3, p4); - let p5 = _mm_adds_epi16(p5, _mm_mulhrs_epi16(p5, _mm_set1_epi16(5763))); // 0.175875602 - - let t0 = _mm_mulhrs_epi16(t0, _mm_set1_epi16(9786)); // 0.298631336 - let t1 = _mm_adds_epi16( - _mm_adds_epi16(t1, t1), - _mm_mulhrs_epi16(t1, _mm_set1_epi16(1741)), // 0.053119869 - ); - let t2 = _mm_adds_epi16( - _mm_adds_epi16(t2, _mm_adds_epi16(t2, t2)), - _mm_mulhrs_epi16(t2, _mm_set1_epi16(2383)), // 0.072711026 - ); - let t3 = _mm_adds_epi16(t3, _mm_mulhrs_epi16(t3, _mm_set1_epi16(16427))); // 0.501321110 - - let p1 = _mm_subs_epi16(p5, _mm_mulhrs_epi16(p1, _mm_set1_epi16(29490))); // 0.899976223 - let p2 = _mm_subs_epi16( - _mm_subs_epi16(_mm_subs_epi16(p5, p2), p2), - _mm_mulhrs_epi16(p2, _mm_set1_epi16(18446)), // 0.562915447 - ); - - let p3 = _mm_subs_epi16( - _mm_mulhrs_epi16(p3, _mm_set1_epi16(-31509)), // -0.961570560 - p3, - ); - let p4 = _mm_mulhrs_epi16(p4, _mm_set1_epi16(-12785)); // -0.390180644 - - let t3 = _mm_adds_epi16(_mm_adds_epi16(p1, p4), t3); - let t2 = _mm_adds_epi16(_mm_adds_epi16(p2, p3), t2); - let t1 = _mm_adds_epi16(_mm_adds_epi16(p2, p4), t1); - let t0 = _mm_adds_epi16(_mm_adds_epi16(p1, p3), t0); - - data[0] = _mm_adds_epi16(x0, t3); - data[7] = _mm_subs_epi16(x0, t3); - data[1] = _mm_adds_epi16(x1, t2); - data[6] = _mm_subs_epi16(x1, t2); - data[2] = _mm_adds_epi16(x2, t1); - data[5] = _mm_subs_epi16(x2, t1); - data[3] = _mm_adds_epi16(x3, t0); - data[4] = _mm_subs_epi16(x3, t0); -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[target_feature(enable = "ssse3")] -unsafe fn transpose8(data: &mut [__m128i; 8]) { - // Transpose a 8x8 matrix with a sequence of interleaving operations. - // Naming: dABl contains elements from the *l*ower halves of vectors A and B, interleaved, i.e. - // A0 B0 A1 B1 ... - // dABCDll contains elements from the lower quarter (ll) of vectors A, B, C, D, interleaved - - // A0 B0 C0 D0 A1 B1 C1 D1 ... - let d01l = _mm_unpacklo_epi16(data[0], data[1]); - let d23l = _mm_unpacklo_epi16(data[2], data[3]); - let d45l = _mm_unpacklo_epi16(data[4], data[5]); - let d67l = _mm_unpacklo_epi16(data[6], data[7]); - let d01h = _mm_unpackhi_epi16(data[0], data[1]); - let d23h = _mm_unpackhi_epi16(data[2], data[3]); - let d45h = _mm_unpackhi_epi16(data[4], data[5]); - let d67h = _mm_unpackhi_epi16(data[6], data[7]); - // Operating on 32-bits will interleave *consecutive pairs* of 16-bit integers. - let d0123ll = _mm_unpacklo_epi32(d01l, d23l); - let d0123lh = _mm_unpackhi_epi32(d01l, d23l); - let d4567ll = _mm_unpacklo_epi32(d45l, d67l); - let d4567lh = _mm_unpackhi_epi32(d45l, d67l); - let d0123hl = _mm_unpacklo_epi32(d01h, d23h); - let d0123hh = _mm_unpackhi_epi32(d01h, d23h); - let d4567hl = _mm_unpacklo_epi32(d45h, d67h); - let d4567hh = _mm_unpackhi_epi32(d45h, d67h); - // Operating on 64-bits will interleave *consecutive quadruples* of 16-bit integers. - data[0] = _mm_unpacklo_epi64(d0123ll, d4567ll); - data[1] = _mm_unpackhi_epi64(d0123ll, d4567ll); - data[2] = _mm_unpacklo_epi64(d0123lh, d4567lh); - data[3] = _mm_unpackhi_epi64(d0123lh, d4567lh); - data[4] = _mm_unpacklo_epi64(d0123hl, d4567hl); - data[5] = _mm_unpackhi_epi64(d0123hl, d4567hl); - data[6] = _mm_unpacklo_epi64(d0123hh, d4567hh); - data[7] = _mm_unpackhi_epi64(d0123hh, d4567hh); -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[target_feature(enable = "ssse3")] -pub unsafe fn dequantize_and_idct_block_8x8( - coefficients: &[i16; 64], - quantization_table: &[u16; 64], - output_linestride: usize, - output: &mut [u8], -) { - // The loop below will write to positions [output_linestride * i, output_linestride * i + 8) - // for 0<=i<8. Thus, the last accessed position is at an offset of output_linestrade * 7 + 7, - // and if that position is in-bounds, so are all other accesses. - assert!( - output.len() - > output_linestride - .checked_mul(7) - .unwrap() - .checked_add(7) - .unwrap() - ); - - #[cfg(target_arch = "x86")] - use std::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use std::arch::x86_64::*; - - const SHIFT: i32 = 3; - - // Read the DCT coefficients, scale them up and dequantize them. - let mut data = [_mm_setzero_si128(); 8]; - for i in 0..8 { - data[i] = _mm_slli_epi16( - _mm_mullo_epi16( - _mm_loadu_si128(coefficients.as_ptr().wrapping_add(i * 8) as *const _), - _mm_loadu_si128(quantization_table.as_ptr().wrapping_add(i * 8) as *const _), - ), - SHIFT, - ); - } - - // Usual column IDCT - transpose - column IDCT - transpose approach. - idct8(&mut data); - transpose8(&mut data); - idct8(&mut data); - transpose8(&mut data); - - for i in 0..8 { - let mut buf = [0u8; 16]; - // The two passes of the IDCT algorithm give us a factor of 8, so the shift here is - // increased by 3. - // As values will be stored in a u8, they need to be 128-centered and not 0-centered. - // We add 128 with the appropriate shift for that purpose. - const OFFSET: i16 = 128 << (SHIFT + 3); - // We want rounding right shift, so we should add (1/2) << (SHIFT+3) before shifting. - const ROUNDING_BIAS: i16 = (1 << (SHIFT + 3)) >> 1; - - let data_with_offset = _mm_adds_epi16(data[i], _mm_set1_epi16(OFFSET + ROUNDING_BIAS)); - - _mm_storeu_si128( - buf.as_mut_ptr() as *mut _, - _mm_packus_epi16( - _mm_srai_epi16(data_with_offset, SHIFT + 3), - _mm_setzero_si128(), - ), - ); - std::ptr::copy_nonoverlapping::<u8>( - buf.as_ptr(), - output.as_mut_ptr().wrapping_add(output_linestride * i) as *mut _, - 8, - ); - } -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[target_feature(enable = "ssse3")] -pub unsafe fn color_convert_line_ycbcr(y: &[u8], cb: &[u8], cr: &[u8], output: &mut [u8]) -> usize { - assert!(output.len() % 3 == 0); - let num = output.len() / 3; - assert!(num <= y.len()); - assert!(num <= cb.len()); - assert!(num <= cr.len()); - // _mm_loadu_si64 generates incorrect code for Rust <1.58. To circumvent this, we use a full - // 128-bit load, but that requires leaving an extra vector of border to the scalar code. - // From Rust 1.58 on, the _mm_loadu_si128 can be replaced with _mm_loadu_si64 and this - // .saturating_sub() can be removed. - let num_vecs = (num / 8).saturating_sub(1); - - for i in 0..num_vecs { - const SHIFT: i32 = 6; - // Load. - let y = _mm_loadu_si128(y.as_ptr().wrapping_add(i * 8) as *const _); - let cb = _mm_loadu_si128(cb.as_ptr().wrapping_add(i * 8) as *const _); - let cr = _mm_loadu_si128(cr.as_ptr().wrapping_add(i * 8) as *const _); - - // Convert to 16 bit. - let shuf16 = _mm_setr_epi8( - 0, -0x7F, 1, -0x7F, 2, -0x7F, 3, -0x7F, 4, -0x7F, 5, -0x7F, 6, -0x7F, 7, -0x7F, - ); - let y = _mm_slli_epi16(_mm_shuffle_epi8(y, shuf16), SHIFT); - let cb = _mm_slli_epi16(_mm_shuffle_epi8(cb, shuf16), SHIFT); - let cr = _mm_slli_epi16(_mm_shuffle_epi8(cr, shuf16), SHIFT); - - // Add offsets - let c128 = _mm_set1_epi16(128 << SHIFT); - let y = _mm_adds_epi16(y, _mm_set1_epi16((1 << SHIFT) >> 1)); - let cb = _mm_subs_epi16(cb, c128); - let cr = _mm_subs_epi16(cr, c128); - - // Compute cr * 1.402, cb * 0.34414, cr * 0.71414, cb * 1.772 - let cr_140200 = _mm_adds_epi16(_mm_mulhrs_epi16(cr, _mm_set1_epi16(13173)), cr); - let cb_034414 = _mm_mulhrs_epi16(cb, _mm_set1_epi16(11276)); - let cr_071414 = _mm_mulhrs_epi16(cr, _mm_set1_epi16(23401)); - let cb_177200 = _mm_adds_epi16(_mm_mulhrs_epi16(cb, _mm_set1_epi16(25297)), cb); - - // Last conversion step. - let r = _mm_adds_epi16(y, cr_140200); - let g = _mm_subs_epi16(y, _mm_adds_epi16(cb_034414, cr_071414)); - let b = _mm_adds_epi16(y, cb_177200); - - // Shift back and convert to u8. - let zero = _mm_setzero_si128(); - let r = _mm_packus_epi16(_mm_srai_epi16(r, SHIFT), zero); - let g = _mm_packus_epi16(_mm_srai_epi16(g, SHIFT), zero); - let b = _mm_packus_epi16(_mm_srai_epi16(b, SHIFT), zero); - - // Shuffle rrrrrrrrggggggggbbbbbbbb to rgbrgbrgb... - - // Control vectors for _mm_shuffle_epi8. -0x7F is selected so that the resulting position - // after _mm_shuffle_epi8 will be filled with 0, so that the r, g, and b vectors can then - // be OR-ed together. - let shufr = _mm_setr_epi8( - 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, -0x7F, 5, - ); - let shufg = _mm_setr_epi8( - -0x7F, 0, -0x7F, -0x7F, 1, -0x7F, -0x7F, 2, -0x7F, -0x7F, 3, -0x7F, -0x7F, 4, -0x7F, - -0x7F, - ); - let shufb = _mm_alignr_epi8(shufg, shufg, 15); - - let rgb_low = _mm_or_si128( - _mm_shuffle_epi8(r, shufr), - _mm_or_si128(_mm_shuffle_epi8(g, shufg), _mm_shuffle_epi8(b, shufb)), - ); - - // For the next part of the rgb vectors, we need to select R values from 6 up, G and B from - // 5 up. The highest bit of -0x7F + 6 is still set, so the corresponding location will - // still be 0. - let shufr1 = _mm_add_epi8(shufb, _mm_set1_epi8(6)); - let shufg1 = _mm_add_epi8(shufr, _mm_set1_epi8(5)); - let shufb1 = _mm_add_epi8(shufg, _mm_set1_epi8(5)); - - let rgb_hi = _mm_or_si128( - _mm_shuffle_epi8(r, shufr1), - _mm_or_si128(_mm_shuffle_epi8(g, shufg1), _mm_shuffle_epi8(b, shufb1)), - ); - - let mut data = [0u8; 32]; - _mm_storeu_si128(data.as_mut_ptr() as *mut _, rgb_low); - _mm_storeu_si128(data.as_mut_ptr().wrapping_add(16) as *mut _, rgb_hi); - std::ptr::copy_nonoverlapping::<u8>( - data.as_ptr(), - output.as_mut_ptr().wrapping_add(24 * i), - 24, - ); - } - - num_vecs * 8 -} |